NMachine commited on
Commit
4eb1a13
·
verified ·
1 Parent(s): 000c9b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -19
app.py CHANGED
@@ -4,19 +4,19 @@ from snac import SNAC
4
  import soundfile as sf
5
  import gradio as gr
6
 
7
- # Load models
 
8
  model = AutoModelForCausalLM.from_pretrained(
9
  "maya-research/maya1",
10
- torch_dtype=torch.bfloat16,
11
- device_map="auto"
12
  )
13
  tokenizer = AutoTokenizer.from_pretrained("maya-research/maya1")
14
- snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to("cuda")
15
 
16
- # Main generation function
17
  def generate_voice(description, text):
18
  prompt = f'<description="{description}"> {text}'
19
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
20
  with torch.inference_mode():
21
  outputs = model.generate(
22
  **inputs,
@@ -25,28 +25,30 @@ def generate_voice(description, text):
25
  top_p=0.9,
26
  do_sample=True
27
  )
28
-
29
  generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
30
  snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]
31
-
32
  frames = len(snac_tokens) // 7
33
  codes = [[], [], []]
34
  for i in range(frames):
35
- s = snac_tokens[i*7:(i+1)*7]
36
- codes[0].append((s[0]-128266) % 4096)
37
- codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096])
38
- codes[2].extend([(s[2]-128266) % 4096, (s[3]-128266) % 4096,
39
- (s[5]-128266) % 4096, (s[6]-128266) % 4096])
40
-
41
- codes_tensor = [torch.tensor(c, dtype=torch.long, device="cuda").unsqueeze(0) for c in codes]
 
 
 
 
 
 
42
  with torch.inference_mode():
43
  audio = snac_model.decoder(snac_model.quantizer.from_codes(codes_tensor))[0, 0].cpu().numpy()
44
-
45
  out_path = "output.wav"
46
  sf.write(out_path, audio, 24000)
47
  return out_path
48
 
49
- # Gradio interface — no preset text, fully user-controlled
50
  demo = gr.Interface(
51
  fn=generate_voice,
52
  inputs=[
@@ -54,8 +56,8 @@ demo = gr.Interface(
54
  gr.Textbox(label="Text to Speak (type anything you want)")
55
  ],
56
  outputs=gr.Audio(label="Generated Speech"),
57
- title="🎙️ Maya1 Voice Generator",
58
- description="Generate expressive emotional speech using the open-source Maya1 + SNAC pipeline."
59
  )
60
 
61
  if __name__ == "__main__":
 
4
  import soundfile as sf
5
  import gradio as gr
6
 
7
+ device = "cpu"
8
+
9
  model = AutoModelForCausalLM.from_pretrained(
10
  "maya-research/maya1",
11
+ dtype=torch.bfloat16,
12
+ device_map=None
13
  )
14
  tokenizer = AutoTokenizer.from_pretrained("maya-research/maya1")
15
+ snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(device)
16
 
 
17
  def generate_voice(description, text):
18
  prompt = f'<description="{description}"> {text}'
19
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
20
  with torch.inference_mode():
21
  outputs = model.generate(
22
  **inputs,
 
25
  top_p=0.9,
26
  do_sample=True
27
  )
 
28
  generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
29
  snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]
 
30
  frames = len(snac_tokens) // 7
31
  codes = [[], [], []]
32
  for i in range(frames):
33
+ s = snac_tokens[i * 7:(i + 1) * 7]
34
+ codes[0].append((s[0] - 128266) % 4096)
35
+ codes[1].extend([(s[1] - 128266) % 4096, (s[4] - 128266) % 4096])
36
+ codes[2].extend([
37
+ (s[2] - 128266) % 4096,
38
+ (s[3] - 128266) % 4096,
39
+ (s[5] - 128266) % 4096,
40
+ (s[6] - 128266) % 4096
41
+ ])
42
+ codes_tensor = [
43
+ torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0)
44
+ for c in codes
45
+ ]
46
  with torch.inference_mode():
47
  audio = snac_model.decoder(snac_model.quantizer.from_codes(codes_tensor))[0, 0].cpu().numpy()
 
48
  out_path = "output.wav"
49
  sf.write(out_path, audio, 24000)
50
  return out_path
51
 
 
52
  demo = gr.Interface(
53
  fn=generate_voice,
54
  inputs=[
 
56
  gr.Textbox(label="Text to Speak (type anything you want)")
57
  ],
58
  outputs=gr.Audio(label="Generated Speech"),
59
+ title="🎙️ Maya1 Voice Generator (CPU-only)",
60
+ description="Generate expressive emotional speech using Maya1 + SNAC on CPU."
61
  )
62
 
63
  if __name__ == "__main__":