from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.

inputs = processor(text="hello all, welcome to speech synthesis tutorial", return_tensors="pt")

inputs.input_ids
# Tokenizer based on sentence piece tokenizer (In speechT5 processor two activities are happening. 1. Feature Extraction 2. Tokenization)

tensor([[ 4, 11,  5, 15, 15,  8,  4,  7, 15, 15, 23,  4, 20,  5, 15, 17,  8, 18,
          5,  4,  6,  8,  4, 12, 24,  5,  5, 17, 11,  4, 12, 22,  9,  6, 11,  5,
         12, 10, 12,  4,  6, 16,  6,  8, 13, 10,  7, 15,  2]])

%%capture
!pip install datasets

from datasets import load_dataset
import torch

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

embeddings_dataset

Dataset({
    features: ['filename', 'xvector'],
    num_rows: 7931
})

print(len(embeddings_dataset[7500]['xvector']))
print(embeddings_dataset[7500]['filename'])

512
cmu_us_slt_arctic-wav-arctic_b0109

# We will take embeddings of "slt (US female)"
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speaker_embeddings.shape

torch.Size([1, 512])

spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)

import matplotlib.pyplot as plt
import librosa.display

# Convert the tensor to a numpy array
spectrogram_np = spectrogram.cpu().numpy()

# Display the spectrogram
plt.figure(figsize=(10, 4))
librosa.display.specshow(spectrogram_np.T, sr=16000, hop_length=256, x_axis="time", y_axis="mel")  # Adjust parameters as needed
plt.colorbar(format="%+2.f dB")
plt.title("Spectrogram")
plt.show()

spectrogram.shape

torch.Size([216, 80])

# Load the vocoder
from transformers import SpeechT5HifiGan
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

from IPython.display import Audio
Audio(speech, rate=16000)

Introduction to Language and Speech Technology - ReMA (RU)¶

SpeechT5¶