%%capture
# %%capture at the beginning of a code cell prevents the output of that cell from being displayed in the notebook unless you specifically request it later.
# They are also known as Magic Commands and there are various magic commands which colab supports. You can learn more about this in below notebook.
# https://colab.research.google.com/github/jdwittenauer/ipython-notebooks/blob/master/notebooks/language/IPythonMagic.ipynb
! pip install transformers
! pip install datasets
! pip install librosa
! pip install soundfile
! pip install jiwer

from huggingface_hub import notebook_login
notebook_login()

!wget --no-check-certificate 'https://upload.wikimedia.org/wikipedia/commons/f/f6/Appuru.wav'

--2024-12-18 15:39:34--  https://upload.wikimedia.org/wikipedia/commons/f/f6/Appuru.wav
Resolving upload.wikimedia.org (upload.wikimedia.org)... 198.35.26.112, 2620:0:863:ed1a::2:b
Connecting to upload.wikimedia.org (upload.wikimedia.org)|198.35.26.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 300098 (293K) [audio/x-wav]
Saving to: ‘Appuru.wav’

Appuru.wav          100%[===================>] 293.06K  1.73MB/s    in 0.2s    

2024-12-18 15:39:34 (1.73 MB/s) - ‘Appuru.wav’ saved [300098/300098]

from IPython.display import Audio

# Play the audio file
Audio('/content/Appuru.wav')

# Import required libraries
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor  # Libraries to use the Wav2Vec2 model for speech recognition
import librosa    # Library to handle audio files
import torch      # PyTorch library for tensor operations
from transformers import logging # Managing the warnings
logging.set_verbosity_error()


# Load the pre-trained Wav2Vec2 model and processor
# The model 'wav2vec2-large-xlsr-53-english' is specifically trained for English speech recognition
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")

# Now we read the audio file using librosa
# 'sr=16000' specifies the sample rate to ensure the audio is processed correctly by the model
audio, rate = librosa.load("/content/Appuru.wav", sr=16000)

# Process the audio data using the Wav2Vec2 processor
# This converts the audio waveform into input values that the model can understand
# 'padding="longest"' ensures that the audio input is properly padded for batch processing
input_values = processor(audio, sampling_rate=16_000, return_tensors="pt", padding="longest").input_values

# Pass the input values through the model to get the logits (non-normalized prediction values)
# The logits are raw scores from the model, representing the likelihood of each token being correct
logits = model(input_values).logits

# Get the predicted ids by finding the highest values in the logits
# This step identifies the most likely tokens for each time step in the audio input
prediction = torch.argmax(logits, dim=-1)

# Decode the predicted tokens into a human-readable transcription
# The batch_decode function converts the token ids back into words to form the final text
transcription = processor.batch_decode(prediction)[0]

transcription

'the apple does not fall far from the tree'

probs = torch.nn.functional.softmax(logits, dim=-1)

tensor([[[1.0000e+00, 6.3755e-15, 5.1981e-15,  ..., 1.3486e-09,
          1.1472e-08, 6.0117e-09],
         [1.0000e+00, 4.0153e-16, 3.3276e-16,  ..., 4.1281e-10,
          3.5719e-09, 6.4843e-10],
         [1.0000e+00, 2.7224e-17, 2.1026e-17,  ..., 1.0491e-10,
          1.3314e-09, 1.1527e-10],
         ...,
         [1.0000e+00, 1.3515e-16, 1.1106e-16,  ..., 3.7944e-10,
          2.6893e-09, 5.3555e-10],
         [1.0000e+00, 1.8683e-15, 1.4673e-15,  ..., 1.5056e-09,
          7.8347e-09, 2.4061e-09],
         [2.9775e-06, 1.2535e-11, 1.2222e-11,  ..., 1.3962e-08,
          9.4920e-08, 2.1285e-09]]])

probs.shape

torch.Size([1, 156, 33])

probs[0][21]

tensor([4.6639e-07, 2.3321e-09, 2.4124e-09, 3.0503e-07, 1.7871e-06, 6.3960e-07,
        5.3728e-08, 7.8180e-06, 6.0511e-08, 9.6134e-07, 5.6119e-07, 9.9985e-01,
        2.9169e-07, 5.4118e-07, 5.9517e-07, 1.1887e-04, 8.2797e-08, 6.2184e-07,
        2.1768e-07, 3.9737e-07, 2.7391e-06, 6.7354e-07, 1.8673e-07, 6.0831e-08,
        4.7723e-07, 5.7472e-06, 2.2915e-06, 1.9844e-07, 4.0904e-07, 6.9178e-08,
        1.5162e-07, 1.5691e-06, 9.7995e-08])

prediction

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         26, 14,  0, 11,  0,  4,  4,  0,  0,  0,  0,  7,  0,  0, 22,  0,  0,  0,
          0, 22,  0,  0, 18,  0,  0,  0, 11,  0,  0,  0,  4,  4,  0,  0,  0,  0,
         10,  0,  0, 21, 21, 11, 11,  0, 25,  0,  4,  0,  0,  0, 20,  0,  0, 21,
          0,  0,  0, 26,  0,  4,  0, 12,  0,  0,  0,  7, 18,  0,  0, 18,  0,  4,
          0, 12,  0,  0,  0,  0,  7,  0,  0, 24,  0,  0,  4,  0, 12,  0, 24,  0,
         21,  0, 19,  0,  4,  4,  0, 26, 14,  0, 11,  0,  4,  4, 26,  0,  0,  0,
         24,  0,  0, 11,  0,  0,  0,  0, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4]])

vocab_dict = processor.tokenizer.get_vocab()
# Reverse the vocabulary dictionary to map indices to tokens
id_to_token = {v: k for k, v in vocab_dict.items()}

# Convert the predicted ids to their corresponding tokens
predicted_tokens = [id_to_token[idx.item()] for idx in prediction[0]]

# Print the predicted tokens as a sequence of characters
Predicted_sequence = " ".join(predicted_tokens)
Predicted_sequence

'<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> t h <pad> e <pad> | | <pad> <pad> <pad> <pad> a <pad> <pad> p <pad> <pad> <pad> <pad> p <pad> <pad> l <pad> <pad> <pad> e <pad> <pad> <pad> | | <pad> <pad> <pad> <pad> d <pad> <pad> o o e e <pad> s <pad> | <pad> <pad> <pad> n <pad> <pad> o <pad> <pad> <pad> t <pad> | <pad> f <pad> <pad> <pad> a l <pad> <pad> l <pad> | <pad> f <pad> <pad> <pad> <pad> a <pad> <pad> r <pad> <pad> | <pad> f <pad> r <pad> o <pad> m <pad> | | <pad> t h <pad> e <pad> | | t <pad> <pad> <pad> r <pad> <pad> e <pad> <pad> <pad> <pad> e <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> |'

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import librosa
import torch
import warnings
import numpy as np

warnings.filterwarnings("ignore")

# Load model and processor
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")

# Load audio
audio, rate = librosa.load("/content/Appuru.wav", sr=16000)

# Preprocess audio
input_values = processor(audio, sampling_rate=16_000, return_tensors="pt", padding="longest").input_values

# Perform inference
with torch.no_grad():
    logits = model(input_values).logits  # Get logits (raw scores)

# Apply softmax to convert logits to probabilities
probs = torch.nn.functional.softmax(logits, dim=-1).cpu().numpy()

# Decode to get word offsets and text
outputs = processor.decode(np.argmax(probs, axis=-1)[0], output_word_offsets=True)

# Compute `time_offset` in seconds
time_offset = model.config.inputs_to_logits_ratio / processor.feature_extractor.sampling_rate
word_offsets = [
    {
        "word": d["word"],
        "start_time": round(d["start_offset"] * time_offset, 2),
        "end_time": round(d["end_offset"] * time_offset, 2),
    }
    for d in outputs["word_offsets"]
]

# Print results
print(word_offsets)
print(outputs["text"])

[{'word': 'the', 'start_time': 0.36, 'end_time': 0.44}, {'word': 'apple', 'start_time': 0.58, 'end_time': 0.9}, {'word': 'does', 'start_time': 1.08, 'end_time': 1.26}, {'word': 'not', 'start_time': 1.36, 'end_time': 1.52}, {'word': 'fall', 'start_time': 1.58, 'end_time': 1.76}, {'word': 'far', 'start_time': 1.82, 'end_time': 2.0}, {'word': 'from', 'start_time': 2.08, 'end_time': 2.22}, {'word': 'the', 'start_time': 2.3, 'end_time': 2.38}, {'word': 'tree', 'start_time': 2.44, 'end_time': 2.7}]
the apple does not fall far from the tree

#@title
from IPython.display import Javascript, display
from google.colab import output
import base64
import io

def record():
    js = Javascript("""
    async function recordAudio() {
        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
        const mediaRecorder = new MediaRecorder(stream);
        let audioChunks = [];

        mediaRecorder.ondataavailable = event => {
            audioChunks.push(event.data);
        };

        mediaRecorder.start();

        alert('Recording started! Please start speaking in English, around 7 seconds.');  // Alert to inform the user
        await new Promise(resolve => setTimeout(resolve, 7000)); // Record for 7 seconds
        mediaRecorder.stop();

        await new Promise(resolve => mediaRecorder.onstop = resolve);

        const audioBlob = new Blob(audioChunks);
        const reader = new FileReader();
        reader.readAsDataURL(audioBlob);
        reader.onloadend = () => {
            const base64data = reader.result.split(',')[1];
            google.colab.kernel.invokeFunction('notebook.recorded_audio', [base64data], {});
        };
    }

    recordAudio();
    """)
    display(js)

audio_data = None

def save_audio(data):
    global audio_datas
    audio_data = base64.b64decode(data)
    with open('recorded_English_audio.wav', 'wb') as f:
        f.write(audio_data)
    print("Audio saved as 'recorded_English_audio.wav'")

output.register_callback('notebook.recorded_audio', save_audio)

record()

Audio saved as 'recorded_English_audio.wav'

from IPython.display import Audio

# Play the audio file
Audio('/content/recorded_English_audio.wav')

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
import warnings
import librosa

# Suppress FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load model and processor
# WhisperProcessor helps to preprocess the audio data into a format suitable for the model.
# WhisperForConditionalGeneration is the pre-trained Whisper model used for speech-to-text transcription.
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

# Set the forced_decoder_ids to None
# This is necessary because Whisper uses forced decoding for translation by default.
# By setting it to None, we ensure the model will just perform transcription without forcing translation.
model.config.forced_decoder_ids = None


audio, rate = librosa.load("/content/recorded_English_audio.wav", sr=16000)
input_features = processor(audio, sampling_rate=rate, return_tensors="pt").input_features

# generate token ids
## The model processes the input features and generates token ids (predicted text in token form).
predicted_ids = model.generate(input_features)

# decode token ids to text
# skip_special_tokens=False will keep any special tokens (such as language identifiers or special markers) in the transcription.
transcription_with_special_tokens = processor.batch_decode(predicted_ids, skip_special_tokens=False)

# Decode token ids to text (without special tokens)
# Now we decode the token ids again but skip the special tokens, providing only a cleaner transcription.
transcription_clean = processor.batch_decode(predicted_ids, skip_special_tokens=True)

# Output both transcriptions for comparison
print("Transcription with special tokens:", transcription_with_special_tokens)
print("Clean transcription:", transcription_clean)

<ipython-input-9-f53abb2c8390>:21: UserWarning: PySoundFile failed. Trying audioread instead.
  audio, rate = librosa.load("/content/recorded_English_audio.wav", sr=16000)

Transcription with special tokens: ['<|startoftranscript|><|en|><|transcribe|><|notimestamps|> And yes, it will take around like 7 seconds and your audio will be saved.']
Clean transcription: [' And yes, it will take around like 7 seconds and your audio will be saved.']

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import Audio, load_dataset

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

# Set forced decoder ids for transcription in Dutch
# This forces the model to transcribe the audio in a specified language (Dutch in this case).
forced_decoder_ids = processor.get_decoder_prompt_ids(language="dutch", task="transcribe")

# load streaming dataset and read first audio sample

audio, rate = librosa.load("/content/recorded_English_audio.wav", sr=16000)
input_features = processor(audio, sampling_rate=rate, return_tensors="pt").input_features

# generate token ids
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)

# decode token ids to text
# skip_special_tokens=False will keep any special tokens (such as language identifiers or special markers) in the transcription.
transcription_with_special_tokens = processor.batch_decode(predicted_ids, skip_special_tokens=False)

# Decode token ids to text (without special tokens)
# Now we decode the token ids again but skip the special tokens, providing only a cleaner transcription.
transcription_clean = processor.batch_decode(predicted_ids, skip_special_tokens=True)

# Output both transcriptions for comparison
print("Transcription with special tokens:", transcription_with_special_tokens)
print("Clean transcription:", transcription_clean)

<ipython-input-10-5718825a4854>:14: UserWarning: PySoundFile failed. Trying audioread instead.
  audio, rate = librosa.load("/content/recorded_English_audio.wav", sr=16000)

Transcription with special tokens: ['<|startoftranscript|><|nl|><|transcribe|><|notimestamps|> En ja, het wil ik een keer op een 7 seconds. En je wil bezien.']
Clean transcription: [' En ja, het wil ik een keer op een 7 seconds. En je wil bezien.']

#@title
from IPython.display import Javascript, display
from google.colab import output
import base64
import io

def record():
    js = Javascript("""
    async function recordAudio() {
        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
        const mediaRecorder = new MediaRecorder(stream);
        let audioChunks = [];

        mediaRecorder.ondataavailable = event => {
            audioChunks.push(event.data);
        };

        mediaRecorder.start();

        alert('Recording started! Please start speaking in Dutch, around 7 seconds.');  // Alert to inform the user
        await new Promise(resolve => setTimeout(resolve, 7000)); // Record for 7 seconds
        mediaRecorder.stop();

        await new Promise(resolve => mediaRecorder.onstop = resolve);

        const audioBlob = new Blob(audioChunks);
        const reader = new FileReader();
        reader.readAsDataURL(audioBlob);
        reader.onloadend = () => {
            const base64data = reader.result.split(',')[1];
            google.colab.kernel.invokeFunction('notebook.recorded_audio', [base64data], {});
        };
    }

    recordAudio();
    """)
    display(js)

audio_data = None

def save_audio(data):
    global audio_datas
    audio_data = base64.b64decode(data)
    with open('recorded_Dutch_audio.wav', 'wb') as f:
        f.write(audio_data)
    print("Audio saved as 'recorded_Dutch_audio.wav'")

output.register_callback('notebook.recorded_audio', save_audio)

record()

from IPython.display import Audio

# Play the audio file
Audio('/content/recorded_Dutch_audio.wav')

import librosa
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

# The 'forced_decoder_ids' tells the model to perform the task of translation instead of transcription.
# Here, 'language="dutch"' specifies that we want the model to translate the audio to Dutch.
# 'task="translate"' makes the model translate instead of transcribing in the original language.
forced_decoder_ids = processor.get_decoder_prompt_ids(language="dutch", task="translate")

# load streaming dataset and read first audio sample

audio, rate = librosa.load("/content/recorded_Dutch_audio.wav", sr=16000)
input_features = processor(audio, sampling_rate=rate, return_tensors="pt").input_features

# generate token ids
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)

# decode token ids to text
# skip_special_tokens=False will keep any special tokens (such as language identifiers or special markers) in the transcription.
transcription_with_special_tokens = processor.batch_decode(predicted_ids, skip_special_tokens=False)

# Decode token ids to text (without special tokens)
# Now we decode the token ids again but skip the special tokens, providing only a cleaner transcription.
transcription_clean = processor.batch_decode(predicted_ids, skip_special_tokens=True)

# Output both transcriptions for comparison
print("Transcription with special tokens:", transcription_with_special_tokens)
print("Clean transcription:", transcription_clean)

! pip install TTS

Collecting TTS
  Downloading TTS-0.22.0-cp310-cp310-manylinux1_x86_64.whl.metadata (21 kB)
Requirement already satisfied: cython>=0.29.30 in /usr/local/lib/python3.10/dist-packages (from TTS) (3.0.11)
Requirement already satisfied: scipy>=1.11.2 in /usr/local/lib/python3.10/dist-packages (from TTS) (1.13.1)
Requirement already satisfied: torch>=2.1 in /usr/local/lib/python3.10/dist-packages (from TTS) (2.5.0+cu121)
Requirement already satisfied: torchaudio in /usr/local/lib/python3.10/dist-packages (from TTS) (2.5.0+cu121)
Requirement already satisfied: soundfile>=0.12.0 in /usr/local/lib/python3.10/dist-packages (from TTS) (0.12.1)
Requirement already satisfied: librosa>=0.10.0 in /usr/local/lib/python3.10/dist-packages (from TTS) (0.10.2.post1)
Requirement already satisfied: scikit-learn>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from TTS) (1.5.2)
Requirement already satisfied: inflect>=5.6.0 in /usr/local/lib/python3.10/dist-packages (from TTS) (7.4.0)
Requirement already satisfied: tqdm>=4.64.1 in /usr/local/lib/python3.10/dist-packages (from TTS) (4.66.6)
Collecting anyascii>=0.3.0 (from TTS)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Requirement already satisfied: pyyaml>=6.0 in /usr/local/lib/python3.10/dist-packages (from TTS) (6.0.2)
Requirement already satisfied: fsspec>=2023.6.0 in /usr/local/lib/python3.10/dist-packages (from TTS) (2024.10.0)
Requirement already satisfied: aiohttp>=3.8.1 in /usr/local/lib/python3.10/dist-packages (from TTS) (3.10.10)
Requirement already satisfied: packaging>=23.1 in /usr/local/lib/python3.10/dist-packages (from TTS) (24.1)
Requirement already satisfied: flask>=2.0.1 in /usr/local/lib/python3.10/dist-packages (from TTS) (2.2.5)
Collecting pysbd>=0.3.4 (from TTS)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting umap-learn>=0.5.1 (from TTS)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pandas<2.0,>=1.4 (from TTS)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Requirement already satisfied: matplotlib>=3.7.0 in /usr/local/lib/python3.10/dist-packages (from TTS) (3.8.0)
Collecting trainer>=0.0.32 (from TTS)
  Downloading trainer-0.0.36-py3-none-any.whl.metadata (8.1 kB)
Collecting coqpit>=0.0.16 (from TTS)
  Downloading coqpit-0.0.17-py3-none-any.whl.metadata (11 kB)
Requirement already satisfied: jieba in /usr/local/lib/python3.10/dist-packages (from TTS) (0.42.1)
Collecting pypinyin (from TTS)
  Downloading pypinyin-0.53.0-py2.py3-none-any.whl.metadata (12 kB)
Collecting hangul-romanize (from TTS)
  Downloading hangul_romanize-0.1.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gruut==2.2.3 (from gruut[de,es,fr]==2.2.3->TTS)
  Downloading gruut-2.2.3.tar.gz (73 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 73.5/73.5 kB 6.1 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Collecting jamo (from TTS)
  Downloading jamo-0.4.1-py3-none-any.whl.metadata (2.3 kB)
Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from TTS) (3.8.1)
Collecting g2pkk>=0.1.1 (from TTS)
  Downloading g2pkk-0.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting bangla (from TTS)
  Downloading bangla-0.0.2-py2.py3-none-any.whl.metadata (4.5 kB)
Collecting bnnumerizer (from TTS)
  Downloading bnnumerizer-0.0.2.tar.gz (4.7 kB)
  Preparing metadata (setup.py) ... done
Collecting bnunicodenormalizer (from TTS)
  Downloading bnunicodenormalizer-0.1.7-py3-none-any.whl.metadata (22 kB)
Requirement already satisfied: einops>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from TTS) (0.8.0)
Requirement already satisfied: transformers>=4.33.0 in /usr/local/lib/python3.10/dist-packages (from TTS) (4.44.2)
Collecting encodec>=0.1.1 (from TTS)
  Downloading encodec-0.1.1.tar.gz (3.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.7/3.7 MB 72.7 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Collecting unidecode>=1.3.2 (from TTS)
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Collecting num2words (from TTS)
  Downloading num2words-0.5.13-py3-none-any.whl.metadata (12 kB)
Requirement already satisfied: spacy>=3 in /usr/local/lib/python3.10/dist-packages (from spacy[ja]>=3->TTS) (3.7.5)
Collecting numpy==1.22.0 (from TTS)
  Downloading numpy-1.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB)
Requirement already satisfied: numba>=0.57.0 in /usr/local/lib/python3.10/dist-packages (from TTS) (0.60.0)
Requirement already satisfied: Babel<3.0.0,>=2.8.0 in /usr/local/lib/python3.10/dist-packages (from gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS) (2.16.0)
Collecting dateparser~=1.1.0 (from gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS)
  Downloading dateparser-1.1.8-py2.py3-none-any.whl.metadata (27 kB)
Collecting gruut-ipa<1.0,>=0.12.0 (from gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS)
  Downloading gruut-ipa-0.13.0.tar.gz (101 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 101.6/101.6 kB 7.5 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Collecting gruut_lang_en~=2.0.0 (from gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS)
  Downloading gruut_lang_en-2.0.1.tar.gz (15.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 15.3/15.3 MB 108.4 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Collecting jsonlines~=1.2.0 (from gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS)
  Downloading jsonlines-1.2.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting networkx<3.0.0,>=2.5.0 (from gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS)
  Downloading networkx-2.8.8-py3-none-any.whl.metadata (5.1 kB)
Collecting python-crfsuite~=0.9.7 (from gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting gruut_lang_de~=2.0.0 (from gruut[de,es,fr]==2.2.3->TTS)
  Downloading gruut_lang_de-2.0.1.tar.gz (18.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.1/18.1 MB 20.5 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Collecting gruut_lang_fr~=2.0.0 (from gruut[de,es,fr]==2.2.3->TTS)
  Downloading gruut_lang_fr-2.0.2.tar.gz (10.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.9/10.9 MB 22.0 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Collecting gruut_lang_es~=2.0.0 (from gruut[de,es,fr]==2.2.3->TTS)
  Downloading gruut_lang_es-2.0.1.tar.gz (31.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 31.4/31.4 MB 16.1 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp>=3.8.1->TTS) (2.4.3)
Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp>=3.8.1->TTS) (1.3.1)
Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp>=3.8.1->TTS) (24.2.0)
Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp>=3.8.1->TTS) (1.5.0)
Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp>=3.8.1->TTS) (6.1.0)
Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp>=3.8.1->TTS) (1.17.0)
Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp>=3.8.1->TTS) (4.0.3)
Requirement already satisfied: Werkzeug>=2.2.2 in /usr/local/lib/python3.10/dist-packages (from flask>=2.0.1->TTS) (3.0.6)
Requirement already satisfied: Jinja2>=3.0 in /usr/local/lib/python3.10/dist-packages (from flask>=2.0.1->TTS) (3.1.4)
Requirement already satisfied: itsdangerous>=2.0 in /usr/local/lib/python3.10/dist-packages (from flask>=2.0.1->TTS) (2.2.0)
Requirement already satisfied: click>=8.0 in /usr/local/lib/python3.10/dist-packages (from flask>=2.0.1->TTS) (8.1.7)
Requirement already satisfied: more-itertools>=8.5.0 in /usr/local/lib/python3.10/dist-packages (from inflect>=5.6.0->TTS) (10.5.0)
Requirement already satisfied: typeguard>=4.0.1 in /usr/local/lib/python3.10/dist-packages (from inflect>=5.6.0->TTS) (4.4.0)
Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS) (3.0.1)
INFO: pip is looking at multiple versions of librosa to determine which version is compatible with other requirements. This could take a while.
Collecting librosa>=0.10.0 (from TTS)
  Downloading librosa-0.10.2-py3-none-any.whl.metadata (8.6 kB)
  Downloading librosa-0.10.1-py3-none-any.whl.metadata (8.3 kB)
  Downloading librosa-0.10.0.post2-py3-none-any.whl.metadata (8.3 kB)
  Downloading librosa-0.10.0.post1-py3-none-any.whl.metadata (8.3 kB)
  Downloading librosa-0.10.0-py3-none-any.whl.metadata (8.3 kB)
Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS) (1.4.2)
Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS) (4.4.2)
Requirement already satisfied: pooch>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS) (1.8.2)
Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS) (0.5.0.post1)
Requirement already satisfied: typing-extensions>=4.1.1 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS) (4.12.2)
Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS) (0.4)
Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa>=0.10.0->TTS) (1.1.0)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.7.0->TTS) (1.3.0)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.7.0->TTS) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.7.0->TTS) (4.54.1)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.7.0->TTS) (1.4.7)
Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.7.0->TTS) (10.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.7.0->TTS) (3.2.0)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.7.0->TTS) (2.8.2)
Collecting docopt>=0.6.2 (from num2words->TTS)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... done
Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.57.0->TTS) (0.43.0)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<2.0,>=1.4->TTS) (2024.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=1.3.0->TTS) (3.5.0)
INFO: pip is looking at multiple versions of scipy to determine which version is compatible with other requirements. This could take a while.
Collecting scipy>=1.11.2 (from TTS)
  Downloading scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.8/60.8 kB 4.9 MB/s eta 0:00:00
  Downloading scipy-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.8/60.8 kB 5.0 MB/s eta 0:00:00
  Downloading scipy-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.6/60.6 kB 4.8 MB/s eta 0:00:00
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.4/60.4 kB 4.4 MB/s eta 0:00:00
  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.4/60.4 kB 4.7 MB/s eta 0:00:00
Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile>=0.12.0->TTS) (1.17.1)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (3.0.12)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (1.0.5)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (1.0.10)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (2.0.8)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (3.0.9)
Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (8.2.5)
Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (1.1.3)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (2.4.8)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (2.0.10)
Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (0.4.1)
Requirement already satisfied: typer<1.0.0,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (0.12.5)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (2.32.3)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (2.9.2)
Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (75.1.0)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.10/dist-packages (from spacy>=3->spacy[ja]>=3->TTS) (3.4.1)
Collecting sudachipy!=0.6.1,>=0.5.2 (from spacy[ja]>=3->TTS)
  Downloading SudachiPy-0.6.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting sudachidict-core>=20211220 (from spacy[ja]>=3->TTS)
  Downloading SudachiDict_core-20241021-py3-none-any.whl.metadata (2.5 kB)
Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=2.1->TTS) (3.16.1)
Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=2.1->TTS) (1.13.1)
Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=2.1->TTS) (1.3.0)
Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from trainer>=0.0.32->TTS) (5.9.5)
Requirement already satisfied: tensorboard in /usr/local/lib/python3.10/dist-packages (from trainer>=0.0.32->TTS) (2.17.0)
Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.33.0->TTS) (0.24.7)
Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.33.0->TTS) (2024.9.11)
Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.33.0->TTS) (0.4.5)
Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.33.0->TTS) (0.19.1)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.1->TTS)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile>=0.12.0->TTS) (2.22)
INFO: pip is looking at multiple versions of contourpy to determine which version is compatible with other requirements. This could take a while.
Collecting contourpy>=1.0.1 (from matplotlib>=3.7.0->TTS)
  Downloading contourpy-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)
Requirement already satisfied: tzlocal in /usr/local/lib/python3.10/dist-packages (from dateparser~=1.1.0->gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS) (5.2)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from Jinja2>=3.0->flask>=2.0.1->TTS) (3.0.2)
Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from jsonlines~=1.2.0->gruut==2.2.3->gruut[de,es,fr]==2.2.3->TTS) (1.16.0)
Requirement already satisfied: language-data>=1.2 in /usr/local/lib/python3.10/dist-packages (from langcodes<4.0.0,>=3.2.0->spacy>=3->spacy[ja]>=3->TTS) (1.2.0)
Requirement already satisfied: platformdirs>=2.5.0 in /usr/local/lib/python3.10/dist-packages (from pooch>=1.0->librosa>=0.10.0->TTS) (4.3.6)
Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy>=3->spacy[ja]>=3->TTS) (0.7.0)
Requirement already satisfied: pydantic-core==2.23.4 in /usr/local/lib/python3.10/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy>=3->spacy[ja]>=3->TTS) (2.23.4)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3->spacy[ja]>=3->TTS) (3.4.0)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3->spacy[ja]>=3->TTS) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3->spacy[ja]>=3->TTS) (2.2.3)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=3->spacy[ja]>=3->TTS) (2024.8.30)
Requirement already satisfied: blis<0.8.0,>=0.7.8 in /usr/local/lib/python3.10/dist-packages (from thinc<8.3.0,>=8.2.2->spacy>=3->spacy[ja]>=3->TTS) (0.7.11)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.10/dist-packages (from thinc<8.3.0,>=8.2.2->spacy>=3->spacy[ja]>=3->TTS) (0.1.5)
Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0.0,>=0.3.0->spacy>=3->spacy[ja]>=3->TTS) (1.5.4)
Requirement already satisfied: rich>=10.11.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0.0,>=0.3.0->spacy>=3->spacy[ja]>=3->TTS) (13.9.3)
Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from weasel<0.5.0,>=0.1.0->spacy>=3->spacy[ja]>=3->TTS) (0.20.0)
Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /usr/local/lib/python3.10/dist-packages (from weasel<0.5.0,>=0.1.0->spacy>=3->spacy[ja]>=3->TTS) (7.0.5)
Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp>=3.8.1->TTS) (0.2.0)
Requirement already satisfied: absl-py>=0.4 in /usr/local/lib/python3.10/dist-packages (from tensorboard->trainer>=0.0.32->TTS) (1.4.0)
Requirement already satisfied: grpcio>=1.48.2 in /usr/local/lib/python3.10/dist-packages (from tensorboard->trainer>=0.0.32->TTS) (1.64.1)
Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.10/dist-packages (from tensorboard->trainer>=0.0.32->TTS) (3.7)
Requirement already satisfied: protobuf!=4.24.0,<5.0.0,>=3.19.6 in /usr/local/lib/python3.10/dist-packages (from tensorboard->trainer>=0.0.32->TTS) (3.20.3)
Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from tensorboard->trainer>=0.0.32->TTS) (0.7.2)
Requirement already satisfied: marisa-trie>=0.7.7 in /usr/local/lib/python3.10/dist-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy>=3->spacy[ja]>=3->TTS) (1.2.1)
Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy>=3->spacy[ja]>=3->TTS) (3.0.0)
Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy>=3->spacy[ja]>=3->TTS) (2.18.0)
Requirement already satisfied: wrapt in /usr/local/lib/python3.10/dist-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy>=3->spacy[ja]>=3->TTS) (1.16.0)
Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy>=3->spacy[ja]>=3->TTS) (0.1.2)
Downloading TTS-0.22.0-cp310-cp310-manylinux1_x86_64.whl (938 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 938.0/938.0 kB 37.0 MB/s eta 0:00:00
Downloading numpy-1.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 16.8/16.8 MB 38.8 MB/s eta 0:00:00
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 289.9/289.9 kB 19.3 MB/s eta 0:00:00
Downloading coqpit-0.0.17-py3-none-any.whl (13 kB)
Downloading g2pkk-0.1.2-py3-none-any.whl (25 kB)
Downloading librosa-0.10.0-py3-none-any.whl (252 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 252.9/252.9 kB 20.3 MB/s eta 0:00:00
Downloading num2words-0.5.13-py3-none-any.whl (143 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 143.3/143.3 kB 13.7 MB/s eta 0:00:00
Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.1/12.1 MB 60.4 MB/s eta 0:00:00
Downloading pysbd-0.3.4-py3-none-any.whl (71 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 71.1/71.1 kB 5.5 MB/s eta 0:00:00
Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 36.4/36.4 MB 10.7 MB/s eta 0:00:00
Downloading trainer-0.0.36-py3-none-any.whl (51 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 51.2/51.2 kB 3.7 MB/s eta 0:00:00
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 88.8/88.8 kB 8.2 MB/s eta 0:00:00
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 235.5/235.5 kB 13.7 MB/s eta 0:00:00
Downloading bangla-0.0.2-py2.py3-none-any.whl (6.2 kB)
Downloading bnunicodenormalizer-0.1.7-py3-none-any.whl (23 kB)
Downloading hangul_romanize-0.1.0-py3-none-any.whl (4.6 kB)
Downloading jamo-0.4.1-py3-none-any.whl (9.5 kB)
Downloading pypinyin-0.53.0-py2.py3-none-any.whl (834 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 834.7/834.7 kB 38.8 MB/s eta 0:00:00
Downloading contourpy-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (305 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 305.2/305.2 kB 20.9 MB/s eta 0:00:00
Downloading dateparser-1.1.8-py2.py3-none-any.whl (293 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 293.8/293.8 kB 23.5 MB/s eta 0:00:00
Downloading jsonlines-1.2.0-py2.py3-none-any.whl (7.6 kB)
Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 60.5 MB/s eta 0:00:00
Downloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.9/56.9 kB 5.6 MB/s eta 0:00:00
Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 53.5 MB/s eta 0:00:00
Downloading SudachiDict_core-20241021-py3-none-any.whl (72.1 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 72.1/72.1 MB 9.0 MB/s eta 0:00:00
Downloading SudachiPy-0.6.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.6 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.6/2.6 MB 54.5 MB/s eta 0:00:00
Building wheels for collected packages: gruut, encodec, bnnumerizer, docopt, gruut-ipa, gruut_lang_de, gruut_lang_en, gruut_lang_es, gruut_lang_fr
  Building wheel for gruut (setup.py) ... done
  Created wheel for gruut: filename=gruut-2.2.3-py3-none-any.whl size=75788 sha256=f00a88cd072ab6c49d5841137fca7f9f153f91e2f8141ebc2f15d9e09ffa321b
  Stored in directory: /root/.cache/pip/wheels/fc/57/a8/f9de532daf5214f53644f20f3a9e6f69269453c87df9c0a817
  Building wheel for encodec (setup.py) ... done
  Created wheel for encodec: filename=encodec-0.1.1-py3-none-any.whl size=45760 sha256=bc7d1d6ee28b60c9efd78b1df08011487b7bfb39bacb4686bc84b457042dbd26
  Stored in directory: /root/.cache/pip/wheels/fc/36/cb/81af8b985a5f5e0815312d5e52b41263237af07b977e6bcbf3
  Building wheel for bnnumerizer (setup.py) ... done
  Created wheel for bnnumerizer: filename=bnnumerizer-0.0.2-py3-none-any.whl size=5261 sha256=e018527512ba68674c51e671877e32b0bf057f380b8f58cfac63272b7cc28dc8
  Stored in directory: /root/.cache/pip/wheels/59/6b/e8/223172e7d5c9f72df3ea1a0d9258f3a8ab5b28e827728edef5
  Building wheel for docopt (setup.py) ... done
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=5b14d60903a04bb394f00d9bf153ca7ca3300f62be9ad294ede723cdf1cbabb2
  Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac
  Building wheel for gruut-ipa (setup.py) ... done
  Created wheel for gruut-ipa: filename=gruut_ipa-0.13.0-py3-none-any.whl size=104873 sha256=010e27d72664ac8a6ca0118741f6c091af7a37c8207cf0d79eb97e3d52a633b6
  Stored in directory: /root/.cache/pip/wheels/7b/18/49/e4f500ecdf0babe757953f844e4d7cd1ea81c5503c09bfe984
  Building wheel for gruut_lang_de (setup.py) ... done
  Created wheel for gruut_lang_de: filename=gruut_lang_de-2.0.1-py3-none-any.whl size=18498313 sha256=b26cb008cb440d4945db0ca953fa225b4cd63144f03ec5ce765d11494e613e24
  Stored in directory: /root/.cache/pip/wheels/83/80/5f/775b357ae61d7cb68793327c7470d848715cbc60bb373af8dd
  Building wheel for gruut_lang_en (setup.py) ... done
  Created wheel for gruut_lang_en: filename=gruut_lang_en-2.0.1-py3-none-any.whl size=15326857 sha256=63b99b73d233808b0530b537e9ec59f1e2719a38ddfa0a0d50175c7d18be1641
  Stored in directory: /root/.cache/pip/wheels/64/8d/b7/d484d224facd899ed188e00374f25dd3f19d1a3f53da6517bd
  Building wheel for gruut_lang_es (setup.py) ... done
  Created wheel for gruut_lang_es: filename=gruut_lang_es-2.0.1-py3-none-any.whl size=32173928 sha256=b6d330065784cdd302b4a87a96adc1e08869a397dbd1182ebd9d3b6ba0413d90
  Stored in directory: /root/.cache/pip/wheels/ab/bd/96/5ddde14e8e6932a96f12c5ab5de62b619d39e2507d7daf5188
  Building wheel for gruut_lang_fr (setup.py) ... done
  Created wheel for gruut_lang_fr: filename=gruut_lang_fr-2.0.2-py3-none-any.whl size=10968766 sha256=18c927001ec6e9a3d32612ca119c4e027822a314b84209745be65d397fed1e4f
  Stored in directory: /root/.cache/pip/wheels/db/21/be/d0436e3f1cf9bf38b9bb9b4a476399c77a1ab19f7172b45e19
Successfully built gruut encodec bnnumerizer docopt gruut-ipa gruut_lang_de gruut_lang_en gruut_lang_es gruut_lang_fr
Installing collected packages: sudachipy, jamo, hangul-romanize, gruut_lang_fr, gruut_lang_es, gruut_lang_en, gruut_lang_de, docopt, bnunicodenormalizer, bnnumerizer, bangla, unidecode, sudachidict-core, python-crfsuite, pysbd, pypinyin, numpy, num2words, networkx, jsonlines, gruut-ipa, coqpit, anyascii, scipy, pandas, g2pkk, dateparser, contourpy, trainer, gruut, pynndescent, librosa, encodec, umap-learn, TTS
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
  Attempting uninstall: networkx
    Found existing installation: networkx 3.4.2
    Uninstalling networkx-3.4.2:
      Successfully uninstalled networkx-3.4.2
  Attempting uninstall: scipy
    Found existing installation: scipy 1.13.1
    Uninstalling scipy-1.13.1:
      Successfully uninstalled scipy-1.13.1
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully uninstalled pandas-2.2.2
  Attempting uninstall: contourpy
    Found existing installation: contourpy 1.3.0
    Uninstalling contourpy-1.3.0:
      Successfully uninstalled contourpy-1.3.0
  Attempting uninstall: librosa
    Found existing installation: librosa 0.10.2.post1
    Uninstalling librosa-0.10.2.post1:
      Successfully uninstalled librosa-0.10.2.post1
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albucore 0.0.19 requires numpy>=1.24.4, but you have numpy 1.22.0 which is incompatible.
albumentations 1.4.20 requires numpy>=1.24.4, but you have numpy 1.22.0 which is incompatible.
arviz 0.20.0 requires numpy>=1.23.0, but you have numpy 1.22.0 which is incompatible.
astropy 6.1.4 requires numpy>=1.23, but you have numpy 1.22.0 which is incompatible.
bigframes 1.25.0 requires numpy>=1.24.0, but you have numpy 1.22.0 which is incompatible.
chex 0.1.87 requires numpy>=1.24.1, but you have numpy 1.22.0 which is incompatible.
cudf-cu12 24.10.1 requires numpy<3.0a0,>=1.23, but you have numpy 1.22.0 which is incompatible.
cudf-cu12 24.10.1 requires pandas<2.2.3dev0,>=2.0, but you have pandas 1.5.3 which is incompatible.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 1.5.3 which is incompatible.
ibis-framework 9.2.0 requires numpy<3,>=1.23.2, but you have numpy 1.22.0 which is incompatible.
jax 0.4.33 requires numpy>=1.24, but you have numpy 1.22.0 which is incompatible.
jaxlib 0.4.33 requires numpy>=1.24, but you have numpy 1.22.0 which is incompatible.
mizani 0.13.0 requires numpy>=1.23.5, but you have numpy 1.22.0 which is incompatible.
mizani 0.13.0 requires pandas>=2.2.0, but you have pandas 1.5.3 which is incompatible.
numexpr 2.10.1 requires numpy>=1.23.0, but you have numpy 1.22.0 which is incompatible.
nx-cugraph-cu12 24.10.0 requires networkx>=3.0, but you have networkx 2.8.8 which is incompatible.
nx-cugraph-cu12 24.10.0 requires numpy<3.0a0,>=1.23, but you have numpy 1.22.0 which is incompatible.
pandas-stubs 2.2.2.240909 requires numpy>=1.23.5, but you have numpy 1.22.0 which is incompatible.
plotnine 0.14.0 requires numpy>=1.23.5, but you have numpy 1.22.0 which is incompatible.
plotnine 0.14.0 requires pandas>=2.2.0, but you have pandas 1.5.3 which is incompatible.
pylibraft-cu12 24.10.0 requires numpy<3.0a0,>=1.23, but you have numpy 1.22.0 which is incompatible.
rmm-cu12 24.10.0 requires numpy<3.0a0,>=1.23, but you have numpy 1.22.0 which is incompatible.
scikit-image 0.24.0 requires numpy>=1.23, but you have numpy 1.22.0 which is incompatible.
statsmodels 0.14.4 requires numpy<3,>=1.22.3, but you have numpy 1.22.0 which is incompatible.
tensorflow 2.17.0 requires numpy<2.0.0,>=1.23.5; python_version <= "3.11", but you have numpy 1.22.0 which is incompatible.
xarray 2024.10.0 requires numpy>=1.24, but you have numpy 1.22.0 which is incompatible.
xarray 2024.10.0 requires pandas>=2.1, but you have pandas 1.5.3 which is incompatible.
xarray-einstats 0.8.0 requires numpy>=1.23, but you have numpy 1.22.0 which is incompatible.
Successfully installed TTS-0.22.0 anyascii-0.3.2 bangla-0.0.2 bnnumerizer-0.0.2 bnunicodenormalizer-0.1.7 contourpy-1.2.1 coqpit-0.0.17 dateparser-1.1.8 docopt-0.6.2 encodec-0.1.1 g2pkk-0.1.2 gruut-2.2.3 gruut-ipa-0.13.0 gruut_lang_de-2.0.1 gruut_lang_en-2.0.1 gruut_lang_es-2.0.1 gruut_lang_fr-2.0.2 hangul-romanize-0.1.0 jamo-0.4.1 jsonlines-1.2.0 librosa-0.10.0 networkx-2.8.8 num2words-0.5.13 numpy-1.22.0 pandas-1.5.3 pynndescent-0.5.13 pypinyin-0.53.0 pysbd-0.3.4 python-crfsuite-0.9.11 scipy-1.11.4 sudachidict-core-20241021 sudachipy-0.6.8 trainer-0.0.36 umap-learn-0.5.7 unidecode-1.3.8

#@title
from IPython.display import Javascript, display
from google.colab import output
import base64
import io

def record():
    js = Javascript("""
    async function recordAudio() {
        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
        const mediaRecorder = new MediaRecorder(stream);
        let audioChunks = [];

        mediaRecorder.ondataavailable = event => {
            audioChunks.push(event.data);
        };

        mediaRecorder.start();

        alert('Recording started! Please start speaking in English, around 7 seconds.');  // Alert to inform the user
        await new Promise(resolve => setTimeout(resolve, 7000)); // Record for 7 seconds
        mediaRecorder.stop();

        await new Promise(resolve => mediaRecorder.onstop = resolve);

        const audioBlob = new Blob(audioChunks);
        const reader = new FileReader();
        reader.readAsDataURL(audioBlob);
        reader.onloadend = () => {
            const base64data = reader.result.split(',')[1];
            google.colab.kernel.invokeFunction('notebook.recorded_audio', [base64data], {});
        };
    }

    recordAudio();
    """)
    display(js)

audio_data = None

def save_audio(data):
    global audio_datas
    audio_data = base64.b64decode(data)
    with open('recorded_English_audio.wav', 'wb') as f:
        f.write(audio_data)
    print("Audio saved as 'recorded_English_audio.wav'")

output.register_callback('notebook.recorded_audio', save_audio)

record()

Audio saved as 'recorded_English_audio.wav'

from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
RuntimeError: module was compiled against NumPy C-API version 0x10 (NumPy 1.23) but the running NumPy has C-API version 0xf. Check the section C-API incompatibility at the Troubleshooting ImportError section at https://numpy.org/devdocs/user/troubleshooting-importerror.html#c-api-incompatibility for indications on how to solve this problem.

/usr/local/lib/python3.10/dist-packages/TTS/api.py:70: UserWarning: `gpu` will be deprecated. Please use `tts.to(device)` instead.
  warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")

 > You must confirm the following:
 | > "I have purchased a commercial license from Coqui: licensing@coqui.ai"
 | > "Otherwise, I agree to the terms of the non-commercial CPML: https://coqui.ai/cpml" - [y/n]
 | | > y
 > Downloading model to /root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2

100%|█████████▉| 1.87G/1.87G [00:43<00:00, 43.0MiB/s]
100%|██████████| 1.87G/1.87G [00:44<00:00, 42.2MiB/s]
100%|██████████| 4.37k/4.37k [00:00<00:00, 18.3kiB/s]
 75%|███████▌  | 271k/361k [00:00<00:00, 2.12MiB/s]
100%|██████████| 361k/361k [00:00<00:00, 957kiB/s] 
100%|██████████| 32.0/32.0 [00:00<00:00, 109iB/s]
 67%|██████▋   | 5.19M/7.75M [00:00<00:00, 28.9MiB/s]

 > Model's license - CPML
 > Check https://coqui.ai/cpml.txt for more info.
 > Using model: xtts

100%|██████████| 7.75M/7.75M [00:14<00:00, 28.9MiB/s]/usr/local/lib/python3.10/dist-packages/TTS/tts/layers/xtts/xtts_manager.py:5: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  self.speakers = torch.load(speaker_file_path)
/usr/local/lib/python3.10/dist-packages/TTS/utils/io.py:54: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  return torch.load(f, map_location=map_location, **kwargs)

# generate speech by cloning a voice using default settings
tts.tts_to_file(text="how are you students? I hope you find this course interesting",
                file_path="output.wav",
                speaker_wav="/content/recorded_English_audio.wav",
                language="en")

 > Text splitted to sentences.
['how are you students?', 'I hope you find this course interesting']

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.

 > Processing time: 7.699343204498291
 > Real-time factor: 0.8288281013669121

'output.wav'

from IPython.display import Audio

# Play the audio file
Audio('/content/output.wav')

Introduction to Language and Speech Technology - REma (RU)¶

Google Colab¶

Key Features:¶

Hands-on Exercise¶

Incase you are interested: What is Hugging Face🤗?¶

Setup¶

Installing Required Libraries¶

Automatic Speech Recognition (ASR)¶

Whisper ASR¶

Text-to-Speech (TTS)¶

Insights/Discussion/Conclusion¶

Introduction to Language and Speech Technology - REma (RU)¶

Google Colab¶

Key Features:¶

Saving and Sharing Notebooks:¶

Hands-on Exercise¶

Incase you are interested: What is Hugging Face🤗?¶

Setup¶

Installing Required Libraries¶

Automatic Speech Recognition (ASR)¶

Whisper ASR¶

Text-to-Speech (TTS)¶

Insights/Discussion/Conclusion¶