%%capture
!pip install transformers datasets torchaudio evaluate jiwer accelerate
!apt install git-lfs ##to upload your model on huggingface

from huggingface_hub import notebook_login
notebook_login()

repo_name = "wav2vec2-bert-speechocean-762-tutorial"

from datasets import load_dataset
speechocean = load_dataset("mispeech/speechocean762")
print(speechocean)

DatasetDict({
    train: Dataset({
        features: ['accuracy', 'completeness', 'fluency', 'prosodic', 'text', 'total', 'words', 'speaker', 'gender', 'age', 'audio'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['accuracy', 'completeness', 'fluency', 'prosodic', 'text', 'total', 'words', 'speaker', 'gender', 'age', 'audio'],
        num_rows: 2500
    })
})

speechocean

DatasetDict({
    train: Dataset({
        features: ['accuracy', 'completeness', 'fluency', 'prosodic', 'text', 'total', 'words', 'speaker', 'gender', 'age', 'audio'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['accuracy', 'completeness', 'fluency', 'prosodic', 'text', 'total', 'words', 'speaker', 'gender', 'age', 'audio'],
        num_rows: 2500
    })
})

speechocean['train']

Dataset({
    features: ['accuracy', 'completeness', 'fluency', 'prosodic', 'text', 'total', 'words', 'speaker', 'gender', 'age', 'audio'],
    num_rows: 2500
})

speechocean["train"][15]

{'accuracy': 8,
 'completeness': 10.0,
 'fluency': 9,
 'prosodic': 9,
 'text': 'DORA IS NOT A CLEANER',
 'total': 8,
 'words': [{'accuracy': 10,
   'phones': ['D', 'AO1', 'R', 'AH0'],
   'phones-accuracy': [2.0, 1.8, 2.0, 2.0],
   'stress': 10,
   'text': 'DORA',
   'total': 10,
   'mispronunciations': []},
  {'accuracy': 10,
   'phones': ['IH0', 'Z'],
   'phones-accuracy': [2.0, 2.0],
   'stress': 10,
   'text': 'IS',
   'total': 10,
   'mispronunciations': []},
  {'accuracy': 10,
   'phones': ['N', 'AA0', 'T'],
   'phones-accuracy': [2.0, 1.6, 2.0],
   'stress': 10,
   'text': 'NOT',
   'total': 10,
   'mispronunciations': []},
  {'accuracy': 10,
   'phones': ['AH0'],
   'phones-accuracy': [2.0],
   'stress': 10,
   'text': 'A',
   'total': 10,
   'mispronunciations': []},
  {'accuracy': 10,
   'phones': ['K', 'L', 'IY1', 'N', 'ER0'],
   'phones-accuracy': [2.0, 2.0, 2.0, 2.0, 2.0],
   'stress': 10,
   'text': 'CLEANER',
   'total': 10,
   'mispronunciations': []}],
 'speaker': '0001',
 'gender': 'm',
 'age': 6,
 'audio': {'path': '000010140.wav',
  'array': array([ 0.00021362, -0.0005188 , -0.00186157, ...,  0.00164795,
          0.00048828, -0.00079346]),
  'sampling_rate': 16000}}

def add_phonetic_transcription(entry):
    # Extract phonetic transcription for the words in the entry
    phonetic_transcription = " ".join(" ".join(word['phones']) for word in entry['words'])
    # Add the phonetic transcription to the entry
    entry['phonetic_transcription'] = phonetic_transcription
    return entry

speechocean['train'] = speechocean['train'].map(add_phonetic_transcription)

speechocean['test'] = speechocean['test'].map(add_phonetic_transcription)

speechocean['train'][0]

{'accuracy': 8,
 'completeness': 10.0,
 'fluency': 9,
 'prosodic': 9,
 'text': 'WE CALL IT BEAR',
 'total': 8,
 'words': [{'accuracy': 10,
   'phones': ['W', 'IY0'],
   'phones-accuracy': [2.0, 2.0],
   'stress': 10,
   'text': 'WE',
   'total': 10,
   'mispronunciations': []},
  {'accuracy': 10,
   'phones': ['K', 'AO0', 'L'],
   'phones-accuracy': [2.0, 1.8, 1.8],
   'stress': 10,
   'text': 'CALL',
   'total': 10,
   'mispronunciations': []},
  {'accuracy': 10,
   'phones': ['IH0', 'T'],
   'phones-accuracy': [2.0, 2.0],
   'stress': 10,
   'text': 'IT',
   'total': 10,
   'mispronunciations': []},
  {'accuracy': 6,
   'phones': ['B', 'EH0', 'R'],
   'phones-accuracy': [2.0, 1.0, 1.0],
   'stress': 10,
   'text': 'BEAR',
   'total': 6,
   'mispronunciations': []}],
 'speaker': '0001',
 'gender': 'm',
 'age': 6,
 'audio': {'path': '000010011.wav',
  'array': array([-9.46044922e-04, -2.38037109e-03, -1.31225586e-03, ...,
         -9.15527344e-05,  3.05175781e-04, -2.44140625e-04]),
  'sampling_rate': 16000},
 'phonetic_transcription': 'W IY0 K AO0 L IH0 T B EH0 R'}

speechocean = speechocean.remove_columns(['accuracy', 'completeness', 'fluency', 'prosodic','total', 'words', 'speaker', 'gender', 'age','phonetic_transcription'])

speechocean

DatasetDict({
    train: Dataset({
        features: ['text', 'audio'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['text', 'audio'],
        num_rows: 2500
    })
})

# Your code here for Task 2
# Write a code to remove any special characters in the train/test dataset
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"])
    return batch

speechocean = speechocean.map(remove_special_characters)

# Your code here for Task 3
# Your output should look like this. For each unique character there should be a number.
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocabs = speechocean.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=speechocean.column_names["train"])

vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'T': 0,
 'K': 1,
 'X': 2,
 'M': 3,
 'L': 4,
 'E': 5,
 'H': 6,
 'B': 7,
 'W': 8,
 'C': 9,
 'J': 10,
 'Y': 11,
 'V': 12,
 'P': 13,
 'D': 14,
 'G': 15,
 'F': 16,
 'R': 17,
 'N': 18,
 "'": 19,
 'I': 20,
 'S': 21,
 'O': 22,
 ' ': 23,
 'Z': 24,
 'U': 25,
 'A': 26,
 'Q': 27}

# If you are not able to perform all the 3 tasks, please complete it later. You can download the vocab.json file directly.

import json
with open('/content/vocab.json', 'r', encoding='utf-8') as file:
    vocab = json.load(file)

vocab

{'H': 0,
 'Q': 1,
 'F': 2,
 'Z': 3,
 'X': 4,
 'K': 5,
 'Y': 6,
 'R': 7,
 'N': 8,
 'M': 9,
 'A': 10,
 'C': 11,
 'O': 12,
 'J': 13,
 'T': 14,
 "'": 15,
 ' ': 16,
 'P': 17,
 'W': 18,
 'L': 19,
 'S': 20,
 'V': 21,
 'U': 22,
 'I': 23,
 'B': 24,
 'E': 25,
 'D': 26,
 'G': 27}

vocab["[UNK]"] = len(vocab)
vocab["[PAD]"] = len(vocab)
# Also for convience, change your " " token with |
# So it can be more visible to you.
vocab["|"] = vocab[" "]
del vocab[" "]

vocab

{'H': 0,
 'Q': 1,
 'F': 2,
 'Z': 3,
 'X': 4,
 'K': 5,
 'Y': 6,
 'R': 7,
 'N': 8,
 'M': 9,
 'A': 10,
 'C': 11,
 'O': 12,
 'J': 13,
 'T': 14,
 "'": 15,
 'P': 17,
 'W': 18,
 'L': 19,
 'S': 20,
 'V': 21,
 'U': 22,
 'I': 23,
 'B': 24,
 'E': 25,
 'D': 26,
 'G': 27,
 '[UNK]': 28,
 '[PAD]': 29,
 '|': 16}

len(vocab)

30

# Save this vocab.json
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab, vocab_file)

from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.

# Push tokenizer to repository
tokenizer.push_to_hub(repo_name)

CommitInfo(commit_url='https://huggingface.co/Aditya3107/wav2vec2-bert-speechocean-762-tutorial/commit/11612c1aa4fa0b0093e45a73791a35df34b504f9', commit_message='Upload tokenizer', commit_description='', oid='11612c1aa4fa0b0093e45a73791a35df34b504f9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Aditya3107/wav2vec2-bert-speechocean-762-tutorial', endpoint='https://huggingface.co', repo_type='model', repo_id='Aditya3107/wav2vec2-bert-speechocean-762-tutorial'), pr_revision=None, pr_num=None)

from transformers import SeamlessM4TFeatureExtractor

feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")

from transformers import Wav2Vec2BertProcessor

processor = Wav2Vec2BertProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.push_to_hub(repo_name)

CommitInfo(commit_url='https://huggingface.co/Aditya3107/wav2vec2-bert-speechocean-762-tutorial/commit/098be93ab22606d8fba9b33ca3b052a6f175d4d7', commit_message='Upload processor', commit_description='', oid='098be93ab22606d8fba9b33ca3b052a6f175d4d7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Aditya3107/wav2vec2-bert-speechocean-762-tutorial', endpoint='https://huggingface.co', repo_type='model', repo_id='Aditya3107/wav2vec2-bert-speechocean-762-tutorial'), pr_revision=None, pr_num=None)

# if you check the speechocean dataset, then we do not need to do anything.
speechocean['train'][20]['audio']

{'path': '000050003.wav',
 'array': array([-0.02224731, -0.02105713, -0.0227356 , ...,  0.0010376 ,
        -0.00030518,  0.00030518]),
 'sampling_rate': 16000}

import numpy as np
print("Target text:", speechocean["train"][50]["text"])
print("Input array shape:", np.asarray(speechocean["train"][50]["audio"]["array"]).shape)
print("Sampling rate:", speechocean["train"][50]["audio"]["sampling_rate"])

Target text: TOM LIKES THE OLD SWEATER
Input array shape: (50880,)
Sampling rate: 16000

def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["input_length"] = len(batch["input_features"])

    batch["labels"] = processor(text=batch["text"]).input_ids
    return batch

speechocean = speechocean.map(prepare_dataset)

import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2BertProcessor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        labels_batch = self.processor.pad(
            labels=label_features,
            padding=self.padding,
            return_tensors="pt",
        )
        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

from evaluate import load
wer_metric = load("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

from transformers import Wav2Vec2BertForCTC

model = Wav2Vec2BertForCTC.from_pretrained(
    "facebook/w2v-bert-2.0",
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.0,
    layerdrop=0.0,
    ctc_loss_reduction="mean",
    add_adapter=True,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

Some weights of Wav2Vec2BertForCTC were not initialized from the model checkpoint at facebook/w2v-bert-2.0 and are newly initialized: ['adapter.layers.0.ffn.intermediate_dense.bias', 'adapter.layers.0.ffn.intermediate_dense.weight', 'adapter.layers.0.ffn.output_dense.bias', 'adapter.layers.0.ffn.output_dense.weight', 'adapter.layers.0.ffn_layer_norm.bias', 'adapter.layers.0.ffn_layer_norm.weight', 'adapter.layers.0.residual_conv.bias', 'adapter.layers.0.residual_conv.weight', 'adapter.layers.0.residual_layer_norm.bias', 'adapter.layers.0.residual_layer_norm.weight', 'adapter.layers.0.self_attn.linear_k.bias', 'adapter.layers.0.self_attn.linear_k.weight', 'adapter.layers.0.self_attn.linear_out.bias', 'adapter.layers.0.self_attn.linear_out.weight', 'adapter.layers.0.self_attn.linear_q.bias', 'adapter.layers.0.self_attn.linear_q.weight', 'adapter.layers.0.self_attn.linear_v.bias', 'adapter.layers.0.self_attn.linear_v.weight', 'adapter.layers.0.self_attn_conv.bias', 'adapter.layers.0.self_attn_conv.weight', 'adapter.layers.0.self_attn_layer_norm.bias', 'adapter.layers.0.self_attn_layer_norm.weight', 'lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=repo_name,
  group_by_length=True,
  per_device_train_batch_size=2,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=10,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=600,
  eval_steps=300,
  logging_steps=300,
  learning_rate=5e-5,
  warmup_steps=500,
  save_total_limit=2,
  push_to_hub=True,
  #report_to="wandb" # Uncomment if using Weights and Bias account
)

/usr/local/lib/python3.10/dist-packages/transformers/training_args.py:1568: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead
  warnings.warn(

from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=speechocean['train'],
    eval_dataset=speechocean['test'],
    tokenizer=processor.feature_extractor,
)

<ipython-input-25-64f8541731cf>:3: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.
  trainer = Trainer(

trainer.train()

wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: aditya3107. Use `wandb login --relogin` to force relogin

trainer.push_to_hub()

Introduction to Language and Speech Technology - ReMA (RU)¶

1. Load Dataset¶

Add phonetic transcription¶

2. Prepare Data¶

¶

¶

¶

¶

3. Create Tokenizer¶

4. Feature Extractor¶

5. Prepare for training¶

Data Collator¶

Load pretrained model¶

Introduction to Weights and Bias.¶