%%capture
! pip install evaluate jiwer

import evaluate

# Load the WER metric
wer = evaluate.load("wer")

# Define reference and hypothesis transcripts
reference = ["the cat sat on the mat"]
hypothesis = ["the cat is on mat"]

# Calculate WER
result = wer.compute(references=reference, predictions=hypothesis)
print(f"WER: {result:.2f}")

/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(

WER: 0.33

# Multiple references and hypotheses
references = [
    "the cat sat on the mat",
    "hello world this is a test",
    "huggingface is awesome",
    "introduction to language and speech technology"
]
hypotheses = [
    "the cat is on mat",
    "hello this is test",
    "huggingface awesome",
    "introdution language and speech"
]

# Calculate WER for the batch
result = wer.compute(references=references, predictions=hypotheses)
print(f"Batch WER: {result:.2f}")

Batch WER: 0.38

%%capture
!pip install transformers datasets torchaudio evaluate jiwer accelerate

# Add your code here
from datasets import load_dataset
speechocean = load_dataset("mispeech/speechocean762")

!pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-06h0afie
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-06h0afie
  Resolved https://github.com/openai/whisper.git to commit 90db0de1896c23cbfaf0c58bc2d30665f709f170
  Installing build dependencies ... done
  Getting requirements to build wheel ... done
  Preparing metadata (pyproject.toml) ... done
Requirement already satisfied: numba in /usr/local/lib/python3.10/dist-packages (from openai-whisper==20240930) (0.60.0)
Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from openai-whisper==20240930) (1.26.4)
Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from openai-whisper==20240930) (2.5.1+cu121)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from openai-whisper==20240930) (4.66.6)
Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from openai-whisper==20240930) (10.5.0)
Collecting tiktoken (from openai-whisper==20240930)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting triton>=2.0.0 (from openai-whisper==20240930)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from triton>=2.0.0->openai-whisper==20240930) (3.16.1)
Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba->openai-whisper==20240930) (0.43.0)
Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken->openai-whisper==20240930) (2024.9.11)
Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.10/dist-packages (from tiktoken->openai-whisper==20240930) (2.32.3)
Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch->openai-whisper==20240930) (4.12.2)
Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->openai-whisper==20240930) (3.4.2)
Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->openai-whisper==20240930) (3.1.4)
Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch->openai-whisper==20240930) (2024.10.0)
Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch->openai-whisper==20240930) (1.13.1)
Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch->openai-whisper==20240930) (1.3.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken->openai-whisper==20240930) (3.4.0)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken->openai-whisper==20240930) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken->openai-whisper==20240930) (2.2.3)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.26.0->tiktoken->openai-whisper==20240930) (2024.8.30)
Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->openai-whisper==20240930) (3.0.2)
Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.5/209.5 MB 6.5 MB/s eta 0:00:00
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 57.5 MB/s eta 0:00:00
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... done
  Created wheel for openai-whisper: filename=openai_whisper-20240930-py3-none-any.whl size=803583 sha256=5506187910a71b73d3e5fb97030ee465541878616e8c82a49e3f9af83b182e1d
  Stored in directory: /tmp/pip-ephem-wheel-cache-_1qpp00i/wheels/8b/6c/d0/622666868c179f156cf595c8b6f06f88bc5d80c4b31dccaa03
Successfully built openai-whisper
Installing collected packages: triton, tiktoken, openai-whisper
Successfully installed openai-whisper-20240930 tiktoken-0.8.0 triton-3.1.0

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from whisper.tokenizer import get_tokenizer

# Load the Whisper processor and model
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None

tokenizer = get_tokenizer(multilingual=True)
number_tokens = [
    i for i in range(tokenizer.eot)
    if all(c in "0123456789" for c in tokenizer.decode([i]).strip())
]

def transcribe_audio(entry):
    # Process the audio to get input features
    input_features = processor(
        entry["audio"]["array"],
        sampling_rate=entry["audio"]["sampling_rate"],
        return_tensors="pt"
    ).input_features

    # Generate transcription
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    # Add transcription to the entry
    entry["whisper_tiny_transcripts"] = transcription
    return entry

subset_test = speechocean["test"].select(range(10))  # Select the first 10 entry in testset
subset_test = subset_test.map(transcribe_audio)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.

subset_test[0]

{'accuracy': 9,
 'completeness': 10.0,
 'fluency': 9,
 'prosodic': 9,
 'text': 'MARK IS GOING TO SEE ELEPHANT',
 'total': 9,
 'words': [{'accuracy': 10,
   'phones': ['M', 'AA0', 'R', 'K'],
   'phones-accuracy': [2.0, 2.0, 1.8, 2.0],
   'stress': 10,
   'text': 'MARK',
   'total': 10,
   'mispronunciations': []},
  {'accuracy': 10,
   'phones': ['IH0', 'Z'],
   'phones-accuracy': [2.0, 1.8],
   'stress': 10,
   'text': 'IS',
   'total': 10,
   'mispronunciations': []},
  {'accuracy': 10,
   'phones': ['G', 'OW0', 'IH0', 'NG'],
   'phones-accuracy': [2.0, 2.0, 2.0, 2.0],
   'stress': 10,
   'text': 'GOING',
   'total': 10,
   'mispronunciations': []},
  {'accuracy': 10,
   'phones': ['T', 'UW0'],
   'phones-accuracy': [2.0, 2.0],
   'stress': 10,
   'text': 'TO',
   'total': 10,
   'mispronunciations': []},
  {'accuracy': 10,
   'phones': ['S', 'IY0'],
   'phones-accuracy': [2.0, 2.0],
   'stress': 10,
   'text': 'SEE',
   'total': 10,
   'mispronunciations': []},
  {'accuracy': 10,
   'phones': ['EH1', 'L', 'IH0', 'F', 'AH0', 'N', 'T'],
   'phones-accuracy': [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0],
   'stress': 10,
   'text': 'ELEPHANT',
   'total': 10,
   'mispronunciations': []}],
 'speaker': '0003',
 'gender': 'm',
 'age': 6,
 'audio': {'path': None,
  'array': array([-0.00119019, -0.00500488, -0.00283813, ...,  0.00274658,
          0.        ,  0.00125122]),
  'sampling_rate': 16000},
 'whisper_tiny_transcripts': ' Mark is going to see elephant.'}

import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def remove_special_characters(batch):
    batch["whisper_tiny_transcripts"] = re.sub(chars_to_ignore_regex, '', batch["whisper_tiny_transcripts"])
    return batch

subset_test = subset_test.map(remove_special_characters)

hypothesis_transcripts = [entry["whisper_tiny_transcripts"].lower() for entry in subset_test]

hypothesis_transcripts

[' mark is going to see elephant.',
 ' ketan loves china.',
 ' 2648.',
 ' 7, 3, 4, 2.',
 ' 2-8-9-1',
 '往揭着往揭着',
 ' 2, 5, 9, 9.',
 ' tomato, spring, photo.',
 ' lera is good at streaming.',
 " and i'll give that book scene."]

reference_transcripts = [entry["text"].lower() for entry in subset_test]
hypothesis_transcripts = [entry["whisper_tiny_transcripts"].lower() for entry in subset_test]

reference_transcripts

['mark is going to see elephant',
 'kate loves china',
 'two six four eight',
 'seven three four two',
 'two eight nine one',
 'one zero one zero',
 'two five nine nine',
 'tomato spring photo',
 'layla is good at swimming',
 'alice give up boxing']

hypothesis_transcripts

[' mark is going to see elephant',
 ' ketan loves china',
 ' two six four eight',
 ' seven three four two',
 ' two eight nine one',
 '往揭着往揭着',
 ' two five nine nine',
 ' tomato spring photo',
 ' lera is good at streaming',
 " and i'll give that book scene"]

# Now we do calculations
result = wer.compute(references=reference_transcripts, predictions=hypothesis_transcripts)
print(f"WER: {result:.2f}")

WER: 0.29

!git clone https://github.com/usnistgov/SCTK.git

Cloning into 'SCTK'...
remote: Enumerating objects: 5487, done.
remote: Counting objects: 100% (344/344), done.
remote: Compressing objects: 100% (223/223), done.
remote: Total 5487 (delta 143), reused 286 (delta 117), pack-reused 5143 (from 1)
Receiving objects: 100% (5487/5487), 7.61 MiB | 16.76 MiB/s, done.
Resolving deltas: 100% (3827/3827), done.

%%capture
!cd SCTK && make config
!cd SCTK && make all
!cd SCTK && make check
!cd SCTK && make install
!cd SCTK && make doc

reference_transcripts_sclite = [f"{sentence} (SPEAKER_{i})" for i, sentence in enumerate(reference_transcripts, 1)] ## We created dummy speaker id here.

with open("reference.txt", 'w') as file:
    for sentence in reference_transcripts_sclite:
        file.write(sentence + '\n')

hypothesis_transcripts_sclite = [f"{sentence} (SPEAKER_{i})" for i, sentence in enumerate(hypothesis_transcripts, 1)]

with open("hypothesis.txt", 'w') as file:
    for sentence in hypothesis_transcripts_sclite:
        file.write(sentence + '\n')

# Make sure the name (path) of the files is the same!!

! /content/SCTK/bin/sclite -s -i rm -r /content/reference.txt -h /content/hypothesis.txt -o all

sclite: 2.10 TK Version 1.3
Begin alignment of Ref File: '/content/reference.txt' and Hyp File: '/content/hypothesis.txt'

    Alignment# 1 for speaker SPEAKER          
    Alignment# 2 for speaker SPEAKER          
    Alignment# 3 for speaker SPEAKER          
    Alignment# 4 for speaker SPEAKER          
    Alignment# 5 for speaker SPEAKER          
    Alignment# 6 for speaker SPEAKER          
    Alignment# 7 for speaker SPEAKER          
    Alignment# 8 for speaker SPEAKER          
    Alignment# 9 for speaker SPEAKER          
    Alignment# 10 for speaker SPEAKER          

    Writing scoring report to '/content/hypothesis.txt.sys'
    Writing raw scoring report to '/content/hypothesis.txt.raw'
    Writing string alignments to '/content/hypothesis.txt.pra'

Successful Completion

# Your code here

# Your code here

Introduction to Language and Speech Technology - ReMA (RU)¶

Formula¶

Levenshtein Distance¶

Example of WER Calculation¶

Reference Transcript:¶

Hypothesis (ASR Output):¶

Step 1: Align the sequences¶

Step 2: Identify the Errors¶

Step 3: Apply the Formula¶

Task 1¶

SCLITE¶

Answer Key¶

Task 2¶

Task 3¶