import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

<torch._C.Generator at 0x7c272c600170>

!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-12-04 16:19:37--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


input.txt             0%[                    ]       0  --.-KB/s               
input.txt           100%[===================>]   1.06M  --.-KB/s    in 0.05s   

2024-12-04 16:19:37 (22.9 MB/s) - ‘input.txt’ saved [1115394/1115394]

# Read the data
with open('input.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

chars = sorted(list(set(raw_text)))
vocab_size = len(chars)
print("Unique Characters")
print(''.join(chars))
print("Vocab Size: ",vocab_size)

Unique Characters

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab Size:  65

def build_vocab(chars):
    string_to_int = {ch: i for i, ch in enumerate(chars)}
    int_to_string = {i: ch for i, ch in enumerate(chars)}
    return string_to_int, int_to_string


string_to_int, int_to_string = build_vocab(chars)

# Make an encoder function: Converts a string to a list of integer indices
def encode(text, string_to_int):
    return [string_to_int[c] for c in text]

# Decoder function: Converts a list of integer indices back to a string
def decode(indices, int_to_string):
    return ''.join(int_to_string[i] for i in indices)

encoded_data = torch.tensor(encode(raw_text,string_to_int), dtype=torch.long)

n = int(0.9*len(encoded_data)) # first 90% will be train, rest val
train_data = encoded_data[:n]
val_data = encoded_data[n:]

torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context window

x = train_data[:block_size]
y = train_data[1:block_size+1]

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        return logits

m = BigramLanguageModel(vocab_size)
logits = m(xb, yb)
print(logits.shape)

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        B,T,C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss= F.cross_entropy(logits,targets) #how well we are predicting next character based on logits
        return logits, loss

m = BigramLanguageModel(vocab_size)
logits,loss = m(xb, yb)
print(logits.shape)
print('loss:',loss)

torch.Size([256, 65])
loss: tensor(4.6425, grad_fn=<NllLossBackward0>)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx)  # (B, T, C)

        if targets is None:
            # If targets are not provided, skip the loss computation
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)  # Compute loss

        return logits, loss

def generate(model, idx, max_new_tokens):
    """
    Generate tokens using a given model.

    Args:
        model: The language model instance.
        idx: (B, T) tensor of indices in the current context.
        max_new_tokens: Number of tokens to generate.
    """
    for _ in range(max_new_tokens):
        # Get the predictions
        logits, _ = model(idx)
        # Focus only on the last time step
        logits = logits[:, -1, :]  # Becomes (B, C)
        # Apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1)  # (B, C)
        # Sample from the distribution
        idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
    return idx

m = BigramLanguageModel(vocab_size)

# Example inputs
idx = torch.zeros((1, 1), dtype=torch.long)  # Starting token index

max_new_tokens = 100  # Number of tokens to generate
# Generate sequence
generated_sequence = generate(m, idx, max_new_tokens)

decoded_output = decode(generated_sequence[0].tolist(), int_to_string)
print(decoded_output)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32 #we increased the batch size to 32
for steps in range(500): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb) #evaluate the loss
    optimizer.zero_grad(set_to_none=True) #Zeroing out all the gradients from previous step
    loss.backward() #getting gradients for all the parameters
    optimizer.step() #using gradients to update the parameters

print(loss.item())

!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-12-04 16:19:57--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


input.txt.1           0%[                    ]       0  --.-KB/s               
input.txt.1         100%[===================>]   1.06M  --.-KB/s    in 0.04s   

2024-12-04 16:19:57 (24.8 MB/s) - ‘input.txt.1’ saved [1115394/1115394]

import torch
import torch.nn as nn
from torch.nn import functional as F

#hyperparameters
batch_size = 32 #independent sequences process in parallel
block_size = 8 #context length
max_iters = 5000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 32 #embedding size

torch.manual_seed(1337)

#read the data
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

#Extract all the unique characters occur in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)

#creating mapping from characters to integers
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

#train-test split
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


model = BigramLanguageModel(vocab_size)
m = model.to(device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.7305, val loss 4.7241
step 300: train loss 2.8110, val loss 2.8249
step 600: train loss 2.5434, val loss 2.5682
step 900: train loss 2.4932, val loss 2.5088
step 1200: train loss 2.4863, val loss 2.5035
step 1500: train loss 2.4665, val loss 2.4921
step 1800: train loss 2.4683, val loss 2.4936
step 2100: train loss 2.4696, val loss 2.4846
step 2400: train loss 2.4638, val loss 2.4879
step 2700: train loss 2.4738, val loss 2.4911
step 3000: train loss 2.4613, val loss 2.4897
step 3300: train loss 2.4689, val loss 2.4793
step 3600: train loss 2.4554, val loss 2.4919
step 3900: train loss 2.4682, val loss 2.4906
step 4200: train loss 2.4634, val loss 2.4882
step 4500: train loss 2.4563, val loss 2.4804
step 4800: train loss 2.4557, val loss 2.4852


My, g: ir'de wherethiszDos he ye tsthicur foreles!
KI I n m hitof mas JUTUngnobressuch s ane Sl:
The g! inoes mechindo, t hateforeorle ey ch ny eptourveet hat as heyo hur s wa f s is sthecithate I k.
F s'demath IORONTEL:

LO:
MIUK:
S:
INRIsenta d ar, the nghim it INCithifour bje:
Thans w bornowhalll are s s that le we hat
Cliver?
ARI k.
To tom.
BRABucownar, lant sthe fryo nod thte be.
Theito d asdssD:
FO, qun,
ONETENThencrs?
HAD whorke!
shifa han:
Frdard sen,
VIfon: y the, k'sut s ane cr t s ho

Transformer-Based Models - ReMA (RU)¶

Tutorial 3¶

Last update: 2024/11/28¶

Aditya Parikh (aditya.parikh@ru.nl)¶

Task 1¶

Answer key¶