Transformer-Based Models - ReMA (RU)¶
In this tutorial, we will assemble all the modules of transformer architecture which we created in last tutorial (I know it was very quick) and try to generate some sequence with our tiny-shakespeare dataset.
The goal of this tutorial is to understand how transformers based models are trained and compare different architectures i.e. Encoder only, Decoder only, Encoder-Decoder architecture.
Start with importing our dataset
! wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
--2024-12-12 05:50:03-- https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 1115394 (1.1M) [text/plain] Saving to: ‘input.txt’ input.txt 0%[ ] 0 --.-KB/s input.txt 100%[===================>] 1.06M --.-KB/s in 0.04s 2024-12-12 05:50:03 (27.6 MB/s) - ‘input.txt’ saved [1115394/1115394]
Below is the complete code for the decoder-only architecture of transformers. Run the cell below to start training, which will generate sequences. Monitor the loss during training. Previously, with the bigram model, the loss plateaued around ~2.50. This model will significantly reduce the loss*.
The loss is the difference between the predicted outputs of the model and the actual target values during training. It is a key metric used to evaluate how well a machine learning model is performing.
import torch
import torch.nn as nn
from torch.nn import functional as F
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------
torch.manual_seed(1337)
with open('input.txt', 'r', encoding='utf-8') as f:
text = f.read()
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]
# data loading
def get_batch(split):
# generate a small batch of data of inputs x and targets y
data = train_data if split == 'train' else val_data
ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([data[i:i+block_size] for i in ix])
y = torch.stack([data[i+1:i+block_size+1] for i in ix])
x, y = x.to(device), y.to(device)
return x, y
@torch.no_grad()
def estimate_loss():
out = {}
model.eval()
for split in ['train', 'val']:
losses = torch.zeros(eval_iters)
for k in range(eval_iters):
X, Y = get_batch(split)
logits, loss = model(X, Y)
losses[k] = loss.item()
out[split] = losses.mean()
model.train()
return out
class Head(nn.Module):
""" one head of self-attention """
def __init__(self, head_size):
super().__init__()
self.key = nn.Linear(n_embd, head_size, bias=False)
self.query = nn.Linear(n_embd, head_size, bias=False)
self.value = nn.Linear(n_embd, head_size, bias=False)
self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
self.dropout = nn.Dropout(dropout)
def forward(self, x):
B,T,C = x.shape
k = self.key(x) # (B,T,C)
q = self.query(x) # (B,T,C)
# compute attention scores ("affinities")
wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
wei = F.softmax(wei, dim=-1) # (B, T, T)
wei = self.dropout(wei)
# perform the weighted aggregation of the values
v = self.value(x) # (B,T,C)
out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
return out
class MultiHeadAttention(nn.Module):
""" multiple heads of self-attention in parallel """
def __init__(self, num_heads, head_size):
super().__init__()
self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
self.proj = nn.Linear(n_embd, n_embd)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
out = torch.cat([h(x) for h in self.heads], dim=-1)
out = self.dropout(self.proj(out))
return out
class FeedFoward(nn.Module):
""" a simple linear layer followed by a non-linearity """
def __init__(self, n_embd):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n_embd, 4 * n_embd),
nn.ReLU(),
nn.Linear(4 * n_embd, n_embd),
nn.Dropout(dropout),
)
def forward(self, x):
return self.net(x)
class Block(nn.Module):
""" Transformer block: communication followed by computation """
def __init__(self, n_embd, n_head):
# n_embd: embedding dimension, n_head: the number of heads we'd like
super().__init__()
head_size = n_embd // n_head
self.sa = MultiHeadAttention(n_head, head_size)
self.ffwd = FeedFoward(n_embd)
self.ln1 = nn.LayerNorm(n_embd)
self.ln2 = nn.LayerNorm(n_embd)
def forward(self, x):
x = x + self.sa(self.ln1(x))
x = x + self.ffwd(self.ln2(x))
return x
# super simple bigram model
class BigramLanguageModel(nn.Module):
def __init__(self):
super().__init__()
# each token directly reads off the logits for the next token from a lookup table
self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
self.position_embedding_table = nn.Embedding(block_size, n_embd)
self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
self.ln_f = nn.LayerNorm(n_embd) # final layer norm
self.lm_head = nn.Linear(n_embd, vocab_size)
def forward(self, idx, targets=None):
B, T = idx.shape
# idx and targets are both (B,T) tensor of integers
tok_emb = self.token_embedding_table(idx) # (B,T,C)
pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
x = tok_emb + pos_emb # (B,T,C)
x = self.blocks(x) # (B,T,C)
x = self.ln_f(x) # (B,T,C)
logits = self.lm_head(x) # (B,T,vocab_size)
if targets is None:
loss = None
else:
B, T, C = logits.shape
logits = logits.view(B*T, C)
targets = targets.view(B*T)
loss = F.cross_entropy(logits, targets)
return logits, loss
def generate(self, idx, max_new_tokens):
# idx is (B, T) array of indices in the current context
for _ in range(max_new_tokens):
# crop idx to the last block_size tokens
idx_cond = idx[:, -block_size:]
# get the predictions
logits, loss = self(idx_cond)
# focus only on the last time step
logits = logits[:, -1, :] # becomes (B, C)
# apply softmax to get probabilities
probs = F.softmax(logits, dim=-1) # (B, C)
# sample from the distribution
idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
# append sampled index to the running sequence
idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
return idx
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for iter in range(max_iters):
# every once in a while evaluate the loss on train and val sets
if iter % eval_interval == 0 or iter == max_iters - 1:
losses = estimate_loss()
print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
# sample a batch of data
xb, yb = get_batch('train')
# evaluate the loss
logits, loss = model(xb, yb)
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))
0.209729 M parameters step 0: train loss 4.4116, val loss 4.4022 step 100: train loss 2.6568, val loss 2.6670 step 200: train loss 2.5091, val loss 2.5058 step 300: train loss 2.4197, val loss 2.4336 step 400: train loss 2.3501, val loss 2.3562 step 500: train loss 2.2963, val loss 2.3125 step 600: train loss 2.2407, val loss 2.2496 step 700: train loss 2.2054, val loss 2.2187 step 800: train loss 2.1633, val loss 2.1866 step 900: train loss 2.1241, val loss 2.1504 step 1000: train loss 2.1036, val loss 2.1306 step 1100: train loss 2.0698, val loss 2.1180 step 1200: train loss 2.0380, val loss 2.0791 step 1300: train loss 2.0248, val loss 2.0634 step 1400: train loss 1.9926, val loss 2.0359 step 1500: train loss 1.9697, val loss 2.0287 step 1600: train loss 1.9627, val loss 2.0477 step 1700: train loss 1.9403, val loss 2.0115 step 1800: train loss 1.9090, val loss 1.9941 step 1900: train loss 1.9092, val loss 1.9858 step 2000: train loss 1.8847, val loss 1.9925 step 2100: train loss 1.8724, val loss 1.9757 step 2200: train loss 1.8580, val loss 1.9594 step 2300: train loss 1.8560, val loss 1.9537 step 2400: train loss 1.8412, val loss 1.9427 step 2500: train loss 1.8141, val loss 1.9402 step 2600: train loss 1.8292, val loss 1.9397 step 2700: train loss 1.8116, val loss 1.9322 step 2800: train loss 1.8032, val loss 1.9218 step 2900: train loss 1.8022, val loss 1.9285 step 3000: train loss 1.7955, val loss 1.9195 step 3100: train loss 1.7672, val loss 1.9192 step 3200: train loss 1.7568, val loss 1.9138 step 3300: train loss 1.7551, val loss 1.9059 step 3400: train loss 1.7549, val loss 1.8945 step 3500: train loss 1.7383, val loss 1.8956 step 3600: train loss 1.7242, val loss 1.8868 step 3700: train loss 1.7273, val loss 1.8822 step 3800: train loss 1.7176, val loss 1.8923 step 3900: train loss 1.7219, val loss 1.8750 step 4000: train loss 1.7131, val loss 1.8603 step 4100: train loss 1.7105, val loss 1.8777 step 4200: train loss 1.7033, val loss 1.8675 step 4300: train loss 1.7038, val loss 1.8556 step 4400: train loss 1.7057, val loss 1.8643 step 4500: train loss 1.6875, val loss 1.8528 step 4600: train loss 1.6887, val loss 1.8405 step 4700: train loss 1.6834, val loss 1.8501 step 4800: train loss 1.6675, val loss 1.8437 step 4900: train loss 1.6684, val loss 1.8407 step 4999: train loss 1.6645, val loss 1.8286 KING RICHARD II: Shal lifest made to bub, to take Our my dagatants: Whith foul his vetward that a endrer, my fears' to zorm heavens, Oof it heart my would but With ensengmin latest in ov the doest not. WARWICK: Welll now, and thus quechiry: there's speak you love. In Bodiet, and whom the sclittle Enout-now what evily well most rive with is compon to the me Town danters, If so; Ange to shall do aleous, for dear? KING HENRY VI: Hark, but a ards bring Edward? GROKE: As is no Rurnts I am you! who neet. Pom mary thou contrantym so a thense. QUEEN VINCENTIO: O, sir, may in God't well ow, whom confessy. Which migh. ARCHILINIUS: Dithul seaze Peed me: very it passce of's cruport; How what make you fear tals: there loves Tunkistren in deed, is xment. CORIONIUS: What comforts me. I with self From the walt I? GRINION: Which ushold. KING HENRY Gindner: Withrief I doot, is onter now. Securming: Intande whose no crown some Eiverely marry sold; For for me watch the our torguet! Goy, know our her and brut what I, I huself as humsell. APTOLYCUM: Laitance and toarth or word As beherefitions so me worting. CORIOLINA: What a wouldds, An but branedy wouldIng my a canity: Was you be any in Becausing watcess the Regreast men is what see would in thas jury your Hrannertandless; As there'erliacter me band frind through he crown, I she love is stay just torment: Slaw you behoth unserving of vonby the post, Whave baste hold; I they nengety may's fries To there's fince, I heave arrow old, Thee best sincess soul be that Lord, as; River thou a-latsteer: Out. PORALLINA: Where but Braight gentle, drieven the know you for that to this mack a rishn. Prawity arm as is infectely, Ah, sinstats o' no, this send; commant to love, Go fly this fathal I cortuns cold, offrong to old, the courtly thee? before a gace. KING RICHARD III: A life he pusict It. Vitters, and were not fanturs, thy promind thy awonse than a braute comforn, Will Roman! you brain shown'd for a dresss me; he heavison! MENE
This output is way better than our previous output from a simple bi-gram language model. We also printed out total parameters of the model i.e. 0.209729 M parameters. But you can also develop this at a larger scale. You just need to change the hyper-parameters. We do it below. Let's check it out.
At this point: do not forget to change the runtime and start the GPU on Google Colab.
Hyperparameters¶
- batch_size = 64
How many independent sequences will we process in parallel? - block_size = 256
What is the maximum context length for predictions? Initially it was 8. - max_iters = 5000
- eval_interval = 500
- learning_rate = 3e-4
- eval_iters = 200
- n_embd = 384
- n_head = 6 (Number of heads)
- n_layer = 6 (6 transformer blocks)
- dropout = 0.2
We also added dropout layer, means at every forward-backward pass 20% of intermediate calculations are disabled and dropped to 0
Now we will add these hyperparameters and run the code again.
import torch
import torch.nn as nn
from torch.nn import functional as F
# hyperparameters
batch_size = 64 #16 # how many independent sequences will we process in parallel?
block_size = 256 #32 # what is the maximum context length for predictions? Initially it was 8
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384 #64
n_head = 6 #4 (Number of heads)
n_layer = 6 #4 (transformer blocks)
dropout = 0.2 #0.0 We also added dropout layer, means at every forward-backward pass 20% of intermediate calculations are disabled and dropped to 0
# ------------
torch.manual_seed(1337)
with open('input.txt', 'r', encoding='utf-8') as f:
text = f.read()
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]
# data loading
def get_batch(split):
# generate a small batch of data of inputs x and targets y
data = train_data if split == 'train' else val_data
ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([data[i:i+block_size] for i in ix])
y = torch.stack([data[i+1:i+block_size+1] for i in ix])
x, y = x.to(device), y.to(device)
return x, y
@torch.no_grad()
def estimate_loss():
out = {}
model.eval()
for split in ['train', 'val']:
losses = torch.zeros(eval_iters)
for k in range(eval_iters):
X, Y = get_batch(split)
logits, loss = model(X, Y)
losses[k] = loss.item()
out[split] = losses.mean()
model.train()
return out
class Head(nn.Module):
""" one head of self-attention """
def __init__(self, head_size):
super().__init__()
self.key = nn.Linear(n_embd, head_size, bias=False)
self.query = nn.Linear(n_embd, head_size, bias=False)
self.value = nn.Linear(n_embd, head_size, bias=False)
self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
self.dropout = nn.Dropout(dropout)
def forward(self, x):
B,T,C = x.shape
k = self.key(x) # (B,T,C)
q = self.query(x) # (B,T,C)
# compute attention scores ("affinities")
wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
wei = F.softmax(wei, dim=-1) # (B, T, T)
wei = self.dropout(wei)
# perform the weighted aggregation of the values
v = self.value(x) # (B,T,C)
out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
return out
class MultiHeadAttention(nn.Module):
""" multiple heads of self-attention in parallel """
def __init__(self, num_heads, head_size):
super().__init__()
self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
self.proj = nn.Linear(n_embd, n_embd)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
out = torch.cat([h(x) for h in self.heads], dim=-1)
out = self.dropout(self.proj(out))
return out
class FeedFoward(nn.Module):
""" a simple linear layer followed by a non-linearity """
def __init__(self, n_embd):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n_embd, 4 * n_embd),
nn.ReLU(),
nn.Linear(4 * n_embd, n_embd),
nn.Dropout(dropout),
)
def forward(self, x):
return self.net(x)
class Block(nn.Module):
""" Transformer block: communication followed by computation """
def __init__(self, n_embd, n_head):
# n_embd: embedding dimension, n_head: the number of heads we'd like
super().__init__()
head_size = n_embd // n_head
self.sa = MultiHeadAttention(n_head, head_size)
self.ffwd = FeedFoward(n_embd)
self.ln1 = nn.LayerNorm(n_embd)
self.ln2 = nn.LayerNorm(n_embd)
def forward(self, x):
x = x + self.sa(self.ln1(x))
x = x + self.ffwd(self.ln2(x))
return x
# super simple bigram model
class BigramLanguageModel(nn.Module):
def __init__(self):
super().__init__()
# each token directly reads off the logits for the next token from a lookup table
self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
self.position_embedding_table = nn.Embedding(block_size, n_embd)
self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
self.ln_f = nn.LayerNorm(n_embd) # final layer norm
self.lm_head = nn.Linear(n_embd, vocab_size)
def forward(self, idx, targets=None):
B, T = idx.shape
# idx and targets are both (B,T) tensor of integers
tok_emb = self.token_embedding_table(idx) # (B,T,C)
pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
x = tok_emb + pos_emb # (B,T,C)
x = self.blocks(x) # (B,T,C)
x = self.ln_f(x) # (B,T,C)
logits = self.lm_head(x) # (B,T,vocab_size)
if targets is None:
loss = None
else:
B, T, C = logits.shape
logits = logits.view(B*T, C)
targets = targets.view(B*T)
loss = F.cross_entropy(logits, targets)
return logits, loss
def generate(self, idx, max_new_tokens):
# idx is (B, T) array of indices in the current context
for _ in range(max_new_tokens):
# crop idx to the last block_size tokens
idx_cond = idx[:, -block_size:]
# get the predictions
logits, loss = self(idx_cond)
# focus only on the last time step
logits = logits[:, -1, :] # becomes (B, C)
# apply softmax to get probabilities
probs = F.softmax(logits, dim=-1) # (B, C)
# sample from the distribution
idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
# append sampled index to the running sequence
idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
return idx
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for iter in range(max_iters):
# every once in a while evaluate the loss on train and val sets
if iter % eval_interval == 0 or iter == max_iters - 1:
losses = estimate_loss()
print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
# sample a batch of data
xb, yb = get_batch('train')
# evaluate the loss
logits, loss = model(xb, yb)
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))
10.788929 M parameters step 0: train loss 4.2849, val loss 4.2823 step 500: train loss 2.0112, val loss 2.0971 step 1000: train loss 1.6021, val loss 1.7830 step 1500: train loss 1.4412, val loss 1.6396 step 2000: train loss 1.3430, val loss 1.5724 step 2500: train loss 1.2809, val loss 1.5330 step 3000: train loss 1.2268, val loss 1.5094 step 3500: train loss 1.1824, val loss 1.4881 step 4000: train loss 1.1475, val loss 1.4869 step 4500: train loss 1.1108, val loss 1.4805 step 4999: train loss 1.0779, val loss 1.4920 But with prison: I will stead with you. ISABELLA: Carress, all do; and I'll say your honour self good: Then I'll regn your highness and Compell'd by my sweet gates that you may: Valiant make how I heard of you. ANGELO: Nay, sir, Isay! ISABELLA: I am sweet men sister as you steed. LUCIO: As it if you in the case would princily, I'll rote, sir, I did cannot now at me? That look thence, thy children shall be you called. DUKE VINCENTIO: Marry, though I do read you! LUCIO: O that mufflest than that should do worse a mode, By good clopHelden brick, your petite infect, Give mattering summour; I pray you have an eanning of you, May be past a press'd, so we show with my walls. I slept, I play; for I am, but will. Second Peter: Hold Claudio you that sees to meet you, Tranio; Her well with my wounds shall see ht; but he were a smoth way his eweary wanto-mou rich on our A rose faitter gash; parce mo know that he did. DUCHESS OF YORY: Why, how far, ay? see, methought be not upon't? RICHMOND: No more! Beseech you are I turn the banish Nothing removed and turn'd the king sight. RICHARD: It mean us grows muture with now, like some tailor, That gliers make him speeding leaves ranted this law, And may as the noble liberty hatch, Something me high all buy, as well her she carried As young my demands? is clear? Love in my loyal pleggage? LEONTES: Fair God's fordship's noble could. Pray, sir, for my weeds, stips: old the wisestray follow. WELBOW: Pray, you are not honour. POLIXENES: My lords leve, you'll not so be. give me leave your father, for the offence but set this taughting war the su glass. Here in blood me with oath. What jot she'll cont many in meny fool Have yielege it, I acconds say 'tis beyn, but Loopless it dark, the plantage: would I, as we Hath thy son a word, year my husband; who shall meet there comet; and, even out the noble warter, As the whore our grace would slave, so much, He chides me loves me in for usurpians, Which the beggars he with no presently t
This model has 10.788929 M parameters That is so cool. It took around 40-50 minutes to train the model and it won't be possible on CPU.
Now let's checkout the parameters in the GPT paper. Checkout this paper: Language Models are Few-Shot Learners https://arxiv.org/pdf/2005.14165
One big difference between what we did and above GPT models is the tokenizer. We just considered character tokens as as tokenizer. But in real-word condition, byte-pair encoding tokenizer or sentence piece tokenizer is being used.
Using the above code, you can create a decoder-only pre-trained model. This model will generate text (which may not necessarily make sense). If you ask it a question, it might keep generating sentences that look like questions, but it will not provide any meaningful answers.
In real-world applications, the model needs to provide answers, which requires a process different from pre-training. This process is called fine-tuning. Pre-trained models (also called checkpoints) need to be fine-tuned for specific tasks to achieve meaningful results.
How ChatGPT generates response like humans you can learn it from here: https://openai.com/index/chatgpt/
Now, it is a good time that you look at different transformer achitectures of pretrained models on HuggingFace 🤗
from transformers import AutoModel
# Load the BERT model
model = AutoModel.from_pretrained("bert-base-uncased")
# Print the architecture
print(model)
/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: The secret `HF_TOKEN` does not exist in your Colab secrets. To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session. You will be able to reuse this secret in all of your notebooks. Please note that authentication is recommended but still optional to access public models or datasets. warnings.warn(
BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(30522, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0-11): 12 x BertLayer(
(attention): BertAttention(
(self): BertSdpaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): BertPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
Visualize Encoder-Decoder based transformer model, BART-base
from transformers import AutoModel
# Load a BART model
model = AutoModel.from_pretrained("facebook/bart-base")
# Print the model architecture
print(model)
BartModel(
(shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
(encoder): BartEncoder(
(embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
(embed_positions): BartLearnedPositionalEmbedding(1026, 768)
(layers): ModuleList(
(0-5): 6 x BartEncoderLayer(
(self_attn): BartSdpaAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(activation_fn): GELUActivation()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
(layernorm_embedding): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(decoder): BartDecoder(
(embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
(embed_positions): BartLearnedPositionalEmbedding(1026, 768)
(layers): ModuleList(
(0-5): 6 x BartDecoderLayer(
(self_attn): BartSdpaAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(activation_fn): GELUActivation()
(self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(encoder_attn): BartSdpaAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(encoder_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
(layernorm_embedding): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
We took reference for the code from famous tutorial of Arndej Karpathy. Please check out this nanoGPT github repo for more learning.
Some more references:
- The official GPT-2 TensorFlow implementation released by OpenAI: https://github.com/openai/gpt-2/blob/master/src/model.py
- Huggingface/transformers PyTorch implementation: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py