99 lines
4.1 KiB
Python
Executable File
99 lines
4.1 KiB
Python
Executable File
"""
|
|
The MIT License (MIT) Copyright (c) 2020 Andrej Karpathy
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
|
|
|
|
References:
|
|
1) "minGPT" implemented by Andrej Karpathy
|
|
https://github.com/karpathy/minGPT
|
|
2) the official GPT-2 TensorFlow implementation released by OpenAI:
|
|
https://github.com/openai/gpt-2/blob/master/src/model.py
|
|
3) huggingface/transformers PyTorch implementation:
|
|
https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
|
|
"""
|
|
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
from torch.nn import functional as F
|
|
from models import Attention
|
|
|
|
|
|
class Transformer_Block(nn.Module):
|
|
"""
|
|
This class builds the basic transformer block.
|
|
"""
|
|
def __init__(self, n_embd, block_size):
|
|
super().__init__()
|
|
|
|
self.attn_block = Attention(n_embd, block_size)
|
|
self.norm_1 = nn.LayerNorm(n_embd)
|
|
self.linear_1 = nn.Linear(n_embd, n_embd)
|
|
self.norm_2 = nn.LayerNorm(n_embd)
|
|
|
|
|
|
def forward(self, x):
|
|
"""YOUR CODE HERE"""
|
|
|
|
|
|
|
|
class Character_GPT(nn.Module):
|
|
|
|
def __init__(self, block_size, n_embd, n_layer, vocab_size):
|
|
super().__init__()
|
|
self.block_size = block_size
|
|
self.embed = nn.Embedding(vocab_size, n_embd) #Embedding layer, think of this as similar to a linear layer
|
|
|
|
|
|
self.transformer_blocks = nn.ModuleList([Transformer_Block(n_embd, block_size) for _ in range(n_layer)]) #You can treat this as a python list
|
|
self.norm = nn.LayerNorm(n_embd) #Normalization Layer
|
|
self.output_layer = nn.Linear(n_embd, vocab_size, bias=False)
|
|
|
|
|
|
|
|
def get_loss(self, input, target):
|
|
output = self(input)
|
|
return F.cross_entropy(output.view(-1, output.size(-1)), target.view(-1), ignore_index=-1)
|
|
|
|
def forward(self, input):
|
|
"""
|
|
This function should take in an input representing a sequence of characters, and output
|
|
an array representing the likelihood of any character appearing next.
|
|
|
|
All necessary layers have been initialized for you in the __init__() function, you should pay special
|
|
attention to the self.transformer_blocks variable. Since we have multiple transformer blocks in our
|
|
final model, you will have to pass the input through every object in this list.
|
|
"""
|
|
b, t = input.size()
|
|
assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
|
|
|
|
"""YOUR CODE HERE"""
|
|
|
|
|
|
@torch.no_grad()
|
|
def generate(self, idx, max_new_tokens):
|
|
"""
|
|
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
|
|
the sequence max_new_tokens times, feeding the predictions back into the model each time.
|
|
"""
|
|
for _ in range(max_new_tokens):
|
|
# if the sequence context is growing too long we must crop it at block_size
|
|
idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
|
|
# forward the model to get the logits for the index in the sequence
|
|
logits = self(idx_cond)
|
|
# pluck the logits at the final step and scale by desired temperature
|
|
logits = logits[:, -1, :]
|
|
# optionally crop the logits to only the top k options
|
|
|
|
# apply softmax to convert logits to (normalized) probabilities
|
|
probs = F.softmax(logits, dim=-1)
|
|
# either sample from the distribution or take the most likely element
|
|
|
|
idx_next = torch.multinomial(probs, num_samples=1)
|
|
|
|
# append sampled index to the running sequence and continue
|
|
idx = torch.cat((idx, idx_next), dim=1)
|
|
|
|
return idx
|