Files
cs188/proj5/gpt_model.py
2025-12-08 11:30:54 +08:00

99 lines
4.1 KiB
Python
Executable File

"""
The MIT License (MIT) Copyright (c) 2020 Andrej Karpathy
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
References:
1) "minGPT" implemented by Andrej Karpathy
https://github.com/karpathy/minGPT
2) the official GPT-2 TensorFlow implementation released by OpenAI:
https://github.com/openai/gpt-2/blob/master/src/model.py
3) huggingface/transformers PyTorch implementation:
https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
"""
import torch
import torch.nn as nn
from torch.nn import functional as F
from models import Attention
class Transformer_Block(nn.Module):
"""
This class builds the basic transformer block.
"""
def __init__(self, n_embd, block_size):
super().__init__()
self.attn_block = Attention(n_embd, block_size)
self.norm_1 = nn.LayerNorm(n_embd)
self.linear_1 = nn.Linear(n_embd, n_embd)
self.norm_2 = nn.LayerNorm(n_embd)
def forward(self, x):
"""YOUR CODE HERE"""
class Character_GPT(nn.Module):
def __init__(self, block_size, n_embd, n_layer, vocab_size):
super().__init__()
self.block_size = block_size
self.embed = nn.Embedding(vocab_size, n_embd) #Embedding layer, think of this as similar to a linear layer
self.transformer_blocks = nn.ModuleList([Transformer_Block(n_embd, block_size) for _ in range(n_layer)]) #You can treat this as a python list
self.norm = nn.LayerNorm(n_embd) #Normalization Layer
self.output_layer = nn.Linear(n_embd, vocab_size, bias=False)
def get_loss(self, input, target):
output = self(input)
return F.cross_entropy(output.view(-1, output.size(-1)), target.view(-1), ignore_index=-1)
def forward(self, input):
"""
This function should take in an input representing a sequence of characters, and output
an array representing the likelihood of any character appearing next.
All necessary layers have been initialized for you in the __init__() function, you should pay special
attention to the self.transformer_blocks variable. Since we have multiple transformer blocks in our
final model, you will have to pass the input through every object in this list.
"""
b, t = input.size()
assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
"""YOUR CODE HERE"""
@torch.no_grad()
def generate(self, idx, max_new_tokens):
"""
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
the sequence max_new_tokens times, feeding the predictions back into the model each time.
"""
for _ in range(max_new_tokens):
# if the sequence context is growing too long we must crop it at block_size
idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
# forward the model to get the logits for the index in the sequence
logits = self(idx_cond)
# pluck the logits at the final step and scale by desired temperature
logits = logits[:, -1, :]
# optionally crop the logits to only the top k options
# apply softmax to convert logits to (normalized) probabilities
probs = F.softmax(logits, dim=-1)
# either sample from the distribution or take the most likely element
idx_next = torch.multinomial(probs, num_samples=1)
# append sampled index to the running sequence and continue
idx = torch.cat((idx, idx_next), dim=1)
return idx