proj5 report initialized
This commit is contained in:
507
proj5/models.py
Executable file
507
proj5/models.py
Executable file
@ -0,0 +1,507 @@
|
||||
from torch import no_grad, stack
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.nn import Module
|
||||
|
||||
|
||||
"""
|
||||
Functions you should use.
|
||||
Please avoid importing any other functions or modules.
|
||||
Your code will not pass if the gradescope autograder detects any changed imports
|
||||
"""
|
||||
import torch
|
||||
from torch.nn import Parameter, Linear
|
||||
from torch import optim, tensor, tensordot, ones, matmul
|
||||
from torch.nn.functional import cross_entropy, relu, mse_loss, softmax
|
||||
from torch import movedim
|
||||
|
||||
|
||||
class PerceptronModel(Module):
|
||||
def __init__(self, dimensions):
|
||||
"""
|
||||
Initialize a new Perceptron instance.
|
||||
|
||||
A perceptron classifies data points as either belonging to a particular
|
||||
class (+1) or not (-1). `dimensions` is the dimensionality of the data.
|
||||
For example, dimensions=2 would mean that the perceptron must classify
|
||||
2D points.
|
||||
|
||||
In order for our autograder to detect your weight, initialize it as a
|
||||
pytorch Parameter object as follows:
|
||||
|
||||
Parameter(weight_vector)
|
||||
|
||||
where weight_vector is a pytorch Tensor of dimension 'dimensions'
|
||||
|
||||
|
||||
Hint: You can use ones(dim) to create a tensor of dimension dim.
|
||||
"""
|
||||
super(PerceptronModel, self).__init__()
|
||||
|
||||
self.w = Parameter(ones(1, dimensions))
|
||||
|
||||
|
||||
def get_weights(self):
|
||||
"""
|
||||
Return a Parameter instance with the current weights of the perceptron.
|
||||
"""
|
||||
return self.w
|
||||
|
||||
def run(self, x):
|
||||
"""
|
||||
Calculates the score assigned by the perceptron to a data point x.
|
||||
|
||||
Inputs:
|
||||
x: a node with shape (1 x dimensions)
|
||||
Returns: a node containing a single number (the score)
|
||||
|
||||
The pytorch function `tensordot` may be helpful here.
|
||||
"""
|
||||
return tensordot(self.w, x, dims=2)
|
||||
|
||||
|
||||
def get_prediction(self, x):
|
||||
"""
|
||||
Calculates the predicted class for a single data point `x`.
|
||||
|
||||
Returns: 1 or -1
|
||||
"""
|
||||
score = self.run(x)
|
||||
return 1 if score.item() >= 0 else -1
|
||||
|
||||
|
||||
|
||||
def train(self, dataset):
|
||||
"""
|
||||
Train the perceptron until convergence.
|
||||
You can iterate through DataLoader in order to
|
||||
retrieve all the batches you need to train on.
|
||||
|
||||
Each sample in the dataloader is in the form {'x': features, 'label': label} where label
|
||||
is the item we need to predict based off of its features.
|
||||
"""
|
||||
with no_grad():
|
||||
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
|
||||
while True:
|
||||
converged = True
|
||||
for batch in dataloader:
|
||||
x = batch['x']
|
||||
y = batch['label']
|
||||
prediction = self.get_prediction(x)
|
||||
if prediction != y.item():
|
||||
self.w.data += y.item() * x
|
||||
converged = False
|
||||
if converged:
|
||||
break
|
||||
|
||||
|
||||
|
||||
class RegressionModel(Module):
|
||||
"""
|
||||
A neural network model for approximating a function that maps from real
|
||||
numbers to real numbers. The network should be sufficiently large to be able
|
||||
to approximate sin(x) on the interval [-2pi, 2pi] to reasonable precision.
|
||||
"""
|
||||
def __init__(self):
|
||||
# Initialize your model parameters here
|
||||
"*** YOUR CODE HERE ***"
|
||||
super().__init__()
|
||||
self.layer1 = Linear(1, 256)
|
||||
self.layer2 = Linear(256, 256)
|
||||
self.layer3 = Linear(256, 1)
|
||||
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Runs the model for a batch of examples.
|
||||
|
||||
Inputs:
|
||||
x: a node with shape (batch_size x 1)
|
||||
Returns:
|
||||
A node with shape (batch_size x 1) containing predicted y-values
|
||||
"""
|
||||
"*** YOUR CODE HERE ***"
|
||||
x = relu(self.layer1(x))
|
||||
x = relu(self.layer2(x))
|
||||
return self.layer3(x)
|
||||
|
||||
|
||||
def get_loss(self, x, y):
|
||||
"""
|
||||
Computes the loss for a batch of examples.
|
||||
|
||||
Inputs:
|
||||
x: a node with shape (batch_size x 1)
|
||||
y: a node with shape (batch_size x 1), containing the true y-values
|
||||
to be used for training
|
||||
Returns: a tensor of size 1 containing the loss
|
||||
"""
|
||||
"*** YOUR CODE HERE ***"
|
||||
return mse_loss(self.forward(x), y)
|
||||
|
||||
|
||||
|
||||
def train(self, dataset):
|
||||
"""
|
||||
Trains the model.
|
||||
|
||||
In order to create batches, create a DataLoader object and pass in `dataset` as well as your required
|
||||
batch size. You can look at PerceptronModel as a guideline for how you should implement the DataLoader
|
||||
|
||||
Each sample in the dataloader object will be in the form {'x': features, 'label': label} where label
|
||||
is the item we need to predict based off of its features.
|
||||
|
||||
Inputs:
|
||||
dataset: a PyTorch dataset object containing data to be trained on
|
||||
|
||||
"""
|
||||
"*** YOUR CODE HERE ***"
|
||||
batch_size = 40
|
||||
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
optimizer = optim.Adam(self.parameters(), lr=0.001)
|
||||
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=250, gamma=0.1)
|
||||
|
||||
for epoch in range(1000): # Train for 1000 epochs, should be enough
|
||||
total_loss = 0
|
||||
num_samples = 0
|
||||
for batch in dataloader:
|
||||
optimizer.zero_grad()
|
||||
loss = self.get_loss(batch['x'], batch['label'])
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
total_loss += loss.item() * len(batch['x'])
|
||||
num_samples += len(batch['x'])
|
||||
|
||||
scheduler.step()
|
||||
|
||||
avg_loss = total_loss / num_samples
|
||||
if avg_loss < 0.02:
|
||||
break
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class DigitClassificationModel(Module):
|
||||
"""
|
||||
A model for handwritten digit classification using the MNIST dataset.
|
||||
|
||||
Each handwritten digit is a 28x28 pixel grayscale image, which is flattened
|
||||
into a 784-dimensional vector for the purposes of this model. Each entry in
|
||||
the vector is a floating point number between 0 and 1.
|
||||
|
||||
The goal is to sort each digit into one of 10 classes (number 0 through 9).
|
||||
|
||||
(See RegressionModel for more information about the APIs of different
|
||||
methods here. We recommend that you implement the RegressionModel before
|
||||
working on this part of the project.)
|
||||
"""
|
||||
def __init__(self):
|
||||
# Initialize your model parameters here
|
||||
super().__init__()
|
||||
input_size = 28 * 28
|
||||
output_size = 10
|
||||
"*** YOUR CODE HERE ***"
|
||||
self.layer1 = Linear(input_size, 256)
|
||||
self.layer2 = Linear(256, 128)
|
||||
self.layer3 = Linear(128, output_size)
|
||||
|
||||
|
||||
|
||||
|
||||
def run(self, x):
|
||||
"""
|
||||
Runs the model for a batch of examples.
|
||||
|
||||
Your model should predict a node with shape (batch_size x 10),
|
||||
containing scores. Higher scores correspond to greater probability of
|
||||
the image belonging to a particular class.
|
||||
|
||||
Inputs:
|
||||
x: a tensor with shape (batch_size x 784)
|
||||
Output:
|
||||
A node with shape (batch_size x 10) containing predicted scores
|
||||
(also called logits)
|
||||
"""
|
||||
""" YOUR CODE HERE """
|
||||
x = relu(self.layer1(x))
|
||||
x = relu(self.layer2(x))
|
||||
return self.layer3(x)
|
||||
|
||||
|
||||
|
||||
def get_loss(self, x, y):
|
||||
"""
|
||||
Computes the loss for a batch of examples.
|
||||
|
||||
The correct labels `y` are represented as a tensor with shape
|
||||
(batch_size x 10). Each row is a one-hot vector encoding the correct
|
||||
digit class (0-9).
|
||||
|
||||
Inputs:
|
||||
x: a node with shape (batch_size x 784)
|
||||
y: a node with shape (batch_size x 10)
|
||||
Returns: a loss tensor
|
||||
"""
|
||||
""" YOUR CODE HERE """
|
||||
return cross_entropy(self.run(x), y)
|
||||
|
||||
|
||||
|
||||
|
||||
def train(self, dataset):
|
||||
"""
|
||||
Trains the model.
|
||||
"""
|
||||
""" YOUR CODE HERE """
|
||||
batch_size = 100
|
||||
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
optimizer = optim.Adam(self.parameters(), lr=0.001)
|
||||
|
||||
for epoch in range(5): # Train for 5 epochs
|
||||
for batch in dataloader:
|
||||
optimizer.zero_grad()
|
||||
loss = self.get_loss(batch['x'], batch['label'])
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
|
||||
|
||||
class LanguageIDModel(Module):
|
||||
"""
|
||||
A model for language identification at a single-word granularity.
|
||||
|
||||
(See RegressionModel for more information about the APIs of different
|
||||
methods here. We recommend that you implement the RegressionModel before
|
||||
working on this part of the project.)
|
||||
"""
|
||||
def __init__(self):
|
||||
# Our dataset contains words from five different languages, and the
|
||||
# combined alphabets of the five languages contain a total of 47 unique
|
||||
# characters.
|
||||
# You can refer to self.num_chars or len(self.languages) in your code
|
||||
self.num_chars = 47
|
||||
self.languages = ["English", "Spanish", "Finnish", "Dutch", "Polish"]
|
||||
super(LanguageIDModel, self).__init__()
|
||||
"*** YOUR CODE HERE ***"
|
||||
|
||||
|
||||
def run(self, xs):
|
||||
"""
|
||||
Runs the model for a batch of examples.
|
||||
|
||||
Although words have different lengths, our data processing guarantees
|
||||
that within a single batch, all words will be of the same length (L).
|
||||
|
||||
Here `xs` will be a list of length L. Each element of `xs` will be a
|
||||
tensor with shape (batch_size x self.num_chars), where every row in the
|
||||
array is a one-hot vector encoding of a character. For example, if we
|
||||
have a batch of 8 three-letter words where the last word is "cat", then
|
||||
xs[1] will be a tensor that contains a 1 at position (7, 0). Here the
|
||||
index 7 reflects the fact that "cat" is the last word in the batch, and
|
||||
the index 0 reflects the fact that the letter "a" is the inital (0th)
|
||||
letter of our combined alphabet for this task.
|
||||
|
||||
Your model should use a Recurrent Neural Network to summarize the list
|
||||
`xs` into a single tensor of shape (batch_size x hidden_size), for your
|
||||
choice of hidden_size. It should then calculate a tensor of shape
|
||||
(batch_size x 5) containing scores, where higher scores correspond to
|
||||
greater probability of the word originating from a particular language.
|
||||
|
||||
Inputs:
|
||||
xs: a list with L elements (one per character), where each element
|
||||
is a node with shape (batch_size x self.num_chars)
|
||||
Returns:
|
||||
A node with shape (batch_size x 5) containing predicted scores
|
||||
(also called logits)
|
||||
"""
|
||||
"*** YOUR CODE HERE ***"
|
||||
|
||||
|
||||
def get_loss(self, xs, y):
|
||||
"""
|
||||
Computes the loss for a batch of examples.
|
||||
|
||||
The correct labels `y` are represented as a node with shape
|
||||
(batch_size x 5). Each row is a one-hot vector encoding the correct
|
||||
language.
|
||||
|
||||
Inputs:
|
||||
xs: a list with L elements (one per character), where each element
|
||||
is a node with shape (batch_size x self.num_chars)
|
||||
y: a node with shape (batch_size x 5)
|
||||
Returns: a loss node
|
||||
"""
|
||||
"*** YOUR CODE HERE ***"
|
||||
|
||||
|
||||
def train(self, dataset):
|
||||
"""
|
||||
Trains the model.
|
||||
|
||||
Note that when you iterate through dataloader, each batch will returned as its own vector in the form
|
||||
(batch_size x length of word x self.num_chars). However, in order to run multiple samples at the same time,
|
||||
get_loss() and run() expect each batch to be in the form (length of word x batch_size x self.num_chars), meaning
|
||||
that you need to switch the first two dimensions of every batch. This can be done with the movedim() function
|
||||
as follows:
|
||||
|
||||
movedim(input_vector, initial_dimension_position, final_dimension_position)
|
||||
|
||||
For more information, look at the pytorch documentation of torch.movedim()
|
||||
"""
|
||||
"*** YOUR CODE HERE ***"
|
||||
|
||||
|
||||
|
||||
def Convolve(input: tensor, weight: tensor):
|
||||
"""
|
||||
Acts as a convolution layer by applying a 2d convolution with the given inputs and weights.
|
||||
DO NOT import any pytorch methods to directly do this, the convolution must be done with only the functions
|
||||
already imported.
|
||||
|
||||
There are multiple ways to complete this function. One possible solution would be to use 'tensordot'.
|
||||
If you would like to index a tensor, you can do it as such:
|
||||
|
||||
tensor[y:y+height, x:x+width]
|
||||
|
||||
This returns a subtensor who's first element is tensor[y,x] and has height 'height, and width 'width'
|
||||
"""
|
||||
input_h, input_w = input.shape
|
||||
weight_h, weight_w = weight.shape
|
||||
output_h = input_h - weight_h + 1
|
||||
output_w = input_w - weight_w + 1
|
||||
|
||||
Output_Tensor = torch.zeros((output_h, output_w))
|
||||
|
||||
for y in range(output_h):
|
||||
for x in range(output_w):
|
||||
sub_tensor = input[y:y+weight_h, x:x+weight_w]
|
||||
Output_Tensor[y, x] = tensordot(sub_tensor, weight, dims=2)
|
||||
|
||||
return Output_Tensor
|
||||
|
||||
|
||||
|
||||
class DigitConvolutionalModel(Module):
|
||||
"""
|
||||
A model for handwritten digit classification using the MNIST dataset.
|
||||
|
||||
This class is a convolutational model which has already been trained on MNIST.
|
||||
if Convolve() has been correctly implemented, this model should be able to achieve a high accuracy
|
||||
on the mnist dataset given the pretrained weights.
|
||||
|
||||
Note that this class looks different from a standard pytorch model since we don't need to train it
|
||||
as it will be run on preset weights.
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self):
|
||||
# Initialize your model parameters here
|
||||
super().__init__()
|
||||
output_size = 10
|
||||
|
||||
self.convolution_weights = Parameter(ones((3, 3)))
|
||||
""" YOUR CODE HERE """
|
||||
conv_output_size = 26 * 26
|
||||
hidden_size = 100
|
||||
self.layer1 = Linear(conv_output_size, hidden_size)
|
||||
self.layer2 = Linear(hidden_size, output_size)
|
||||
|
||||
|
||||
|
||||
|
||||
def run(self, x):
|
||||
return self(x)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
The convolutional layer is already applied, and the output is flattened for you. You should treat x as
|
||||
a regular 1-dimentional datapoint now, similar to the previous questions.
|
||||
"""
|
||||
x = x.reshape(len(x), 28, 28)
|
||||
x = stack(list(map(lambda sample: Convolve(sample, self.convolution_weights), x)))
|
||||
x = x.flatten(start_dim=1)
|
||||
""" YOUR CODE HERE """
|
||||
x = relu(self.layer1(x))
|
||||
return self.layer2(x)
|
||||
|
||||
|
||||
def get_loss(self, x, y):
|
||||
"""
|
||||
Computes the loss for a batch of examples.
|
||||
|
||||
The correct labels `y` are represented as a tensor with shape
|
||||
(batch_size x 10). Each row is a one-hot vector encoding the correct
|
||||
digit class (0-9).
|
||||
|
||||
Inputs:
|
||||
x: a node with shape (batch_size x 784)
|
||||
y: a node with shape (batch_size x 10)
|
||||
Returns: a loss tensor
|
||||
"""
|
||||
""" YOUR CODE HERE """
|
||||
return cross_entropy(self.forward(x), y)
|
||||
|
||||
|
||||
|
||||
|
||||
def train(self, dataset):
|
||||
"""
|
||||
Trains the model.
|
||||
"""
|
||||
""" YOUR CODE HERE """
|
||||
batch_size = 64
|
||||
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
||||
optimizer = optim.Adam(self.parameters(), lr=0.001)
|
||||
|
||||
for epoch in range(3):
|
||||
for batch in dataloader:
|
||||
optimizer.zero_grad()
|
||||
loss = self.get_loss(batch['x'], batch['label'])
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
|
||||
|
||||
class Attention(Module):
|
||||
def __init__(self, layer_size, block_size):
|
||||
super().__init__()
|
||||
"""
|
||||
All the layers you should use are defined here.
|
||||
|
||||
In order to pass the autograder, make sure each linear layer matches up with their corresponding matrix,
|
||||
ie: use self.k_layer to generate the K matrix.
|
||||
"""
|
||||
self.k_layer = Linear(layer_size, layer_size)
|
||||
self.q_layer = Linear(layer_size, layer_size)
|
||||
self.v_layer = Linear(layer_size,layer_size)
|
||||
|
||||
#Masking part of attention layer
|
||||
self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size))
|
||||
.view(1, 1, block_size, block_size))
|
||||
|
||||
self.layer_size = layer_size
|
||||
|
||||
|
||||
def forward(self, input):
|
||||
"""
|
||||
Applies the attention mechanism to input. All necessary layers have
|
||||
been defined in __init__()
|
||||
|
||||
In order to apply the causal mask to a given matrix M, you should update
|
||||
it as such:
|
||||
|
||||
M = M.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))[0]
|
||||
|
||||
For the softmax activation, it should be applied to the last dimension of the input,
|
||||
Take a look at the "dim" argument of torch.nn.functional.softmax to figure out how to do this.
|
||||
"""
|
||||
B, T, C = input.size()
|
||||
|
||||
"""YOUR CODE HERE"""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user