"""
The MIT License (MIT) Copyright (c) 2020 Andrej Karpathy
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.


References:
1) "minGPT" implemented by Andrej Karpathy
https://github.com/karpathy/minGPT
2) the official GPT-2 TensorFlow implementation released by OpenAI:
https://github.com/openai/gpt-2/blob/master/src/model.py
3) huggingface/transformers PyTorch implementation:
https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
"""


import torch
import torch.nn as nn
from torch.nn import functional as F
from models import Attention


class Transformer_Block(nn.Module):
    """
    This class builds the basic transformer block.
    """
    def __init__(self, n_embd, block_size):
        super().__init__()

        self.attn_block = Attention(n_embd, block_size)
        self.norm_1 = nn.LayerNorm(n_embd)
        self.linear_1 = nn.Linear(n_embd, n_embd)
        self.norm_2 = nn.LayerNorm(n_embd)


    def forward(self, x):
        """YOUR CODE HERE"""

       
class Character_GPT(nn.Module):
   
    def __init__(self, block_size, n_embd, n_layer, vocab_size):
        super().__init__()
        self.block_size = block_size
        self.embed = nn.Embedding(vocab_size, n_embd) #Embedding layer, think of this as similar to a linear layer

        
        self.transformer_blocks = nn.ModuleList([Transformer_Block(n_embd, block_size) for _ in range(n_layer)]) #You can treat this as a python list
        self.norm = nn.LayerNorm(n_embd) #Normalization Layer
        self.output_layer = nn.Linear(n_embd, vocab_size, bias=False)


    def get_loss(self, input, target):
        output = self(input)
        return F.cross_entropy(output.view(-1, output.size(-1)), target.view(-1), ignore_index=-1)
        
    def forward(self, input):
        """
        This function should take in an input representing a sequence of characters, and output
        an array representing the likelihood of any character appearing next.

        All necessary layers have been initialized for you in the __init__() function, you should pay special
        attention to the self.transformer_blocks variable. Since we have multiple transformer blocks in our
        final model, you will have to pass the input through every object in this list.
        """
        b, t = input.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"

        """YOUR CODE HERE"""


    @torch.no_grad()
    def generate(self, idx, max_new_tokens):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :]
            # optionally crop the logits to only the top k options

            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # either sample from the distribution or take the most likely element
            
            idx_next = torch.multinomial(probs, num_samples=1)
            
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx