253 lines
9.7 KiB
Python
253 lines
9.7 KiB
Python
import sys
|
|
import re
|
|
import struct
|
|
|
|
# Y86-64 Constants
|
|
# Register mapping from name to number
|
|
REG = {
|
|
'%rax': 0x0, '%rcx': 0x1, '%rdx': 0x2, '%rbx': 0x3,
|
|
'%rsp': 0x4, '%rbp': 0x5, '%rsi': 0x6, '%rdi': 0x7,
|
|
'%r8': 0x8, '%r9': 0x9, '%r10': 0xa, '%r11': 0xb,
|
|
'%r12': 0xc, '%r13': 0xd, '%r14': 0xe, 'F': 0xF
|
|
}
|
|
|
|
# Instruction mapping from mnemonic to (icode, ifun)
|
|
# icode is the instruction code, ifun is the function code
|
|
INS = {
|
|
'halt': (0x0, 0x0), 'nop': (0x1, 0x0), 'rrmovq': (0x2, 0x0),
|
|
'cmovle': (0x2, 0x1), 'cmovl': (0x2, 0x2), 'cmove': (0x2, 0x3),
|
|
'cmovne': (0x2, 0x4), 'cmovge': (0x2, 0x5), 'cmovg': (0x2, 0x6),
|
|
'irmovq': (0x3, 0x0), 'rmmovq': (0x4, 0x0), 'mrmovq': (0x5, 0x0),
|
|
'addq': (0x6, 0x0), 'subq': (0x6, 0x1), 'andq': (0x6, 0x2),
|
|
'xorq': (0x6, 0x3), 'jmp': (0x7, 0x0), 'jle': (0x7, 0x1),
|
|
'jl': (0x7, 0x2), 'je': (0x7, 0x3), 'jne': (0x7, 0x4),
|
|
'jge': (0x7, 0x5), 'jg': (0x7, 0x6), 'call': (0x8, 0x0),
|
|
'ret': (0x9, 0x0), 'pushq': (0xA, 0x0), 'popq': (0xB, 0x0)
|
|
}
|
|
|
|
class Y86Assembler:
|
|
"""
|
|
A two-pass assembler for the Y86-64 instruction set.
|
|
"""
|
|
def __init__(self):
|
|
self.symbol_table = {}
|
|
self.pc = 0
|
|
self.byte_code = {} # Maps address to (size, hex_string, original_line)
|
|
|
|
def assemble(self, filepath):
|
|
"""
|
|
Assembles a Y86-64 source file.
|
|
|
|
Args:
|
|
filepath (str): The path to the input .txt or .ys file.
|
|
|
|
Returns:
|
|
str: The formatted machine code output.
|
|
"""
|
|
try:
|
|
with open(filepath, 'r') as f:
|
|
lines = f.readlines()
|
|
except FileNotFoundError:
|
|
return f"Error: File not found at '{filepath}'"
|
|
|
|
print("🚀 Starting assembly process...")
|
|
self._first_pass(lines)
|
|
print("✅ First pass complete. Symbol table built.")
|
|
self._second_pass(lines)
|
|
print("✅ Second pass complete. Machine code generated.")
|
|
return self._format_output()
|
|
|
|
def _parse_line(self, line):
|
|
"""Strips comments and splits a line into label, instruction, and operands."""
|
|
line = line.split('#')[0].split('|')[0].strip()
|
|
if not line:
|
|
return None, None, []
|
|
|
|
label, instruction, operands_str = None, None, ''
|
|
if ':' in line:
|
|
label, rest = line.split(':', 1)
|
|
line = rest.strip()
|
|
|
|
parts = line.split(maxsplit=1)
|
|
if parts:
|
|
instruction = parts[0]
|
|
if len(parts) > 1:
|
|
operands_str = parts[1]
|
|
|
|
# Split operands by comma, but not inside parentheses
|
|
operands = re.split(r',\s*(?![^()]*\))', operands_str) if operands_str else []
|
|
|
|
return label, instruction, [op.strip() for op in operands]
|
|
|
|
def _get_instruction_size(self, instruction, operands):
|
|
"""Calculates the size of an instruction in bytes."""
|
|
if not instruction: return 0
|
|
|
|
if instruction in ['.quad']: return 8
|
|
if instruction in ['.pos', '.align']: return 0
|
|
|
|
# ********** MODIFICATION START **********
|
|
# Handle custom `addq $val, rB` which has size 10
|
|
if instruction == 'addq' and operands and operands[0].startswith('$'):
|
|
return 10 # 1 (icode) + 1 (reg) + 8 (val)
|
|
# ********** MODIFICATION END **********
|
|
|
|
icode = INS[instruction][0]
|
|
if icode in [0x0, 0x1, 0x9]: return 1
|
|
if icode in [0x2, 0x6, 0xA, 0xB]: return 2
|
|
if icode in [0x7, 0x8]: return 9
|
|
if icode in [0x3, 0x4, 0x5]: return 10
|
|
return 0
|
|
|
|
def _first_pass(self, lines):
|
|
"""
|
|
Builds the symbol table by mapping labels to addresses.
|
|
"""
|
|
self.pc = 0
|
|
for line_num, line in enumerate(lines, 1):
|
|
label, instruction, operands = self._parse_line(line)
|
|
|
|
if label:
|
|
if label in self.symbol_table:
|
|
print(f"Warning: Duplicate label '{label}' on line {line_num}. Using first definition.")
|
|
else:
|
|
self.symbol_table[label] = self.pc
|
|
|
|
if not instruction: continue
|
|
|
|
if instruction == '.pos':
|
|
self.pc = int(operands[0], 0)
|
|
elif instruction == '.align':
|
|
align_val = int(operands[0])
|
|
self.pc = (self.pc + align_val - 1) & -align_val
|
|
else:
|
|
self.pc += self._get_instruction_size(instruction, operands)
|
|
|
|
def _second_pass(self, lines):
|
|
self.pc = 0
|
|
for line_num, line in enumerate(lines, 1):
|
|
original_line = line.strip()
|
|
label, instruction, operands = self._parse_line(line)
|
|
|
|
if instruction == '.pos':
|
|
self.pc = int(operands[0], 0)
|
|
continue
|
|
elif instruction == '.align':
|
|
new_pc = (self.pc + int(operands[0]) - 1) & -int(operands[0])
|
|
if new_pc != self.pc: self.byte_code[self.pc] = (0, '', original_line)
|
|
self.pc = new_pc
|
|
continue
|
|
|
|
if not instruction:
|
|
if label: self.byte_code[self.pc] = (0, '', original_line)
|
|
continue
|
|
|
|
start_pc = self.pc
|
|
size = self._get_instruction_size(instruction, operands)
|
|
code = bytearray()
|
|
|
|
if instruction == '.quad':
|
|
val = self._parse_value(operands[0])
|
|
code.extend(struct.pack('<Q', val))
|
|
else:
|
|
# ********** MODIFICATION START **********
|
|
# Handle our custom `addq $imm, rB` instruction
|
|
if instruction == 'addq' and operands[0].startswith('$'):
|
|
icode, ifun = 0xC, 0x0 # Use unused icode 0xC for our custom instruction
|
|
code.append((icode << 4) | ifun)
|
|
|
|
rB = REG[operands[1]]
|
|
rA = REG['F'] # No source register, so rA is F
|
|
code.append((rA << 4) | rB)
|
|
|
|
val = self._parse_value(operands[0])
|
|
code.extend(struct.pack('<q', val))
|
|
# ********** MODIFICATION END **********
|
|
else:
|
|
# Original logic for all other instructions
|
|
icode, ifun = INS[instruction]
|
|
b0 = (icode << 4) | ifun
|
|
code.append(b0)
|
|
|
|
if instruction in ['rrmovq', 'cmovle', 'cmovl', 'cmove', 'cmovne', 'cmovge', 'cmovg', 'addq', 'subq', 'andq', 'xorq']:
|
|
rA = REG[operands[0]]
|
|
rB = REG[operands[1]]
|
|
code.append((rA << 4) | rB)
|
|
elif instruction in ['pushq', 'popq']:
|
|
rA = REG[operands[0]]
|
|
code.append((rA << 4) | 0xF)
|
|
elif instruction in ['irmovq', 'rmmovq', 'mrmovq']:
|
|
if instruction == 'irmovq':
|
|
rA, rB = REG['F'], REG[operands[1]]
|
|
val = self._parse_value(operands[0])
|
|
elif instruction == 'rmmovq':
|
|
rA = REG[operands[0]]
|
|
disp, rB_name = self._parse_mem(operands[1])
|
|
rB, val = REG[rB_name], disp
|
|
elif instruction == 'mrmovq':
|
|
rA = REG[operands[1]]
|
|
disp, rB_name = self._parse_mem(operands[0])
|
|
rB, val = REG[rB_name], disp
|
|
code.append((rA << 4) | rB)
|
|
code.extend(struct.pack('<q', val))
|
|
elif instruction in ['jmp', 'jle', 'jl', 'je', 'jne', 'jge', 'jg', 'call']:
|
|
dest = self._parse_value(operands[0])
|
|
code.extend(struct.pack('<Q', dest))
|
|
|
|
if code:
|
|
self.byte_code[start_pc] = (size, code.hex(), original_line)
|
|
self.pc += size
|
|
|
|
def _parse_value(self, s):
|
|
"""Converts a string operand to an integer, resolving labels."""
|
|
s = s.strip()
|
|
if s.startswith('$'):
|
|
s = s[1:]
|
|
|
|
if s in self.symbol_table:
|
|
return self.symbol_table[s]
|
|
try:
|
|
return int(s, 0) # Handles decimal and '0x' hex
|
|
except ValueError:
|
|
raise ValueError(f"Invalid immediate value or unresolved label: {s}")
|
|
|
|
def _parse_mem(self, s):
|
|
"""Parses memory operands like 'D(%rB)' or '(%rB)'."""
|
|
match = re.match(r'(-?\d+)?\((\%r\w+)\)', s)
|
|
if not match:
|
|
raise ValueError(f"Invalid memory operand: {s}")
|
|
|
|
disp_str, reg = match.groups()
|
|
disp = int(disp_str) if disp_str else 0
|
|
return disp, reg
|
|
|
|
def _format_output(self):
|
|
"""Formats the final output string."""
|
|
output = []
|
|
# Sort addresses to print in order
|
|
sorted_addrs = sorted(self.byte_code.keys())
|
|
|
|
for addr in sorted_addrs:
|
|
size, hex_code, line = self.byte_code[addr]
|
|
addr_hex = f"0x{addr:03x}:"
|
|
|
|
# Handle lines that generate no code (labels, .align)
|
|
if size == 0:
|
|
output.append(f"{addr_hex: <10}| {line}")
|
|
else:
|
|
formatted_code = f"{hex_code:<20}"
|
|
output.append(f"{addr_hex: <10}{formatted_code}| {line}")
|
|
|
|
return '\n'.join(output)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) != 2:
|
|
print("Usage: python Y86_64_assembler.py <source_file.txt>")
|
|
sys.exit(1)
|
|
|
|
assembler = Y86Assembler()
|
|
result = assembler.assemble(sys.argv[1])
|
|
print("\n--- Assembled Code ---")
|
|
print(result)
|