From 0fd65c721c711fb2e723c61808644ee136e5d4df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Kn=C3=BCttel?= Date: Tue, 9 Oct 2018 11:43:42 +0200 Subject: [PATCH] initial assembler package --- assembler/__init__.py | 0 assembler/assembler.py | 280 ++++++++++++++++++++++++++++++++++++++++ assembler/context.py | 28 ++++ assembler/directives.py | 42 ++++++ assembler/opcodes.py | 33 +++++ assembler/tokenize.py | 55 ++++++++ assembler/util.py | 85 ++++++++++++ 7 files changed, 523 insertions(+) create mode 100644 assembler/__init__.py create mode 100644 assembler/assembler.py create mode 100644 assembler/context.py create mode 100644 assembler/directives.py create mode 100644 assembler/opcodes.py create mode 100644 assembler/tokenize.py create mode 100644 assembler/util.py diff --git a/assembler/__init__.py b/assembler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/assembler/assembler.py b/assembler/assembler.py new file mode 100644 index 0000000..53f556c --- /dev/null +++ b/assembler/assembler.py @@ -0,0 +1,280 @@ +from collections import deque + +from .context import FileContext +from .tokenize import Tokenizer, WHITESPACE +from .opcodes import make_opcodes +from .util import can_be_mark, can_convert_to_int, autoint +from .directives import SetDirective + +class ParsingError(Exception): + pass + +class Assembler(object): + """ + This class is used for generating bytecode from a file containing assembly. + + Also required is the memory definition of the interpreter given as a dict:: + + { + "registers": , + "memory": , + "program_memory": + } + + And the definition of the commands. This is a list of dicts:: + + [ + { + "mnemonic": , + "args": [ + ("register"|"memory"|"program_memory"|"direct_input"), ... + ] + } + ] + + The method ``parse`` will parse the input file and ``bindump`` will dump the binary + bytecode into a file. + """ + def __init__(self, file_, memory_definition, command_definition, custom_directives): + self._file_context = FileContext(file_) + self._code_objects = deque() + self._memory_definition = memory_definition + self._command_definition = command_definition + self._word_count = 0 + self._marks = {} + + self._opcodes = make_opcodes([cmd["mnemonic"] for cmd in command_definition]) + self._commands_by_mnemonic = {cmd["mnemonic"]: cmd for cmd in command_definition} + self._tokenizer = Tokenizer(self._file_context) + + self._directives = {"set": SetDirective()} + self._directives.update(custom_directives) + + def parse(self): + try: + for token in self._tokenizer: + + # Comments + if(token == ";"): + while(token != "\n"): + token = next(self._tokenizer) + continue + + # Commands + if(token in self._commands_by_mnemonic): + self.parse_command(token) + continue + + # Directives + if(token == "."): + self.parse_directive() + continue + + # The default is trying to parse a mark + if(not can_be_mark(token)): + self.raise_unexpected_token(token + , "comment, command, directive or mark" + , token) + self.parse_mark(token) + except StopIteration: + raise ParsingError("Unexpected EOF while parsing.") + + for mark, mark_data in self._marks.items(): + if(mark_data["target"] < 0): + raise ParsingError("Mark {} undefined. Referenced in lines: {}".format( + mark + , mark_data["references"])) + + self._code_objects = [self._marks[c]["target"] if c in self._marks else c + for c in self._code_objects ] + + def bindump(self, file_): + # FIXME: + # Make this work for tons of data. + # Or is that necessary? + return file_.write(bytes(self._code_objects)) + + + def parse_mark(self, token): + should_be_colon = next(self._tokenizer) + + if(should_be_colon != ":"): + self.raise_unexpected_token(token, "':'", should_be_colon) + + should_be_newline = next(self._tokenizer) + if(should_be_newline != "\n"): + self.raise_unexpected_token(token + ":", "'\\n'", should_be_newline) + + if(token in self._marks): + if(self._marks[token]["target"] != -1): + raise ParsingError("Error in line {} column {} mark already defined: '{}'. Previous was in line {}.".format( + self._line + , self._column + , token + , self._marks[token]["target_line"])) + self._marks[token]["target"] = self._word_count + self._marks[token]["target_line"] = self._line + + else: + self._marks[token] = { + "target": self._word_count + , "target_line": self._line + , "references": [] + } + + should_be_newline = next(self._tokenizer) + if(should_be_newline != "\n"): + self.raise_unexpected_token(token, "newline", should_be_newline) + + + def parse_directive(self): + should_be_name = next(self._tokenizer) + + if(not should_be_name in self._directives): + self.raise_unexpected_token(".", "directive name", should_be_name) + + words = self._directives[should_be_name].parse(self, self._tokenizer) + + self._word_count += len(words) + self._code_objects.extend(words) + + should_be_newline = next(self._tokenizer) + if(should_be_newline != "\n"): + self.raise_unexpected_token(".", "newline", should_be_newline) + + + + + def parse_command(self, cmd): + + self._code_objects.append(self._opcodes[cmd]) + self._word_count += 1 + + if(not self._commands_by_mnemonic[cmd]["args"]): + token = next(self._tokenizer) + if(token != "\n"): + self.raise_unexpected_token(cmd, "newline", token) + return + + should_be_whitespace = next(self._tokenizer) + if(should_be_whitespace not in WHITESPACE + or should_be_whitespace == "\n"): + self.raise_unexpected_token(cmd, "' '", should_be_whitespace) + + should_be_an_argument = next(self._tokenizer) + argument = self.check_and_convert_argument(cmd + , should_be_an_argument + , self._commands_by_mnemonic[cmd]["args"][0]) + self._word_count += 1 + self._code_objects.append(argument) + + + for argument in self._commands_by_mnemonic[cmd]["args"][1:]: + should_be_comma = next(self._tokenizer) + if(should_be_comma != ","): + self.raise_unexpected_token(cmd, should_be_whitespace) + + self._word_count += 1 + self._code_objects.append( + self.check_and_convert_argument( + cmd + , next(self._tokenizer) + , argument)) + + should_be_newline = next(self._tokenizer) + if(should_be_newline != "\n"): + self.raise_unexpected_token(cmd, "newline", should_be_newline) + + + def raise_unexpected_token(self, after, expected, got): + raise ParsingError("Error in line {} column {} after '{}': expected {}, got '{}'".format( + self._file_context._line + , self._file_context._column + , after + , expected + , got)) + + def raise_invalid_address(self, after, memtype, maxval, got): + raise ParsingError("Error in line {} column {} after '{}': value {} is invalid for {} (max is {})".format( + self._file_context._line + , self._file_context._column + , after + , got + , memtype + , maxval)) + + def check_and_convert_argument(self, cmd, argument, argument_definition): + if(argument_definition == "register"): + if(not argument.startswith("r")): + self.raise_unexpected_token(cmd, "register name", argument) + register_offset = argument[1:] + raise_could_not_convert_register_offset = False + try: + register_offset = int(register_offset) + except: + raise_could_not_convert_register_offset = True + + if(raise_could_not_convert_register_offset): + self.raise_unexpected_token(cmd, "register name", argument) + + if(register_offset > self._memory_definition["registers"] + or register_offset < 0): + self.raise_invalid_address(cmd + , "register" + , self._memory_definition["registers"] + , register_offset) + + return register_offset + + if(argument_definition == "memory"): + if(not can_convert_to_int(argument)): + self.raise_unexpected_token(cmd, "integer address", argument) + argument = autoint(argument) + + if(argument < 0 or argument > self._memory_definition["memory"]): + self.raise_invalid_address(cmd + , "memory" + , self._memory_definition["memory"] + , argument) + return argument + + if(argument_definition == "program_memory"): + if(not can_convert_to_int(argument)): + if(not can_be_mark(argument)): + self.raise_unexpected_token(cmd, "integer address or mark", argument) + if(argument in self._marks): + self._marks[argument]["references"].append(self._line) + if(self._marks[argument]["target"] != -1): + return self._marks[argument]["target"] + return argument + self._marks[argument] = { + "target": -1 + , "target_line": 0 + , "references": [self._line] + } + return argument + + + argument = autoint(argument) + + if(argument < 0 or argument > self._memory_definition["program_memory"]): + self.raise_invalid_address(cmd + , "program_memory" + , self._memory_definition["program_memory"] + , argument) + return argument + + + if(can_convert_to_int(argument)): + return autoint(argument) + + if(not can_be_mark(argument)): + self.raise_unexpected_token(cmd, "integer, char or mark", argument) + if(argument in self._marks and self._marks[argument] >= 0): + return self._marks[argument] + self._marks[argument] = -1 + return argument + + + + diff --git a/assembler/context.py b/assembler/context.py new file mode 100644 index 0000000..3863a00 --- /dev/null +++ b/assembler/context.py @@ -0,0 +1,28 @@ +from collections import deque + +class FileContext(object): + def __init__(self, file_): + self._file = file_ + self._line = 0 + self._column = 0 + self._column_stack = deque() + + def getc(self): + c = self._file.read(1) + if(c == "\n"): + self._line += 1 + self._column_stack.append(self._column) + self._column = 0 + else: + self._column += 1 + + return c + + def ungetc(self, c): + self._file.seek(self._file.tell() - 1, 0) + if(c == "\n"): + self._line -= 1 + self._column = self._column_stack.pop() + else: + self._column -= 1 + diff --git a/assembler/directives.py b/assembler/directives.py new file mode 100644 index 0000000..24b6c99 --- /dev/null +++ b/assembler/directives.py @@ -0,0 +1,42 @@ +""" +Directives for explicitly modifying the program memory. +""" + +from abc import ABC, abstractmethod +from collections import deque + +class AbstractDirective(ABC): + @abstractmethod + def parse(self, assembler, tokenizer): + """ + Parse the directive by converting the text to a list of words. + Returns a list of 16bit words. + """ + pass + + + +class SetDirective(AbstractDirective): + def parse(self, assembler, tokenizer): + words = deque() + should_be_bracket = next(tokenizer) + if(not should_be_bracket == "["): + assembler.raise_unexpected_token(".set", "'['", should_be_bracket) + + while(True): + should_be_value = next(tokenizer) + if(not can_convert_to_int(should_be_value)): + assembler.raise_unexpected_token(".set" + , "integer or character value" + , should_be_value) + words.append(autoint(should_be_value)) + + comma_or_bracket = next(tokenizer) + if(not comma_or_bracket in (",", "]")): + assembler.raise_unexpected_token(".set" + , "',' or ']'" + , comma_or_bracket) + + if(comma_or_bracket == "]"): + break + return list(words) diff --git a/assembler/opcodes.py b/assembler/opcodes.py new file mode 100644 index 0000000..15c9ba3 --- /dev/null +++ b/assembler/opcodes.py @@ -0,0 +1,33 @@ + + +class Autoinserter(object): + def __init__(self): + self.mayor = 2 + self.minor = 1 + def __next__(self): + """Generate the next opcode""" + # 10bit opcode + opcode = 0b1111111111 + + # Calculate this opcode. + opcode *= self.minor + opcode //= self.mayor + + # The lower 6 bits are reserved. + opcode <<= 6 + # 16 bits in total. Should not matter. + opcode &= 0xffff + + # Update the state. + self.minor = (self.minor + 2) % self.mayor + if(self.minor == 1): + self.mayor *= 2 + + return opcode + def __iter__(self): + return self + +def make_opcodes(mnemonics_in_order): + + ai = Autoinserter() + return {mnemonic: opcode for (mnemonic, opcode) in zip(mnemonics_in_order, ai)} diff --git a/assembler/tokenize.py b/assembler/tokenize.py new file mode 100644 index 0000000..71be23f --- /dev/null +++ b/assembler/tokenize.py @@ -0,0 +1,55 @@ +from collections import deque + +WHITESPACE = {" ", "\t", "\n", "\r"} +TOKENS_END_OF_WORD = WHITESPACE | {";", ":", ",", ".", "[", "]"} + +TOKENS_EXPECT_NEWLINE = {":", "]"} + + +class Tokenizer(object): + def __init__(self, context): + self._context = context + self._expect_newline = False + self._expect_whitespace = False + + def __iter__(self): + return self + def __next__(self): + """ + Return the next token. + """ + + current_token = deque() + + while(True): + c = self._context.getc() + + # Sometimes we need the explicit newline. + if(self._expect_newline and c == "\n"): + self._expect_newline = False + return c + + # Skip multiple whitespaces. + if(c in WHITESPACE and not current_token): + if(self._expect_whitespace): + self._expect_whitespace = False + return c + continue + + if(c in TOKENS_END_OF_WORD): + if(current_token): + self._context.ungetc(c) + if(c in WHITESPACE): + self._expect_whitespace = True + return "".join(current_token) + else: + if(c in TOKENS_EXPECT_NEWLINE): + self._expect_newline = True + return c + + if(not c): + raise StopIteration() + + current_token.append(c) + + diff --git a/assembler/util.py b/assembler/util.py new file mode 100644 index 0000000..60d043b --- /dev/null +++ b/assembler/util.py @@ -0,0 +1,85 @@ +""" +Utility functions used for parsing. +""" + + +def can_be_mark(argument): + """ + The ``argument`` can be interpreted as a Mark. + """ + a = ord("a") + a_z = [chr(a + i) for i in range(26)] + A = ord("A") + A_Z = [chr(A + i) for i in range(26)] + null = ord("0") + null_9 = [chr(null + i) for i in range(10)] + + if(not argument[0] in a_z): + return False + + for char in argument[1:]: + if(not (char in a_z + or char in A_Z + or char in null_9 + or char == "_")): + return False + return True + + + +def can_convert_to_int(value): + """ + ``value`` can be converted to an integer. + + **Note** that this returns ``True`` if the value is a + character definition like ``'a'``. + """ + if(value.startswith("0x")): + try: + int(value[2:], 16) + return True + except: + return False + + if(value.startswith("0b")): + try: + int(value[2:], 2) + return True + except: + return False + + if(value.startswith("'") and value.endswith("'")): + if(len(value) == 3): + return True + if(len(value) == 4): + if(value[1:-1] in {"\\n", "\\r", "\\t"}): + return True + return False + + try: + int(value) + return True + except: + return False + +def autoint(value): + """ + Convert ``value`` to an integer automatically. + """ + escape_sequences = {"\\n": "\n", "\\r": "\r", "\\t":"\t"} + if(value.startswith("0x")): + return int(value[2:], 16) + + if(value.startswith("0b")): + return int(value[2:], 2) + + if(value.startswith("'") and value.endswith("'")): + if(len(value) == 3): + return ord(value[1:-1]) + if(len(value) == 4): + if(value[1:-1] in escape_sequences): + return ord(escape_sequences[value[1:-1]]) + + return int(value) + +