initial assembler package
This commit is contained in:
parent
481c4ff5ba
commit
0fd65c721c
0
assembler/__init__.py
Normal file
0
assembler/__init__.py
Normal file
280
assembler/assembler.py
Normal file
280
assembler/assembler.py
Normal file
|
@ -0,0 +1,280 @@
|
|||
from collections import deque
|
||||
|
||||
from .context import FileContext
|
||||
from .tokenize import Tokenizer, WHITESPACE
|
||||
from .opcodes import make_opcodes
|
||||
from .util import can_be_mark, can_convert_to_int, autoint
|
||||
from .directives import SetDirective
|
||||
|
||||
class ParsingError(Exception):
|
||||
pass
|
||||
|
||||
class Assembler(object):
|
||||
"""
|
||||
This class is used for generating bytecode from a file containing assembly.
|
||||
|
||||
Also required is the memory definition of the interpreter given as a dict::
|
||||
|
||||
{
|
||||
"registers": <register_count>,
|
||||
"memory": <number of memory words>,
|
||||
"program_memory": <number of words available for the program>
|
||||
}
|
||||
|
||||
And the definition of the commands. This is a list of dicts::
|
||||
|
||||
[
|
||||
{
|
||||
"mnemonic": <mnemonic>,
|
||||
"args": [
|
||||
("register"|"memory"|"program_memory"|"direct_input"), ...
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
The method ``parse`` will parse the input file and ``bindump`` will dump the binary
|
||||
bytecode into a file.
|
||||
"""
|
||||
def __init__(self, file_, memory_definition, command_definition, custom_directives):
|
||||
self._file_context = FileContext(file_)
|
||||
self._code_objects = deque()
|
||||
self._memory_definition = memory_definition
|
||||
self._command_definition = command_definition
|
||||
self._word_count = 0
|
||||
self._marks = {}
|
||||
|
||||
self._opcodes = make_opcodes([cmd["mnemonic"] for cmd in command_definition])
|
||||
self._commands_by_mnemonic = {cmd["mnemonic"]: cmd for cmd in command_definition}
|
||||
self._tokenizer = Tokenizer(self._file_context)
|
||||
|
||||
self._directives = {"set": SetDirective()}
|
||||
self._directives.update(custom_directives)
|
||||
|
||||
def parse(self):
|
||||
try:
|
||||
for token in self._tokenizer:
|
||||
|
||||
# Comments
|
||||
if(token == ";"):
|
||||
while(token != "\n"):
|
||||
token = next(self._tokenizer)
|
||||
continue
|
||||
|
||||
# Commands
|
||||
if(token in self._commands_by_mnemonic):
|
||||
self.parse_command(token)
|
||||
continue
|
||||
|
||||
# Directives
|
||||
if(token == "."):
|
||||
self.parse_directive()
|
||||
continue
|
||||
|
||||
# The default is trying to parse a mark
|
||||
if(not can_be_mark(token)):
|
||||
self.raise_unexpected_token(token
|
||||
, "comment, command, directive or mark"
|
||||
, token)
|
||||
self.parse_mark(token)
|
||||
except StopIteration:
|
||||
raise ParsingError("Unexpected EOF while parsing.")
|
||||
|
||||
for mark, mark_data in self._marks.items():
|
||||
if(mark_data["target"] < 0):
|
||||
raise ParsingError("Mark {} undefined. Referenced in lines: {}".format(
|
||||
mark
|
||||
, mark_data["references"]))
|
||||
|
||||
self._code_objects = [self._marks[c]["target"] if c in self._marks else c
|
||||
for c in self._code_objects ]
|
||||
|
||||
def bindump(self, file_):
|
||||
# FIXME:
|
||||
# Make this work for tons of data.
|
||||
# Or is that necessary?
|
||||
return file_.write(bytes(self._code_objects))
|
||||
|
||||
|
||||
def parse_mark(self, token):
|
||||
should_be_colon = next(self._tokenizer)
|
||||
|
||||
if(should_be_colon != ":"):
|
||||
self.raise_unexpected_token(token, "':'", should_be_colon)
|
||||
|
||||
should_be_newline = next(self._tokenizer)
|
||||
if(should_be_newline != "\n"):
|
||||
self.raise_unexpected_token(token + ":", "'\\n'", should_be_newline)
|
||||
|
||||
if(token in self._marks):
|
||||
if(self._marks[token]["target"] != -1):
|
||||
raise ParsingError("Error in line {} column {} mark already defined: '{}'. Previous was in line {}.".format(
|
||||
self._line
|
||||
, self._column
|
||||
, token
|
||||
, self._marks[token]["target_line"]))
|
||||
self._marks[token]["target"] = self._word_count
|
||||
self._marks[token]["target_line"] = self._line
|
||||
|
||||
else:
|
||||
self._marks[token] = {
|
||||
"target": self._word_count
|
||||
, "target_line": self._line
|
||||
, "references": []
|
||||
}
|
||||
|
||||
should_be_newline = next(self._tokenizer)
|
||||
if(should_be_newline != "\n"):
|
||||
self.raise_unexpected_token(token, "newline", should_be_newline)
|
||||
|
||||
|
||||
def parse_directive(self):
|
||||
should_be_name = next(self._tokenizer)
|
||||
|
||||
if(not should_be_name in self._directives):
|
||||
self.raise_unexpected_token(".", "directive name", should_be_name)
|
||||
|
||||
words = self._directives[should_be_name].parse(self, self._tokenizer)
|
||||
|
||||
self._word_count += len(words)
|
||||
self._code_objects.extend(words)
|
||||
|
||||
should_be_newline = next(self._tokenizer)
|
||||
if(should_be_newline != "\n"):
|
||||
self.raise_unexpected_token(".", "newline", should_be_newline)
|
||||
|
||||
|
||||
|
||||
|
||||
def parse_command(self, cmd):
|
||||
|
||||
self._code_objects.append(self._opcodes[cmd])
|
||||
self._word_count += 1
|
||||
|
||||
if(not self._commands_by_mnemonic[cmd]["args"]):
|
||||
token = next(self._tokenizer)
|
||||
if(token != "\n"):
|
||||
self.raise_unexpected_token(cmd, "newline", token)
|
||||
return
|
||||
|
||||
should_be_whitespace = next(self._tokenizer)
|
||||
if(should_be_whitespace not in WHITESPACE
|
||||
or should_be_whitespace == "\n"):
|
||||
self.raise_unexpected_token(cmd, "' '", should_be_whitespace)
|
||||
|
||||
should_be_an_argument = next(self._tokenizer)
|
||||
argument = self.check_and_convert_argument(cmd
|
||||
, should_be_an_argument
|
||||
, self._commands_by_mnemonic[cmd]["args"][0])
|
||||
self._word_count += 1
|
||||
self._code_objects.append(argument)
|
||||
|
||||
|
||||
for argument in self._commands_by_mnemonic[cmd]["args"][1:]:
|
||||
should_be_comma = next(self._tokenizer)
|
||||
if(should_be_comma != ","):
|
||||
self.raise_unexpected_token(cmd, should_be_whitespace)
|
||||
|
||||
self._word_count += 1
|
||||
self._code_objects.append(
|
||||
self.check_and_convert_argument(
|
||||
cmd
|
||||
, next(self._tokenizer)
|
||||
, argument))
|
||||
|
||||
should_be_newline = next(self._tokenizer)
|
||||
if(should_be_newline != "\n"):
|
||||
self.raise_unexpected_token(cmd, "newline", should_be_newline)
|
||||
|
||||
|
||||
def raise_unexpected_token(self, after, expected, got):
|
||||
raise ParsingError("Error in line {} column {} after '{}': expected {}, got '{}'".format(
|
||||
self._file_context._line
|
||||
, self._file_context._column
|
||||
, after
|
||||
, expected
|
||||
, got))
|
||||
|
||||
def raise_invalid_address(self, after, memtype, maxval, got):
|
||||
raise ParsingError("Error in line {} column {} after '{}': value {} is invalid for {} (max is {})".format(
|
||||
self._file_context._line
|
||||
, self._file_context._column
|
||||
, after
|
||||
, got
|
||||
, memtype
|
||||
, maxval))
|
||||
|
||||
def check_and_convert_argument(self, cmd, argument, argument_definition):
|
||||
if(argument_definition == "register"):
|
||||
if(not argument.startswith("r")):
|
||||
self.raise_unexpected_token(cmd, "register name", argument)
|
||||
register_offset = argument[1:]
|
||||
raise_could_not_convert_register_offset = False
|
||||
try:
|
||||
register_offset = int(register_offset)
|
||||
except:
|
||||
raise_could_not_convert_register_offset = True
|
||||
|
||||
if(raise_could_not_convert_register_offset):
|
||||
self.raise_unexpected_token(cmd, "register name", argument)
|
||||
|
||||
if(register_offset > self._memory_definition["registers"]
|
||||
or register_offset < 0):
|
||||
self.raise_invalid_address(cmd
|
||||
, "register"
|
||||
, self._memory_definition["registers"]
|
||||
, register_offset)
|
||||
|
||||
return register_offset
|
||||
|
||||
if(argument_definition == "memory"):
|
||||
if(not can_convert_to_int(argument)):
|
||||
self.raise_unexpected_token(cmd, "integer address", argument)
|
||||
argument = autoint(argument)
|
||||
|
||||
if(argument < 0 or argument > self._memory_definition["memory"]):
|
||||
self.raise_invalid_address(cmd
|
||||
, "memory"
|
||||
, self._memory_definition["memory"]
|
||||
, argument)
|
||||
return argument
|
||||
|
||||
if(argument_definition == "program_memory"):
|
||||
if(not can_convert_to_int(argument)):
|
||||
if(not can_be_mark(argument)):
|
||||
self.raise_unexpected_token(cmd, "integer address or mark", argument)
|
||||
if(argument in self._marks):
|
||||
self._marks[argument]["references"].append(self._line)
|
||||
if(self._marks[argument]["target"] != -1):
|
||||
return self._marks[argument]["target"]
|
||||
return argument
|
||||
self._marks[argument] = {
|
||||
"target": -1
|
||||
, "target_line": 0
|
||||
, "references": [self._line]
|
||||
}
|
||||
return argument
|
||||
|
||||
|
||||
argument = autoint(argument)
|
||||
|
||||
if(argument < 0 or argument > self._memory_definition["program_memory"]):
|
||||
self.raise_invalid_address(cmd
|
||||
, "program_memory"
|
||||
, self._memory_definition["program_memory"]
|
||||
, argument)
|
||||
return argument
|
||||
|
||||
|
||||
if(can_convert_to_int(argument)):
|
||||
return autoint(argument)
|
||||
|
||||
if(not can_be_mark(argument)):
|
||||
self.raise_unexpected_token(cmd, "integer, char or mark", argument)
|
||||
if(argument in self._marks and self._marks[argument] >= 0):
|
||||
return self._marks[argument]
|
||||
self._marks[argument] = -1
|
||||
return argument
|
||||
|
||||
|
||||
|
||||
|
28
assembler/context.py
Normal file
28
assembler/context.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
from collections import deque
|
||||
|
||||
class FileContext(object):
|
||||
def __init__(self, file_):
|
||||
self._file = file_
|
||||
self._line = 0
|
||||
self._column = 0
|
||||
self._column_stack = deque()
|
||||
|
||||
def getc(self):
|
||||
c = self._file.read(1)
|
||||
if(c == "\n"):
|
||||
self._line += 1
|
||||
self._column_stack.append(self._column)
|
||||
self._column = 0
|
||||
else:
|
||||
self._column += 1
|
||||
|
||||
return c
|
||||
|
||||
def ungetc(self, c):
|
||||
self._file.seek(self._file.tell() - 1, 0)
|
||||
if(c == "\n"):
|
||||
self._line -= 1
|
||||
self._column = self._column_stack.pop()
|
||||
else:
|
||||
self._column -= 1
|
||||
|
42
assembler/directives.py
Normal file
42
assembler/directives.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
"""
|
||||
Directives for explicitly modifying the program memory.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import deque
|
||||
|
||||
class AbstractDirective(ABC):
|
||||
@abstractmethod
|
||||
def parse(self, assembler, tokenizer):
|
||||
"""
|
||||
Parse the directive by converting the text to a list of words.
|
||||
Returns a list of 16bit words.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
|
||||
class SetDirective(AbstractDirective):
|
||||
def parse(self, assembler, tokenizer):
|
||||
words = deque()
|
||||
should_be_bracket = next(tokenizer)
|
||||
if(not should_be_bracket == "["):
|
||||
assembler.raise_unexpected_token(".set", "'['", should_be_bracket)
|
||||
|
||||
while(True):
|
||||
should_be_value = next(tokenizer)
|
||||
if(not can_convert_to_int(should_be_value)):
|
||||
assembler.raise_unexpected_token(".set"
|
||||
, "integer or character value"
|
||||
, should_be_value)
|
||||
words.append(autoint(should_be_value))
|
||||
|
||||
comma_or_bracket = next(tokenizer)
|
||||
if(not comma_or_bracket in (",", "]")):
|
||||
assembler.raise_unexpected_token(".set"
|
||||
, "',' or ']'"
|
||||
, comma_or_bracket)
|
||||
|
||||
if(comma_or_bracket == "]"):
|
||||
break
|
||||
return list(words)
|
33
assembler/opcodes.py
Normal file
33
assembler/opcodes.py
Normal file
|
@ -0,0 +1,33 @@
|
|||
|
||||
|
||||
class Autoinserter(object):
|
||||
def __init__(self):
|
||||
self.mayor = 2
|
||||
self.minor = 1
|
||||
def __next__(self):
|
||||
"""Generate the next opcode"""
|
||||
# 10bit opcode
|
||||
opcode = 0b1111111111
|
||||
|
||||
# Calculate this opcode.
|
||||
opcode *= self.minor
|
||||
opcode //= self.mayor
|
||||
|
||||
# The lower 6 bits are reserved.
|
||||
opcode <<= 6
|
||||
# 16 bits in total. Should not matter.
|
||||
opcode &= 0xffff
|
||||
|
||||
# Update the state.
|
||||
self.minor = (self.minor + 2) % self.mayor
|
||||
if(self.minor == 1):
|
||||
self.mayor *= 2
|
||||
|
||||
return opcode
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def make_opcodes(mnemonics_in_order):
|
||||
|
||||
ai = Autoinserter()
|
||||
return {mnemonic: opcode for (mnemonic, opcode) in zip(mnemonics_in_order, ai)}
|
55
assembler/tokenize.py
Normal file
55
assembler/tokenize.py
Normal file
|
@ -0,0 +1,55 @@
|
|||
from collections import deque
|
||||
|
||||
WHITESPACE = {" ", "\t", "\n", "\r"}
|
||||
TOKENS_END_OF_WORD = WHITESPACE | {";", ":", ",", ".", "[", "]"}
|
||||
|
||||
TOKENS_EXPECT_NEWLINE = {":", "]"}
|
||||
|
||||
|
||||
class Tokenizer(object):
|
||||
def __init__(self, context):
|
||||
self._context = context
|
||||
self._expect_newline = False
|
||||
self._expect_whitespace = False
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
def __next__(self):
|
||||
"""
|
||||
Return the next token.
|
||||
"""
|
||||
|
||||
current_token = deque()
|
||||
|
||||
while(True):
|
||||
c = self._context.getc()
|
||||
|
||||
# Sometimes we need the explicit newline.
|
||||
if(self._expect_newline and c == "\n"):
|
||||
self._expect_newline = False
|
||||
return c
|
||||
|
||||
# Skip multiple whitespaces.
|
||||
if(c in WHITESPACE and not current_token):
|
||||
if(self._expect_whitespace):
|
||||
self._expect_whitespace = False
|
||||
return c
|
||||
continue
|
||||
|
||||
if(c in TOKENS_END_OF_WORD):
|
||||
if(current_token):
|
||||
self._context.ungetc(c)
|
||||
if(c in WHITESPACE):
|
||||
self._expect_whitespace = True
|
||||
return "".join(current_token)
|
||||
else:
|
||||
if(c in TOKENS_EXPECT_NEWLINE):
|
||||
self._expect_newline = True
|
||||
return c
|
||||
|
||||
if(not c):
|
||||
raise StopIteration()
|
||||
|
||||
current_token.append(c)
|
||||
|
||||
|
85
assembler/util.py
Normal file
85
assembler/util.py
Normal file
|
@ -0,0 +1,85 @@
|
|||
"""
|
||||
Utility functions used for parsing.
|
||||
"""
|
||||
|
||||
|
||||
def can_be_mark(argument):
|
||||
"""
|
||||
The ``argument`` can be interpreted as a Mark.
|
||||
"""
|
||||
a = ord("a")
|
||||
a_z = [chr(a + i) for i in range(26)]
|
||||
A = ord("A")
|
||||
A_Z = [chr(A + i) for i in range(26)]
|
||||
null = ord("0")
|
||||
null_9 = [chr(null + i) for i in range(10)]
|
||||
|
||||
if(not argument[0] in a_z):
|
||||
return False
|
||||
|
||||
for char in argument[1:]:
|
||||
if(not (char in a_z
|
||||
or char in A_Z
|
||||
or char in null_9
|
||||
or char == "_")):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
|
||||
def can_convert_to_int(value):
|
||||
"""
|
||||
``value`` can be converted to an integer.
|
||||
|
||||
**Note** that this returns ``True`` if the value is a
|
||||
character definition like ``'a'``.
|
||||
"""
|
||||
if(value.startswith("0x")):
|
||||
try:
|
||||
int(value[2:], 16)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
if(value.startswith("0b")):
|
||||
try:
|
||||
int(value[2:], 2)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
if(value.startswith("'") and value.endswith("'")):
|
||||
if(len(value) == 3):
|
||||
return True
|
||||
if(len(value) == 4):
|
||||
if(value[1:-1] in {"\\n", "\\r", "\\t"}):
|
||||
return True
|
||||
return False
|
||||
|
||||
try:
|
||||
int(value)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def autoint(value):
|
||||
"""
|
||||
Convert ``value`` to an integer automatically.
|
||||
"""
|
||||
escape_sequences = {"\\n": "\n", "\\r": "\r", "\\t":"\t"}
|
||||
if(value.startswith("0x")):
|
||||
return int(value[2:], 16)
|
||||
|
||||
if(value.startswith("0b")):
|
||||
return int(value[2:], 2)
|
||||
|
||||
if(value.startswith("'") and value.endswith("'")):
|
||||
if(len(value) == 3):
|
||||
return ord(value[1:-1])
|
||||
if(len(value) == 4):
|
||||
if(value[1:-1] in escape_sequences):
|
||||
return ord(escape_sequences[value[1:-1]])
|
||||
|
||||
return int(value)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user