BCI-assembler/assembler/tokenize.py

56 lines
1.1 KiB
Python

from collections import deque
WHITESPACE = {" ", "\t", "\n", "\r"}
TOKENS_END_OF_WORD = WHITESPACE | {";", ":", ",", ".", "[", "]"}
TOKENS_EXPECT_NEWLINE = {":", "]"}
class Tokenizer(object):
def __init__(self, context):
self._context = context
self._expect_newline = False
self._expect_whitespace = False
def __iter__(self):
return self
def __next__(self):
"""
Return the next token.
"""
current_token = deque()
while(True):
c = self._context.getc()
# Sometimes we need the explicit newline.
if(self._expect_newline and c == "\n"):
self._expect_newline = False
return c
# Skip multiple whitespaces.
if(c in WHITESPACE and not current_token):
if(self._expect_whitespace):
self._expect_whitespace = False
return c
continue
if(c in TOKENS_END_OF_WORD):
if(current_token):
self._context.ungetc(c)
if(c in WHITESPACE):
self._expect_whitespace = True
return "".join(current_token)
else:
if(c in TOKENS_EXPECT_NEWLINE):
self._expect_newline = True
return c
if(not c):
raise StopIteration()
current_token.append(c)