BCI-assembler/assembler/assembler/tokenize.py

from collections import deque

WHITESPACE = {" ", "\t", "\n", "\r"}
TOKENS_END_OF_WORD = WHITESPACE | {";", ":", ",", ".", "[", "]"}

TOKENS_EXPECT_NEWLINE = {":", "]"}


class Tokenizer(object):
	def __init__(self, context):
		self._context = context
		self._expect_newline = False
		self._expect_whitespace = False

	def __iter__(self):
		return self
	def __next__(self):
		"""
		Return the next token.
		"""

		current_token = deque()

		while(True):
			c = self._context.getc()

			# Sometimes we need the explicit newline.
			if(self._expect_newline and c == "\n"):
				self._expect_newline = False
				return c

			# Skip multiple whitespaces.
			if(c in WHITESPACE and not current_token):
				if(self._expect_whitespace):
					self._expect_whitespace = False
					return c
				continue

			if(c in TOKENS_END_OF_WORD):
				if(current_token):
					self._context.ungetc(c)
					if(c in WHITESPACE):
						self._expect_whitespace = True
					return "".join(current_token)
				else:
					if(c in TOKENS_EXPECT_NEWLINE):
						self._expect_newline = True
					return c

			if(not c):
				raise StopIteration()

			current_token.append(c)