from collections import deque WHITESPACE = {" ", "\t", "\n", "\r"} TOKENS_END_OF_WORD = WHITESPACE | {";", ":", ",", ".", "[", "]"} TOKENS_EXPECT_NEWLINE = {":", "]"} class Tokenizer(object): def __init__(self, context): self._context = context self._expect_newline = False self._expect_whitespace = False def __iter__(self): return self def __next__(self): """ Return the next token. """ current_token = deque() while(True): c = self._context.getc() # Sometimes we need the explicit newline. if(self._expect_newline and c == "\n"): self._expect_newline = False return c # Skip multiple whitespaces. if(c in WHITESPACE and not current_token): if(self._expect_whitespace): self._expect_whitespace = False return c continue if(c in TOKENS_END_OF_WORD): if(current_token): self._context.ungetc(c) if(c in WHITESPACE): self._expect_whitespace = True return "".join(current_token) else: if(c in TOKENS_EXPECT_NEWLINE): self._expect_newline = True return c if(not c): raise StopIteration() current_token.append(c)