diff --git a/exam/ex06/finite_state_machine.py b/exam/ex06/finite_state_machine.py new file mode 100644 index 0000000..27fca05 --- /dev/null +++ b/exam/ex06/finite_state_machine.py @@ -0,0 +1,36 @@ + + +class FiniteStateMachine(object): + def __init__(self, start: int, valid: list, default: int, transitions: dict): + self._start = start + self._valid = valid + + self._transitions = dict() + for state, trans in transitions.items(): + self._transitions[state] = dict() + for words, target in trans.items(): + for word in words: + self._transitions[state][word] = target + + self._default = default + self._state = start + + def reset(self): + self._state = self._start + + def make_transition(self, word): + if(not self._state in self._transitions): + self._state = self._default + if(not word in self._transitions[self._state]): + self._state = self._default + return + + self._state = self._transitions[self._state][word] + + def check(self, sequence): + for word in sequence: + self.make_transition(word) + is_valid = self._state in self._valid + self.reset() + return is_valid + diff --git a/exam/ex06/main.py b/exam/ex06/main.py new file mode 100644 index 0000000..220eaec --- /dev/null +++ b/exam/ex06/main.py @@ -0,0 +1,16 @@ +from io import StringIO + +from parser import Parser +from tokenio import TokenStream +from tokens import NumberTokenParser + +texts = ["one plus one" + , "one plus two" + , "thirtytwo plus eleven" + , "four times four" + , "(eight plus eleven) times two" + , "twohundred through eleven"] + +for text in texts: + print(text, "=", Parser(TokenStream(StringIO(text))).parse()) + diff --git a/exam/ex06/parser.py b/exam/ex06/parser.py new file mode 100644 index 0000000..5a60bea --- /dev/null +++ b/exam/ex06/parser.py @@ -0,0 +1,91 @@ +from collections import deque + +from tokens import NumberTokenParser, OperatorTokenParser + +class ParsingException(Exception): + pass + +class Parser(object): + def __init__(self, token_stream): + self._token_stream = token_stream + self._stack = deque() + self._current_list = deque() + + def parse(self): + state = 0 + while True: + token = self._token_stream.get_token() + if(token == "("): + if(state == 1): + raise ParsingException( + "expected operator, not parenthesis: {} (near '{}')".format( + self._token_stream._offset + , token)) + + self._stack.append(self._current_list) + continue + + if(NumberTokenParser.can_parse(token)): + if(state == 1): + raise ParsingException( + "expected operator, not number: {} (near '{}')".format( + self._token_stream._offset + , token)) + self._current_list.append(NumberTokenParser(token).parse()) + state = 1 + continue + + if(OperatorTokenParser.can_parse(token)): + if(state != 1): + raise ParsingException( + "expected number or parenthesis, not operator: {} (near '{}')".format( + self._token_stream._offset + , token)) + + self._current_list.append(OperatorTokenParser(token).parse()) + state = 0 + continue + + if(token == ")"): + #if(state == 1): + # raise ParsingException( + # "expected operator, not parenthesis: {} (near '{}')".format( + # self._token_stream._offset + # , token)) + state = 1 + + result = self.execute_branch(self._current_list) + self._current_list = self._stack.pop() + continue + + if(not token): + if(self._stack): + raise ParsingException("unexpected EOF while parsing") + return self.execute_branch(self._current_list) + + raise ParsingException("unknown token: {} (near '{}')".format(self._token_stream._offset, token)) + + return self.execute_branch(self._current_list) + + def execute_branch(self, branch): + result = None + current_operator = None + + for element in branch: + if(result is None): + result = element + continue + + if(not isinstance(element, (float, int, complex))): + # Operator + current_operator = element + continue + + if(current_operator): + result = current_operator(result, element) + current_operator = None + return result + + + + diff --git a/exam/ex06/tokenio.py b/exam/ex06/tokenio.py new file mode 100644 index 0000000..1f00df9 --- /dev/null +++ b/exam/ex06/tokenio.py @@ -0,0 +1,56 @@ +from collections import deque + +base_chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +end_of_token_chars = "() \t" +whitespace_chars = " \t" + +class UnexpectedCharacterException(Exception): + def __init__(self, msg, offset, char, *args): + Exception.__init__(self, *args) + self._msg = msg + self._offset = offset + self._char = char + + +class TokenStream(object): + def __init__(self, file_): + self._file = file_ + self._file.seek(0, 0) + self._offset = 0 + + def _getc(self): + c = self._file.read(1) + if(c): + self._offset += 1 + return c + def _ungetc(self): + self._file.seek(self._offset - 1, 0) + self._offset -= 1 + + def get_token(self): + result = deque() + while True: + c = self._getc() + if(not c): + # EOF. + break + if(c in base_chars): + result.append(c) + continue + if(c in end_of_token_chars): + if(not result): + # We are not inside a token. + if(c in whitespace_chars): + # Some whitespace. Ignore it. + continue + # A parenthesis. + return c + + # End of token. + self._ungetc() + break + raise UnexpectedCharacterException("Unexpected character while tokenizing", self._offset, c) + return "".join(result) + + + diff --git a/exam/ex06/tokens.py b/exam/ex06/tokens.py new file mode 100644 index 0000000..2e32648 --- /dev/null +++ b/exam/ex06/tokens.py @@ -0,0 +1,155 @@ +from collections import deque +from abc import ABCMeta, abstractmethod + +from finite_state_machine import FiniteStateMachine + +BASE_NUMBER_TOKENS = {"one": 1 + , "two": 2 + , "three": 3 + , "four": 4 + , "five": 5 + , "six": 6 + , "seven": 7 + , "eight": 8 + , "nine": 9} +DECI_NUMBER_TOKENS = {"twenty": 20 + , "thirty": 30 + , "fourty": 40 + , "fifty": 50 + , "sixty": 60 + , "secenty": 70 + , "eigthy": 80 + , "ninety": 90} +TEEN_NUMBER_TOKENS = {"ten": 10 + , "eleven": 11 + , "twelve": 12 + , "thirteen": 13 + , "fourteen": 14 + , "fifteen": 15 + , "sixteen": 16 + , "seventeen": 17 + , "eighteen": 18 + , "nineteen": 19} +HUNDRED_NUMBER_TOKENS = {"hundred": 100} +ZERO_NUMBER_TOKENS = {"zero": 0 + , "null": 0} + +OPERATOR_TOKENS = { "plus": lambda x,y: x + y + , "minus": lambda x,y: x - y + , "times": lambda x,y: x * y + , "through": lambda x,y: x / y} + +transitions = { + 0: { tuple(ZERO_NUMBER_TOKENS) + tuple(TEEN_NUMBER_TOKENS): 1 + , tuple(DECI_NUMBER_TOKENS): 2 + , tuple(BASE_NUMBER_TOKENS): 3} + , 2: {tuple(BASE_NUMBER_TOKENS): 1} + , 3: {tuple(HUNDRED_NUMBER_TOKENS): 4} + , 4: {tuple(DECI_NUMBER_TOKENS): 2} +} +valid_states = [1, 2, 3, 4] +default_transition = -1 + + + +class TokenParsingException(Exception): + pass +class SubtokenizingException(TokenParsingException): + pass + + +class TokenParser(metaclass = ABCMeta): + @classmethod + def can_parse(cls, token): + try: + cls(token).parse() + return True + except TokenParsingException: + return False + @abstractmethod + def parse(self): + pass + +class NumberTokenParser(TokenParser): + def __init__(self, token): + self._token = token.lower() + self._token_length = len(token) + + self._finite_state_machine = FiniteStateMachine(0, valid_states, default_transition, transitions) + + def get_token_of_class_or_none(self, offset, token_class): + for token in token_class: + if(len(token) + offset > self._token_length): + continue + + if(self._token[offset: offset + len(token)] == token): + return token + return None + + def get_next_token_part(self, offset): + token_classes = [ZERO_NUMBER_TOKENS + , HUNDRED_NUMBER_TOKENS + , TEEN_NUMBER_TOKENS + , DECI_NUMBER_TOKENS + , BASE_NUMBER_TOKENS] + + result = None + for token_class in token_classes: + result = self.get_token_of_class_or_none(offset, token_class) + if(result): + break + + return result + + def subtokenize(self): + token_parts = deque() + + offset = 0 + while(True): + subtoken = self.get_next_token_part(offset) + if(subtoken is None): + if(offset != self._token_length): + raise SubtokenizingException("part of the token is dangling: '{}'".format(self._token[offset:])) + return list(token_parts) + + offset += len(subtoken) + token_parts.append(subtoken) + + def parse(self): + token_parts = self.subtokenize() + + if(not self._finite_state_machine.check(token_parts)): + raise TokenParsingException("token '{}' is invalid".format(self._token)) + + + # This is ugly but it works. + result = 0 + for subtoken in token_parts: + if(subtoken in BASE_NUMBER_TOKENS): + result += BASE_NUMBER_TOKENS[subtoken] + if(subtoken in TEEN_NUMBER_TOKENS): + result += TEEN_NUMBER_TOKENS[subtoken] + if(subtoken in DECI_NUMBER_TOKENS): + result += DECI_NUMBER_TOKENS[subtoken] + + if(subtoken in HUNDRED_NUMBER_TOKENS): + result *= HUNDRED_NUMBER_TOKENS[subtoken] + + + return result + + + +class OperatorTokenParser(TokenParser): + def __init__(self, token): + self._token = token.lower() + + def parse(self): + if(not self._token in OPERATOR_TOKENS): + raise TokenParsingException("token '{}' is not an operator".format(self._token)) + return OPERATOR_TOKENS[self._token] + + + + +