Add a lexer and a couple tests

kennknowles · kennknowles · commit 353c7ce7b15e · 2013-01-14T17:30:09.000-05:00
diff --git a/tests/test_lexer.py b/tests/test_lexer.py
@@ -0,0 +1,40 @@
+import unittest
+import logging
+
+from ply.lex import LexToken
+
+from typelanguage.lexer import TypeLexer
+from typelanguage.types import *
+
+class TestLexer(unittest.TestCase):
+
+    def token(self, value, ty=None):
+        t = LexToken()
+        t.type = ty if ty != None else value
+        t.value = value
+        t.lineno = -1
+        t.lexpos = -1
+        return t
+    
+    def assert_lex_equiv(self, s, stream2):
+        # NOTE: lexer fails to reset after call?
+        l = TypeLexer(debug=True)
+        stream1 = list(l.tokenize(s)) # Save the stream for debug output when a test fails
+        stream2 = list(stream2)
+        assert len(stream1) == len(stream2)
+        for token1, token2 in zip(stream1, stream2):
+            print token1, token2
+            assert token1.type  == token2.type
+            assert token1.value == token2.value
+
+    @classmethod
+    def setup_class(cls):
+        logging.basicConfig()
+
+    def test_simple_inputs(self):
+        self.assert_lex_equiv('int', [self.token('int', 'ID')])
+        self.assert_lex_equiv('[int]', [self.token('['), self.token('int', 'ID'), self.token(']')])
+        self.assert_lex_equiv('int -> int', [self.token('int', 'ID'), self.token('->', 'ARROW'), self.token('int', 'ID')])
+        self.assert_lex_equiv('*a', [self.token('*'), self.token('a', 'ID')])
+        self.assert_lex_equiv('**a', [self.token('**', 'KWARGS'), self.token('a', 'ID')])
+        self.assert_lex_equiv('*x, **a', [self.token('*'), self.token('x', 'ID'), self.token(','), self.token('**', 'KWARGS'), self.token('a', 'ID')])
diff --git a/typelanguage/lexer.py b/typelanguage/lexer.py
@@ -0,0 +1,56 @@
+import sys
+import logging
+
+import ply.lex
+
+logger = logging.getLogger(__name__)
+
+class TypeLexer(object):
+    '''
+    A Lexical analyzer for Python Typelanguage.
+    '''
+    
+    def __init__(self, debug=False):
+        self.debug = debug
+
+    def tokenize(self, string):
+        '''
+        Maps a string to an iterator over tokens. In other words: [char] -> [token]
+        '''
+        
+        new_lexer = ply.lex.lex(module=self, debug=self.debug, errorlog=logger)
+        new_lexer.latest_newline = 0
+        new_lexer.input(string)
+
+        while True:
+            t = new_lexer.token()
+            if t is None: break
+            yield t
+
+    # ============== PLY Lexer specification ==================
+    # This probably should be private but things like "literals"
+    # might be a legitimate part of the public interface. Anyhow
+    # it is pythonic to give some rope to hang oneself with :-)
+
+    literals = ['|', '(', ')', '{', '}', '[', ']', ':', '*', ',', ';']
+    
+    tokens = ['ID', 'TYVAR', 'ARROW', 'KWARGS']
+
+    t_ARROW = r'->'
+    t_KWARGS = r'\*\*'
+    t_ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
+    t_ignore = ' \t'
+
+    def t_newline(self, t):
+        r'\n'
+        t.lexer.lineno += 1
+        t.lexer.latest_newline = t.lexpos
+
+    def t_error(self, t):
+        raise Exception('Error on line %s, col %s: Unexpected character: %s ' % (t.lexer.lineno, t.lexpos - t.latest_newline, t.value[0]))
+
+if __name__ == '__main__':
+    logging.basicConfig()
+    lexer = TypeLexer(debug=True)
+    for token in lexer.tokenize(sys.stdin.read()):
+        print '%-20s%s' % (token.value, token.type)