track positions and lengths on tokens

andialbrecht · dylanscott · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
commit 5d6f2a15d4ce5cbc260a7194408b08ae75bc0bcd
diff --git a/sqlparse/engine/statement_splitter.py b/sqlparse/engine/statement_splitter.py
@@ -84,7 +84,7 @@ def process(self, stream):
         EOS_TTYPE = T.Whitespace, T.Comment.Single
 
         # Run over all stream tokens
-        for ttype, value in stream:
+        for ttype, value, pos in stream:
             # Yield token if we finished a statement and there's no whitespaces
             # It will count newline token as a non whitespace. In this context
             # whitespace ignores newlines.
@@ -99,7 +99,7 @@ def process(self, stream):
             self.level += self._change_splitlevel(ttype, value)
 
             # Append the token to the current statement
-            self.tokens.append(sql.Token(ttype, value))
+            self.tokens.append(sql.Token(ttype, value, pos))
 
             # Check if we get the end of a statement
             # Issue762: Allow GO (or "GO 2") as statement splitter.

diff --git a/sqlparse/filters/tokens.py b/sqlparse/filters/tokens.py
@@ -16,10 +16,10 @@ def __init__(self, case=None):
         self.convert = getattr(str, case)
 
     def process(self, stream):
-        for ttype, value in stream:
+        for ttype, value, pos in stream:
             if ttype in self.ttype:
                 value = self.convert(value)
-            yield ttype, value
+            yield ttype, value, pos
 
 
 class KeywordCaseFilter(_CaseFilter):
@@ -30,10 +30,10 @@ class IdentifierCaseFilter(_CaseFilter):
     ttype = T.Name, T.String.Symbol
 
     def process(self, stream):
-        for ttype, value in stream:
+        for ttype, value, pos in stream:
             if ttype in self.ttype and value.strip()[0] != '"':
                 value = self.convert(value)
-            yield ttype, value
+            yield ttype, value, pos
 
 
 class TruncateStringFilter:
@@ -42,9 +42,9 @@ def __init__(self, width, char):
         self.char = char
 
     def process(self, stream):
-        for ttype, value in stream:
+        for ttype, value, pos in stream:
             if ttype != T.Literal.String.Single:
-                yield ttype, value
+                yield ttype, value, pos
                 continue
 
             if value[:2] == "''":
@@ -56,4 +56,4 @@ def process(self, stream):
 
             if len(inner) > self.width:
                 value = ''.join((quote, inner[:self.width], self.char, quote))
-            yield ttype, value
+            yield ttype, value, pos
diff --git a/sqlparse/lexer.py b/sqlparse/lexer.py
@@ -106,14 +106,14 @@ def is_keyword(self, value):
 
     def get_tokens(self, text, encoding=None):
         """
-        Return an iterable of (tokentype, value) pairs generated from
+        Return an iterable of (tokentype, value, pos) tuples generated from
         `text`. If `unfiltered` is set to `True`, the filtering mechanism
         is bypassed even if filters are defined.
 
         Also preprocess the text, i.e. expand tabs and strip it if
         wanted and applies registered filters.
 
-        Split ``text`` into (tokentype, text) pairs.
+        Split ``text`` into (tokentype, text, pos) tuples.
 
         ``stack`` is the initial stack (default: ``['root']``)
         """
@@ -142,20 +142,20 @@ def get_tokens(self, text, encoding=None):
                 if not m:
                     continue
                 elif isinstance(action, tokens._TokenType):
-                    yield action, m.group()
+                    yield action, m.group(), pos
                 elif action is keywords.PROCESS_AS_KEYWORD:
-                    yield self.is_keyword(m.group())
+                    yield (*self.is_keyword(m.group()), pos)
 
                 consume(iterable, m.end() - pos - 1)
                 break
             else:
-                yield tokens.Error, char
+                yield tokens.Error, char, pos
 
 
 def tokenize(sql, encoding=None):
     """Tokenize sql.
 
-    Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream
-    of ``(token type, value)`` items.
+    Tokenize *sql* using the :class:`Lexer` and return a 3-tuple stream
+    of ``(token type, value, pos)`` items.
     """
     return Lexer.get_default_instance().get_tokens(sql, encoding)
diff --git a/sqlparse/sql.py b/sqlparse/sql.py
@@ -45,10 +45,10 @@ class Token:
     the type of the token.
     """
 
-    __slots__ = ('value', 'ttype', 'parent', 'normalized', 'is_keyword',
-                 'is_group', 'is_whitespace', 'is_newline')
+    __slots__ = ('value', 'ttype', 'parent', 'pos', 'length', 'normalized',
+                 'is_keyword', 'is_group', 'is_whitespace', 'is_newline')
 
-    def __init__(self, ttype, value):
+    def __init__(self, ttype, value, pos=None):
         value = str(value)
         self.value = value
         self.ttype = ttype
@@ -59,6 +59,11 @@ def __init__(self, ttype, value):
         self.is_newline = self.ttype in T.Newline
         self.normalized = value.upper() if self.is_keyword else value
 
+        # TokenList overrides these with @property getters
+        if not hasattr(self, 'pos'):
+            self.pos = pos
+            self.length = len(value)
+
     def __str__(self):
         return self.value
 
@@ -163,6 +168,20 @@ def __init__(self, tokens=None):
         super().__init__(None, str(self))
         self.is_group = True
 
+    @property
+    def pos(self):
+        if len(self.tokens) > 0:
+            return self.tokens[0].pos
+
+    @property
+    def length(self):
+        if len(self.tokens) > 0:
+            first, last = self.tokens[0], self.tokens[-1]
+            if first.pos is not None and last.pos is not None:
+                return last.length + (last.pos - first.pos)
+
+        return len(str(self))
+
     def __str__(self):
         return ''.join(token.value for token in self.flatten())
 

diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -14,16 +14,16 @@ def test_tokenize_simple():
     assert isinstance(stream, types.GeneratorType)
     tokens = list(stream)
     assert len(tokens) == 8
-    assert len(tokens[0]) == 2
-    assert tokens[0] == (T.Keyword.DML, 'select')
-    assert tokens[-1] == (T.Punctuation, ';')
+    assert len(tokens[0]) == 3
+    assert tokens[0] == (T.Keyword.DML, 'select', 0)
+    assert tokens[-1] == (T.Punctuation, ';', 17)
 
 
 def test_tokenize_backticks():
     s = '`foo`.`bar`'
     tokens = list(lexer.tokenize(s))
     assert len(tokens) == 3
-    assert tokens[0] == (T.Name, '`foo`')
+    assert tokens[0] == (T.Name, '`foo`', 0)
 
 
 @pytest.mark.parametrize('s', ['foo\nbar\n', 'foo\rbar\r',