Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
track positions and lengths on tokens
  • Loading branch information
dylanscott committed Oct 18, 2024
commit 5d6f2a15d4ce5cbc260a7194408b08ae75bc0bcd
4 changes: 2 additions & 2 deletions sqlparse/engine/statement_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def process(self, stream):
EOS_TTYPE = T.Whitespace, T.Comment.Single

# Run over all stream tokens
for ttype, value in stream:
for ttype, value, pos in stream:
# Yield token if we finished a statement and there's no whitespaces
# It will count newline token as a non whitespace. In this context
# whitespace ignores newlines.
Expand All @@ -99,7 +99,7 @@ def process(self, stream):
self.level += self._change_splitlevel(ttype, value)

# Append the token to the current statement
self.tokens.append(sql.Token(ttype, value))
self.tokens.append(sql.Token(ttype, value, pos))

# Check if we get the end of a statement
# Issue762: Allow GO (or "GO 2") as statement splitter.
Expand Down
14 changes: 7 additions & 7 deletions sqlparse/filters/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ def __init__(self, case=None):
self.convert = getattr(str, case)

def process(self, stream):
for ttype, value in stream:
for ttype, value, pos in stream:
if ttype in self.ttype:
value = self.convert(value)
yield ttype, value
yield ttype, value, pos


class KeywordCaseFilter(_CaseFilter):
Expand All @@ -30,10 +30,10 @@ class IdentifierCaseFilter(_CaseFilter):
ttype = T.Name, T.String.Symbol

def process(self, stream):
for ttype, value in stream:
for ttype, value, pos in stream:
if ttype in self.ttype and value.strip()[0] != '"':
value = self.convert(value)
yield ttype, value
yield ttype, value, pos


class TruncateStringFilter:
Expand All @@ -42,9 +42,9 @@ def __init__(self, width, char):
self.char = char

def process(self, stream):
for ttype, value in stream:
for ttype, value, pos in stream:
if ttype != T.Literal.String.Single:
yield ttype, value
yield ttype, value, pos
continue

if value[:2] == "''":
Expand All @@ -56,4 +56,4 @@ def process(self, stream):

if len(inner) > self.width:
value = ''.join((quote, inner[:self.width], self.char, quote))
yield ttype, value
yield ttype, value, pos
14 changes: 7 additions & 7 deletions sqlparse/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,14 +106,14 @@ def is_keyword(self, value):

def get_tokens(self, text, encoding=None):
"""
Return an iterable of (tokentype, value) pairs generated from
Return an iterable of (tokentype, value, pos) tuples generated from
`text`. If `unfiltered` is set to `True`, the filtering mechanism
is bypassed even if filters are defined.

Also preprocess the text, i.e. expand tabs and strip it if
wanted and applies registered filters.

Split ``text`` into (tokentype, text) pairs.
Split ``text`` into (tokentype, text, pos) tuples.

``stack`` is the initial stack (default: ``['root']``)
"""
Expand Down Expand Up @@ -142,20 +142,20 @@ def get_tokens(self, text, encoding=None):
if not m:
continue
elif isinstance(action, tokens._TokenType):
yield action, m.group()
yield action, m.group(), pos
elif action is keywords.PROCESS_AS_KEYWORD:
yield self.is_keyword(m.group())
yield (*self.is_keyword(m.group()), pos)

consume(iterable, m.end() - pos - 1)
break
else:
yield tokens.Error, char
yield tokens.Error, char, pos


def tokenize(sql, encoding=None):
"""Tokenize sql.

Tokenize *sql* using the :class:`Lexer` and return a 2-tuple stream
of ``(token type, value)`` items.
Tokenize *sql* using the :class:`Lexer` and return a 3-tuple stream
of ``(token type, value, pos)`` items.
"""
return Lexer.get_default_instance().get_tokens(sql, encoding)
25 changes: 22 additions & 3 deletions sqlparse/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ class Token:
the type of the token.
"""

__slots__ = ('value', 'ttype', 'parent', 'normalized', 'is_keyword',
'is_group', 'is_whitespace', 'is_newline')
__slots__ = ('value', 'ttype', 'parent', 'pos', 'length', 'normalized',
'is_keyword', 'is_group', 'is_whitespace', 'is_newline')

def __init__(self, ttype, value):
def __init__(self, ttype, value, pos=None):
value = str(value)
self.value = value
self.ttype = ttype
Expand All @@ -59,6 +59,11 @@ def __init__(self, ttype, value):
self.is_newline = self.ttype in T.Newline
self.normalized = value.upper() if self.is_keyword else value

# TokenList overrides these with @property getters
if not hasattr(self, 'pos'):
self.pos = pos
self.length = len(value)

def __str__(self):
return self.value

Expand Down Expand Up @@ -163,6 +168,20 @@ def __init__(self, tokens=None):
super().__init__(None, str(self))
self.is_group = True

@property
def pos(self):
if len(self.tokens) > 0:
return self.tokens[0].pos

@property
def length(self):
if len(self.tokens) > 0:
first, last = self.tokens[0], self.tokens[-1]
if first.pos is not None and last.pos is not None:
return last.length + (last.pos - first.pos)

return len(str(self))

def __str__(self):
return ''.join(token.value for token in self.flatten())

Expand Down
8 changes: 4 additions & 4 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@ def test_tokenize_simple():
assert isinstance(stream, types.GeneratorType)
tokens = list(stream)
assert len(tokens) == 8
assert len(tokens[0]) == 2
assert tokens[0] == (T.Keyword.DML, 'select')
assert tokens[-1] == (T.Punctuation, ';')
assert len(tokens[0]) == 3
assert tokens[0] == (T.Keyword.DML, 'select', 0)
assert tokens[-1] == (T.Punctuation, ';', 17)


def test_tokenize_backticks():
s = '`foo`.`bar`'
tokens = list(lexer.tokenize(s))
assert len(tokens) == 3
assert tokens[0] == (T.Name, '`foo`')
assert tokens[0] == (T.Name, '`foo`', 0)


@pytest.mark.parametrize('s', ['foo\nbar\n', 'foo\rbar\r',
Expand Down