From 89306c767617a6eeb948fa1b442b9cd23107800d Mon Sep 17 00:00:00 2001 From: brandonspark Date: Wed, 24 Apr 2024 14:47:43 -0700 Subject: [PATCH] feat: augment python grammar with semgrep stuff --- .../src/semgrep-python/grammar.js | 57 +++++- .../semgrep-python/test/corpus/semgrep.txt | 184 ++++++++++++++++-- 2 files changed, 221 insertions(+), 20 deletions(-) diff --git a/lang/semgrep-grammars/src/semgrep-python/grammar.js b/lang/semgrep-grammars/src/semgrep-python/grammar.js index ccba03d8..ade141bd 100644 --- a/lang/semgrep-grammars/src/semgrep-python/grammar.js +++ b/lang/semgrep-grammars/src/semgrep-python/grammar.js @@ -10,6 +10,8 @@ module.exports = grammar(base_grammar, { name: 'python', conflicts: ($, previous) => previous.concat([ + [$.expression, $.pair], + [$.ellipsis, $.pair] ]), /* @@ -25,16 +27,57 @@ module.exports = grammar(base_grammar, { ...previous.members ), */ + + semgrep_deep_expression: $ => seq('<...', $.expression, '...>'), + semgrep_typed_metavar: $ => seq('(', $.identifier, ':', $.type, ')'), + semgrep_ellipsis_metavar: $ => /\$\.\.\.[a-zA-Z_][a-zA-Z_0-9]*/, + + expression: ($, previous) => choice( + ...previous.members, + $.semgrep_deep_expression, + $.semgrep_typed_metavar, + $.semgrep_ellipsis_metavar, + ), + + _statement: ($, previous) => choice( + ...previous.members, + prec(1, $.semgrep_ellipsis_metavar), + ), + + attribute: ($, previous) => choice( + previous, + // This precedence is hard-coded here because we cannot reference the PREC that + // is used within the official tree-sitter-python grammar. + // At the time of this update, PREC.call is 22. + prec(22,seq( + field('object', $.primary_expression), + '.', + field('attribute', choice('...', $.semgrep_ellipsis_metavar)) + )) + ), + + parameter: ($, previous) => choice( + previous, + '...', + $.semgrep_ellipsis_metavar + ), + + pair: ($, previous) => choice( + previous, + '...', + $.semgrep_ellipsis_metavar + ), + // Metavariables - // Rather than creating a separate metavariable term - // and adding it to identifiers, this instead overrides the - // regex that is defined in the original tree-sitter grammar. - // this is needed since currently in the original tree-sitter grammar, - // identifier is a terminal, and thus can't do - // the usual choice/previous shadowing definition. + // Rather than creating a separate metavariable term + // and adding it to identifiers, this instead overrides the + // regex that is defined in the original tree-sitter grammar. + // this is needed since currently in the original tree-sitter grammar, + // identifier is a terminal, and thus can't do + // the usual choice/previous shadowing definition. identifier: $ => /\$?[_\p{XID_Start}][_\p{XID_Continue}]*/, - + } }); diff --git a/lang/semgrep-grammars/src/semgrep-python/test/corpus/semgrep.txt b/lang/semgrep-grammars/src/semgrep-python/test/corpus/semgrep.txt index 557a60c1..95a87db9 100644 --- a/lang/semgrep-grammars/src/semgrep-python/test/corpus/semgrep.txt +++ b/lang/semgrep-grammars/src/semgrep-python/test/corpus/semgrep.txt @@ -6,17 +6,175 @@ Metavariable in match case str(): print("oh no") --- - (module - (match_statement + +(module + (match_statement + (identifier) + (case_clause + (case_pattern + (call + (identifier) + (argument_list))) + (block + (expression_statement + (call + (identifier) + (argument_list + (string)))))))) + +==================================== +Typed metavariable +==================================== + +($X: T) + +--- + +(module + (expression_statement + (semgrep_typed_metavar + (identifier) + (type + (identifier))))) + +==================================== +Standalone ellipsis +==================================== + +... + +--- + +(module + (expression_statement + (ellipsis))) + +==================================== +Standalone ellipsis metavariable +==================================== + +$...X + +--- + +(module + (expression_statement + (semgrep_ellipsis_metavar))) + +==================================== +Standalone deep expression +==================================== + +<... foo() ...> + +--- + +(module + (expression_statement + (semgrep_deep_expression + (call + (identifier) + (argument_list))))) + +==================================== +Method chaining +==================================== + +A. ... .C + +--- + +(module + (expression_statement + (attribute + (attribute + (identifier)) + (identifier)))) + +==================================== +Method chaining with ellipsis metavar +==================================== + +A. $...STUFF .C + +--- + +(module + (expression_statement + (attribute + (attribute (identifier) - (case_clause - (case_pattern - (call - (identifier) - (argument_list))) - (block - (expression_statement - (call - (identifier) - (argument_list - (string)))))))) + (semgrep_ellipsis_metavar)) + (identifier)))) + +==================================== +Ellipsis as function parameter +==================================== + +def $FUNC(..., x, ...): + ... + +--- + +(module + (function_definition + (identifier) + (parameters + (identifier)) + (block + (expression_statement + (ellipsis))))) + +==================================== +Ellipsis between statements +==================================== + +x = 1 +... +y = 2 + +--- + +(module + (expression_statement + (assignment + (identifier) + (integer))) + (expression_statement + (ellipsis)) + (expression_statement + (assignment + (identifier) + (integer)))) + +==================================== +Ellipsis in set +==================================== + +{ ..., $X, ... } + +--- + +(module + (expression_statement + (set + (ellipsis) + (identifier) + (ellipsis)))) + +==================================== +Ellipsis in dictionary +==================================== + +{ ..., "x": 2, ... } + +--- + +(module + (expression_statement + (dictionary + (pair) + (pair + (string) + (integer)) + (pair))))