Skip to content

Commit baf3748

Browse files
Support unicode in lexer_parse_identifier_name_or_keyword (renamed from parse_name).
JerryScript-DCO-1.0-Signed-off-by: Ruben Ayrapetyan [email protected]
1 parent 6432e4d commit baf3748

File tree

1 file changed

+61
-22
lines changed

1 file changed

+61
-22
lines changed

jerry-core/parser/js/lexer.cpp

Lines changed: 61 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ static token saved_token, prev_token, sent_token, empty_token;
2828
static bool allow_dump_lines = false, strict_mode;
2929
static size_t buffer_size = 0;
3030

31+
/*
32+
* FIXME:
33+
* jerry_api_char_t should not be used outside of API implementation
34+
*/
35+
3136
/* Represents the contents of a script. */
3237
static const jerry_api_char_t *buffer_start = NULL;
3338
static const jerry_api_char_t *token_start;
@@ -157,6 +162,34 @@ lexer_create_token_for_charset (token_type tt, /**< token type */
157162
return create_token_from_lit (tt, lit);
158163
} /* lexer_create_token_for_charset */
159164

165+
/**
166+
* Check if the character falls into IdentifierStart group (ECMA-262 v5, 7.6)
167+
*
168+
* @return true / false
169+
*/
170+
static bool
171+
lexer_is_char_can_be_identifier_start (ecma_char_t c) /**< a character */
172+
{
173+
return (lit_char_is_unicode_letter (c)
174+
|| c == LIT_CHAR_DOLLAR_SIGN
175+
|| c == LIT_CHAR_UNDERSCORE
176+
|| c == LIT_CHAR_BACKSLASH);
177+
} /* lexer_is_char_can_be_identifier_start */
178+
179+
/**
180+
* Check if the character falls into IdentifierPart group (ECMA-262 v5, 7.6)
181+
*
182+
* @return true / false
183+
*/
184+
static bool
185+
lexer_is_char_can_be_identifier_part (ecma_char_t c) /**< a character */
186+
{
187+
return (lexer_is_char_can_be_identifier_start (c)
188+
|| lit_char_is_unicode_combining_mark (c)
189+
|| lit_char_is_unicode_digit (c)
190+
|| lit_char_is_unicode_connector_punctuation (c));
191+
} /* lexer_is_char_can_be_identifier_part */
192+
160193
/**
161194
* Try to decode specified character as SingleEscapeCharacter (ECMA-262, v5, 7.8.4)
162195
*
@@ -652,28 +685,29 @@ consume_char (void)
652685
* TOK_BOOL - for BooleanLiteral
653686
*/
654687
static token
655-
parse_name (void)
688+
lexer_parse_identifier_or_keyword (void)
656689
{
657-
ecma_char_t c = (ecma_char_t) LA (0);
690+
ecma_char_t c = LA (0);
658691

659-
JERRY_ASSERT (isalpha (c) || c == '$' || c == '_' || c == '\\');
692+
JERRY_ASSERT (lexer_is_char_can_be_identifier_start (c));
660693

661694
new_token ();
662695

696+
bool is_correct_identifier_name = true;
663697
bool is_escape_sequence_occured = false;
664698
bool is_all_chars_were_lowercase_ascii = true;
665699

666700
while (true)
667701
{
668-
c = (ecma_char_t) LA (0);
702+
c = LA (0);
669703

670-
if (c == '\\')
704+
if (c == LIT_CHAR_BACKSLASH)
671705
{
672706
consume_char ();
673707

674708
is_escape_sequence_occured = true;
675709

676-
bool is_unicode_escape_sequence = (LA (0) == 'u');
710+
bool is_unicode_escape_sequence = (LA (0) == LIT_CHAR_LOWERCASE_U);
677711
consume_char ();
678712

679713
if (is_unicode_escape_sequence)
@@ -684,36 +718,35 @@ parse_name (void)
684718
true,
685719
&c))
686720
{
687-
PARSE_ERROR ("Malformed escape sequence", token_start - buffer_start);
721+
is_correct_identifier_name = false;
722+
break;
688723
}
689724
else
690725
{
691726
/* c now contains character, encoded in the UnicodeEscapeSequence */
692-
if (!isalpha (c)
693-
&& !isdigit (c)
694-
&& c != '$'
695-
&& c != '_')
727+
728+
// Check character, converted from UnicodeEscapeSequence
729+
if (!lexer_is_char_can_be_identifier_part (c))
696730
{
697-
PARSE_ERROR ("Invalid character in identifier", token_start - buffer_start);
731+
is_correct_identifier_name = false;
732+
break;
698733
}
699734
}
700735
}
701736
else
702737
{
703-
PARSE_ERROR ("Only unicode escape sequences are allowed in identifiers",
704-
token_start - buffer_start);
738+
is_correct_identifier_name = false;
739+
break;
705740
}
706741
}
707-
else if (!isalpha (c)
708-
&& !isdigit (c)
709-
&& c != '$'
710-
&& c != '_')
742+
else if (!lexer_is_char_can_be_identifier_part (c))
711743
{
712744
break;
713745
}
714746
else
715747
{
716-
if (!islower (c))
748+
if (!(c >= LIT_CHAR_ASCII_LOWERCASE_LETTERS_BEGIN
749+
&& c <= LIT_CHAR_ASCII_LOWERCASE_LETTERS_END))
717750
{
718751
is_all_chars_were_lowercase_ascii = false;
719752
}
@@ -722,6 +755,11 @@ parse_name (void)
722755
}
723756
}
724757

758+
if (!is_correct_identifier_name)
759+
{
760+
PARSE_ERROR ("Illegal identifier name", lit_utf8_iterator_get_offset (&src_iter));
761+
}
762+
725763
const lit_utf8_size_t charset_size = (lit_utf8_size_t) (lit_utf8_iterator_get_ptr (&src_iter) - token_start);
726764

727765
token ret = empty_token;
@@ -754,7 +792,7 @@ parse_name (void)
754792
token_start = NULL;
755793

756794
return ret;
757-
} /* parse_name */
795+
} /* lexer_parse_identifier_or_keyword */
758796

759797
/* In this function we cannot use strtol function
760798
since there is no octal literals in ECMAscript. */
@@ -1199,9 +1237,10 @@ lexer_next_token_private (void)
11991237

12001238
JERRY_ASSERT (token_start == NULL);
12011239

1202-
if (isalpha (c) || c == '$' || c == '_' || c == '\\')
1240+
/* ECMA-262 v5, 7.6, Identifier */
1241+
if (lexer_is_char_can_be_identifier_start (c))
12031242
{
1204-
return parse_name ();
1243+
return lexer_parse_identifier_or_keyword ();
12051244
}
12061245

12071246
if (isdigit (c) || (c == '.' && isdigit (LA (1))))

0 commit comments

Comments
 (0)