Skip to content

Commit fdb3a2b

Browse files
committed
Parser: Parse surrogate pairs and es6 unicode code point escapes only when u flag is set
1 parent 4cd836a commit fdb3a2b

File tree

4 files changed

+225
-92
lines changed

4 files changed

+225
-92
lines changed

README.md

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -884,7 +884,6 @@ Unicode char started with `\u`, followed by a hex number:
884884

885885
```
886886
\u003B
887-
\u{003B}
888887
```
889888

890889
Node:
@@ -899,6 +898,43 @@ Node:
899898
}
900899
```
901900

901+
When using the `u` flag, unicode chars can also be represented using `\u` followed by a hex number between curly braces:
902+
903+
```
904+
\u{1F680}
905+
```
906+
907+
Node:
908+
909+
```js
910+
{
911+
type: 'Char',
912+
value: '\\u{1F680}',
913+
symbol: '🚀',
914+
kind: 'unicode',
915+
codePoint: 128640
916+
}
917+
```
918+
919+
When using the `u` flag, unicode chars can also be represented using a surrogate pair:
920+
921+
```
922+
\ud83d\ude80
923+
```
924+
925+
Node:
926+
927+
```js
928+
{
929+
type: 'Char',
930+
value: '\\ud83d\\ude80',
931+
symbol: '🚀',
932+
kind: 'unicode',
933+
codePoint: 128640,
934+
isSurrogatePair: true
935+
}
936+
```
937+
902938
#### Character class
903939

904940
Character classes define a _set_ of characters. A set may include as simple characters, as well as _character ranges_. A class can be _positive_ (any from the characters in the class match), or _negative_ (any _but_ the characters from the class match).

src/parser/__tests__/parser-basic-test.js

Lines changed: 60 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -933,26 +933,67 @@ describe('basic', () => {
933933
codePoint: 0x000001d306
934934
});
935935

936-
// TODO: without `u` flag \u{1234} should be parsed NOT as
937-
// a unicode code point, but as an (escaped) `u` character,
938-
// repeated 1234 times.
936+
// Not using `u` flag, not parsed as a unicode code point,
937+
// but as an (escaped) `u` character repeated 1234 times.
938+
expect(re(/\u{1234}/).body).toEqual({
939+
type: 'Repetition',
940+
expression: {
941+
type: 'Char',
942+
value: 'u',
943+
symbol: 'u',
944+
kind: 'simple',
945+
escaped: true,
946+
codePoint: 'u'.codePointAt(0)
947+
},
948+
quantifier: {
949+
type: 'Quantifier',
950+
kind: 'Range',
951+
from: 1234,
952+
to: 1234,
953+
greedy: true
954+
}
955+
});
939956

940-
// expect(re(/\u{1234}/).body).toEqual({
941-
// type: 'Repetition',
942-
// expression: {
943-
// type: 'Char',
944-
// value: 'u',
945-
// kind: 'simple',
946-
// escaped: true
947-
// },
948-
// quantifier: {
949-
// type: 'Quantifier',
950-
// kind: 'Range',
951-
// from: 1234,
952-
// to: 1234,
953-
// greedy: true
954-
// }
955-
// });
957+
// Using `u` flag, surrogate pairs.
958+
expect(re(/\ud83d\ude80/u).body).toEqual({
959+
type: 'Char',
960+
value: '\\ud83d\\ude80',
961+
kind: 'unicode',
962+
symbol: String.fromCodePoint(0x1F680),
963+
codePoint: 0x1F680,
964+
isSurrogatePair: true
965+
});
966+
967+
// Using `u` flag, surrogate pairs in character class.
968+
expect(re(/[\ud83d\ude80]/u).body).toEqual({
969+
type: 'CharacterClass',
970+
expressions: [{
971+
type: 'Char',
972+
value: '\\ud83d\\ude80',
973+
kind: 'unicode',
974+
symbol: String.fromCodePoint(0x1F680),
975+
codePoint: 0x1F680,
976+
isSurrogatePair: true
977+
}]
978+
});
979+
980+
// Not using `u` flag, surrogate pairs are treated as two characters
981+
expect(re(/\ud83d\ude80/).body).toEqual({
982+
type: 'Alternative',
983+
expressions: [{
984+
type: 'Char',
985+
value: '\\ud83d',
986+
kind: 'unicode',
987+
symbol: String.fromCodePoint(0xd83d),
988+
codePoint: 0xd83d
989+
}, {
990+
type: 'Char',
991+
value: '\\ude80',
992+
kind: 'unicode',
993+
symbol: String.fromCodePoint(0xde80),
994+
codePoint: 0xde80
995+
}]
996+
});
956997
});
957998

958999
it('valid sorted flags', () => {

0 commit comments

Comments
 (0)