Skip to content

Commit fa5737f

Browse files
committed
Improve parser suppport for regexp group names
1 parent 5e745ed commit fa5737f

File tree

4 files changed

+185
-5
lines changed

4 files changed

+185
-5
lines changed
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
// based on https://github.com/microsoft/TypeScript/tree/master/scripts/regenerate-unicode-identifier-parts.js
2+
3+
/** @param {number} i */
4+
function toHex4Digits(i) {
5+
let s = i.toString(16);
6+
while (s.length < 4) {
7+
s = '0' + s;
8+
}
9+
if (s.length > 4) throw new Error('Invalid Hex4Digits value');
10+
return s;
11+
}
12+
13+
class NonSurrogateRange {
14+
/** @param {number} codePoint */
15+
constructor(codePoint) {
16+
this.firstCodePoint = codePoint;
17+
this.lastCodePoint = codePoint;
18+
}
19+
toString() {
20+
let text = '\\u' + toHex4Digits(this.firstCodePoint);
21+
if (this.lastCodePoint !== this.firstCodePoint) {
22+
text += '-\\u' + toHex4Digits(this.lastCodePoint);
23+
}
24+
return text;
25+
}
26+
}
27+
28+
class LeadSurrogateRange {
29+
/** @param {number} leadSurrogate */
30+
constructor(leadSurrogate) {
31+
this.leadSurrogate = leadSurrogate;
32+
/** @type {TrailSurrogateRange[]} */
33+
this.ranges = [];
34+
}
35+
36+
toString() {
37+
return (
38+
'\\u' +
39+
toHex4Digits(this.leadSurrogate) +
40+
'[' +
41+
this.ranges.join('') +
42+
']'
43+
);
44+
}
45+
}
46+
47+
class TrailSurrogateRange {
48+
/** @param {number} trailSurrogate */
49+
constructor(trailSurrogate) {
50+
this.firstTrailSurrogate = trailSurrogate;
51+
this.lastTrailSurrogate = trailSurrogate;
52+
}
53+
toString() {
54+
let text = '\\u' + toHex4Digits(this.firstTrailSurrogate);
55+
if (this.lastTrailSurrogate !== this.firstTrailSurrogate) {
56+
text += '-\\u' + toHex4Digits(this.lastTrailSurrogate);
57+
}
58+
return text;
59+
}
60+
}
61+
62+
class Writer {
63+
constructor() {
64+
/** @type {number} */
65+
this.lastCodePoint = -1;
66+
/** @type {NonSurrogateRange[]} */
67+
this.nonSurrogateRanges = [];
68+
/** @type {LeadSurrogateRange[]} */
69+
this.surrogateRanges = [];
70+
/** @type {NonSurrogateRange} */
71+
this.nonSurrogateRange;
72+
/** @type {LeadSurrogateRange} */
73+
this.leadSurrogateRange;
74+
/** @type {TrailSurrogateRange} */
75+
this.trailSurrogateRange;
76+
}
77+
78+
/** @param {number} codePoint */
79+
push(codePoint) {
80+
if (codePoint <= this.lastCodePoint)
81+
throw new Error('Code points must be added in order.');
82+
this.lastCodePoint = codePoint;
83+
84+
if (codePoint < MAX_UNICODE_NON_SURROGATE) {
85+
if (
86+
this.nonSurrogateRange &&
87+
this.nonSurrogateRange.lastCodePoint === codePoint - 1
88+
) {
89+
this.nonSurrogateRange.lastCodePoint = codePoint;
90+
return;
91+
}
92+
this.nonSurrogateRange = new NonSurrogateRange(codePoint);
93+
this.nonSurrogateRanges.push(this.nonSurrogateRange);
94+
} else {
95+
const leadSurrogate = Math.floor((codePoint - 0x10000) / 0x400) + 0xd800;
96+
const trailSurrogate = ((codePoint - 0x10000) % 0x400) + 0xdc00;
97+
if (
98+
!this.leadSurrogateRange ||
99+
this.leadSurrogateRange.leadSurrogate !== leadSurrogate
100+
) {
101+
this.trailSurrogateRange = undefined;
102+
this.leadSurrogateRange = new LeadSurrogateRange(leadSurrogate);
103+
this.surrogateRanges.push(this.leadSurrogateRange);
104+
}
105+
106+
if (
107+
this.trailSurrogateRange &&
108+
this.trailSurrogateRange.lastTrailSurrogate === trailSurrogate - 1
109+
) {
110+
this.trailSurrogateRange.lastTrailSurrogate = trailSurrogate;
111+
return;
112+
}
113+
114+
this.trailSurrogateRange = new TrailSurrogateRange(trailSurrogate);
115+
this.leadSurrogateRange.ranges.push(this.trailSurrogateRange);
116+
}
117+
}
118+
119+
toString() {
120+
let first = this.nonSurrogateRanges.join('');
121+
let second = this.surrogateRanges.join('|');
122+
return first && second
123+
? `([${first}]|${second})`
124+
: first
125+
? `[${first}]`
126+
: second
127+
? `(${second})`
128+
: '';
129+
}
130+
}
131+
132+
const MAX_UNICODE_NON_SURROGATE = 0xffff;
133+
const MAX_UNICODE_CODEPOINT = 0x10ffff;
134+
const isStart = c => /\p{ID_Start}/u.test(c);
135+
const isContinue = c => /\p{ID_Continue}/u.test(c);
136+
137+
let idStartWriter = new Writer();
138+
let idContinueWriter = new Writer();
139+
140+
for (let cp = 0; cp < MAX_UNICODE_CODEPOINT; cp++) {
141+
const ch = String.fromCodePoint(cp);
142+
if (isStart(ch)) {
143+
idStartWriter.push(cp);
144+
}
145+
if (isContinue(ch)) {
146+
idContinueWriter.push(cp);
147+
}
148+
}
149+
150+
console.log(`/**
151+
* Generated by scripts/generate-unicode-id-parts.js on node ${
152+
process.version
153+
} with unicode ${process.versions.unicode}
154+
* based on http://www.unicode.org/reports/tr31/ and https://www.ecma-international.org/ecma-262/6.0/#sec-names-and-keywords
155+
* U_ID_START corresponds to the ID_Start property, and U_ID_CONTINUE corresponds to ID_Continue property.
156+
*/`);
157+
console.log('U_ID_START ' + idStartWriter.toString());
158+
console.log('U_ID_CONTINUE ' + idContinueWriter.toString());

src/parser/__tests__/parser-test262-test.js

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ function ut(message) {
3030
}
3131

3232
describe('test262', () => {
33-
3433
it('invalid char range', () => {
3534
// 15.10.2.15-6-1
3635
invalid('/^[z-a]$/', 'out of order');
@@ -135,4 +134,15 @@ describe('test262', () => {
135134
it('invalid unicode escape', () => {
136135
invalid('/\\u{11FFFF}/u', 'Bad character escape');
137136
});
138-
});
137+
138+
it('unicode group names', () => {
139+
valid('/(?<π>a)/u');
140+
valid('/(?<\\u{03C0}>a)/u');
141+
valid('/(?<$𐒤>a)/u');
142+
valid('/(?<_\\u200C>a)/u');
143+
valid('/(?<_\\u200D>a)/u');
144+
valid('/(?<ಠ_ಠ>a)/u');
145+
valid('/(?<$>a)/u');
146+
valid('/(?<_>a)/u');
147+
});
148+
});

0 commit comments

Comments
 (0)