Skip to content

Commit 89e92dd

Browse files
authored
Merge pull request #54 from fglock/fix-unicode-prop-tests
Fix unicode prop tests
2 parents eb819fb + e760ea7 commit 89e92dd

File tree

2 files changed

+232
-0
lines changed

2 files changed

+232
-0
lines changed

dev/tools/perl_test_runner.pl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ sub run_single_test {
243243
re/pat_rt_report.t
244244
| re/pat.t
245245
| re/regex_sets.t
246+
| re/regexp_unicode_prop.t
246247
| op/pack.t
247248
| op/index.t
248249
| op/split.t

src/main/java/org/perlonjava/regex/UnicodeResolver.java

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
import com.ibm.icu.lang.UCharacter;
44
import com.ibm.icu.lang.UProperty;
55
import com.ibm.icu.text.UnicodeSet;
6+
import org.perlonjava.runtime.*;
7+
8+
import java.util.HashSet;
9+
import java.util.Set;
610

711
public class UnicodeResolver {
812
/**
@@ -50,8 +54,230 @@ public static int getCodePointFromName(String name) {
5054
return codePoint;
5155
}
5256

57+
/**
58+
* Parses a user-defined property definition string and returns a character class pattern.
59+
* The format is hex ranges separated by tabs/newlines:
60+
* - "0009\t000D\n0020" means ranges U+0009 to U+000D and single char U+0020
61+
* - Lines starting with # are comments
62+
* - Lines starting with + add another property
63+
* - Lines starting with - or ! remove a property
64+
* - Lines starting with & intersect with a property
65+
*
66+
* @param definition The property definition string
67+
* @param recursionSet Set to track recursive property calls
68+
* @param propertyName The name of the property being parsed (for error messages)
69+
* @return A character class pattern
70+
*/
71+
private static String parseUserDefinedProperty(String definition, Set<String> recursionSet, String propertyName) {
72+
UnicodeSet resultSet = new UnicodeSet();
73+
boolean hasIntersection = false;
74+
UnicodeSet intersectionSet = null;
75+
76+
String[] lines = definition.split("\\n");
77+
for (String line : lines) {
78+
line = line.trim();
79+
80+
// Skip empty lines and comments
81+
if (line.isEmpty() || line.startsWith("#")) {
82+
continue;
83+
}
84+
85+
// Handle property references
86+
if (line.startsWith("+")) {
87+
// Add another property
88+
String propName = line.substring(1).trim();
89+
String propPattern = resolvePropertyReference(propName, recursionSet, propertyName);
90+
UnicodeSet propSet = new UnicodeSet(propPattern);
91+
resultSet.addAll(propSet);
92+
} else if (line.startsWith("-") || line.startsWith("!")) {
93+
// Remove a property
94+
String propName = line.substring(1).trim();
95+
String propPattern = resolvePropertyReference(propName, recursionSet, propertyName);
96+
UnicodeSet propSet = new UnicodeSet(propPattern);
97+
resultSet.removeAll(propSet);
98+
} else if (line.startsWith("&")) {
99+
// Intersection with a property
100+
String propName = line.substring(1).trim();
101+
String propPattern = resolvePropertyReference(propName, recursionSet, propertyName);
102+
if (!hasIntersection) {
103+
intersectionSet = new UnicodeSet(propPattern);
104+
hasIntersection = true;
105+
} else {
106+
intersectionSet.retainAll(new UnicodeSet(propPattern));
107+
}
108+
} else {
109+
// Parse hex range - extract the hex part before any comments
110+
String hexPart = line.split("#")[0].trim();
111+
// Split by tabs or multiple spaces
112+
String[] parts = hexPart.split("\\t+|\\s{2,}");
113+
if (parts.length == 1 && !parts[0].isEmpty()) {
114+
// Single character
115+
String hexStr = parts[0].trim();
116+
// Check if it's a valid hex string
117+
if (!hexStr.matches("[0-9A-Fa-f]+")) {
118+
throw new IllegalArgumentException("Can't find Unicode property definition \"" + line.trim() + "\" in expansion of " + propertyName);
119+
}
120+
try {
121+
long codePoint = Long.parseLong(hexStr, 16);
122+
if (codePoint > 0x10FFFF) {
123+
throw new IllegalArgumentException("Code point too large in \"" + line.trim() + "\" in expansion of " + propertyName);
124+
}
125+
resultSet.add((int) codePoint);
126+
} catch (NumberFormatException e) {
127+
throw new IllegalArgumentException("Can't find Unicode property definition \"" + line.trim() + "\" in expansion of " + propertyName);
128+
}
129+
} else if (parts.length >= 2) {
130+
// Range
131+
String startHex = parts[0].trim();
132+
String endHex = parts[1].trim();
133+
134+
// Check if they're valid hex strings
135+
if (!startHex.matches("[0-9A-Fa-f]+") || !endHex.matches("[0-9A-Fa-f]+")) {
136+
throw new IllegalArgumentException("Can't find Unicode property definition \"" + line.trim() + "\" in expansion of " + propertyName);
137+
}
138+
139+
try {
140+
long start = Long.parseLong(startHex, 16);
141+
long end = Long.parseLong(endHex, 16);
142+
143+
if (start > 0x10FFFF) {
144+
throw new IllegalArgumentException("Code point too large in \"" + line.trim() + "\" in expansion of " + propertyName);
145+
}
146+
if (end > 0x10FFFF) {
147+
throw new IllegalArgumentException("Code point too large in \"" + line.trim() + "\" in expansion of " + propertyName);
148+
}
149+
if (start > end) {
150+
throw new IllegalArgumentException("Illegal range in \"" + line.trim() + "\" in expansion of " + propertyName);
151+
}
152+
153+
resultSet.add((int) start, (int) end);
154+
} catch (NumberFormatException e) {
155+
throw new IllegalArgumentException("Can't find Unicode property definition \"" + line.trim() + "\" in expansion of " + propertyName);
156+
}
157+
}
158+
}
159+
}
160+
161+
// Apply intersection if any
162+
if (hasIntersection) {
163+
resultSet.retainAll(intersectionSet);
164+
}
165+
166+
return resultSet.toPattern(false);
167+
}
168+
169+
/**
170+
* Resolves a property reference (like utf8::InHiragana or main::IsMyProp).
171+
*
172+
* @param propRef The property reference
173+
* @param recursionSet Set to track recursive property calls
174+
* @param parentProperty The parent property name (for error messages)
175+
* @return A character class pattern
176+
*/
177+
private static String resolvePropertyReference(String propRef, Set<String> recursionSet, String parentProperty) {
178+
// Check for recursion
179+
if (recursionSet.contains(propRef)) {
180+
// Build recursion chain for error message
181+
StringBuilder chain = new StringBuilder();
182+
for (String prop : recursionSet) {
183+
if (chain.length() > 0) {
184+
chain.append(" in expansion of ");
185+
}
186+
chain.append(prop);
187+
}
188+
if (chain.length() > 0) {
189+
chain.append(" in expansion of ");
190+
}
191+
chain.append(propRef);
192+
throw new IllegalArgumentException("Infinite recursion in user-defined property \"" + propRef + "\" in expansion of " + chain);
193+
}
194+
195+
// Remove utf8:: prefix if present
196+
if (propRef.startsWith("utf8::")) {
197+
String stdProp = propRef.substring(6);
198+
try {
199+
// Try as standard property
200+
return translateUnicodeProperty(stdProp, false, recursionSet);
201+
} catch (IllegalArgumentException e) {
202+
// Fall through to user-defined property lookup
203+
propRef = "main::" + stdProp;
204+
}
205+
}
206+
207+
// Try as user-defined property
208+
return translateUnicodeProperty(propRef, false, recursionSet);
209+
}
210+
211+
/**
212+
* Tries to look up a user-defined property by calling a Perl subroutine.
213+
*
214+
* @param property The property name (e.g., "IsMyUpper" or "main::IsMyUpper")
215+
* @param recursionSet Set to track recursive property calls
216+
* @return The property definition string, or null if not found
217+
*/
218+
private static String tryUserDefinedProperty(String property, Set<String> recursionSet) {
219+
// Add to recursion set
220+
Set<String> newRecursionSet = new HashSet<>(recursionSet);
221+
newRecursionSet.add(property);
222+
223+
// Build the full subroutine name
224+
String subName = property;
225+
if (!subName.contains("::")) {
226+
// Try in main package
227+
subName = "main::" + subName;
228+
}
229+
230+
// Look up the subroutine
231+
RuntimeScalar codeRef = GlobalVariable.getGlobalCodeRef(subName);
232+
if (codeRef == null || !codeRef.getDefinedBoolean()) {
233+
return null;
234+
}
235+
236+
try {
237+
// Call the subroutine with an empty argument list
238+
RuntimeArray args = new RuntimeArray();
239+
RuntimeList result = RuntimeCode.apply(codeRef, args, RuntimeContextType.SCALAR);
240+
241+
if (result.elements.isEmpty()) {
242+
return "";
243+
}
244+
245+
String definition = result.elements.getFirst().toString();
246+
247+
// Parse and return the property definition
248+
return parseUserDefinedProperty(definition, newRecursionSet, subName);
249+
250+
} catch (PerlCompilerException e) {
251+
// Re-throw Perl exceptions (like die in IsDeath)
252+
String msg = e.getMessage();
253+
if (msg != null && !msg.contains("in expansion of")) {
254+
throw new IllegalArgumentException("Died" + (msg.isEmpty() ? "" : ": " + msg) + " in expansion of " + subName, e);
255+
}
256+
throw e;
257+
} catch (IllegalArgumentException e) {
258+
// Re-throw validation errors from parseUserDefinedProperty
259+
throw e;
260+
} catch (Exception e) {
261+
// Wrap other errors
262+
throw new IllegalArgumentException("Error in user-defined property " + subName + ": " + e.getMessage(), e);
263+
}
264+
}
265+
53266
public static String translateUnicodeProperty(String property, boolean negated) {
267+
return translateUnicodeProperty(property, negated, new HashSet<>());
268+
}
269+
270+
private static String translateUnicodeProperty(String property, boolean negated, Set<String> recursionSet) {
54271
try {
272+
// Check for user-defined properties (Is... or In...)
273+
if (property.matches("^(.*::)?(Is|In)[A-Z].*")) {
274+
String userProp = tryUserDefinedProperty(property, recursionSet);
275+
if (userProp != null) {
276+
return wrapCharClass(userProp, negated);
277+
}
278+
// Property not found - fall through to throw error below
279+
}
280+
55281
// Special cases - Perl XPosix properties not natively supported in Java
56282
switch (property) {
57283
case "lb=cr":
@@ -166,6 +392,11 @@ public static String translateUnicodeProperty(String property, boolean negated)
166392
return wrapCharClass(pattern, negated);
167393

168394
} catch (IllegalArgumentException e) {
395+
// If the error message already contains "in expansion of", it's a user-defined property error
396+
// that should be propagated as-is
397+
if (e.getMessage() != null && e.getMessage().contains("in expansion of")) {
398+
throw e;
399+
}
169400
throw new IllegalArgumentException("Invalid or unsupported Unicode property: " + property, e);
170401
}
171402
}

0 commit comments

Comments
 (0)