|
3 | 3 | import com.ibm.icu.lang.UCharacter; |
4 | 4 | import com.ibm.icu.lang.UProperty; |
5 | 5 | import com.ibm.icu.text.UnicodeSet; |
| 6 | +import org.perlonjava.runtime.*; |
| 7 | + |
| 8 | +import java.util.HashSet; |
| 9 | +import java.util.Set; |
6 | 10 |
|
7 | 11 | public class UnicodeResolver { |
8 | 12 | /** |
@@ -50,8 +54,230 @@ public static int getCodePointFromName(String name) { |
50 | 54 | return codePoint; |
51 | 55 | } |
52 | 56 |
|
| 57 | + /** |
| 58 | + * Parses a user-defined property definition string and returns a character class pattern. |
| 59 | + * The format is hex ranges separated by tabs/newlines: |
| 60 | + * - "0009\t000D\n0020" means ranges U+0009 to U+000D and single char U+0020 |
| 61 | + * - Lines starting with # are comments |
| 62 | + * - Lines starting with + add another property |
| 63 | + * - Lines starting with - or ! remove a property |
| 64 | + * - Lines starting with & intersect with a property |
| 65 | + * |
| 66 | + * @param definition The property definition string |
| 67 | + * @param recursionSet Set to track recursive property calls |
| 68 | + * @param propertyName The name of the property being parsed (for error messages) |
| 69 | + * @return A character class pattern |
| 70 | + */ |
| 71 | + private static String parseUserDefinedProperty(String definition, Set<String> recursionSet, String propertyName) { |
| 72 | + UnicodeSet resultSet = new UnicodeSet(); |
| 73 | + boolean hasIntersection = false; |
| 74 | + UnicodeSet intersectionSet = null; |
| 75 | + |
| 76 | + String[] lines = definition.split("\\n"); |
| 77 | + for (String line : lines) { |
| 78 | + line = line.trim(); |
| 79 | + |
| 80 | + // Skip empty lines and comments |
| 81 | + if (line.isEmpty() || line.startsWith("#")) { |
| 82 | + continue; |
| 83 | + } |
| 84 | + |
| 85 | + // Handle property references |
| 86 | + if (line.startsWith("+")) { |
| 87 | + // Add another property |
| 88 | + String propName = line.substring(1).trim(); |
| 89 | + String propPattern = resolvePropertyReference(propName, recursionSet, propertyName); |
| 90 | + UnicodeSet propSet = new UnicodeSet(propPattern); |
| 91 | + resultSet.addAll(propSet); |
| 92 | + } else if (line.startsWith("-") || line.startsWith("!")) { |
| 93 | + // Remove a property |
| 94 | + String propName = line.substring(1).trim(); |
| 95 | + String propPattern = resolvePropertyReference(propName, recursionSet, propertyName); |
| 96 | + UnicodeSet propSet = new UnicodeSet(propPattern); |
| 97 | + resultSet.removeAll(propSet); |
| 98 | + } else if (line.startsWith("&")) { |
| 99 | + // Intersection with a property |
| 100 | + String propName = line.substring(1).trim(); |
| 101 | + String propPattern = resolvePropertyReference(propName, recursionSet, propertyName); |
| 102 | + if (!hasIntersection) { |
| 103 | + intersectionSet = new UnicodeSet(propPattern); |
| 104 | + hasIntersection = true; |
| 105 | + } else { |
| 106 | + intersectionSet.retainAll(new UnicodeSet(propPattern)); |
| 107 | + } |
| 108 | + } else { |
| 109 | + // Parse hex range - extract the hex part before any comments |
| 110 | + String hexPart = line.split("#")[0].trim(); |
| 111 | + // Split by tabs or multiple spaces |
| 112 | + String[] parts = hexPart.split("\\t+|\\s{2,}"); |
| 113 | + if (parts.length == 1 && !parts[0].isEmpty()) { |
| 114 | + // Single character |
| 115 | + String hexStr = parts[0].trim(); |
| 116 | + // Check if it's a valid hex string |
| 117 | + if (!hexStr.matches("[0-9A-Fa-f]+")) { |
| 118 | + throw new IllegalArgumentException("Can't find Unicode property definition \"" + line.trim() + "\" in expansion of " + propertyName); |
| 119 | + } |
| 120 | + try { |
| 121 | + long codePoint = Long.parseLong(hexStr, 16); |
| 122 | + if (codePoint > 0x10FFFF) { |
| 123 | + throw new IllegalArgumentException("Code point too large in \"" + line.trim() + "\" in expansion of " + propertyName); |
| 124 | + } |
| 125 | + resultSet.add((int) codePoint); |
| 126 | + } catch (NumberFormatException e) { |
| 127 | + throw new IllegalArgumentException("Can't find Unicode property definition \"" + line.trim() + "\" in expansion of " + propertyName); |
| 128 | + } |
| 129 | + } else if (parts.length >= 2) { |
| 130 | + // Range |
| 131 | + String startHex = parts[0].trim(); |
| 132 | + String endHex = parts[1].trim(); |
| 133 | + |
| 134 | + // Check if they're valid hex strings |
| 135 | + if (!startHex.matches("[0-9A-Fa-f]+") || !endHex.matches("[0-9A-Fa-f]+")) { |
| 136 | + throw new IllegalArgumentException("Can't find Unicode property definition \"" + line.trim() + "\" in expansion of " + propertyName); |
| 137 | + } |
| 138 | + |
| 139 | + try { |
| 140 | + long start = Long.parseLong(startHex, 16); |
| 141 | + long end = Long.parseLong(endHex, 16); |
| 142 | + |
| 143 | + if (start > 0x10FFFF) { |
| 144 | + throw new IllegalArgumentException("Code point too large in \"" + line.trim() + "\" in expansion of " + propertyName); |
| 145 | + } |
| 146 | + if (end > 0x10FFFF) { |
| 147 | + throw new IllegalArgumentException("Code point too large in \"" + line.trim() + "\" in expansion of " + propertyName); |
| 148 | + } |
| 149 | + if (start > end) { |
| 150 | + throw new IllegalArgumentException("Illegal range in \"" + line.trim() + "\" in expansion of " + propertyName); |
| 151 | + } |
| 152 | + |
| 153 | + resultSet.add((int) start, (int) end); |
| 154 | + } catch (NumberFormatException e) { |
| 155 | + throw new IllegalArgumentException("Can't find Unicode property definition \"" + line.trim() + "\" in expansion of " + propertyName); |
| 156 | + } |
| 157 | + } |
| 158 | + } |
| 159 | + } |
| 160 | + |
| 161 | + // Apply intersection if any |
| 162 | + if (hasIntersection) { |
| 163 | + resultSet.retainAll(intersectionSet); |
| 164 | + } |
| 165 | + |
| 166 | + return resultSet.toPattern(false); |
| 167 | + } |
| 168 | + |
| 169 | + /** |
| 170 | + * Resolves a property reference (like utf8::InHiragana or main::IsMyProp). |
| 171 | + * |
| 172 | + * @param propRef The property reference |
| 173 | + * @param recursionSet Set to track recursive property calls |
| 174 | + * @param parentProperty The parent property name (for error messages) |
| 175 | + * @return A character class pattern |
| 176 | + */ |
| 177 | + private static String resolvePropertyReference(String propRef, Set<String> recursionSet, String parentProperty) { |
| 178 | + // Check for recursion |
| 179 | + if (recursionSet.contains(propRef)) { |
| 180 | + // Build recursion chain for error message |
| 181 | + StringBuilder chain = new StringBuilder(); |
| 182 | + for (String prop : recursionSet) { |
| 183 | + if (chain.length() > 0) { |
| 184 | + chain.append(" in expansion of "); |
| 185 | + } |
| 186 | + chain.append(prop); |
| 187 | + } |
| 188 | + if (chain.length() > 0) { |
| 189 | + chain.append(" in expansion of "); |
| 190 | + } |
| 191 | + chain.append(propRef); |
| 192 | + throw new IllegalArgumentException("Infinite recursion in user-defined property \"" + propRef + "\" in expansion of " + chain); |
| 193 | + } |
| 194 | + |
| 195 | + // Remove utf8:: prefix if present |
| 196 | + if (propRef.startsWith("utf8::")) { |
| 197 | + String stdProp = propRef.substring(6); |
| 198 | + try { |
| 199 | + // Try as standard property |
| 200 | + return translateUnicodeProperty(stdProp, false, recursionSet); |
| 201 | + } catch (IllegalArgumentException e) { |
| 202 | + // Fall through to user-defined property lookup |
| 203 | + propRef = "main::" + stdProp; |
| 204 | + } |
| 205 | + } |
| 206 | + |
| 207 | + // Try as user-defined property |
| 208 | + return translateUnicodeProperty(propRef, false, recursionSet); |
| 209 | + } |
| 210 | + |
| 211 | + /** |
| 212 | + * Tries to look up a user-defined property by calling a Perl subroutine. |
| 213 | + * |
| 214 | + * @param property The property name (e.g., "IsMyUpper" or "main::IsMyUpper") |
| 215 | + * @param recursionSet Set to track recursive property calls |
| 216 | + * @return The property definition string, or null if not found |
| 217 | + */ |
| 218 | + private static String tryUserDefinedProperty(String property, Set<String> recursionSet) { |
| 219 | + // Add to recursion set |
| 220 | + Set<String> newRecursionSet = new HashSet<>(recursionSet); |
| 221 | + newRecursionSet.add(property); |
| 222 | + |
| 223 | + // Build the full subroutine name |
| 224 | + String subName = property; |
| 225 | + if (!subName.contains("::")) { |
| 226 | + // Try in main package |
| 227 | + subName = "main::" + subName; |
| 228 | + } |
| 229 | + |
| 230 | + // Look up the subroutine |
| 231 | + RuntimeScalar codeRef = GlobalVariable.getGlobalCodeRef(subName); |
| 232 | + if (codeRef == null || !codeRef.getDefinedBoolean()) { |
| 233 | + return null; |
| 234 | + } |
| 235 | + |
| 236 | + try { |
| 237 | + // Call the subroutine with an empty argument list |
| 238 | + RuntimeArray args = new RuntimeArray(); |
| 239 | + RuntimeList result = RuntimeCode.apply(codeRef, args, RuntimeContextType.SCALAR); |
| 240 | + |
| 241 | + if (result.elements.isEmpty()) { |
| 242 | + return ""; |
| 243 | + } |
| 244 | + |
| 245 | + String definition = result.elements.getFirst().toString(); |
| 246 | + |
| 247 | + // Parse and return the property definition |
| 248 | + return parseUserDefinedProperty(definition, newRecursionSet, subName); |
| 249 | + |
| 250 | + } catch (PerlCompilerException e) { |
| 251 | + // Re-throw Perl exceptions (like die in IsDeath) |
| 252 | + String msg = e.getMessage(); |
| 253 | + if (msg != null && !msg.contains("in expansion of")) { |
| 254 | + throw new IllegalArgumentException("Died" + (msg.isEmpty() ? "" : ": " + msg) + " in expansion of " + subName, e); |
| 255 | + } |
| 256 | + throw e; |
| 257 | + } catch (IllegalArgumentException e) { |
| 258 | + // Re-throw validation errors from parseUserDefinedProperty |
| 259 | + throw e; |
| 260 | + } catch (Exception e) { |
| 261 | + // Wrap other errors |
| 262 | + throw new IllegalArgumentException("Error in user-defined property " + subName + ": " + e.getMessage(), e); |
| 263 | + } |
| 264 | + } |
| 265 | + |
53 | 266 | public static String translateUnicodeProperty(String property, boolean negated) { |
| 267 | + return translateUnicodeProperty(property, negated, new HashSet<>()); |
| 268 | + } |
| 269 | + |
| 270 | + private static String translateUnicodeProperty(String property, boolean negated, Set<String> recursionSet) { |
54 | 271 | try { |
| 272 | + // Check for user-defined properties (Is... or In...) |
| 273 | + if (property.matches("^(.*::)?(Is|In)[A-Z].*")) { |
| 274 | + String userProp = tryUserDefinedProperty(property, recursionSet); |
| 275 | + if (userProp != null) { |
| 276 | + return wrapCharClass(userProp, negated); |
| 277 | + } |
| 278 | + // Property not found - fall through to throw error below |
| 279 | + } |
| 280 | + |
55 | 281 | // Special cases - Perl XPosix properties not natively supported in Java |
56 | 282 | switch (property) { |
57 | 283 | case "lb=cr": |
@@ -166,6 +392,11 @@ public static String translateUnicodeProperty(String property, boolean negated) |
166 | 392 | return wrapCharClass(pattern, negated); |
167 | 393 |
|
168 | 394 | } catch (IllegalArgumentException e) { |
| 395 | + // If the error message already contains "in expansion of", it's a user-defined property error |
| 396 | + // that should be propagated as-is |
| 397 | + if (e.getMessage() != null && e.getMessage().contains("in expansion of")) { |
| 398 | + throw e; |
| 399 | + } |
169 | 400 | throw new IllegalArgumentException("Invalid or unsupported Unicode property: " + property, e); |
170 | 401 | } |
171 | 402 | } |
|
0 commit comments