Skip to content

Commit f8c6fbc

Browse files
committed
Add partial \G anchor support for regex matching and substitution
- Add \G escape sequence handling in RegexPreprocessorHelper to remove \G from patterns (since Java regex doesn't support it natively) - Set useGAssertion flag in RuntimeRegex based on pattern contents - Implement position validation for patterns starting with \G in matchRegex - Implement position validation for patterns starting/ending with \G in replaceRegex - Add matcher.region() support for patterns starting with \G This is a partial implementation that improves \G support but does not fully handle all edge cases, particularly: - Patterns ending with \G (e.g., \d\d\G) need more work - Complex interactions with pos() and string modification during substitution Contributes to fixing t/re/subst.t tests 96-99 and 165-188.
1 parent b93bb66 commit f8c6fbc

File tree

2 files changed

+98
-5
lines changed

2 files changed

+98
-5
lines changed

src/main/java/org/perlonjava/regex/RegexPreprocessorHelper.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,13 @@ static int handleEscapeSequences(String s, StringBuilder sb, int c, int offset)
198198
sb.setLength(sb.length() - 1); // Remove the backslash
199199
sb.append("[^\\n\\x0B\\f\\r\\x85\\x{2028}\\x{2029}]");
200200
return offset;
201+
} else if (nextChar == 'G') {
202+
// \G - matches at the position where the last match ended
203+
// Java regex doesn't support \G, so we remove it and handle it manually in RuntimeRegex
204+
// The useGAssertion flag will be set and we'll validate match positions manually
205+
sb.setLength(sb.length() - 1); // Remove the backslash
206+
// Don't add anything - \G is handled in RuntimeRegex.matchRegex/replaceRegex
207+
return offset;
201208
} else if (nextChar == 'K') {
202209
// \K - keep assertion (reset start of match)
203210
// Convert to positive lookbehind for everything before this point

src/main/java/org/perlonjava/regex/RuntimeRegex.java

Lines changed: 91 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ protected boolean removeEldestEntry(Map.Entry<String, RuntimeRegex> eldest) {
5353
// ${^LAST_SUCCESSFUL_PATTERN}
5454
public static RuntimeRegex lastSuccessfulPattern = null;
5555
// Indicates if \G assertion is used
56-
private final boolean useGAssertion = false;
56+
private boolean useGAssertion = false;
5757
// Compiled regex pattern
5858
public Pattern pattern;
5959
int patternFlags;
@@ -96,6 +96,7 @@ public static RuntimeRegex compile(String patternString, String modifiers) {
9696

9797
regex.regexFlags = fromModifiers(modifiers, patternString);
9898
regex.patternFlags = regex.regexFlags.toPatternFlags();
99+
regex.useGAssertion = regex.regexFlags.useGAssertion();
99100

100101
String javaPattern = null;
101102
try {
@@ -377,10 +378,35 @@ public static RuntimeBase matchRegex(RuntimeScalar quotedRegex, RuntimeScalar st
377378
lastMatchStart = -1;
378379
lastMatchEnd = -1;
379380

381+
// For patterns ending with \G, we need to validate match end position
382+
boolean startsWithGMatch = false;
383+
boolean endsWithGMatch = false;
384+
if (regex.useGAssertion) {
385+
String origPattern = regex.patternString;
386+
startsWithGMatch = origPattern != null && origPattern.startsWith("\\G");
387+
endsWithGMatch = origPattern != null && origPattern.endsWith("\\G");
388+
}
389+
380390
while (matcher.find()) {
381-
// If \G is used, ensure the match starts at the expected position
382-
if (regex.useGAssertion && isPosDefined && matcher.start() != startPos) {
383-
break;
391+
// If \G is used, validate match position
392+
if (regex.useGAssertion) {
393+
if (startsWithGMatch && !endsWithGMatch) {
394+
// Pattern starts with \G: match must start at startPos
395+
if (matcher.start() != startPos) {
396+
break;
397+
}
398+
} else if (endsWithGMatch && !startsWithGMatch) {
399+
// Pattern ends with \G: match must end at startPos
400+
if (matcher.end() != startPos) {
401+
// Skip this match, keep looking
402+
continue;
403+
}
404+
} else if (startsWithGMatch && endsWithGMatch) {
405+
// Pattern both starts and ends with \G: zero-width match at startPos
406+
if (matcher.start() != startPos || matcher.end() != startPos) {
407+
break;
408+
}
409+
}
384410
}
385411

386412
found = true;
@@ -551,6 +577,11 @@ public static RuntimeBase replaceRegex(RuntimeScalar quotedRegex, RuntimeScalar
551577
Pattern pattern = regex.pattern;
552578
Matcher matcher = pattern.matcher(inputStr);
553579

580+
// Use RuntimePosLvalue to get the current position for \G handling
581+
RuntimeScalar posScalar = RuntimePosLvalue.pos(string);
582+
boolean isPosDefined = posScalar.getDefinedBoolean();
583+
int gPos = isPosDefined ? posScalar.getInt() : 0;
584+
554585
// The result string after substitutions
555586
StringBuilder resultBuffer = new StringBuilder();
556587
int found = 0;
@@ -561,11 +592,55 @@ public static RuntimeBase replaceRegex(RuntimeScalar quotedRegex, RuntimeScalar
561592
// Don't reset globalMatcher here - only reset it if we actually find a match
562593
// This preserves capture variables from previous matches when substitution doesn't match
563594

595+
// Track the position for \G handling in global matches
596+
int currentGPos = gPos;
597+
598+
// For patterns ending with \G, we need to find matches ending at specific positions
599+
// Check pattern structure once before the loop
600+
boolean startsWithG = false;
601+
boolean endsWithG = false;
602+
if (regex.useGAssertion) {
603+
String origPattern = regex.patternString;
604+
startsWithG = origPattern != null && origPattern.startsWith("\\G");
605+
endsWithG = origPattern != null && origPattern.endsWith("\\G");
606+
607+
// For patterns starting with \G, set the matcher region to start from gPos
608+
if (startsWithG && isPosDefined) {
609+
matcher.region(gPos, inputStr.length());
610+
// When using region(), we need to manually add the prefix part
611+
// that comes before the region start
612+
if (gPos > 0) {
613+
resultBuffer.append(inputStr, 0, gPos);
614+
}
615+
}
616+
}
617+
564618
// Perform the substitution
565619
while (matcher.find()) {
620+
// For \G anchor, validate match position
621+
if (regex.useGAssertion) {
622+
if (startsWithG && !endsWithG) {
623+
// Pattern starts with \G: match must start at currentGPos
624+
if (matcher.start() != currentGPos) {
625+
break;
626+
}
627+
} else if (endsWithG && !startsWithG) {
628+
// Pattern ends with \G: match must end at currentGPos
629+
if (matcher.end() != currentGPos) {
630+
// This match doesn't end at \G, skip it
631+
continue;
632+
}
633+
} else if (startsWithG && endsWithG) {
634+
// Pattern both starts and ends with \G: must be a zero-width match at currentGPos
635+
if (matcher.start() != currentGPos || matcher.end() != currentGPos) {
636+
break;
637+
}
638+
}
639+
}
640+
566641
found++;
567642

568-
// Initialize $1, $2, @+, @- only when we have a match
643+
// Initialize $1, $2, @+, @-, $`, $&, $' only when we have a match
569644
globalMatcher = matcher;
570645
globalMatchString = inputStr;
571646
// Store match information
@@ -599,6 +674,17 @@ public static RuntimeBase replaceRegex(RuntimeScalar quotedRegex, RuntimeScalar
599674
matcher.appendReplacement(resultBuffer, Matcher.quoteReplacement(replacementStr));
600675
}
601676

677+
// Update currentGPos for \G in global matches
678+
if (regex.useGAssertion && regex.regexFlags.isGlobalMatch()) {
679+
// For \G in global matches, update the \G position for the next iteration
680+
// The \G position moves by the difference between replacement and match length
681+
int matchLength = matcher.end() - matcher.start();
682+
int replacementLength = replacementStr != null ? replacementStr.length() : 0;
683+
int lengthDiff = replacementLength - matchLength;
684+
// Update \G position: it moves forward by the size of the replacement
685+
currentGPos = matcher.end() + lengthDiff;
686+
}
687+
602688
// If not a global match, break after the first replacement
603689
if (!regex.regexFlags.isGlobalMatch()) {
604690
break;

0 commit comments

Comments
 (0)