diff --git a/AGENTS.md b/AGENTS.md index ec9209add..373f86ae3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -75,6 +75,29 @@ Example format at the end of a design doc: ./jperl --int -e 'code' # Interpreter ``` +### Perl Test Runner + +Use `dev/tools/perl_test_runner.pl` to run Perl test files and get pass/fail counts. **Run with `perl` (not `jperl`)** because it needs fork support. + +```bash +# Run specific test files +perl dev/tools/perl_test_runner.pl perl5_t/t/re/regexp.t perl5_t/t/op/utfhash.t + +# Run all tests in a directory +perl dev/tools/perl_test_runner.pl perl5_t/t/op/ + +# Common test directories +perl dev/tools/perl_test_runner.pl perl5_t/t/re/ # Regex tests +perl dev/tools/perl_test_runner.pl perl5_t/t/op/ # Operator tests +perl dev/tools/perl_test_runner.pl perl5_t/t/uni/ # Unicode tests +``` + +The runner: +- Executes tests in parallel (5 jobs by default) +- Has a 300s timeout per test +- Reports pass/fail counts in format: `passed/total` +- Saves results to `test_results_YYYYMMDD_HHMMSS.txt` + ### Git Workflow **IMPORTANT: Never push directly to master. Always use feature branches and PRs.** diff --git a/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java b/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java index f3ab33410..8a3c0cb28 100644 --- a/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java +++ b/src/main/java/org/perlonjava/backend/bytecode/BytecodeCompiler.java @@ -1067,7 +1067,16 @@ public void visit(StringNode node) { } else if (emitterContext != null && emitterContext.symbolTable != null && !emitterContext.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8) && !emitterContext.compilerOptions.isUnicodeSource) { - opcode = Opcodes.LOAD_BYTE_STRING; + // Under `no utf8` - use BYTE_STRING, unless it contains wide characters (> 255) + // Wide characters (like \x{100}) force the string to be UTF-8 even without `use utf8` + boolean hasWideChars = false; + for (int i = 0; i < node.value.length(); i++) { + if (node.value.charAt(i) > 255) { + hasWideChars = true; + break; + } + } + opcode = hasWideChars ? Opcodes.LOAD_STRING : Opcodes.LOAD_BYTE_STRING; } else { opcode = Opcodes.LOAD_STRING; } diff --git a/src/main/java/org/perlonjava/backend/jvm/EmitLiteral.java b/src/main/java/org/perlonjava/backend/jvm/EmitLiteral.java index 433bddaf0..b44c20405 100644 --- a/src/main/java/org/perlonjava/backend/jvm/EmitLiteral.java +++ b/src/main/java/org/perlonjava/backend/jvm/EmitLiteral.java @@ -199,38 +199,49 @@ public static void emitString(EmitterContext ctx, StringNode node) { } if (!ctx.symbolTable.isStrictOptionEnabled(HINT_UTF8) && !ctx.compilerOptions.isUnicodeSource) { - // Under `no utf8` - create a octet string - - int stringIndex = RuntimeScalarCache.getOrCreateByteStringIndex(node.value); - - if (stringIndex >= 0) { - // Use cached RuntimeScalar - mv.visitLdcInsn(stringIndex); - mv.visitMethodInsn( - Opcodes.INVOKESTATIC, - "org/perlonjava/runtime/runtimetypes/RuntimeScalarCache", - "getScalarByteString", - "(I)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;", - false); - return; - } else { - // String is too long for cache or null, create new object - mv.visitTypeInsn(Opcodes.NEW, "org/perlonjava/runtime/runtimetypes/RuntimeScalarReadOnly"); - mv.visitInsn(Opcodes.DUP); - emitStringValue(mv, node.value); - mv.visitMethodInsn( - Opcodes.INVOKESPECIAL, - "org/perlonjava/runtime/runtimetypes/RuntimeScalarReadOnly", - "", - "(Ljava/lang/String;)V", - false); - - // Set the Perl scalar type to BYTE_STRING - mv.visitInsn(Opcodes.DUP); - mv.visitLdcInsn(RuntimeScalarType.BYTE_STRING); - mv.visitFieldInsn(Opcodes.PUTFIELD, "org/perlonjava/runtime/runtimetypes/RuntimeScalarReadOnly", "type", "I"); - return; + // Under `no utf8` - create an octet string, unless it contains wide characters (> 255) + // Wide characters (like \x{100}) force the string to be UTF-8 even without `use utf8` + boolean hasWideChars = false; + for (int i = 0; i < node.value.length(); i++) { + if (node.value.charAt(i) > 255) { + hasWideChars = true; + break; + } + } + + if (!hasWideChars) { + int stringIndex = RuntimeScalarCache.getOrCreateByteStringIndex(node.value); + + if (stringIndex >= 0) { + // Use cached RuntimeScalar + mv.visitLdcInsn(stringIndex); + mv.visitMethodInsn( + Opcodes.INVOKESTATIC, + "org/perlonjava/runtime/runtimetypes/RuntimeScalarCache", + "getScalarByteString", + "(I)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;", + false); + return; + } else { + // String is too long for cache or null, create new object + mv.visitTypeInsn(Opcodes.NEW, "org/perlonjava/runtime/runtimetypes/RuntimeScalarReadOnly"); + mv.visitInsn(Opcodes.DUP); + emitStringValue(mv, node.value); + mv.visitMethodInsn( + Opcodes.INVOKESPECIAL, + "org/perlonjava/runtime/runtimetypes/RuntimeScalarReadOnly", + "", + "(Ljava/lang/String;)V", + false); + + // Set the Perl scalar type to BYTE_STRING + mv.visitInsn(Opcodes.DUP); + mv.visitLdcInsn(RuntimeScalarType.BYTE_STRING); + mv.visitFieldInsn(Opcodes.PUTFIELD, "org/perlonjava/runtime/runtimetypes/RuntimeScalarReadOnly", "type", "I"); + return; + } } + // Fall through to create UTF-8 string if hasWideChars } // Use cache for regular strings diff --git a/src/main/java/org/perlonjava/core/Configuration.java b/src/main/java/org/perlonjava/core/Configuration.java index c83adca3a..ba9d3e49f 100644 --- a/src/main/java/org/perlonjava/core/Configuration.java +++ b/src/main/java/org/perlonjava/core/Configuration.java @@ -33,7 +33,7 @@ public final class Configuration { * Automatically populated by Gradle/Maven during build. * DO NOT EDIT MANUALLY - this value is replaced at build time. */ - public static final String gitCommitId = "c8049e1a7"; + public static final String gitCommitId = "204b2f32c"; /** * Git commit date of the build (ISO format: YYYY-MM-DD). diff --git a/src/main/java/org/perlonjava/frontend/parser/StringParser.java b/src/main/java/org/perlonjava/frontend/parser/StringParser.java index d25fb1470..0497bc8b8 100644 --- a/src/main/java/org/perlonjava/frontend/parser/StringParser.java +++ b/src/main/java/org/perlonjava/frontend/parser/StringParser.java @@ -14,6 +14,8 @@ import java.util.Map; import static org.perlonjava.runtime.perlmodule.Strict.HINT_UTF8; +import static org.perlonjava.runtime.perlmodule.Strict.HINT_RE_ASCII; +import static org.perlonjava.runtime.perlmodule.Strict.HINT_RE_UNICODE; import static org.perlonjava.runtime.runtimetypes.ScalarUtils.printable; /* @@ -508,6 +510,20 @@ public static OperatorNode parseRegexReplace(EmitterContext ctx, ParsedString ra public static OperatorNode parseRegexMatch(EmitterContext ctx, String operator, ParsedString rawStr, Parser parser) { operator = operator.equals("qr") ? "quoteRegex" : "matchRegex"; String modStr = rawStr.buffers.get(1); + + // Add default modifiers from `use re` pragma if not already present + if (ctx.symbolTable != null) { + if (ctx.symbolTable.isStrictOptionEnabled(HINT_RE_ASCII)) { + if (!modStr.contains("a") && !modStr.contains("u")) { + modStr = "a" + modStr; + } + } else if (ctx.symbolTable.isStrictOptionEnabled(HINT_RE_UNICODE)) { + if (!modStr.contains("u") && !modStr.contains("a")) { + modStr = "u" + modStr; + } + } + } + Node parsed = parseRegexString(ctx, rawStr, parser, modStr); if (rawStr.startDelim == '?') { // `m?PAT?` matches exactly once diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Re.java b/src/main/java/org/perlonjava/runtime/perlmodule/Re.java index 71326cfa2..f37a2c1ee 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/Re.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/Re.java @@ -1,12 +1,37 @@ package org.perlonjava.runtime.perlmodule; +import org.perlonjava.frontend.semantic.ScopedSymbolTable; import org.perlonjava.runtime.runtimetypes.RuntimeArray; import org.perlonjava.runtime.runtimetypes.RuntimeList; import org.perlonjava.runtime.runtimetypes.RuntimeScalar; import org.perlonjava.runtime.runtimetypes.RuntimeScalarType; +import static org.perlonjava.frontend.parser.SpecialBlockParser.getCurrentScope; + /** * The Re class provides functionalities similar to the Perl re module. + * + *

Currently implemented features: + *

+ * + *

TODO: Features not yet implemented (see {@code perldoc re}): + *

*/ public class Re extends PerlModuleBase { @@ -48,35 +73,57 @@ public static RuntimeList isRegexp(RuntimeArray args, int ctx) { } /** - * Handle `use re ...` import. Recognizes: 'strict'. + * Handle `use re ...` import. Recognizes: 'strict', '/a', '/u', '/aa'. * Enables appropriate experimental warning categories so our regex preprocessor can emit them. */ public static RuntimeList importRe(RuntimeArray args, int ctx) { + ScopedSymbolTable symbolTable = getCurrentScope(); + for (int i = 0; i < args.size(); i++) { String opt = args.get(i).toString(); // Normalize quotes if present opt = opt.replace("\"", "").replace("'", "").trim(); + if (opt.equalsIgnoreCase("strict")) { // Enable categories used by our preprocessor warnings Warnings.warningManager.enableWarning("experimental::re_strict"); Warnings.warningManager.enableWarning("experimental::uniprop_wildcards"); Warnings.warningManager.enableWarning("experimental::vlb"); + } else if (opt.equals("/a")) { + // use re '/a' - ASCII-restrict regex character classes + symbolTable.enableStrictOption(Strict.HINT_RE_ASCII); + symbolTable.disableStrictOption(Strict.HINT_RE_UNICODE | Strict.HINT_RE_ASCII_AA); + } else if (opt.equals("/aa")) { + // use re '/aa' - Strict ASCII-restrict (also affects case folding) + symbolTable.enableStrictOption(Strict.HINT_RE_ASCII | Strict.HINT_RE_ASCII_AA); + symbolTable.disableStrictOption(Strict.HINT_RE_UNICODE); + } else if (opt.equals("/u")) { + // use re '/u' - Unicode semantics for regex + symbolTable.enableStrictOption(Strict.HINT_RE_UNICODE); + symbolTable.disableStrictOption(Strict.HINT_RE_ASCII | Strict.HINT_RE_ASCII_AA); } } return new RuntimeList(); } /** - * Handle `no re ...` unimport. Recognizes: 'strict'. + * Handle `no re ...` unimport. Recognizes: 'strict', '/a', '/u', '/aa'. */ public static RuntimeList unimportRe(RuntimeArray args, int ctx) { + ScopedSymbolTable symbolTable = getCurrentScope(); + for (int i = 0; i < args.size(); i++) { String opt = args.get(i).toString(); opt = opt.replace("\"", "").replace("'", "").trim(); + if (opt.equalsIgnoreCase("strict")) { Warnings.warningManager.disableWarning("experimental::re_strict"); Warnings.warningManager.disableWarning("experimental::uniprop_wildcards"); Warnings.warningManager.disableWarning("experimental::vlb"); + } else if (opt.equals("/a") || opt.equals("/aa")) { + symbolTable.disableStrictOption(Strict.HINT_RE_ASCII | Strict.HINT_RE_ASCII_AA); + } else if (opt.equals("/u")) { + symbolTable.disableStrictOption(Strict.HINT_RE_UNICODE); } } return new RuntimeList(); diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Strict.java b/src/main/java/org/perlonjava/runtime/perlmodule/Strict.java index ff0325e80..4b4fff0ee 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/Strict.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/Strict.java @@ -30,6 +30,11 @@ public class Strict extends PerlModuleBase { // Bitmask for utf8 source code public static final int HINT_UTF8 = 0x00800000; + // Bitmask for `use re` regex modifiers + public static final int HINT_RE_ASCII = 0x01000000; // use re '/a' + public static final int HINT_RE_UNICODE = 0x02000000; // use re '/u' + public static final int HINT_RE_ASCII_AA = 0x04000000; // use re '/aa' + /** * Constructor for Strict. * Initializes the module with the name "strict". diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Utf8.java b/src/main/java/org/perlonjava/runtime/perlmodule/Utf8.java index 5c7382d22..d23a3d4d4 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/Utf8.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/Utf8.java @@ -320,10 +320,18 @@ public static RuntimeList isUtf8(RuntimeArray args, int ctx) { throw new IllegalStateException("Bad number of arguments for is_utf8() method"); } RuntimeScalar scalar = args.get(0); - if (scalar.type == BYTE_STRING) { - return RuntimeScalarCache.scalarFalse.getList(); - } - return RuntimeScalarCache.scalarTrue.getList(); + return isUtf8(scalar) ? RuntimeScalarCache.scalarTrue.getList() : RuntimeScalarCache.scalarFalse.getList(); + } + + /** + * Internal helper to check if a scalar has the UTF-8 flag set. + * This is used by regex matching to determine whether to use Unicode semantics. + * + * @param scalar The scalar to check. + * @return true if the scalar is a UTF-8 string (not BYTE_STRING), false otherwise. + */ + public static boolean isUtf8(RuntimeScalar scalar) { + return scalar.type != BYTE_STRING; } /** diff --git a/src/main/java/org/perlonjava/runtime/perlmodule/Version.java b/src/main/java/org/perlonjava/runtime/perlmodule/Version.java index c3389fea6..8ea6142c0 100644 --- a/src/main/java/org/perlonjava/runtime/perlmodule/Version.java +++ b/src/main/java/org/perlonjava/runtime/perlmodule/Version.java @@ -7,6 +7,7 @@ import static org.perlonjava.runtime.runtimetypes.GlobalVariable.getGlobalVariable; import static org.perlonjava.runtime.runtimetypes.RuntimeScalarCache.*; import static org.perlonjava.runtime.runtimetypes.RuntimeScalarType.DOUBLE; +import static org.perlonjava.runtime.runtimetypes.RuntimeScalarType.VSTRING; // TODO - create test cases // $ perl -E ' use version; say version->declare("v1.2.3"); say version->declare("1.2.3"); say version->declare("1.2"); say version->declare("1.2.3.4"); say version->declare("1"); say version->declare(" 1.2.4 ")->normal; say version->new(1.2); say version->new(1.2)->normal; say version->new("1.200000"); say version->new("1.2"); ' @@ -63,48 +64,106 @@ public static void initialize() { * Parses a version string into a version object. */ public static RuntimeList parse(RuntimeArray args, int ctx) { + return parseInternal(args, ctx, false); + } + + /** + * Internal parse method with option to force qv mode. + * @param args The arguments array + * @param ctx The runtime context + * @param forceQv If true, always set qv=true (used by qv() function) + */ + private static RuntimeList parseInternal(RuntimeArray args, int ctx, boolean forceQv) { if (args.size() < 2) { throw new IllegalStateException("version->parse() requires an argument"); } RuntimeScalar versionStr = args.get(1); - String version = versionStr.toString(); - - version = version.trim(); - if (version.isEmpty()) { - throw new PerlCompilerException("Invalid version format (version required)"); - } + String version; // Preserve the original version string before any modifications - RuntimeScalar originalVersionStr = versionStr; + RuntimeScalar originalVersionStr; - // Track whether the original input was a v-string (starts with 'v') - boolean originalIsVString = version.startsWith("v"); + // Track whether the original input was a v-string + boolean isVString = false; - if (versionStr.type == DOUBLE) { - // Format with enough precision but strip trailing zeros - version = String.format("%.6f", versionStr.getDouble()); - // Remove trailing zeros after decimal point, but keep at least one decimal place - version = version.replaceAll("0+$", "").replaceAll("\\.$", ".0"); - // Actually, Perl keeps the exact representation, so just strip trailing zeros - if (version.contains(".")) { - version = version.replaceAll("0+$", ""); - // Remove trailing dot if all decimals were zeros (e.g., "1." -> "1") - if (version.endsWith(".")) { - version = version.substring(0, version.length() - 1); + // Handle VSTRING type (bare v-strings like v1.2.3) + if (versionStr.type == VSTRING) { + isVString = true; + // Convert VSTRING to dotted format + String vstringValue = versionStr.value.toString(); + StringBuilder dotted = new StringBuilder("v"); + for (int i = 0; i < vstringValue.length(); i++) { + if (i > 0) dotted.append("."); + dotted.append((int) vstringValue.charAt(i)); + } + version = dotted.toString(); + originalVersionStr = new RuntimeScalar(version); + } else { + version = versionStr.toString().trim(); + + if (version.isEmpty()) { + throw new PerlCompilerException("Invalid version format (version required)"); + } + + // Check if original starts with 'v' + isVString = version.startsWith("v"); + + // Validate version format - check for multiple underscores + int underscoreCount = 0; + for (char c : version.toCharArray()) { + if (c == '_') underscoreCount++; + } + if (underscoreCount > 1) { + throw new PerlCompilerException("Invalid version format (multiple underscores)"); + } + + // Validate version format - must contain at least one digit + // and be a valid version pattern (digits, dots, underscores, optional v prefix) + String checkVersion = isVString ? version.substring(1) : version; + checkVersion = checkVersion.replace("_", ""); + + // Version must start with a digit and only contain digits and dots + // (after removing v prefix and underscores) + if (!checkVersion.matches("\\d+(\\.\\d+)*")) { + throw new PerlCompilerException("Invalid version format (non-numeric data)"); + } + + if (versionStr.type == DOUBLE) { + // Format with enough precision but strip trailing zeros + version = String.format("%.6f", versionStr.getDouble()); + // Remove trailing zeros after decimal point + if (version.contains(".")) { + version = version.replaceAll("0+$", ""); + // Remove trailing dot if all decimals were zeros (e.g., "1." -> "1") + if (version.endsWith(".")) { + version = version.substring(0, version.length() - 1); + } } + originalVersionStr = new RuntimeScalar(version); + } else { + originalVersionStr = versionStr; } + } + + // For qv(), prepend 'v' if not already present and set original with v prefix + if (forceQv) { + isVString = true; + if (!version.startsWith("v")) { + version = "v" + version; + } + // For qv(), the original is the v-prefixed version originalVersionStr = new RuntimeScalar(version); } else if (!version.startsWith("v")) { // Count the number of dots long dotCount = version.chars().filter(ch -> ch == '.').count(); - // If exactly one dot, prepend "v" for internal processing + // If exactly one dot and short, prepend "v" for internal processing // but keep the original for stringify() and qv flag if (dotCount == 1 && version.length() < 4) { version = "v" + version; // Note: originalVersionStr stays as the user's input (e.g., "1.0") - // Note: originalIsVString remains false - this is a decimal version + // Note: isVString remains false - this is a decimal version } } @@ -112,12 +171,10 @@ public static RuntimeList parse(RuntimeArray args, int ctx) { RuntimeHash versionObj = new RuntimeHash(); // Parse the version string - // Use originalIsVString to determine qv, not the modified version string if (version.startsWith("v")) { // v-string format (either originally or for internal processing) versionObj.put("alpha", scalarFalse); - // qv is true only if the ORIGINAL input was a v-string - versionObj.put("qv", getScalarBoolean(originalIsVString)); + versionObj.put("qv", getScalarBoolean(isVString)); // Parse components String normalized = VersionHelper.normalizeVersion(new RuntimeScalar(version)); @@ -163,17 +220,18 @@ public static RuntimeList declare(RuntimeArray args, int ctx) { /** * qv() - creates a dotted-decimal version object. * Always receives class name as first argument due to how it's exported. + * qv() always sets is_qv to true, ensuring the version is treated as a v-string. */ public static RuntimeList qv(RuntimeArray args, int ctx) { if (args.isEmpty()) { throw new IllegalStateException("qv() requires an argument"); } - // Create version object via parse + // Create version object via parseInternal with forceQv=true RuntimeArray parseArgs = new RuntimeArray(); parseArgs.push(new RuntimeScalar("version")); // class name parseArgs.push(RuntimeArray.pop(args)); - return parse(parseArgs, ctx); + return parseInternal(parseArgs, ctx, true); // forceQv=true } /** diff --git a/src/main/java/org/perlonjava/runtime/regex/RegexFlags.java b/src/main/java/org/perlonjava/runtime/regex/RegexFlags.java index 1c5eb5906..a7ee22d30 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RegexFlags.java +++ b/src/main/java/org/perlonjava/runtime/regex/RegexFlags.java @@ -19,11 +19,13 @@ * @param isExtended x flag - ignore whitespace and # comments in pattern * @param preservesMatch p flag - preserve match after failed matches * @param isUnicode u flag - Unicode semantics (\w, \d, \s match Unicode) + * @param isAscii a flag - ASCII-restrict (\w, \d, \s match only ASCII) */ public record RegexFlags(boolean isGlobalMatch, boolean keepCurrentPosition, boolean isNonDestructive, boolean isMatchExactlyOnce, boolean useGAssertion, boolean isExtendedWhitespace, boolean isNonCapturing, boolean isOptimized, boolean isCaseInsensitive, boolean isMultiLine, - boolean isDotAll, boolean isExtended, boolean preservesMatch, boolean isUnicode) { + boolean isDotAll, boolean isExtended, boolean preservesMatch, boolean isUnicode, + boolean isAscii) { public static RegexFlags fromModifiers(String modifiers, String patternString) { return new RegexFlags( @@ -40,7 +42,8 @@ public static RegexFlags fromModifiers(String modifiers, String patternString) { modifiers.contains("s"), modifiers.contains("x"), modifiers.contains("p"), - modifiers.contains("u") + modifiers.contains("u"), + modifiers.contains("a") ); } @@ -67,16 +70,18 @@ public int toPatternFlags() { int flags = 0; // /u flag enables Unicode semantics for \w, \d, \s - if (isUnicode) { + // /a flag (ASCII-restrict) disables Unicode semantics + if (isUnicode && !isAscii) { flags |= UNICODE_CHARACTER_CLASS; } if (isCaseInsensitive) { - // For proper Unicode case-insensitive matching, we need both flags: - // - CASE_INSENSITIVE: enables case-insensitive matching - // - UNICODE_CASE: enables Unicode-aware case folding (not just ASCII) - // Without UNICODE_CASE, only ASCII A-Z matches a-z - flags |= CASE_INSENSITIVE | UNICODE_CASE; + flags |= CASE_INSENSITIVE; + // For Unicode case-insensitive matching, add UNICODE_CASE + // But NOT if /a flag (ASCII-restrict) is set - /a restricts case folding to ASCII + if (!isAscii) { + flags |= UNICODE_CASE; + } } if (isMultiLine) { flags |= MULTILINE; @@ -98,6 +103,7 @@ public RegexFlags with(String positiveFlags, String negativeFlags) { boolean newIsExtended = this.isExtended; boolean newPreservesMatch = this.preservesMatch; boolean newIsUnicode = this.isUnicode; + boolean newIsAscii = this.isAscii; // Handle positive flags if (positiveFlags.indexOf('n') >= 0) newFlagN = true; @@ -107,6 +113,7 @@ public RegexFlags with(String positiveFlags, String negativeFlags) { if (positiveFlags.indexOf('x') >= 0) newIsExtended = true; if (positiveFlags.indexOf('p') >= 0) newPreservesMatch = true; if (positiveFlags.indexOf('u') >= 0) newIsUnicode = true; + if (positiveFlags.indexOf('a') >= 0) newIsAscii = true; // Handle negative flags if (negativeFlags.indexOf('n') >= 0) newFlagN = false; @@ -115,6 +122,7 @@ public RegexFlags with(String positiveFlags, String negativeFlags) { if (negativeFlags.indexOf('s') >= 0) newIsDotAll = false; if (negativeFlags.indexOf('x') >= 0) newIsExtended = false; if (negativeFlags.indexOf('u') >= 0) newIsUnicode = false; + if (negativeFlags.indexOf('a') >= 0) newIsAscii = false; return new RegexFlags( this.isGlobalMatch, @@ -130,7 +138,8 @@ public RegexFlags with(String positiveFlags, String negativeFlags) { newIsDotAll, newIsExtended, newPreservesMatch, - newIsUnicode + newIsUnicode, + newIsAscii ); } @@ -146,6 +155,7 @@ public String toFlagString() { if (isExtended) flagString.append('x'); if (isNonDestructive) flagString.append('r'); if (isUnicode) flagString.append('u'); + if (isAscii) flagString.append('a'); return flagString.toString(); } diff --git a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java index 73d53b256..09bea7374 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java +++ b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessor.java @@ -801,7 +801,7 @@ static int handleRegex(String s, int offset, StringBuilder sb, RegexFlags regexF break; case '\\': // Handle escape sequences - offset = RegexPreprocessorHelper.handleEscapeSequences(s, sb, c, offset); + offset = RegexPreprocessorHelper.handleEscapeSequences(s, sb, c, offset, regexFlags); lastWasQuantifiable = true; break; diff --git a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java index 11e33ef78..bfa999900 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java +++ b/src/main/java/org/perlonjava/runtime/regex/RegexPreprocessorHelper.java @@ -7,7 +7,7 @@ import static org.perlonjava.runtime.regex.UnicodeResolver.translateUnicodeProperty; public class RegexPreprocessorHelper { - static int handleEscapeSequences(String s, StringBuilder sb, int c, int offset) { + static int handleEscapeSequences(String s, StringBuilder sb, int c, int offset, RegexFlags regexFlags) { sb.append(Character.toChars(c)); // This appends the backslash final int length = s.length(); @@ -171,25 +171,35 @@ static int handleEscapeSequences(String s, StringBuilder sb, int c, int offset) return end - 1; } } else if (nextChar == 's' || nextChar == 'S') { - // Perl's \s matches Unicode whitespace, Java's \s only matches ASCII whitespace - // Expand \s to match all Perl whitespace characters: - // \t \n \f \r space (ASCII: 09-0D, 20) - // U+000B (vertical tab - Perl includes this) - // U+1680 (OGHAM SPACE MARK) - // U+2000-U+200A (EN QUAD through HAIR SPACE) - // U+2028 (LINE SEPARATOR) - // U+2029 (PARAGRAPH SEPARATOR) - // U+202F (NARROW NO-BREAK SPACE) - // U+205F (MEDIUM MATHEMATICAL SPACE) - // U+3000 (IDEOGRAPHIC SPACE) + // Handle \s and \S based on ASCII mode sb.setLength(sb.length() - 1); // Remove the backslash - if (nextChar == 's') { - // Positive: matches whitespace - // Use \x20 instead of literal space to avoid issues with /x modifier - sb.append("[\\t\\n\\u000B\\f\\r\\x20\\u1680\\u2000-\\u200A\\u2028\\u2029\\u202F\\u205F\\u3000]"); + if (regexFlags.isAscii()) { + // ASCII mode: \s matches only ASCII whitespace + if (nextChar == 's') { + sb.append("[\\t\\n\\u000B\\f\\r\\x20]"); + } else { + sb.append("[^\\t\\n\\u000B\\f\\r\\x20]"); + } } else { - // Negative: matches non-whitespace - sb.append("[^\\t\\n\\u000B\\f\\r\\x20\\u1680\\u2000-\\u200A\\u2028\\u2029\\u202F\\u205F\\u3000]"); + // Unicode mode: Perl's \s matches Unicode whitespace + // Expand \s to match all Perl whitespace characters: + // \t \n \f \r space (ASCII: 09-0D, 20) + // U+000B (vertical tab - Perl includes this) + // U+1680 (OGHAM SPACE MARK) + // U+2000-U+200A (EN QUAD through HAIR SPACE) + // U+2028 (LINE SEPARATOR) + // U+2029 (PARAGRAPH SEPARATOR) + // U+202F (NARROW NO-BREAK SPACE) + // U+205F (MEDIUM MATHEMATICAL SPACE) + // U+3000 (IDEOGRAPHIC SPACE) + if (nextChar == 's') { + // Positive: matches whitespace + // Use \x20 instead of literal space to avoid issues with /x modifier + sb.append("[\\t\\n\\u000B\\f\\r\\x20\\u1680\\u2000-\\u200A\\u2028\\u2029\\u202F\\u205F\\u3000]"); + } else { + // Negative: matches non-whitespace + sb.append("[^\\t\\n\\u000B\\f\\r\\x20\\u1680\\u2000-\\u200A\\u2028\\u2029\\u202F\\u205F\\u3000]"); + } } return offset; } else if (nextChar == 'h') { @@ -367,6 +377,24 @@ static int handleEscapeSequences(String s, StringBuilder sb, int c, int offset) } else { RegexPreprocessor.regexError(s, offset, "Missing right brace on \\o{}"); } + } else if ((nextChar == 'w' || nextChar == 'W' || nextChar == 'd' || nextChar == 'D') && regexFlags.isAscii()) { + // In ASCII mode (/a flag), restrict \w, \W, \d, \D to ASCII only + sb.setLength(sb.length() - 1); // Remove the backslash + switch (nextChar) { + case 'w': + sb.append("[a-zA-Z0-9_]"); + break; + case 'W': + sb.append("[^a-zA-Z0-9_]"); + break; + case 'd': + sb.append("[0-9]"); + break; + case 'D': + sb.append("[^0-9]"); + break; + } + return offset; } else { int c2 = s.codePointAt(offset); if (c2 >= '0' && c2 <= '7') { diff --git a/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java b/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java index 0e9188b25..28b8d8891 100644 --- a/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java +++ b/src/main/java/org/perlonjava/runtime/regex/RuntimeRegex.java @@ -2,6 +2,7 @@ import org.perlonjava.runtime.operators.Time; import org.perlonjava.runtime.operators.WarnDie; +import org.perlonjava.runtime.perlmodule.Utf8; import org.perlonjava.runtime.runtimetypes.*; import java.util.Iterator; @@ -62,9 +63,12 @@ protected boolean removeEldestEntry(Map.Entry eldest) { // Capture groups from the last successful match that had captures. // In Perl 5, $1/$2/etc persist across non-capturing matches. public static String[] lastCaptureGroups = null; - // Compiled regex pattern + // Compiled regex pattern (for byte strings - ASCII-only \w, \d) public Pattern pattern; + // Compiled regex pattern for Unicode strings (Unicode \w, \d) + public Pattern patternUnicode; int patternFlags; + int patternFlagsUnicode; String patternString; boolean hasPreservesMatch = false; // True if /p was used (outer or inline (?p)) // Indicates if \G assertion is used (set from regexFlags during compilation) @@ -118,6 +122,15 @@ public static RuntimeRegex compile(String patternString, String modifiers) { regex.regexFlags = fromModifiers(modifiers, patternString); regex.useGAssertion = regex.regexFlags.useGAssertion(); regex.patternFlags = regex.regexFlags.toPatternFlags(); + + // Always compute Unicode flags - we need the Unicode variant for when + // the input string contains non-ASCII characters (auto-Unicode detection) + // Only skip Unicode variant if /a flag is explicitly used + if (!regex.regexFlags.isAscii()) { + regex.patternFlagsUnicode = regex.patternFlags | Pattern.UNICODE_CHARACTER_CLASS; + } else { + regex.patternFlagsUnicode = regex.patternFlags; + } String javaPattern = null; try { @@ -135,8 +148,16 @@ public static RuntimeRegex compile(String patternString, String modifiers) { regex.patternString = patternString; - // Compile the regex pattern + // Compile the regex pattern for byte strings (ASCII-only \w, \d) regex.pattern = Pattern.compile(javaPattern, regex.patternFlags); + + // Compile the Unicode variant for Unicode strings + // Only compile separately if the flags differ (saves memory when /a or /u is used) + if (regex.patternFlagsUnicode != regex.patternFlags) { + regex.patternUnicode = Pattern.compile(javaPattern, regex.patternFlagsUnicode); + } else { + regex.patternUnicode = regex.pattern; + } // Check if pattern has code block captures for $^R optimization // Code blocks are encoded as named captures like (?) @@ -164,6 +185,7 @@ public static RuntimeRegex compile(String patternString, String modifiers) { } WarnDie.warn(new RuntimeScalar(errorMessage), new RuntimeScalar()); regex.pattern = Pattern.compile(Character.toString(0) + "ERROR" + Character.toString(0), Pattern.DOTALL); + regex.patternUnicode = regex.pattern; // Error pattern - same for both } else { if (e instanceof PerlCompilerException) { throw e; @@ -195,6 +217,7 @@ private static RuntimeRegex ensureCompiledForRuntime(RuntimeRegex regex) { regex.deferredUserDefinedUnicodeProperties = false; RuntimeRegex recompiled = compile(regex.patternString, regex.regexFlags == null ? "" : regex.regexFlags.toFlagString()); regex.pattern = recompiled.pattern; + regex.patternUnicode = recompiled.patternUnicode; regex.patternFlags = recompiled.patternFlags; regex.regexFlags = recompiled.regexFlags; regex.useGAssertion = recompiled.useGAssertion; @@ -266,6 +289,7 @@ public static RuntimeScalar getQuotedRegex(RuntimeScalar patternString, RuntimeS // Create a new regex with merged flags RuntimeRegex regex = new RuntimeRegex(); regex.pattern = originalRegex.pattern; + regex.patternUnicode = originalRegex.patternUnicode; regex.patternString = originalRegex.patternString; regex.hasPreservesMatch = originalRegex.hasPreservesMatch; regex.regexFlags = mergeRegexFlags(originalRegex.regexFlags, modifierStr, originalRegex.patternString); @@ -294,6 +318,7 @@ public static RuntimeScalar getQuotedRegex(RuntimeScalar patternString, RuntimeS // Create a new regex with merged flags RuntimeRegex regex = new RuntimeRegex(); regex.pattern = originalRegex.pattern; + regex.patternUnicode = originalRegex.patternUnicode; regex.patternString = originalRegex.patternString; regex.hasPreservesMatch = originalRegex.hasPreservesMatch; regex.regexFlags = mergeRegexFlags(originalRegex.regexFlags, modifierStr, originalRegex.patternString); @@ -366,6 +391,7 @@ public static RuntimeScalar getReplacementRegex(RuntimeScalar patternString, Run // Always start with the resolved regex properties regex.pattern = resolvedRegex.pattern; + regex.patternUnicode = resolvedRegex.patternUnicode; regex.patternString = resolvedRegex.patternString; regex.regexFlags = resolvedRegex.regexFlags; regex.hasPreservesMatch = resolvedRegex.hasPreservesMatch; @@ -388,6 +414,7 @@ public static RuntimeScalar getReplacementRegex(RuntimeScalar patternString, Run if (flagsChanged) { RuntimeRegex recompiledRegex = compile(resolvedRegex.patternString, newFlags.toFlagString()); regex.pattern = recompiledRegex.pattern; + regex.patternUnicode = recompiledRegex.patternUnicode; regex.patternString = recompiledRegex.patternString; regex.regexFlags = recompiledRegex.regexFlags; regex.hasPreservesMatch = recompiledRegex.hasPreservesMatch; @@ -461,6 +488,7 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc // Create a temporary regex with the right pattern and current flags RuntimeRegex tempRegex = new RuntimeRegex(); tempRegex.pattern = pattern; + tempRegex.patternUnicode = lastSuccessfulPattern.patternUnicode; tempRegex.patternString = lastSuccessfulPattern.patternString; tempRegex.hasPreservesMatch = lastSuccessfulPattern.hasPreservesMatch || (originalFlags != null && originalFlags.preservesMatch()); tempRegex.regexFlags = originalFlags; @@ -489,30 +517,52 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc Pattern pattern = regex.pattern; String inputStr = string.toString(); + + // Select appropriate pattern based on string's UTF-8 flag and content: + // - /a flag or inline (?a): always use ASCII-only pattern + // - BYTE_STRING: use ASCII-only pattern (Perl's "bytes" semantics) + // - UTF-8 string with Unicode chars (> 255): use Unicode pattern + // - UTF-8 string with only Latin-1 chars: use ASCII pattern (avoids false matches) + // This mimics Perl's behavior where \w, \d, \s semantics depend on UTF-8 flag + if (regex.patternUnicode != null && regex.patternUnicode != regex.pattern) { + if (regex.regexFlags != null && regex.regexFlags.isAscii()) { + // /a flag - always ASCII + pattern = regex.pattern; + } else if (hasInlineAsciiModifier(regex.patternString)) { + // Inline (?a...) in pattern - use ASCII to be safe + pattern = regex.pattern; + } else if (Utf8.isUtf8(string) && RuntimePosLvalue.hasUnicodeChars(string, inputStr)) { + // UTF-8 string with true Unicode content (> 255) - use Unicode matching + pattern = regex.patternUnicode; + } + // else: BYTE_STRING or Latin-1 only content - keep ASCII pattern (default) + } + CharSequence matchInput = new RegexTimeoutCharSequence(inputStr); Matcher matcher = pattern.matcher(matchInput); // hexPrinter(inputStr); - // Use RuntimePosLvalue to get the current position - RuntimeScalar posScalar = RuntimePosLvalue.pos(string); - boolean isPosDefined = posScalar.getDefinedBoolean(); - int startPos = isPosDefined ? posScalar.getInt() : 0; - - // Only use pos() for /g matches - non-/g matches always start from 0 - if (!regex.regexFlags.isGlobalMatch()) { - isPosDefined = false; - startPos = 0; - } - - // Check if previous call had zero-length match at this position (for SCALAR context) - // This prevents infinite loops in: while ($str =~ /pat/g) - if (regex.regexFlags.isGlobalMatch() && ctx == RuntimeContextType.SCALAR) { - String patternKey = regex.patternString; - if (RuntimePosLvalue.hadZeroLengthMatchAt(string, startPos, patternKey)) { - // Previous match was zero-length at this position - fail to break loop - posScalar.set(scalarUndef); - return RuntimeScalarCache.scalarFalse; + // Only look up pos() for /g matches - non-/g matches always start from 0 + RuntimeScalar posScalar = null; + boolean isPosDefined = false; + int startPos = 0; + + if (regex.regexFlags.isGlobalMatch()) { + // Use RuntimePosLvalue to get the current position + posScalar = RuntimePosLvalue.pos(string); + isPosDefined = posScalar.getDefinedBoolean(); + startPos = isPosDefined ? posScalar.getInt() : 0; + + // Check if previous call had zero-length match at this position (for SCALAR context) + // This prevents infinite loops in: while ($str =~ /pat/g) + if (ctx == RuntimeContextType.SCALAR) { + String patternKey = regex.patternString; + if (RuntimePosLvalue.hadZeroLengthMatchAt(string, startPos, patternKey)) { + // Previous match was zero-length at this position - fail to break loop + posScalar.set(scalarUndef); + return RuntimeScalarCache.scalarFalse; + } } } @@ -597,17 +647,22 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc if (ctx == RuntimeContextType.SCALAR || ctx == RuntimeContextType.VOID) { // Set pos to the end of the current match to prepare for the next search - posScalar.set(matchEnd); - // Record zero-length match for cross-call tracking - if (matchEnd == matchStart) { - RuntimePosLvalue.recordZeroLengthMatch(string, matchEnd, regex.patternString); - } else { - RuntimePosLvalue.recordNonZeroLengthMatch(string); + // (only for global matches - posScalar is null for non-global) + if (posScalar != null) { + posScalar.set(matchEnd); + // Record zero-length match for cross-call tracking + if (matchEnd == matchStart) { + RuntimePosLvalue.recordZeroLengthMatch(string, matchEnd, regex.patternString); + } else { + RuntimePosLvalue.recordNonZeroLengthMatch(string); + } } break; // Break out of the loop after the first match in SCALAR context } else { startPos = matchEnd; - posScalar.set(startPos); + if (posScalar != null) { + posScalar.set(startPos); + } // Update matcher region if we advanced past a zero-length match if (startPos > matchStart) { matcher.region(startPos, inputStr.length()); @@ -625,7 +680,7 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc } // Reset pos() on failed match with /g, unless /c is set - if (!found && regex.regexFlags.isGlobalMatch() && !regex.regexFlags.keepCurrentPosition()) { + if (!found && regex.regexFlags.isGlobalMatch() && !regex.regexFlags.keepCurrentPosition() && posScalar != null) { posScalar.set(scalarUndef); } @@ -666,7 +721,7 @@ private static RuntimeBase matchRegexDirect(RuntimeScalar quotedRegex, RuntimeSc } // Reset pos() after global match in LIST context (matches Perl behavior) - if (regex.regexFlags.isGlobalMatch() && ctx == RuntimeContextType.LIST) { + if (regex.regexFlags.isGlobalMatch() && ctx == RuntimeContextType.LIST && posScalar != null) { posScalar.set(scalarUndef); } // System.err.println("DEBUG: Match completed, globalMatcher is " + (globalMatcher == null ? "null" : "set")); @@ -778,6 +833,7 @@ public static RuntimeBase replaceRegex(RuntimeScalar quotedRegex, RuntimeScalar // Create a temporary regex with the right pattern and current flags RuntimeRegex tempRegex = new RuntimeRegex(); tempRegex.pattern = pattern; + tempRegex.patternUnicode = lastSuccessfulPattern.patternUnicode; tempRegex.patternString = lastSuccessfulPattern.patternString; tempRegex.hasPreservesMatch = lastSuccessfulPattern.hasPreservesMatch || (originalFlags != null && originalFlags.preservesMatch()); tempRegex.regexFlags = originalFlags; @@ -790,6 +846,7 @@ public static RuntimeBase replaceRegex(RuntimeScalar quotedRegex, RuntimeScalar RuntimeRegex tempRegex = new RuntimeRegex(); int flags = originalFlags != null ? originalFlags.toPatternFlags() : 0; tempRegex.pattern = Pattern.compile("", flags); + tempRegex.patternUnicode = tempRegex.pattern; // Empty pattern - same for both tempRegex.patternString = ""; tempRegex.regexFlags = originalFlags; tempRegex.useGAssertion = originalFlags != null && originalFlags.useGAssertion(); @@ -799,6 +856,22 @@ public static RuntimeBase replaceRegex(RuntimeScalar quotedRegex, RuntimeScalar } Pattern pattern = regex.pattern; + + // Select appropriate pattern based on string's UTF-8 flag and content (same logic as matchRegex) + if (regex.patternUnicode != null && regex.patternUnicode != regex.pattern) { + if (regex.regexFlags != null && regex.regexFlags.isAscii()) { + // /a flag - always ASCII + pattern = regex.pattern; + } else if (hasInlineAsciiModifier(regex.patternString)) { + // Inline (?a...) in pattern - use ASCII to be safe + pattern = regex.pattern; + } else if (Utf8.isUtf8(string) && RuntimePosLvalue.hasUnicodeChars(string, inputStr)) { + // UTF-8 string with true Unicode content (> 255) - use Unicode matching + pattern = regex.patternUnicode; + } + // else: BYTE_STRING or Latin-1 only content - keep ASCII pattern (default) + } + CharSequence matchInput = new RegexTimeoutCharSequence(inputStr); Matcher matcher = pattern.matcher(matchInput); @@ -1029,6 +1102,70 @@ public static int matcherSize() { return globalMatcher.groupCount() + 1; } + /** + * Check if a string contains any non-ASCII characters (code point > 127). + * Used to determine if Unicode matching should be used. + * + * @param s The string to check + * @return true if the string contains non-ASCII characters + */ + private static boolean hasNonAscii(String s) { + for (int i = 0; i < s.length(); i++) { + if (s.charAt(i) > 127) { + return true; // Early exit at first non-ASCII + } + } + return false; + } + + /** + * Check if a string contains any Unicode characters (code point > 255). + * Characters 128-255 are extended ASCII and don't require Unicode semantics. + * Characters > 255 are true Unicode and should trigger Unicode \w, \d, \s. + * + * @param s The string to check + * @return true if the string contains Unicode characters (> 255) + */ + private static boolean hasUnicodeChars(String s) { + for (int i = 0; i < s.length(); i++) { + if (s.charAt(i) > 255) { + return true; // Early exit at first Unicode char + } + } + return false; + } + + /** + * Check if a pattern contains inline ASCII modifier (?a...). + * When present, we should use ASCII matching even for UTF-8 strings with non-ASCII content. + * + * @param pattern The pattern string to check + * @return true if the pattern contains inline (?a...) modifier + */ + private static boolean hasInlineAsciiModifier(String pattern) { + if (pattern == null) { + return false; + } + // Check for (?a...) inline modifier - matches (?a, (?a:, (?ai, (?ia, etc. + // The 'a' must appear in the modifier position after (? + int idx = 0; + while ((idx = pattern.indexOf("(?", idx)) >= 0) { + idx += 2; + // Scan modifier characters until we hit : or ) + while (idx < pattern.length()) { + char c = pattern.charAt(idx); + if (c == 'a') { + return true; // Found inline ASCII modifier + } + if (c == ':' || c == ')' || c == '-' || c == '<' || c == '=' || c == '!' || c == '{' || c == '#') { + break; // End of modifier section + } + idx++; + } + } + return false; + } + /** * Resolves a scalar to a RuntimeRegex, handling qr overloading if necessary. * diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/GlobalContext.java b/src/main/java/org/perlonjava/runtime/runtimetypes/GlobalContext.java index 999ce9250..bd73c35b5 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/GlobalContext.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/GlobalContext.java @@ -84,7 +84,14 @@ public static void initializeGlobals(CompilerOptions compilerOptions) { GlobalVariable.getGlobalVariable("main::0").set(compilerOptions.fileName); } GlobalVariable.getGlobalVariable(GLOBAL_PHASE).set(""); // ${^GLOBAL_PHASE} - GlobalVariable.globalVariables.put(encodeSpecialVar("TAINT"), RuntimeScalarCache.scalarZero); // ${^TAINT} - read-only, always 0 (taint mode not implemented) + // ${^TAINT} - set to 1 if -T (taint mode) was specified, 0 otherwise + // Only initialize if not already set (to avoid overwriting during re-initialization) + String taintVarName = encodeSpecialVar("TAINT"); + if (!GlobalVariable.globalVariables.containsKey(taintVarName) || + (compilerOptions.taintMode && GlobalVariable.globalVariables.get(taintVarName) == RuntimeScalarCache.scalarZero)) { + GlobalVariable.globalVariables.put(taintVarName, + compilerOptions.taintMode ? RuntimeScalarCache.scalarOne : RuntimeScalarCache.scalarZero); + } GlobalVariable.getGlobalVariable("main::>"); // TODO GlobalVariable.getGlobalVariable("main::<"); // TODO GlobalVariable.getGlobalVariable("main::;").set("\034"); // initialize $; (SUBSEP) to \034 diff --git a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimePosLvalue.java b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimePosLvalue.java index 9ac8697b9..4474af17c 100644 --- a/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimePosLvalue.java +++ b/src/main/java/org/perlonjava/runtime/runtimetypes/RuntimePosLvalue.java @@ -175,6 +175,7 @@ private static class CacheEntry { boolean lastMatchWasZeroLength; // Track if last match was zero-length int lastMatchPosition; // Position of last zero-length match String lastMatchPattern; // Pattern that had the zero-length match + Boolean hasUnicodeChars; // Cached result of Unicode character check (null = not computed) CacheEntry(int valueHash, RuntimeScalar regexPosition) { this.valueHash = valueHash; @@ -182,6 +183,54 @@ private static class CacheEntry { this.lastMatchWasZeroLength = false; this.lastMatchPosition = -1; this.lastMatchPattern = null; + this.hasUnicodeChars = null; } } + + /** + * Check if a string contains Unicode characters (code points > 255). + * Results are cached per-scalar to avoid re-scanning on every regex match. + * + * @param perlVariable the scalar to check + * @param stringValue the string value (already extracted from the scalar) + * @return true if the string contains characters > 255 + */ + public static boolean hasUnicodeChars(RuntimeScalar perlVariable, String stringValue) { + if (perlVariable == null || stringValue == null) { + return false; + } + + CacheEntry cachedEntry = positionCache.get(perlVariable); + // Use the same hash calculation as pos() for consistency + int code = perlVariable.value == null ? 0 : perlVariable.value.hashCode(); + + // If cache entry exists and value hasn't changed, use cached result + if (cachedEntry != null && cachedEntry.valueHash == code && cachedEntry.hasUnicodeChars != null) { + return cachedEntry.hasUnicodeChars; + } + + // Compute hasUnicodeChars + boolean result = false; + for (int i = 0; i < stringValue.length(); i++) { + if (stringValue.charAt(i) > 255) { + result = true; + break; + } + } + + // Cache the result - but only update hasUnicodeChars, don't replace the whole entry + // if only hasUnicodeChars was missing (to preserve pos) + if (cachedEntry != null && cachedEntry.valueHash == code) { + // Entry exists with same hash, just update hasUnicodeChars + cachedEntry.hasUnicodeChars = result; + } else { + // Need to create new cache entry (value changed or no entry) + RuntimeScalar position = new PosLvalueScalar(perlVariable); + cachedEntry = new CacheEntry(code, position); + cachedEntry.hasUnicodeChars = result; + positionCache.put(perlVariable, cachedEntry); + } + + return result; + } } \ No newline at end of file diff --git a/src/main/perl/lib/Config.pm b/src/main/perl/lib/Config.pm index d81dd171b..be61c83c6 100644 --- a/src/main/perl/lib/Config.pm +++ b/src/main/perl/lib/Config.pm @@ -87,6 +87,10 @@ $os_name =~ s/\s+/_/g; # Compiler settings (Java instead of C) cc => 'javac', ld => 'javac', + # ccflags includes -DSILENT_NO_TAINT_SUPPORT because PerlOnJava does not + # implement full taint checking. This allows tests that check for taint + # support to skip gracefully. + ccflags => '-DSILENT_NO_TAINT_SUPPORT', # Library/path configuration path_sep => $path_separator,