Skip to content
Merged
23 changes: 23 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,29 @@ Example format at the end of a design doc:
./jperl --int -e 'code' # Interpreter
```

### Perl Test Runner

Use `dev/tools/perl_test_runner.pl` to run Perl test files and get pass/fail counts. **Run with `perl` (not `jperl`)** because it needs fork support.

```bash
# Run specific test files
perl dev/tools/perl_test_runner.pl perl5_t/t/re/regexp.t perl5_t/t/op/utfhash.t

# Run all tests in a directory
perl dev/tools/perl_test_runner.pl perl5_t/t/op/

# Common test directories
perl dev/tools/perl_test_runner.pl perl5_t/t/re/ # Regex tests
perl dev/tools/perl_test_runner.pl perl5_t/t/op/ # Operator tests
perl dev/tools/perl_test_runner.pl perl5_t/t/uni/ # Unicode tests
```

The runner:
- Executes tests in parallel (5 jobs by default)
- Has a 300s timeout per test
- Reports pass/fail counts in format: `passed/total`
- Saves results to `test_results_YYYYMMDD_HHMMSS.txt`

### Git Workflow

**IMPORTANT: Never push directly to master. Always use feature branches and PRs.**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1067,7 +1067,16 @@ public void visit(StringNode node) {
} else if (emitterContext != null && emitterContext.symbolTable != null
&& !emitterContext.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)
&& !emitterContext.compilerOptions.isUnicodeSource) {
opcode = Opcodes.LOAD_BYTE_STRING;
// Under `no utf8` - use BYTE_STRING, unless it contains wide characters (> 255)
// Wide characters (like \x{100}) force the string to be UTF-8 even without `use utf8`
boolean hasWideChars = false;
for (int i = 0; i < node.value.length(); i++) {
if (node.value.charAt(i) > 255) {
hasWideChars = true;
break;
}
}
opcode = hasWideChars ? Opcodes.LOAD_STRING : Opcodes.LOAD_BYTE_STRING;
} else {
opcode = Opcodes.LOAD_STRING;
}
Expand Down
73 changes: 42 additions & 31 deletions src/main/java/org/perlonjava/backend/jvm/EmitLiteral.java
Original file line number Diff line number Diff line change
Expand Up @@ -199,38 +199,49 @@ public static void emitString(EmitterContext ctx, StringNode node) {
}

if (!ctx.symbolTable.isStrictOptionEnabled(HINT_UTF8) && !ctx.compilerOptions.isUnicodeSource) {
// Under `no utf8` - create a octet string

int stringIndex = RuntimeScalarCache.getOrCreateByteStringIndex(node.value);

if (stringIndex >= 0) {
// Use cached RuntimeScalar
mv.visitLdcInsn(stringIndex);
mv.visitMethodInsn(
Opcodes.INVOKESTATIC,
"org/perlonjava/runtime/runtimetypes/RuntimeScalarCache",
"getScalarByteString",
"(I)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;",
false);
return;
} else {
// String is too long for cache or null, create new object
mv.visitTypeInsn(Opcodes.NEW, "org/perlonjava/runtime/runtimetypes/RuntimeScalarReadOnly");
mv.visitInsn(Opcodes.DUP);
emitStringValue(mv, node.value);
mv.visitMethodInsn(
Opcodes.INVOKESPECIAL,
"org/perlonjava/runtime/runtimetypes/RuntimeScalarReadOnly",
"<init>",
"(Ljava/lang/String;)V",
false);

// Set the Perl scalar type to BYTE_STRING
mv.visitInsn(Opcodes.DUP);
mv.visitLdcInsn(RuntimeScalarType.BYTE_STRING);
mv.visitFieldInsn(Opcodes.PUTFIELD, "org/perlonjava/runtime/runtimetypes/RuntimeScalarReadOnly", "type", "I");
return;
// Under `no utf8` - create an octet string, unless it contains wide characters (> 255)
// Wide characters (like \x{100}) force the string to be UTF-8 even without `use utf8`
boolean hasWideChars = false;
for (int i = 0; i < node.value.length(); i++) {
if (node.value.charAt(i) > 255) {
hasWideChars = true;
break;
}
}

if (!hasWideChars) {
int stringIndex = RuntimeScalarCache.getOrCreateByteStringIndex(node.value);

if (stringIndex >= 0) {
// Use cached RuntimeScalar
mv.visitLdcInsn(stringIndex);
mv.visitMethodInsn(
Opcodes.INVOKESTATIC,
"org/perlonjava/runtime/runtimetypes/RuntimeScalarCache",
"getScalarByteString",
"(I)Lorg/perlonjava/runtime/runtimetypes/RuntimeScalar;",
false);
return;
} else {
// String is too long for cache or null, create new object
mv.visitTypeInsn(Opcodes.NEW, "org/perlonjava/runtime/runtimetypes/RuntimeScalarReadOnly");
mv.visitInsn(Opcodes.DUP);
emitStringValue(mv, node.value);
mv.visitMethodInsn(
Opcodes.INVOKESPECIAL,
"org/perlonjava/runtime/runtimetypes/RuntimeScalarReadOnly",
"<init>",
"(Ljava/lang/String;)V",
false);

// Set the Perl scalar type to BYTE_STRING
mv.visitInsn(Opcodes.DUP);
mv.visitLdcInsn(RuntimeScalarType.BYTE_STRING);
mv.visitFieldInsn(Opcodes.PUTFIELD, "org/perlonjava/runtime/runtimetypes/RuntimeScalarReadOnly", "type", "I");
return;
}
}
// Fall through to create UTF-8 string if hasWideChars
}

// Use cache for regular strings
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/perlonjava/core/Configuration.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public final class Configuration {
* Automatically populated by Gradle/Maven during build.
* DO NOT EDIT MANUALLY - this value is replaced at build time.
*/
public static final String gitCommitId = "c8049e1a7";
public static final String gitCommitId = "204b2f32c";

/**
* Git commit date of the build (ISO format: YYYY-MM-DD).
Expand Down
16 changes: 16 additions & 0 deletions src/main/java/org/perlonjava/frontend/parser/StringParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
import java.util.Map;

import static org.perlonjava.runtime.perlmodule.Strict.HINT_UTF8;
import static org.perlonjava.runtime.perlmodule.Strict.HINT_RE_ASCII;
import static org.perlonjava.runtime.perlmodule.Strict.HINT_RE_UNICODE;
import static org.perlonjava.runtime.runtimetypes.ScalarUtils.printable;

/*
Expand Down Expand Up @@ -508,6 +510,20 @@ public static OperatorNode parseRegexReplace(EmitterContext ctx, ParsedString ra
public static OperatorNode parseRegexMatch(EmitterContext ctx, String operator, ParsedString rawStr, Parser parser) {
operator = operator.equals("qr") ? "quoteRegex" : "matchRegex";
String modStr = rawStr.buffers.get(1);

// Add default modifiers from `use re` pragma if not already present
if (ctx.symbolTable != null) {
if (ctx.symbolTable.isStrictOptionEnabled(HINT_RE_ASCII)) {
if (!modStr.contains("a") && !modStr.contains("u")) {
modStr = "a" + modStr;
}
} else if (ctx.symbolTable.isStrictOptionEnabled(HINT_RE_UNICODE)) {
if (!modStr.contains("u") && !modStr.contains("a")) {
modStr = "u" + modStr;
}
}
}

Node parsed = parseRegexString(ctx, rawStr, parser, modStr);
if (rawStr.startDelim == '?') {
// `m?PAT?` matches exactly once
Expand Down
51 changes: 49 additions & 2 deletions src/main/java/org/perlonjava/runtime/perlmodule/Re.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,37 @@
package org.perlonjava.runtime.perlmodule;

import org.perlonjava.frontend.semantic.ScopedSymbolTable;
import org.perlonjava.runtime.runtimetypes.RuntimeArray;
import org.perlonjava.runtime.runtimetypes.RuntimeList;
import org.perlonjava.runtime.runtimetypes.RuntimeScalar;
import org.perlonjava.runtime.runtimetypes.RuntimeScalarType;

import static org.perlonjava.frontend.parser.SpecialBlockParser.getCurrentScope;

/**
* The Re class provides functionalities similar to the Perl re module.
*
* <p>Currently implemented features:
* <ul>
* <li>{@code use re '/a'} - ASCII-restrict \w, \d, \s, \b</li>
* <li>{@code use re '/aa'} - ASCII-restrict including case folding</li>
* <li>{@code use re '/u'} - Unicode semantics for character classes</li>
* <li>{@code use re 'strict'} - Enables experimental regex warnings</li>
* <li>{@code re::is_regexp($ref)} - Check if reference is a compiled regex</li>
* </ul>
*
* <p>TODO: Features not yet implemented (see {@code perldoc re}):
* <ul>
* <li>{@code use re '/l'} - Locale-aware matching</li>
* <li>{@code use re '/d'} - Default/legacy semantics</li>
* <li>{@code use re 'eval'} - Allow (?{}) in interpolated patterns without 'use re eval'</li>
* <li>{@code use re 'debug'} - Regex debugging output</li>
* <li>{@code use re 'debugcolor'} - Colorized regex debugging</li>
* <li>{@code use re 'taint'} - Taint mode for regex</li>
* <li>{@code re::regexp_pattern($ref)} - Return pattern and modifiers from qr//</li>
* <li>Combining multiple flags: {@code use re '/xms'}</li>
* <li>Scoped flag restoration with {@code no re '/flags'}</li>
* </ul>
*/
public class Re extends PerlModuleBase {

Expand Down Expand Up @@ -48,35 +73,57 @@ public static RuntimeList isRegexp(RuntimeArray args, int ctx) {
}

/**
* Handle `use re ...` import. Recognizes: 'strict'.
* Handle `use re ...` import. Recognizes: 'strict', '/a', '/u', '/aa'.
* Enables appropriate experimental warning categories so our regex preprocessor can emit them.
*/
public static RuntimeList importRe(RuntimeArray args, int ctx) {
ScopedSymbolTable symbolTable = getCurrentScope();

for (int i = 0; i < args.size(); i++) {
String opt = args.get(i).toString();
// Normalize quotes if present
opt = opt.replace("\"", "").replace("'", "").trim();

if (opt.equalsIgnoreCase("strict")) {
// Enable categories used by our preprocessor warnings
Warnings.warningManager.enableWarning("experimental::re_strict");
Warnings.warningManager.enableWarning("experimental::uniprop_wildcards");
Warnings.warningManager.enableWarning("experimental::vlb");
} else if (opt.equals("/a")) {
// use re '/a' - ASCII-restrict regex character classes
symbolTable.enableStrictOption(Strict.HINT_RE_ASCII);
symbolTable.disableStrictOption(Strict.HINT_RE_UNICODE | Strict.HINT_RE_ASCII_AA);
} else if (opt.equals("/aa")) {
// use re '/aa' - Strict ASCII-restrict (also affects case folding)
symbolTable.enableStrictOption(Strict.HINT_RE_ASCII | Strict.HINT_RE_ASCII_AA);
symbolTable.disableStrictOption(Strict.HINT_RE_UNICODE);
} else if (opt.equals("/u")) {
// use re '/u' - Unicode semantics for regex
symbolTable.enableStrictOption(Strict.HINT_RE_UNICODE);
symbolTable.disableStrictOption(Strict.HINT_RE_ASCII | Strict.HINT_RE_ASCII_AA);
}
}
return new RuntimeList();
}

/**
* Handle `no re ...` unimport. Recognizes: 'strict'.
* Handle `no re ...` unimport. Recognizes: 'strict', '/a', '/u', '/aa'.
*/
public static RuntimeList unimportRe(RuntimeArray args, int ctx) {
ScopedSymbolTable symbolTable = getCurrentScope();

for (int i = 0; i < args.size(); i++) {
String opt = args.get(i).toString();
opt = opt.replace("\"", "").replace("'", "").trim();

if (opt.equalsIgnoreCase("strict")) {
Warnings.warningManager.disableWarning("experimental::re_strict");
Warnings.warningManager.disableWarning("experimental::uniprop_wildcards");
Warnings.warningManager.disableWarning("experimental::vlb");
} else if (opt.equals("/a") || opt.equals("/aa")) {
symbolTable.disableStrictOption(Strict.HINT_RE_ASCII | Strict.HINT_RE_ASCII_AA);
} else if (opt.equals("/u")) {
symbolTable.disableStrictOption(Strict.HINT_RE_UNICODE);
}
}
return new RuntimeList();
Expand Down
5 changes: 5 additions & 0 deletions src/main/java/org/perlonjava/runtime/perlmodule/Strict.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ public class Strict extends PerlModuleBase {
// Bitmask for utf8 source code
public static final int HINT_UTF8 = 0x00800000;

// Bitmask for `use re` regex modifiers
public static final int HINT_RE_ASCII = 0x01000000; // use re '/a'
public static final int HINT_RE_UNICODE = 0x02000000; // use re '/u'
public static final int HINT_RE_ASCII_AA = 0x04000000; // use re '/aa'

/**
* Constructor for Strict.
* Initializes the module with the name "strict".
Expand Down
16 changes: 12 additions & 4 deletions src/main/java/org/perlonjava/runtime/perlmodule/Utf8.java
Original file line number Diff line number Diff line change
Expand Up @@ -320,10 +320,18 @@ public static RuntimeList isUtf8(RuntimeArray args, int ctx) {
throw new IllegalStateException("Bad number of arguments for is_utf8() method");
}
RuntimeScalar scalar = args.get(0);
if (scalar.type == BYTE_STRING) {
return RuntimeScalarCache.scalarFalse.getList();
}
return RuntimeScalarCache.scalarTrue.getList();
return isUtf8(scalar) ? RuntimeScalarCache.scalarTrue.getList() : RuntimeScalarCache.scalarFalse.getList();
}

/**
* Internal helper to check if a scalar has the UTF-8 flag set.
* This is used by regex matching to determine whether to use Unicode semantics.
*
* @param scalar The scalar to check.
* @return true if the scalar is a UTF-8 string (not BYTE_STRING), false otherwise.
*/
public static boolean isUtf8(RuntimeScalar scalar) {
return scalar.type != BYTE_STRING;
}

/**
Expand Down
Loading
Loading