-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathStringParser.java
More file actions
752 lines (662 loc) · 31.9 KB
/
StringParser.java
File metadata and controls
752 lines (662 loc) · 31.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
package org.perlonjava.frontend.parser;
import org.perlonjava.app.cli.CompilerOptions;
import org.perlonjava.backend.jvm.EmitterContext;
import org.perlonjava.frontend.astnode.*;
import org.perlonjava.frontend.lexer.Lexer;
import org.perlonjava.frontend.lexer.LexerToken;
import org.perlonjava.frontend.lexer.LexerTokenType;
import org.perlonjava.runtime.runtimetypes.PerlCompilerException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import static org.perlonjava.runtime.perlmodule.Strict.HINT_UTF8;
import static org.perlonjava.runtime.perlmodule.Strict.HINT_RE_ASCII;
import static org.perlonjava.runtime.perlmodule.Strict.HINT_RE_UNICODE;
import static org.perlonjava.runtime.runtimetypes.ScalarUtils.printable;
/*
* StringParser is used to parse domain-specific languages within Perl, such as Regex and string interpolation.
*
* Perl has a complex quoting mechanism for strings, which cannot be fully implemented in the Lexer
* due to insufficient context.
* This module reprocesses the tokens according to the current context to extract the quoted string.
* The string is then passed on to the respective domain-specific compilers.
*/
public class StringParser {
// States for the finite state machine (FSM)
private static final int START = 0;
private static final int STRING = 1;
private static final int ESCAPE = 2;
private static final int END_TOKEN = 3;
// Map to hold pairs of matching delimiters
private static final Map<Character, Character> QUOTE_PAIR = Map.of(
'<', '>',
'{', '}',
'(', ')',
'[', ']'
);
/**
* Parses a raw string with delimiters from a list of tokens.
*
* @param tokens List of lexer tokens.
* @param index Starting index in the tokens list.
* @param redo Flag to indicate if the parsing should be redone; example: s/.../.../
* @return ParsedString object containing the parsed string and updated token index.
*/
public static ParsedString parseRawStringWithDelimiter(EmitterContext ctx, List<LexerToken> tokens, int index, boolean redo, Parser parser) {
int tokPos = index; // Current position in the tokens list
char startDelim = 0; // Starting delimiter
char endDelim = 0; // Ending delimiter
int state = START; // Initial state of the FSM
int parenLevel = 0; // Parenthesis nesting level
boolean isPair = false; // Flag to indicate if the delimiters are a pair
StringBuilder buffer = new StringBuilder(); // Buffer to hold the parsed string
StringBuilder remain = new StringBuilder(); // Buffer to hold the remaining string
ArrayList<String> buffers = new ArrayList<>();
// Track token positions for heredoc processing
int startTokPos = tokPos;
StringBuilder pendingBuffer = new StringBuilder(); // Buffer for content pending heredoc check
while (state != END_TOKEN) {
LexerToken currentToken = tokens.get(tokPos);
if (currentToken.type == LexerTokenType.EOF) {
String errorMsg = endDelim == '/'
? "Search pattern not terminated"
: "Can't find string terminator " + endDelim + " anywhere before EOF";
throw new PerlCompilerException(tokPos, errorMsg, ctx.errorUtil);
}
// Process heredocs at newlines during string parsing
if (currentToken.type == LexerTokenType.NEWLINE) {
if (CompilerOptions.DEBUG_ENABLED) ctx.logDebug("parseRawStringWithDelimiter: Found NEWLINE at tokPos=" + tokPos +
", parser=" + (parser != null) +
", heredocCount=" + (parser != null ? parser.getHeredocNodes().size() : 0));
if (parser != null && !parser.getHeredocNodes().isEmpty()) {
if (CompilerOptions.DEBUG_ENABLED) ctx.logDebug("parseRawStringWithDelimiter: Processing heredocs");
// Save the current parser position
int savedIndex = parser.tokenIndex;
int beforeHeredocTokPos = tokPos;
parser.tokenIndex = tokPos;
// Process pending heredocs
ParseHeredoc.parseHeredocAfterNewline(parser);
// Calculate how many tokens were consumed by heredoc processing
int afterHeredocTokPos = parser.tokenIndex;
int tokensConsumed = afterHeredocTokPos - beforeHeredocTokPos;
if (CompilerOptions.DEBUG_ENABLED) ctx.logDebug("parseRawStringWithDelimiter: Heredoc consumed " + tokensConsumed + " tokens");
// If heredoc consumed more than just the newline, we need to handle it
if (tokensConsumed > 1) {
// Add any pending content up to the newline
buffer.append(pendingBuffer);
pendingBuffer.setLength(0);
// Skip the newline (it triggered heredoc) and all consumed content
tokPos = afterHeredocTokPos - 1; // -1 because loop will increment
} else {
// Heredoc only consumed the newline, add pending content including newline
pendingBuffer.append(currentToken.text);
buffer.append(pendingBuffer);
pendingBuffer.setLength(0);
}
// Restore parser position
parser.tokenIndex = savedIndex;
// Continue to next token
tokPos++;
continue;
}
}
// Process token characters
for (char ch : currentToken.text.toCharArray()) {
switch (state) {
case START:
startDelim = ch;
endDelim = startDelim;
if (QUOTE_PAIR.containsKey(startDelim)) { // Check if the delimiter is a pair
isPair = true;
endDelim = QUOTE_PAIR.get(startDelim);
}
state = STRING; // Move to STRING state
break;
case STRING:
if (isPair && ch == startDelim) {
parenLevel++; // Increase nesting level for starting delimiter
} else if (ch == endDelim) {
if (parenLevel == 0) {
if (redo && !isPair) {
redo = false;
// Restart FSM for another string
buffer.append(pendingBuffer); // Flush pending
buffers.add(buffer.toString());
buffer = new StringBuilder();
pendingBuffer.setLength(0);
break; // Exit the loop to restart FSM
} else {
state = END_TOKEN; // End parsing
}
continue; // Skip the rest of the loop
}
parenLevel--; // Decrease nesting level for ending delimiter
} else if (ch == '\\') {
state = ESCAPE; // Move to ESCAPE state
}
pendingBuffer.append(ch); // Append to pending buffer
break;
case ESCAPE:
pendingBuffer.append(ch); // Append escaped character to pending buffer
state = STRING; // Return to STRING state
break;
case END_TOKEN:
remain.append(ch); // Append remaining characters to remain buffer
break;
}
}
// If we haven't hit a newline, flush pending buffer to main buffer
if (currentToken.type != LexerTokenType.NEWLINE) {
buffer.append(pendingBuffer);
pendingBuffer.setLength(0);
}
tokPos++; // Move to the next token
}
// Final flush of any pending content
buffer.append(pendingBuffer);
if (ctx.symbolTable.isStrictOptionEnabled(HINT_UTF8)
|| ctx.compilerOptions.isUnicodeSource) {
// utf8 source code is true - keep Unicode string as-is
buffers.add(buffer.toString());
// System.out.println("buffers utf8: " + buffer.toString().length() + " " + buffer.toString());
} else if (ctx.compilerOptions.isEvalbytes) {
// evalbytes context - treat each character as a raw byte value
// Characters <= 255 represent byte values directly
String str = buffer.toString();
StringBuilder octetString = new StringBuilder();
for (int i = 0; i < str.length(); i++) {
char ch = str.charAt(i);
if (ch <= 255) {
// Treat as raw byte value
octetString.append(ch);
} else {
// Character outside byte range - UTF-8 encode it
byte[] utf8Bytes = Character.toString(ch).getBytes(java.nio.charset.StandardCharsets.UTF_8);
for (byte b : utf8Bytes) {
octetString.append((char) (b & 0xFF));
}
}
}
buffers.add(octetString.toString());
} else if (ctx.compilerOptions.isByteStringSource) {
// Source code originated from a BYTE_STRING scalar (e.g. eval STRING where STRING is bytes).
// In this case buffer already represents raw bytes as chars 0..255.
buffers.add(buffer.toString());
} else {
// utf8 source code is false - convert to octets
String str = buffer.toString();
StringBuilder octetString = new StringBuilder();
// First, we need to convert the Unicode string back to UTF-8 bytes
// to simulate reading the source file as raw bytes
byte[] utf8Bytes = str.getBytes(java.nio.charset.StandardCharsets.UTF_8);
// Then treat each UTF-8 byte as a separate character/octet
for (byte b : utf8Bytes) {
octetString.append((char) (b & 0xFF));
}
buffers.add(octetString.toString());
}
if (!remain.isEmpty()) {
tokPos--;
tokens.get(tokPos).text = remain.toString(); // Put the remaining string back in the tokens list
}
return new ParsedString(index, tokPos, buffers, startDelim, endDelim, ' ', ' ');
}
public static ParsedString parseRawStrings(Parser parser, EmitterContext ctx, List<LexerToken> tokens, int tokenIndex, int stringCount) {
int pos = tokenIndex;
boolean redo = (stringCount == 3);
ParsedString ast = parseRawStringWithDelimiter(ctx, tokens, pos, redo, parser); // use redo flag to extract 2 strings
if (stringCount == 1) {
return ast;
}
pos = ast.next;
if (stringCount == 3) { // fetch the second of 3 strings: s{aaa}{SECOND}ig
char delim = ast.startDelim; // / or {
if (QUOTE_PAIR.containsKey(delim)) {
pos = Whitespace.skipWhitespace(parser, pos, tokens);
ParsedString ast2 = parseRawStringWithDelimiter(ctx, tokens, pos, false, parser);
ast.buffers.add(ast2.buffers.getFirst());
ast.next = ast2.next;
ast.secondBufferStartDelim = ast2.startDelim;
ast.secondBufferEndDelim = ast2.endDelim;
pos = ast.next;
}
}
// fetch the last string: s/aaa/bbb/LAST
String modifier = "";
if (tokens.get(pos).type == LexerTokenType.IDENTIFIER) {
modifier = tokens.get(pos).text;
ast.next = pos + 1;
}
ArrayList<String> buffers = ast.buffers;
if (buffers == null) {
buffers = new ArrayList<>();
ast.buffers = buffers;
}
buffers.add(modifier);
return ast;
}
static Node parseRegexString(EmitterContext ctx, ParsedString rawStr, Parser parser, String modifiers) {
Node parsed;
if (rawStr.startDelim == '\'') {
// single quote delimiter, use the string as-is
parsed = new StringNode(rawStr.buffers.getFirst(), rawStr.index);
} else {
// Check if /x modifier is present
boolean hasXModifier = modifiers != null && modifiers.contains("x");
String patternStr = rawStr.buffers.getFirst();
if (hasXModifier) {
// With /x modifier, strip comments before variable interpolation
// Comments start with # and extend to newline (but not inside [...] or escaped)
patternStr = stripRegexComments(patternStr);
// Create a modified ParsedString with comments stripped
ArrayList<String> modifiedBuffers = new ArrayList<>(rawStr.buffers);
modifiedBuffers.set(0, patternStr);
rawStr = new ParsedString(rawStr.index, rawStr.next, modifiedBuffers,
rawStr.startDelim, rawStr.endDelim,
rawStr.secondBufferStartDelim, rawStr.secondBufferEndDelim);
}
// interpolate variables, but ignore the escapes, keep `\$` if present
// Pass shared heredoc nodes to handle heredocs inside regex patterns
parsed = StringDoubleQuoted.parseDoubleQuotedString(ctx, rawStr, false, true, true,
parser != null ? parser.getHeredocNodes() : null);
}
return parsed;
}
/**
* Strip comments from a regex pattern for /x mode.
* Comments start with # and extend to newline.
* But # is NOT a comment when:
* - Inside [...] character classes
* - Escaped as \#
* - Inside (?{...}) or (??{...}) code blocks
* - Part of (?#...) inline comments (these are preserved)
*/
private static String stripRegexComments(String pattern) {
StringBuilder result = new StringBuilder();
int i = 0;
int len = pattern.length();
boolean inCharClass = false;
int codeBlockDepth = 0; // Track nested (?{...}) code blocks
while (i < len) {
char c = pattern.charAt(i);
if (c == '\\' && i + 1 < len) {
// Escaped character - copy both chars
result.append(c);
result.append(pattern.charAt(i + 1));
i += 2;
continue;
}
if (c == '[' && !inCharClass && codeBlockDepth == 0) {
inCharClass = true;
result.append(c);
i++;
continue;
}
if (c == ']' && inCharClass) {
inCharClass = false;
result.append(c);
i++;
continue;
}
// Check for special (?...) sequences
if (c == '(' && !inCharClass && codeBlockDepth == 0 && i + 1 < len && pattern.charAt(i + 1) == '?') {
// Check what follows (?
if (i + 2 < len) {
char afterQ = pattern.charAt(i + 2);
// (?#...) is an inline comment - copy until closing )
if (afterQ == '#') {
result.append(c); // (
i++;
result.append(pattern.charAt(i)); // ?
i++;
result.append(pattern.charAt(i)); // #
i++;
// Copy until closing )
while (i < len && pattern.charAt(i) != ')') {
result.append(pattern.charAt(i));
i++;
}
if (i < len) {
result.append(pattern.charAt(i)); // )
i++;
}
continue;
}
// (?{ or (??{ starts a code block
if (afterQ == '{' || (afterQ == '?' && i + 3 < len && pattern.charAt(i + 3) == '{')) {
result.append(c); // (
i++;
result.append(pattern.charAt(i)); // ?
i++;
if (pattern.charAt(i) == '?') {
result.append(pattern.charAt(i)); // second ? for (??{
i++;
}
result.append(pattern.charAt(i)); // {
i++;
codeBlockDepth++;
continue;
}
}
}
// Track brace nesting inside code blocks
if (codeBlockDepth > 0) {
if (c == '{') {
codeBlockDepth++;
} else if (c == '}') {
codeBlockDepth--;
}
result.append(c);
i++;
continue;
}
if (c == '#' && !inCharClass && codeBlockDepth == 0) {
// Start of /x comment - skip until newline
while (i < len && pattern.charAt(i) != '\n') {
i++;
}
// Keep the newline if present (it's significant in regex)
if (i < len && pattern.charAt(i) == '\n') {
result.append('\n');
i++;
}
continue;
}
result.append(c);
i++;
}
return result.toString();
}
public static ListNode parseWordsString(ParsedString rawStr) {
String input = rawStr.buffers.getFirst();
char startDelim = rawStr.startDelim;
char endDelim = rawStr.endDelim;
StringBuilder processed = new StringBuilder();
char[] chars = input.toCharArray();
int length = chars.length;
int index = 0;
while (index < length) {
char ch = chars[index];
if (ch == '\\') {
index++;
if (index < length) {
char nextChar = chars[index];
if (nextChar == '\\' || nextChar == startDelim || nextChar == endDelim) {
processed.append(nextChar);
} else {
processed.append('\\').append(nextChar);
}
}
} else {
processed.append(ch);
}
index++;
}
String trimmed = processed.toString().trim();
ListNode list = new ListNode(rawStr.index);
if (trimmed.isEmpty()) {
return list;
}
String[] words = trimmed.split("\\s+");
for (String word : words) {
list.elements.add(new StringNode(word, rawStr.index));
}
return list;
}
public static OperatorNode parseRegexReplace(EmitterContext ctx, ParsedString rawStr, Parser parser) {
String operator = "replaceRegex";
String replaceStr = rawStr.buffers.get(1);
String modifierStr = rawStr.buffers.get(2);
Node parsed = parseRegexString(ctx, rawStr, parser, modifierStr);
Node replace;
if (modifierStr.contains("e")) {
// if modifiers include `e`, then parse the `replace` code
if (CompilerOptions.DEBUG_ENABLED) ctx.logDebug("regex e-modifier: " + replaceStr);
Parser blockParser = new Parser(ctx, new Lexer(replaceStr).tokenize(), parser.getHeredocNodes());
replace = ParseBlock.parseBlock(blockParser);
} else if (rawStr.secondBufferStartDelim != '\'') {
// handle string interpolaton
rawStr.buffers.removeFirst(); // consume the first buffer
replace = StringDoubleQuoted.parseDoubleQuotedString(ctx, rawStr, true, true, true);
} else {
// handle single quoted string
rawStr.buffers.removeFirst(); // consume the first buffer
replace = StringSingleQuoted.parseSingleQuotedString(rawStr);
}
if (modifierStr.contains("ee")) {
replace = new OperatorNode("eval", new ListNode(List.of(replace), rawStr.index), rawStr.index);
}
// If replace is not a plain string, make it an anonymous subroutine
if (!(replace instanceof StringNode)) {
if (!(replace instanceof BlockNode)) {
List<Node> list = new ArrayList<>();
list.add(replace);
replace = new BlockNode(list, rawStr.index);
}
replace = new SubroutineNode(null, null, null, replace, false, rawStr.index);
}
Node modifiers = new StringNode(modifierStr, rawStr.index);
List<Node> elements = new ArrayList<>();
elements.add(parsed);
elements.add(replace);
elements.add(modifiers);
ListNode list = new ListNode(elements, rawStr.index);
return new OperatorNode(operator, list, rawStr.index);
}
public static OperatorNode parseRegexMatch(EmitterContext ctx, String operator, ParsedString rawStr, Parser parser) {
operator = operator.equals("qr") ? "quoteRegex" : "matchRegex";
String modStr = rawStr.buffers.get(1);
// Add default modifiers from `use re` pragma if not already present
if (ctx.symbolTable != null) {
if (ctx.symbolTable.isStrictOptionEnabled(HINT_RE_ASCII)) {
if (!modStr.contains("a") && !modStr.contains("u")) {
modStr = "a" + modStr;
}
} else if (ctx.symbolTable.isStrictOptionEnabled(HINT_RE_UNICODE)) {
if (!modStr.contains("u") && !modStr.contains("a")) {
modStr = "u" + modStr;
}
}
}
Node parsed = parseRegexString(ctx, rawStr, parser, modStr);
if (rawStr.startDelim == '?') {
// `m?PAT?` matches exactly once
// save the internal flag in the modifier string
modStr += '?';
}
Node modifiers = new StringNode(modStr, rawStr.index);
List<Node> elements = new ArrayList<>();
elements.add(parsed);
elements.add(modifiers);
ListNode list = new ListNode(elements, rawStr.index);
return new OperatorNode(operator, list, rawStr.index);
}
public static OperatorNode parseSystemCommand(EmitterContext ctx, String operator, ParsedString rawStr) {
operator = "qx";
// Parse as interpolated string (like double quotes)
Node parsed = StringDoubleQuoted.parseDoubleQuotedString(ctx, rawStr, true, true, false);
List<Node> elements = new ArrayList<>();
elements.add(parsed);
ListNode list = new ListNode(elements, rawStr.index);
return new OperatorNode(operator, list, rawStr.index);
}
public static OperatorNode parseTransliteration(EmitterContext ctx, ParsedString rawStr) {
String operator = "tr";
// Get the search list and replacement list
String searchList = rawStr.buffers.get(0);
String replacementList = rawStr.buffers.get(1);
String modifiers = rawStr.buffers.get(2);
Node searchNode;
Node replacementNode;
// If single quote delimiter, only process \\ escapes
if (rawStr.startDelim == '\'') {
// For single quotes, only remove \ from pairs of \\
searchList = searchList.replace("\\\\", "\\");
searchNode = new StringNode(searchList, rawStr.index);
} else {
// For other delimiters, process double-quote escape sequences
// but without variable interpolation
ParsedString searchParsed = new ParsedString(
rawStr.index,
rawStr.next,
new ArrayList<>(List.of(searchList)),
rawStr.startDelim,
rawStr.endDelim,
' ', ' '
);
// searchNode = StringDoubleQuoted.parseDoubleQuotedString(ctx, searchParsed, true, false);
searchNode = StringDoubleQuoted.parseDoubleQuotedString(ctx, searchParsed, false, false, false);
}
// Same logic for replacement list
if (rawStr.secondBufferStartDelim == '\'') {
replacementList = replacementList.replace("\\\\", "\\");
replacementNode = new StringNode(replacementList, rawStr.index);
} else {
ParsedString replaceParsed = new ParsedString(
rawStr.index,
rawStr.next,
new ArrayList<>(List.of(replacementList)),
rawStr.secondBufferStartDelim,
rawStr.secondBufferEndDelim,
' ', ' '
);
// replacementNode = StringDoubleQuoted.parseDoubleQuotedString(ctx, replaceParsed, true, false);
replacementNode = StringDoubleQuoted.parseDoubleQuotedString(ctx, replaceParsed, false, false, false);
}
Node modifierNode = new StringNode(modifiers, rawStr.index);
List<Node> elements = new ArrayList<>();
elements.add(searchNode);
elements.add(replacementNode);
elements.add(modifierNode);
ListNode list = new ListNode(elements, rawStr.index);
return new OperatorNode(operator, list, rawStr.index);
}
public static Node parseRawString(Parser parser, String operator) {
// handle special quotes for operators: q qq qx qw // s/// m//
if (operator.equals("<") || operator.equals("<<") || operator.equals("'") || operator.equals("\"") || operator.equals("/") || operator.equals("//") || operator.equals("/=")
|| operator.equals("`")) {
parser.tokenIndex--; // will reparse the quote
if (operator.equals("<") || operator.equals("<<")) {
operator = "<>";
}
}
ParsedString rawStr;
int stringParts = switch (operator) {
case "s", "tr", "y" -> 3; // s{str}{str}modifier
case "m", "qr", "/", "//", "/=" -> 2;
default -> 1; // m{str}modifier
};
rawStr = parseRawStrings(parser, parser.ctx, parser.tokens, parser.tokenIndex, stringParts);
parser.tokenIndex = rawStr.next;
switch (operator) {
case "`":
case "qx":
return parseSystemCommand(parser.ctx, operator, rawStr);
case "'":
case "q":
return StringSingleQuoted.parseSingleQuotedString(rawStr);
case "m":
case "qr":
case "/":
case "//":
case "/=":
return parseRegexMatch(parser.ctx, operator, rawStr, parser);
case "s":
return parseRegexReplace(parser.ctx, rawStr, parser);
case "\"":
case "qq":
return StringDoubleQuoted.parseDoubleQuotedString(parser.ctx, rawStr, true, true, false, parser.getHeredocNodes(), parser);
case "qw":
return parseWordsString(rawStr);
case "tr":
case "y":
return parseTransliteration(parser.ctx, rawStr);
}
ListNode list = new ListNode(rawStr.index);
int size = rawStr.buffers.size();
for (int i = 0; i < size; i++) {
list.elements.add(new StringNode(rawStr.buffers.get(i), rawStr.index));
}
return new OperatorNode(operator, list, rawStr.index);
}
static StringNode parseVstring(Parser parser, String vStringPart, int currentIndex) {
// Start constructing the v-string
StringBuilder vStringBuilder = new StringBuilder();
if (vStringPart.startsWith("v")) {
vStringPart = vStringPart.substring(1);
}
try {
// Convert the initial part to a character and append it
int charCode = Integer.parseInt(vStringPart);
vStringBuilder.appendCodePoint(charCode);
} catch (NumberFormatException e) {
throw new PerlCompilerException(currentIndex, "Invalid v-string format: " + vStringPart, parser.ctx.errorUtil);
}
// Continue parsing while the next token is a dot followed immediately by a number
while (true) {
// Get the next immediate token without skipping whitespace
LexerToken nextToken = parser.tokens.get(parser.tokenIndex);
// Check if the next token is a dot
if (nextToken.text.equals(".")) {
// Get the token immediately following the dot
LexerToken numberToken = parser.tokens.get(parser.tokenIndex + 1);
// Ensure the token after the dot is a number
if (numberToken.type == LexerTokenType.NUMBER) {
// Consume the dot
TokenUtils.consume(parser);
// Consume the number, convert it to a character, and append it
String num = TokenUtils.consume(parser).text.replace("_", "");
int charCode = Integer.parseInt(num);
vStringBuilder.appendCodePoint(charCode);
} else {
break; // Exit the loop if the next token is not a number
}
} else {
break; // Exit the loop if the next token is not a dot
}
}
if (CompilerOptions.DEBUG_ENABLED) parser.ctx.logDebug("v-string: " + printable(vStringBuilder.toString()) + " next:" + TokenUtils.peek(parser));
// Create a StringNode with the constructed v-string
return new StringNode(vStringBuilder.toString(), true, currentIndex);
}
public static void assertNoWideCharacters(String toWrite, String message) {
for (int i = 0; i < toWrite.length(); i++) {
if (toWrite.charAt(i) > 255) {
throw new PerlCompilerException("Wide character in " + message);
}
}
}
/**
* Class to represent the parsed string and its position in the tokens list.
*/
public static class ParsedString {
public int index; // Starting index of the parsed string
public int next; // Next index in the tokens list
public ArrayList<String> buffers; // Parsed string
public char startDelim;
public char endDelim;
public char secondBufferStartDelim; // Start delimiter of the second buffer
public char secondBufferEndDelim; // End delimiter of the second buffer
public ParsedString(int index, int next, ArrayList<String> buffers, char startDelim, char endDelim, char secondBufferStartDelim, char secondBufferEndDelim) {
this.index = index;
this.next = next;
this.buffers = buffers;
this.startDelim = startDelim;
this.endDelim = endDelim;
this.secondBufferStartDelim = secondBufferStartDelim;
this.secondBufferEndDelim = secondBufferEndDelim;
}
@Override
public String toString() {
return "ParsedString{" +
"index=" + index +
", next=" + next +
", buffers=" + buffers +
", startDelim=" + startDelim +
", endDelim=" + endDelim +
", secondBufferStartDelim=" + secondBufferStartDelim +
", secondBufferEndDelim=" + secondBufferEndDelim +
'}';
}
}
}