Skip to content

Commit b978f8e

Browse files
committed
experiment: try to avoid a char[] and a string allocation when searching for the end tag
1 parent 0aea545 commit b978f8e

4 files changed

Lines changed: 248 additions & 28 deletions

File tree

src/main/java/org/htmlunit/cyberneko/HTMLScanner.java

Lines changed: 38 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,7 @@ public class HTMLScanner implements XMLDocumentSource, XMLLocator, HTMLComponent
507507
final XMLString fScanComment = new XMLString();
508508

509509
private final XMLString fScanLiteral = new XMLString();
510+
private final XMLString fNextContent = new XMLString(10);
510511

511512
/**
512513
* Reusable single-element boolean array used as an out-parameter.
@@ -1827,41 +1828,47 @@ int read() throws IOException {
18271828
}
18281829

18291830
/**
1830-
* Reads the next characters WITHOUT impacting the buffer content up to current
1831-
* offset.
1831+
* Performs a non-destructive lookahead read of up to {@code len} characters,
1832+
* filling {@code result} without advancing the current position. The offset,
1833+
* column number, and character offset are all restored after the read, so
1834+
* subsequent reads continue from the same position as before this call.
18321835
*
1833-
* @param len the number of characters to read
1834-
* @return the read string (length may be smaller if EOF is encountered)
1835-
* @throws IOException in case of io problems
1836+
* <p>If EOF is reached before {@code len} characters have been read, the
1837+
* result is shorter than requested; callers must check {@code result.length()}
1838+
* rather than assuming it equals {@code len}.
1839+
*
1840+
* <p>The {@code result} buffer is cleared before filling, so any previous
1841+
* content is discarded. The caller is expected to pass a shared, reusable
1842+
* {@link XMLString} instance to avoid allocation on every call.
1843+
*
1844+
* @param result the buffer to fill with the lookahead characters; must not
1845+
* be null. Cleared before use.
1846+
* @param len the maximum number of characters to read
1847+
* @throws IOException if an I/O error occurs while reading
18361848
*/
1837-
String nextContent(final int len) throws IOException {
1849+
void nextContent(final XMLString result, final int len) throws IOException {
1850+
result.clear();
1851+
18381852
final int originalOffset = offset_;
18391853
final int originalColumnNumber = getColumnNumber();
18401854
final int originalCharacterOffset = getCharacterOffset();
18411855

1842-
final char[] buff = new char[len];
1843-
int nbRead;
1844-
for (nbRead = 0; nbRead < len; ++nbRead) {
1845-
// load(length_) should not clear the buffer
1856+
for (int i = 0; i < len; i++) {
18461857
if (offset_ == length_) {
18471858
if (load(length_) == -1) {
18481859
break;
18491860
}
18501861
}
1851-
18521862
final int c = read();
18531863
if (c == -1) {
18541864
break;
18551865
}
1856-
buff[nbRead] = (char) c;
1866+
result.append((char) c);
18571867
}
18581868

1859-
// restore position
18601869
offset_ = originalOffset;
18611870
columnNumber_ = originalColumnNumber;
18621871
characterOffset_ = originalCharacterOffset;
1863-
1864-
return new String(buff, 0, nbRead);
18651872
}
18661873

18671874
// Reads a single character, preserving the old buffer content
@@ -2514,12 +2521,11 @@ private void scanUntilEndTag(final String tagNameWithLeadingSlash) throws IOExce
25142521
break;
25152522
}
25162523
if (c == '<') {
2517-
final String next = fCurrentEntity.nextContent(lengthToScan) + " ";
2518-
if (next.length() >= lengthToScan
2519-
&& tagNameWithLeadingSlash.equalsIgnoreCase(
2520-
next.substring(0, tagNameWithLeadingSlash.length()))
2521-
&& ('>' == next.charAt(lengthToScan - 1)
2522-
|| Character.isWhitespace(next.charAt(lengthToScan - 1)))) {
2524+
fCurrentEntity.nextContent(fNextContent, lengthToScan);
2525+
if (fNextContent.length() >= lengthToScan
2526+
&& fNextContent.startsWithLowerCase(tagNameWithLeadingSlash)
2527+
&& ('>' == fNextContent.charAt(lengthToScan - 1, ' ')
2528+
|| Character.isWhitespace(fNextContent.charAt(lengthToScan - 1, ' ')))) {
25232529
fCurrentEntity.rewind();
25242530
break;
25252531
}
@@ -3211,6 +3217,7 @@ protected int scanAttribute(final XMLAttributesImpl attributes, final boolean[]
32113217
empty[0] = fCurrentEntity.skipMarkup(false);
32123218
return SCAN_FALSE;
32133219
}
3220+
// TODO add test and maybe fix me by using fNamesAttrs as second param
32143221
aname = '=' + scanName(false, fNamesElems);
32153222
}
32163223
if (fReportErrors_ && !skippedSpaces) {
@@ -3731,9 +3738,11 @@ public int scan(final boolean complete) throws IOException {
37313738
state = ScanScriptState.ESCAPED;
37323739
}
37333740
else if (c == '<') {
3734-
final String next = fCurrentEntity.nextContent(8) + " ";
3735-
if (next.length() >= 8 && "/script".equalsIgnoreCase(next.substring(0, 7))
3736-
&& ('>' == next.charAt(7) || Character.isWhitespace(next.charAt(7)))) {
3741+
fCurrentEntity.nextContent(fNextContent, 8);
3742+
if (fNextContent.length() >= 8
3743+
&& fNextContent.startsWithLowerCase("/script")
3744+
&& ('>' == fNextContent.charAt(7, ' ')
3745+
|| Character.isWhitespace(fNextContent.charAt(7, ' ')))) {
37373746
fCurrentEntity.rewind();
37383747
break OUTER;
37393748
}
@@ -3750,9 +3759,11 @@ else if (fScanScriptContent.endsWith("--!")) {
37503759
}
37513760
}
37523761
else if (c == '<') {
3753-
final String next = fCurrentEntity.nextContent(8) + " ";
3754-
if (next.length() >= 8 && "/script".equalsIgnoreCase(next.substring(0, 7))
3755-
&& ('>' == next.charAt(7) || Character.isWhitespace(next.charAt(7)))) {
3762+
fCurrentEntity.nextContent(fNextContent, 8);
3763+
if (fNextContent.length() >= 8
3764+
&& fNextContent.startsWithLowerCase("/script")
3765+
&& ('>' == fNextContent.charAt(7, ' ')
3766+
|| Character.isWhitespace(fNextContent.charAt(7, ' ')))) {
37563767
fCurrentEntity.rewind();
37573768
break OUTER;
37583769
}

src/main/java/org/htmlunit/cyberneko/xerces/xni/XMLString.java

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1121,4 +1121,37 @@ public void ignorableWhitespace(final ContentHandler contentHandler) throws SAXE
11211121
public void comment(final LexicalHandler lexicalHandler) throws SAXException {
11221122
lexicalHandler.comment(data_, 0, length_);
11231123
}
1124+
1125+
/**
1126+
* Returns true if the first {@code prefix.length()} chars of this buffer
1127+
* match {@code prefix} case-insensitively, where {@code prefix} is assumed
1128+
* to be already lowercase. This is faster than equalsIgnoreCase because
1129+
* it only needs one case conversion per char instead of two.
1130+
*
1131+
* @param lowercasePrefix a lowercase string to match against the start of this buffer
1132+
* @return true if this buffer starts with prefix (case-insensitive)
1133+
*/
1134+
public boolean startsWithLowerCase(final String lowercasePrefix) {
1135+
if (length_ < lowercasePrefix.length()) {
1136+
return false;
1137+
}
1138+
for (int i = 0; i < lowercasePrefix.length(); i++) {
1139+
if (Character.toLowerCase(data_[i]) != lowercasePrefix.charAt(i)) {
1140+
return false;
1141+
}
1142+
}
1143+
return true;
1144+
}
1145+
1146+
/**
1147+
* Returns the char at {@code index}, or {@code defaultChar} if index is
1148+
* out of bounds.
1149+
*
1150+
* @param index the position to read
1151+
* @param defaultChar value to return when index >= length
1152+
* @return the char at index, or defaultChar
1153+
*/
1154+
public char charAt(final int index, final char defaultChar) {
1155+
return index < length_ ? data_[index] : defaultChar;
1156+
}
11241157
}

src/test/java/org/htmlunit/cyberneko/HTMLScannerTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ class MyContentScanner extends HTMLScanner.ContentScanner {
209209
@Override
210210
protected int scanComment() throws IOException {
211211
// bug was here: calling nextContent() at the end of the buffer/input
212-
fCurrentEntity.nextContent(30);
212+
fCurrentEntity.nextContent(null, 30);
213213
return super.scanComment();
214214
}
215215
}

src/test/java/org/htmlunit/cyberneko/xerces/xni/XMLStringTest.java

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1436,4 +1436,180 @@ public void contains_XMLString() {
14361436
assertEquals(true, t.apply("abc", "bc"));
14371437
assertEquals(true, t.apply("abcabc", "abc"));
14381438
}
1439+
1440+
// -------------------------------------------------------------------------
1441+
// startsWithLowerCase
1442+
// -------------------------------------------------------------------------
1443+
1444+
@Test
1445+
public void startsWithLowerCase_exactMatchAllLower() {
1446+
final XMLString s = new XMLString("/script");
1447+
assertTrue(s.startsWithLowerCase("/script"));
1448+
}
1449+
1450+
@Test
1451+
public void startsWithLowerCase_exactMatchUpperCaseInBuffer() {
1452+
final XMLString s = new XMLString("/SCRIPT");
1453+
assertTrue(s.startsWithLowerCase("/script"));
1454+
}
1455+
1456+
@Test
1457+
public void startsWithLowerCase_exactMatchMixedCaseInBuffer() {
1458+
final XMLString s = new XMLString("/Script");
1459+
assertTrue(s.startsWithLowerCase("/script"));
1460+
}
1461+
1462+
@Test
1463+
public void startsWithLowerCase_prefixShorterThanBuffer() {
1464+
final XMLString s = new XMLString("/script>");
1465+
assertTrue(s.startsWithLowerCase("/script"));
1466+
}
1467+
1468+
@Test
1469+
public void startsWithLowerCase_prefixShorterThanBufferUpperCase() {
1470+
final XMLString s = new XMLString("/SCRIPT>");
1471+
assertTrue(s.startsWithLowerCase("/script"));
1472+
}
1473+
1474+
@Test
1475+
public void startsWithLowerCase_noMatch() {
1476+
final XMLString s = new XMLString("/style");
1477+
assertFalse(s.startsWithLowerCase("/script"));
1478+
}
1479+
1480+
@Test
1481+
public void startsWithLowerCase_noMatchPartialOverlap() {
1482+
final XMLString s = new XMLString("/scrivener");
1483+
assertFalse(s.startsWithLowerCase("/script"));
1484+
}
1485+
1486+
@Test
1487+
public void startsWithLowerCase_bufferShorterThanPrefix() {
1488+
final XMLString s = new XMLString("/scr");
1489+
assertFalse(s.startsWithLowerCase("/script"));
1490+
}
1491+
1492+
@Test
1493+
public void startsWithLowerCase_emptyPrefix() {
1494+
final XMLString s = new XMLString("/script");
1495+
assertTrue(s.startsWithLowerCase(""));
1496+
}
1497+
1498+
@Test
1499+
public void startsWithLowerCase_emptyPrefixAgainstEmptyBuffer() {
1500+
final XMLString s = new XMLString(0);
1501+
assertTrue(s.startsWithLowerCase(""));
1502+
}
1503+
1504+
@Test
1505+
public void startsWithLowerCase_emptyBuffer_nonEmptyPrefix() {
1506+
final XMLString s = new XMLString(0);
1507+
assertFalse(s.startsWithLowerCase("/script"));
1508+
}
1509+
1510+
@Test
1511+
public void startsWithLowerCase_singleCharMatchLower() {
1512+
final XMLString s = new XMLString("a");
1513+
assertTrue(s.startsWithLowerCase("a"));
1514+
}
1515+
1516+
@Test
1517+
public void startsWithLowerCase_singleCharMatchUpper() {
1518+
final XMLString s = new XMLString("A");
1519+
assertTrue(s.startsWithLowerCase("a"));
1520+
}
1521+
1522+
@Test
1523+
public void startsWithLowerCase_singleCharNoMatch() {
1524+
final XMLString s = new XMLString("b");
1525+
assertFalse(s.startsWithLowerCase("a"));
1526+
}
1527+
1528+
@Test
1529+
public void startsWithLowerCase_realWorldScriptLowerCase() {
1530+
final XMLString s = new XMLString("/script>");
1531+
assertTrue(s.startsWithLowerCase("/script"));
1532+
assertEquals('>', s.charAt(7, ' '));
1533+
}
1534+
1535+
@Test
1536+
public void startsWithLowerCase_realWorldScriptUpperCase() {
1537+
final XMLString s = new XMLString("/SCRIPT>");
1538+
assertTrue(s.startsWithLowerCase("/script"));
1539+
assertEquals('>', s.charAt(7, ' '));
1540+
}
1541+
1542+
@Test
1543+
public void startsWithLowerCase_realWorldScriptWhitespaceSeparator() {
1544+
final XMLString s = new XMLString("/SCRIPT ");
1545+
assertTrue(s.startsWithLowerCase("/script"));
1546+
assertTrue(Character.isWhitespace(s.charAt(7, ' ')));
1547+
}
1548+
1549+
// -------------------------------------------------------------------------
1550+
// charAt(int, char)
1551+
// -------------------------------------------------------------------------
1552+
1553+
@Test
1554+
public void charAtWithDefault_withinBounds() {
1555+
final XMLString s = new XMLString("hello");
1556+
assertEquals('h', s.charAt(0, ' '));
1557+
assertEquals('e', s.charAt(1, ' '));
1558+
assertEquals('o', s.charAt(4, ' '));
1559+
}
1560+
1561+
@Test
1562+
public void charAtWithDefault_exactlyAtLastIndex() {
1563+
final XMLString s = new XMLString("hi");
1564+
assertEquals('i', s.charAt(1, ' '));
1565+
}
1566+
1567+
@Test
1568+
public void charAtWithDefault_onePastEnd_returnsDefault() {
1569+
final XMLString s = new XMLString("hi");
1570+
assertEquals(' ', s.charAt(2, ' '));
1571+
}
1572+
1573+
@Test
1574+
public void charAtWithDefault_wellBeyondEnd_returnsDefault() {
1575+
final XMLString s = new XMLString("hi");
1576+
assertEquals(' ', s.charAt(99, ' '));
1577+
}
1578+
1579+
@Test
1580+
public void charAtWithDefault_emptyBuffer_returnsDefault() {
1581+
final XMLString s = new XMLString(0);
1582+
assertEquals(' ', s.charAt(0, ' '));
1583+
}
1584+
1585+
@Test
1586+
public void charAtWithDefault_customDefault() {
1587+
final XMLString s = new XMLString("ab");
1588+
assertEquals('X', s.charAt(5, 'X'));
1589+
}
1590+
1591+
@Test
1592+
public void charAtWithDefault_sentinelPatternGt() {
1593+
// replicates: '>' == fNextContent.charAt(7, ' ')
1594+
// buffer has exactly 7 chars — index 7 is out of bounds, returns ' '
1595+
// which is whitespace, satisfying the isWhitespace branch
1596+
final XMLString s = new XMLString("/script"); // length 7
1597+
final char sentinel = s.charAt(7, ' ');
1598+
assertEquals(' ', sentinel);
1599+
assertTrue(Character.isWhitespace(sentinel));
1600+
}
1601+
1602+
@Test
1603+
public void charAtWithDefault_sentinelPatternExactGt() {
1604+
// buffer has 8 chars ending with '>'
1605+
final XMLString s = new XMLString("/script>"); // length 8
1606+
assertEquals('>', s.charAt(7, ' '));
1607+
}
1608+
1609+
@Test
1610+
public void charAtWithDefault_bufferFilledToRequestedLen() {
1611+
// EOF case: nextContent asked for 8, got only 4 — index 7 out of bounds
1612+
final XMLString s = new XMLString("/scr"); // length 4
1613+
assertEquals(' ', s.charAt(7, ' '));
1614+
}
14391615
}

0 commit comments

Comments
 (0)