diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/columnindex/RowRanges.java b/parquet-column/src/main/java/org/apache/parquet/filter2/columnindex/RowRanges.java new file mode 100644 index 0000000000..5666e049db --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/columnindex/RowRanges.java @@ -0,0 +1,400 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.filter2.columnindex; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.PrimitiveIterator; +import java.util.Set; +import org.apache.parquet.filter2.compat.FilterCompat.Filter; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; + +/** + * Class representing row ranges in a row-group. These row ranges are calculated as a result of the column index based + * filtering. To be used iterate over the matching row indexes to be read from a row-group, retrieve the count of the + * matching rows or check overlapping of a row index range. + * + * @see org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter#calculateRowRanges(Filter, + * org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore, Set, long) + */ +public class RowRanges { + // Make it public because some uppler layer application need to access it + public static class Range { + + // Returns the union of the two ranges or null if there are elements between them. + private static Range union(Range left, Range right) { + if (left.from <= right.from) { + if (left.to + 1 >= right.from) { + return new Range(left.from, Math.max(left.to, right.to)); + } + } else if (right.to + 1 >= left.from) { + return new Range(right.from, Math.max(left.to, right.to)); + } + return null; + } + + // Returns the intersection of the two ranges of null if they are not overlapped. + private static Range intersection(Range left, Range right) { + if (left.from <= right.from) { + if (left.to >= right.from) { + return new Range(right.from, Math.min(left.to, right.to)); + } + } else if (right.to >= left.from) { + return new Range(left.from, Math.min(left.to, right.to)); + } + return null; + } + + public final long from; + public final long to; + + // Creates a range of [from, to] (from and to are inclusive; empty ranges are not valid) + Range(long from, long to) { + assert from <= to; + this.from = from; + this.to = to; + } + + long count() { + return to - from + 1; + } + + boolean isBefore(Range other) { + return to < other.from; + } + + boolean isAfter(Range other) { + return from > other.to; + } + + @Override + public String toString() { + return "[" + from + ", " + to + ']'; + } + } + + public static final RowRanges EMPTY = new RowRanges(Collections.emptyList()); + + private final List ranges; + + // Visible for the deprecated org.apache.parquet.internal.filter2.columnindex.RowRanges shim, + // which subclasses this type so the released ParquetFileReader#readFilteredRowGroup(int, RowRanges) + // signature keeps working. Remove once that shim is dropped (2.0). + protected RowRanges() { + this(new ArrayList<>()); + } + + private RowRanges(Range range) { + this(Collections.singletonList(range)); + } + + private RowRanges(List ranges) { + this.ranges = ranges; + } + + /** + * Creates an immutable RowRanges object with the single range [0, rowCount - + * 1]. + * + * @param rowCount a single row count + * @return an immutable RowRanges + */ + public static RowRanges createSingle(long rowCount) { + return new RowRanges(new Range(0L, rowCount - 1L)); + } + + /** + * Creates a mutable RowRanges object with the following ranges: + *
+   * [firstRowIndex[0], lastRowIndex[0]],
+   * [firstRowIndex[1], lastRowIndex[1]],
+   * ...,
+   * [firstRowIndex[n], lastRowIndex[n]]
+   * 
+ * (See OffsetIndex.getFirstRowIndex and OffsetIndex.getLastRowIndex for details.) + *

+ * The union of the ranges are calculated so the result ranges always contain the disjunct ranges. See union for + * details. + * + * @param rowCount row count + * @param pageIndexes pageIndexes + * @param offsetIndex offsetIndex + * @return a mutable RowRanges + */ + public static RowRanges create(long rowCount, PrimitiveIterator.OfInt pageIndexes, OffsetIndex offsetIndex) { + RowRanges ranges = new RowRanges(); + while (pageIndexes.hasNext()) { + int pageIndex = pageIndexes.nextInt(); + ranges.add(new Range( + offsetIndex.getFirstRowIndex(pageIndex), offsetIndex.getLastRowIndex(pageIndex, rowCount))); + } + return ranges; + } + + /** + * Calculates the union of the two specified RowRanges object. The union of two range is calculated if there are no + * elements between them. Otherwise, the two disjunct ranges are stored separately. + *

+   * For example:
+   * [113, 241] ∪ [221, 340] = [113, 340]
+   * [113, 230] ∪ [231, 340] = [113, 340]
+   * while
+   * [113, 230] ∪ [232, 340] = [113, 230], [232, 340]
+   * 
+ * The result RowRanges object will contain all the row indexes that were contained in one of the specified objects. + * + * @param left left RowRanges + * @param right right RowRanges + * @return a mutable RowRanges contains all the row indexes that were contained in one of the specified objects + */ + public static RowRanges union(RowRanges left, RowRanges right) { + RowRanges result = new RowRanges(); + Iterator it1 = left.ranges.iterator(); + Iterator it2 = right.ranges.iterator(); + if (it2.hasNext()) { + Range range2 = it2.next(); + while (it1.hasNext()) { + Range range1 = it1.next(); + if (range1.isAfter(range2)) { + result.add(range2); + range2 = range1; + Iterator tmp = it1; + it1 = it2; + it2 = tmp; + } else { + result.add(range1); + } + } + result.add(range2); + } else { + it2 = it1; + } + while (it2.hasNext()) { + result.add(it2.next()); + } + + return result; + } + + /** + * Calculates the intersection of the two specified RowRanges object. Two ranges intersect if they have common + * elements otherwise the result is empty. + *
+   * For example:
+   * [113, 241] ∩ [221, 340] = [221, 241]
+   * while
+   * [113, 230] ∩ [231, 340] = <EMPTY>
+   * 
+ * + * @param left left RowRanges + * @param right right RowRanges + * @return a mutable RowRanges contains all the row indexes that were contained in both of the specified objects + */ + public static RowRanges intersection(RowRanges left, RowRanges right) { + RowRanges result = new RowRanges(); + + int rightIndex = 0; + for (Range l : left.ranges) { + for (int i = rightIndex, n = right.ranges.size(); i < n; ++i) { + Range r = right.ranges.get(i); + if (l.isBefore(r)) { + break; + } else if (l.isAfter(r)) { + rightIndex = i + 1; + continue; + } + result.add(Range.intersection(l, r)); + } + } + + return result; + } + + /* + * Adds a range to the end of the list of ranges. It maintains the disjunct ascending order(*) of the ranges by + * trying to union the specified range to the last ranges in the list. The specified range shall be larger(*) than + * the last one or might be overlapped with some of the last ones. + * (*) [a, b] < [c, d] if b < c + */ + private void add(Range range) { + Range rangeToAdd = range; + for (int i = ranges.size() - 1; i >= 0; --i) { + Range last = ranges.get(i); + assert !last.isAfter(range); + Range u = Range.union(last, rangeToAdd); + if (u == null) { + break; + } + rangeToAdd = u; + ranges.remove(i); + } + ranges.add(rangeToAdd); + } + + /** + * @return the number of rows in the ranges + */ + public long rowCount() { + long cnt = 0; + for (Range range : ranges) { + cnt += range.count(); + } + return cnt; + } + + /** + * @return the ascending iterator of the row indexes contained in the ranges + */ + public PrimitiveIterator.OfLong iterator() { + return new PrimitiveIterator.OfLong() { + private int currentRangeIndex = -1; + private Range currentRange; + private long next = findNext(); + + private long findNext() { + if (currentRange == null || next + 1 > currentRange.to) { + if (currentRangeIndex + 1 < ranges.size()) { + currentRange = ranges.get(++currentRangeIndex); + next = currentRange.from; + } else { + return -1; + } + } else { + ++next; + } + return next; + } + + @Override + public boolean hasNext() { + return next >= 0; + } + + @Override + public long nextLong() { + long ret = next; + if (ret < 0) { + throw new NoSuchElementException(); + } + next = findNext(); + return ret; + } + }; + } + + /** + * @param from the first row of the range to be checked for connection + * @param to the last row of the range to be checked for connection + * @return {@code true} if the specified range is overlapping (have common elements) with one of the ranges + */ + public boolean isOverlapping(long from, long to) { + return Collections.binarySearch( + ranges, new Range(from, to), (r1, r2) -> r1.isBefore(r2) ? -1 : r1.isAfter(r2) ? 1 : 0) + >= 0; + } + + public List getRanges() { + return ranges; + } + + @Override + public String toString() { + return ranges.toString(); + } + + /** + * @return a new {@link Builder} for constructing a {@link RowRanges} from a sequence of + * selected row indices. + */ + public static Builder builder() { + return new Builder(); + } + + /** + * Constructs a {@link RowRanges} by appending selected row indices in strictly increasing + * order. Consecutive indices are coalesced into a single {@link Range}; gaps close the + * current run and start a new one. + * + *

Usage: + *

{@code
+   * RowRanges.Builder builder = RowRanges.builder();
+   * for (long row : selectedRowsInOrder) {
+   *   builder.addSelectedRow(row);
+   * }
+   * RowRanges ranges = builder.build();
+   * }
+ */ + public static final class Builder { + private final List ranges = new ArrayList<>(); + private long runStart = -1; // -1 = no active run + private long runEnd = -1; // valid iff runStart >= 0 + + private Builder() {} + + /** + * Marks {@code rowIndex} as selected. The value is a 0-based row index within the current row + * group. Must be called in strictly increasing order; calling with a value less than or equal + * to the previous call's value throws {@link IllegalArgumentException}. + * + * @param rowIndex the 0-based row index to mark selected (must be {@code >} the last value + * passed and non-negative) + * @return this builder for chaining + */ + public Builder addSelectedRow(long rowIndex) { + if (rowIndex < 0) { + throw new IllegalArgumentException("addSelectedRow requires a non-negative row index; got " + rowIndex); + } + if (runStart < 0) { + runStart = rowIndex; + runEnd = rowIndex; + } else if (rowIndex == runEnd + 1) { + runEnd = rowIndex; + } else if (rowIndex > runEnd + 1) { + ranges.add(new Range(runStart, runEnd)); + runStart = rowIndex; + runEnd = rowIndex; + } else { + throw new IllegalArgumentException("addSelectedRow requires strictly increasing row indices; got " + + rowIndex + " after " + runEnd); + } + return this; + } + + /** + * Returns a snapshot of the rows selected so far. The returned {@link RowRanges} is independent + * of this builder, so the builder may continue to be used afterwards without affecting it. + * + * @return the constructed {@link RowRanges}, or {@link RowRanges#EMPTY} when no rows were + * selected. + */ + public RowRanges build() { + List snapshot = new ArrayList<>(ranges); + if (runStart >= 0) { + snapshot.add(new Range(runStart, runEnd)); + } + if (snapshot.isEmpty()) { + return RowRanges.EMPTY; + } + return new RowRanges(snapshot); + } + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java b/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java index fd26e54d7f..e58b258157 100644 --- a/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java +++ b/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/ColumnIndexFilter.java @@ -21,6 +21,7 @@ import java.util.PrimitiveIterator; import java.util.Set; import java.util.function.Function; +import org.apache.parquet.filter2.columnindex.RowRanges; import org.apache.parquet.filter2.compat.FilterCompat; import org.apache.parquet.filter2.compat.FilterCompat.FilterPredicateCompat; import org.apache.parquet.filter2.compat.FilterCompat.NoOpFilter; diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/RowRanges.java b/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/RowRanges.java index 0b2257a6bc..30709618e4 100644 --- a/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/RowRanges.java +++ b/parquet-column/src/main/java/org/apache/parquet/internal/filter2/columnindex/RowRanges.java @@ -18,302 +18,11 @@ */ package org.apache.parquet.internal.filter2.columnindex; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator; -import java.util.Set; -import org.apache.parquet.filter2.compat.FilterCompat.Filter; -import org.apache.parquet.internal.column.columnindex.OffsetIndex; - /** - * Class representing row ranges in a row-group. These row ranges are calculated as a result of the column index based - * filtering. To be used iterate over the matching row indexes to be read from a row-group, retrieve the count of the - * matching rows or check overlapping of a row index range. - * - * @see ColumnIndexFilter#calculateRowRanges(Filter, ColumnIndexStore, Set, long) + * @deprecated moved to {@link org.apache.parquet.filter2.columnindex.RowRanges}. This type is + * retained only so that the released + * {@code ParquetFileReader#readFilteredRowGroup(int, RowRanges)} signature keeps linking; it + * will be removed in 2.0. Use {@link org.apache.parquet.filter2.columnindex.RowRanges} instead. */ -public class RowRanges { - // Make it public because some uppler layer application need to access it - public static class Range { - - // Returns the union of the two ranges or null if there are elements between them. - private static Range union(Range left, Range right) { - if (left.from <= right.from) { - if (left.to + 1 >= right.from) { - return new Range(left.from, Math.max(left.to, right.to)); - } - } else if (right.to + 1 >= left.from) { - return new Range(right.from, Math.max(left.to, right.to)); - } - return null; - } - - // Returns the intersection of the two ranges of null if they are not overlapped. - private static Range intersection(Range left, Range right) { - if (left.from <= right.from) { - if (left.to >= right.from) { - return new Range(right.from, Math.min(left.to, right.to)); - } - } else if (right.to >= left.from) { - return new Range(left.from, Math.min(left.to, right.to)); - } - return null; - } - - public final long from; - public final long to; - - // Creates a range of [from, to] (from and to are inclusive; empty ranges are not valid) - Range(long from, long to) { - assert from <= to; - this.from = from; - this.to = to; - } - - long count() { - return to - from + 1; - } - - boolean isBefore(Range other) { - return to < other.from; - } - - boolean isAfter(Range other) { - return from > other.to; - } - - @Override - public String toString() { - return "[" + from + ", " + to + ']'; - } - } - - public static final RowRanges EMPTY = new RowRanges(Collections.emptyList()); - - private final List ranges; - - private RowRanges() { - this(new ArrayList<>()); - } - - private RowRanges(Range range) { - this(Collections.singletonList(range)); - } - - private RowRanges(List ranges) { - this.ranges = ranges; - } - - /** - * Creates an immutable RowRanges object with the single range [0, rowCount - - * 1]. - * - * @param rowCount a single row count - * @return an immutable RowRanges - */ - public static RowRanges createSingle(long rowCount) { - return new RowRanges(new Range(0L, rowCount - 1L)); - } - - /** - * Creates a mutable RowRanges object with the following ranges: - *
-   * [firstRowIndex[0], lastRowIndex[0]],
-   * [firstRowIndex[1], lastRowIndex[1]],
-   * ...,
-   * [firstRowIndex[n], lastRowIndex[n]]
-   * 
- * (See OffsetIndex.getFirstRowIndex and OffsetIndex.getLastRowIndex for details.) - *

- * The union of the ranges are calculated so the result ranges always contain the disjunct ranges. See union for - * details. - * - * @param rowCount row count - * @param pageIndexes pageIndexes - * @param offsetIndex offsetIndex - * @return a mutable RowRanges - */ - public static RowRanges create(long rowCount, PrimitiveIterator.OfInt pageIndexes, OffsetIndex offsetIndex) { - RowRanges ranges = new RowRanges(); - while (pageIndexes.hasNext()) { - int pageIndex = pageIndexes.nextInt(); - ranges.add(new Range( - offsetIndex.getFirstRowIndex(pageIndex), offsetIndex.getLastRowIndex(pageIndex, rowCount))); - } - return ranges; - } - - /** - * Calculates the union of the two specified RowRanges object. The union of two range is calculated if there are no - * elements between them. Otherwise, the two disjunct ranges are stored separately. - *

-   * For example:
-   * [113, 241] ∪ [221, 340] = [113, 340]
-   * [113, 230] ∪ [231, 340] = [113, 340]
-   * while
-   * [113, 230] ∪ [232, 340] = [113, 230], [232, 340]
-   * 
- * The result RowRanges object will contain all the row indexes that were contained in one of the specified objects. - * - * @param left left RowRanges - * @param right right RowRanges - * @return a mutable RowRanges contains all the row indexes that were contained in one of the specified objects - */ - public static RowRanges union(RowRanges left, RowRanges right) { - RowRanges result = new RowRanges(); - Iterator it1 = left.ranges.iterator(); - Iterator it2 = right.ranges.iterator(); - if (it2.hasNext()) { - Range range2 = it2.next(); - while (it1.hasNext()) { - Range range1 = it1.next(); - if (range1.isAfter(range2)) { - result.add(range2); - range2 = range1; - Iterator tmp = it1; - it1 = it2; - it2 = tmp; - } else { - result.add(range1); - } - } - result.add(range2); - } else { - it2 = it1; - } - while (it2.hasNext()) { - result.add(it2.next()); - } - - return result; - } - - /** - * Calculates the intersection of the two specified RowRanges object. Two ranges intersect if they have common - * elements otherwise the result is empty. - *
-   * For example:
-   * [113, 241] ∩ [221, 340] = [221, 241]
-   * while
-   * [113, 230] ∩ [231, 340] = <EMPTY>
-   * 
- * - * @param left left RowRanges - * @param right right RowRanges - * @return a mutable RowRanges contains all the row indexes that were contained in both of the specified objects - */ - public static RowRanges intersection(RowRanges left, RowRanges right) { - RowRanges result = new RowRanges(); - - int rightIndex = 0; - for (Range l : left.ranges) { - for (int i = rightIndex, n = right.ranges.size(); i < n; ++i) { - Range r = right.ranges.get(i); - if (l.isBefore(r)) { - break; - } else if (l.isAfter(r)) { - rightIndex = i + 1; - continue; - } - result.add(Range.intersection(l, r)); - } - } - - return result; - } - - /* - * Adds a range to the end of the list of ranges. It maintains the disjunct ascending order(*) of the ranges by - * trying to union the specified range to the last ranges in the list. The specified range shall be larger(*) than - * the last one or might be overlapped with some of the last ones. - * (*) [a, b] < [c, d] if b < c - */ - private void add(Range range) { - Range rangeToAdd = range; - for (int i = ranges.size() - 1; i >= 0; --i) { - Range last = ranges.get(i); - assert !last.isAfter(range); - Range u = Range.union(last, rangeToAdd); - if (u == null) { - break; - } - rangeToAdd = u; - ranges.remove(i); - } - ranges.add(rangeToAdd); - } - - /** - * @return the number of rows in the ranges - */ - public long rowCount() { - long cnt = 0; - for (Range range : ranges) { - cnt += range.count(); - } - return cnt; - } - - /** - * @return the ascending iterator of the row indexes contained in the ranges - */ - public PrimitiveIterator.OfLong iterator() { - return new PrimitiveIterator.OfLong() { - private int currentRangeIndex = -1; - private Range currentRange; - private long next = findNext(); - - private long findNext() { - if (currentRange == null || next + 1 > currentRange.to) { - if (currentRangeIndex + 1 < ranges.size()) { - currentRange = ranges.get(++currentRangeIndex); - next = currentRange.from; - } else { - return -1; - } - } else { - ++next; - } - return next; - } - - @Override - public boolean hasNext() { - return next >= 0; - } - - @Override - public long nextLong() { - long ret = next; - if (ret < 0) { - throw new NoSuchElementException(); - } - next = findNext(); - return ret; - } - }; - } - - /** - * @param from the first row of the range to be checked for connection - * @param to the last row of the range to be checked for connection - * @return {@code true} if the specified range is overlapping (have common elements) with one of the ranges - */ - public boolean isOverlapping(long from, long to) { - return Collections.binarySearch( - ranges, new Range(from, to), (r1, r2) -> r1.isBefore(r2) ? -1 : r1.isAfter(r2) ? 1 : 0) - >= 0; - } - - public List getRanges() { - return ranges; - } - - @Override - public String toString() { - return ranges.toString(); - } -} +@Deprecated +public class RowRanges extends org.apache.parquet.filter2.columnindex.RowRanges {} diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestRowRanges.java b/parquet-column/src/test/java/org/apache/parquet/filter2/columnindex/TestRowRanges.java similarity index 56% rename from parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestRowRanges.java rename to parquet-column/src/test/java/org/apache/parquet/filter2/columnindex/TestRowRanges.java index 9c6b9f737c..c977eede24 100644 --- a/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestRowRanges.java +++ b/parquet-column/src/test/java/org/apache/parquet/filter2/columnindex/TestRowRanges.java @@ -16,10 +16,10 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.parquet.internal.filter2.columnindex; +package org.apache.parquet.filter2.columnindex; -import static org.apache.parquet.internal.filter2.columnindex.RowRanges.intersection; -import static org.apache.parquet.internal.filter2.columnindex.RowRanges.union; +import static org.apache.parquet.filter2.columnindex.RowRanges.intersection; +import static org.apache.parquet.filter2.columnindex.RowRanges.union; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -152,4 +152,138 @@ public void testIntersection() { assertAllRowsEqual(intersection(empty, ranges2).iterator()); assertAllRowsEqual(intersection(empty, empty).iterator()); } + + @Test + public void testBuilderBasic() { + // Select rows 2, 3, 4, 5 (one contiguous run) + RowRanges ranges = RowRanges.builder() + .addSelectedRow(2) + .addSelectedRow(3) + .addSelectedRow(4) + .addSelectedRow(5) + .build(); + assertAllRowsEqual(ranges.iterator(), 2, 3, 4, 5); + assertEquals(4, ranges.rowCount()); + } + + @Test + public void testBuilderMultipleRanges() { + // Two runs: 1-2 and 5-7 + RowRanges ranges = RowRanges.builder() + .addSelectedRow(1) + .addSelectedRow(2) + .addSelectedRow(5) + .addSelectedRow(6) + .addSelectedRow(7) + .build(); + assertAllRowsEqual(ranges.iterator(), 1, 2, 5, 6, 7); + assertEquals(5, ranges.rowCount()); + assertTrue(ranges.isOverlapping(1, 2)); + assertTrue(ranges.isOverlapping(5, 7)); + assertFalse(ranges.isOverlapping(3, 4)); + } + + @Test + public void testBuilderEmpty() { + // No rows selected + RowRanges ranges = RowRanges.builder().build(); + assertEquals(RowRanges.EMPTY, ranges); + assertEquals(0, ranges.rowCount()); + assertAllRowsEqual(ranges.iterator()); + } + + @Test + public void testBuilderAllSelected() { + // Five contiguous rows starting at 0 + RowRanges.Builder builder = RowRanges.builder(); + for (long i = 0; i < 5; i++) { + builder.addSelectedRow(i); + } + RowRanges ranges = builder.build(); + assertAllRowsEqual(ranges.iterator(), 0, 1, 2, 3, 4); + assertEquals(5, ranges.rowCount()); + } + + @Test + public void testBuilderSingleRow() { + RowRanges ranges = RowRanges.builder().addSelectedRow(3).build(); + assertAllRowsEqual(ranges.iterator(), 3); + assertEquals(1, ranges.rowCount()); + assertTrue(ranges.isOverlapping(3, 3)); + assertFalse(ranges.isOverlapping(0, 2)); + assertFalse(ranges.isOverlapping(4, 10)); + } + + @Test + public void testBuilderAlternating() { + // Every other row selected: 0, 2, 4, 6, 8 — five singleton runs. + RowRanges.Builder builder = RowRanges.builder(); + for (long i = 0; i < 10; i += 2) { + builder.addSelectedRow(i); + } + RowRanges ranges = builder.build(); + assertAllRowsEqual(ranges.iterator(), 0, 2, 4, 6, 8); + assertEquals(5, ranges.rowCount()); + } + + @Test + public void testBuilderFirstAndLast() { + RowRanges ranges = + RowRanges.builder().addSelectedRow(0).addSelectedRow(99).build(); + assertAllRowsEqual(ranges.iterator(), 0, 99); + assertEquals(2, ranges.rowCount()); + } + + @Test + public void testBuilderRejectsOutOfOrder() { + RowRanges.Builder builder = RowRanges.builder().addSelectedRow(5).addSelectedRow(7); + try { + builder.addSelectedRow(6); + org.junit.Assert.fail("expected IllegalArgumentException for out-of-order index"); + } catch (IllegalArgumentException expected) { + // expected + } + } + + @Test + public void testBuilderRejectsDuplicate() { + RowRanges.Builder builder = RowRanges.builder().addSelectedRow(3); + try { + builder.addSelectedRow(3); + org.junit.Assert.fail("expected IllegalArgumentException for duplicate index"); + } catch (IllegalArgumentException expected) { + // expected + } + } + + @Test + public void testBuilderRejectsNegativeRow() { + RowRanges.Builder builder = RowRanges.builder(); + try { + builder.addSelectedRow(-1); + org.junit.Assert.fail("expected IllegalArgumentException for negative index"); + } catch (IllegalArgumentException expected) { + // expected + } + } + + @Test + public void testBuilderBuildReturnsSnapshot() { + // build() must return a snapshot: continuing to use the builder afterwards must not + // mutate a previously built result. + RowRanges.Builder builder = RowRanges.builder().addSelectedRow(0).addSelectedRow(1); + RowRanges first = builder.build(); + assertAllRowsEqual(first.iterator(), 0, 1); + assertEquals(2, first.rowCount()); + + builder.addSelectedRow(5); + RowRanges second = builder.build(); + + // The first result is unchanged. + assertAllRowsEqual(first.iterator(), 0, 1); + assertEquals(2, first.rowCount()); + // The second result reflects the additional row. + assertAllRowsEqual(second.iterator(), 0, 1, 5); + assertEquals(3, second.rowCount()); + } } diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestColumnIndexFilter.java b/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestColumnIndexFilter.java index cf91ef4d70..91327b3357 100644 --- a/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestColumnIndexFilter.java +++ b/parquet-column/src/test/java/org/apache/parquet/internal/filter2/columnindex/TestColumnIndexFilter.java @@ -61,6 +61,7 @@ import java.util.Set; import java.util.stream.LongStream; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.filter2.columnindex.RowRanges; import org.apache.parquet.filter2.compat.FilterCompat; import org.apache.parquet.filter2.predicate.Statistics; import org.apache.parquet.filter2.predicate.UserDefinedPredicate; diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageReadStore.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageReadStore.java index 5ddd6e443d..dfdd6c5d82 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageReadStore.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageReadStore.java @@ -43,9 +43,9 @@ import org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor; import org.apache.parquet.crypto.AesCipher; import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType; +import org.apache.parquet.filter2.columnindex.RowRanges; import org.apache.parquet.format.BlockCipher; import org.apache.parquet.internal.column.columnindex.OffsetIndex; -import org.apache.parquet.internal.filter2.columnindex.RowRanges; import org.apache.parquet.io.ParquetDecodingException; import org.apache.parquet.util.AutoCloseables; import org.slf4j.Logger; diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexFilterUtils.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexFilterUtils.java index e783815747..9c9e2a6cfa 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexFilterUtils.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnIndexFilterUtils.java @@ -28,9 +28,9 @@ import java.util.Formatter; import java.util.List; import java.util.Optional; +import org.apache.parquet.filter2.columnindex.RowRanges; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.internal.column.columnindex.OffsetIndex; -import org.apache.parquet.internal.filter2.columnindex.RowRanges; /** * Internal utility class to help at column index based filtering. diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java index 8355a1554a..9af4b4ac60 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java @@ -85,6 +85,7 @@ import org.apache.parquet.crypto.InternalFileDecryptor; import org.apache.parquet.crypto.ModuleCipherFactory.ModuleType; import org.apache.parquet.crypto.ParquetCryptoRuntimeException; +import org.apache.parquet.filter2.columnindex.RowRanges; import org.apache.parquet.filter2.compat.FilterCompat; import org.apache.parquet.filter2.compat.RowGroupFilter; import org.apache.parquet.format.BlockCipher; @@ -111,7 +112,6 @@ import org.apache.parquet.internal.column.columnindex.OffsetIndex; import org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter; import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; -import org.apache.parquet.internal.filter2.columnindex.RowRanges; import org.apache.parquet.internal.hadoop.metadata.IndexReference; import org.apache.parquet.io.InputFile; import org.apache.parquet.io.ParquetDecodingException; @@ -1268,6 +1268,22 @@ public ColumnChunkPageReadStore readFilteredRowGroup(int blockIndex, RowRanges r return internalReadFilteredRowGroup(block, rowRanges, getColumnIndexStore(blockIndex)); } + /** + * @param blockIndex the index of the requested block + * @param rowRanges the row ranges to be read from the requested block + * @return the PageReadStore which can provide PageReaders for each column or null if there are no rows in this block + * @throws IOException if an error occurs while reading + * @throws IllegalArgumentException if the {@code blockIndex} is invalid or the {@code rowRanges} is null + * @deprecated use {@link #readFilteredRowGroup(int, RowRanges)} with + * {@link org.apache.parquet.filter2.columnindex.RowRanges} instead. This overload is retained + * for backward compatibility and will be removed in 2.0. + */ + @Deprecated + public ColumnChunkPageReadStore readFilteredRowGroup( + int blockIndex, org.apache.parquet.internal.filter2.columnindex.RowRanges rowRanges) throws IOException { + return readFilteredRowGroup(blockIndex, (RowRanges) rowRanges); + } + /** * Read data in all parts via either vectored IO or serial IO. * @param allParts all parts to be read. diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileReaderRowRanges.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileReaderRowRanges.java index e445caf2bc..72fdd37180 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileReaderRowRanges.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileReaderRowRanges.java @@ -31,10 +31,10 @@ import org.apache.parquet.ParquetReadOptions; import org.apache.parquet.example.data.Group; import org.apache.parquet.example.data.simple.SimpleGroupFactory; +import org.apache.parquet.filter2.columnindex.RowRanges; import org.apache.parquet.hadoop.example.ExampleParquetWriter; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.util.HadoopInputFile; -import org.apache.parquet.internal.filter2.columnindex.RowRanges; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageTypeParser; import org.junit.Before; diff --git a/pom.xml b/pom.xml index f0af8f49ec..c689e6b10b 100644 --- a/pom.xml +++ b/pom.xml @@ -611,6 +611,15 @@ org.apache.parquet.avro.AvroReadSupport#AVRO_REQUESTED_PROJECTION org.apache.parquet.avro.AvroReadSupport#AVRO_DATA_SUPPLIER org.apache.parquet.hadoop.ParquetFileReader#PARQUET_READ_PARALLELISM + + org.apache.parquet.internal.filter2.columnindex.RowRanges + org.apache.parquet.internal.filter2.columnindex.RowRanges$Range + org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter