Skip to content
This repository was archived by the owner on Jun 14, 2024. It is now read-only.

Commit e6a4931

Browse files
author
Chungmin Lee
committed
Data Skipping Index Part 5: ValueListSketch
1 parent 99304dc commit e6a4931

10 files changed

Lines changed: 984 additions & 3 deletions

File tree

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
/*
2+
* Copyright (2021) The Hyperspace Project Authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.microsoft.hyperspace.index.dataskipping.expressions
18+
19+
import org.apache.spark.sql.catalyst.InternalRow
20+
import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Expression, Predicate}
21+
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode}
22+
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
23+
import org.apache.spark.sql.catalyst.util.{ArrayData, TypeUtils}
24+
25+
/**
26+
* Returns true if the sorted array (left) contains the value (right).
27+
*
28+
* If the value (right) is null, null is returned.
29+
*
30+
* Preconditions (unchecked):
31+
* - The array must not be null.
32+
* - Elements in the array must be in ascending order.
33+
* - The array must not contain null elements.
34+
* - The array must not contain duplicate elements.
35+
*/
36+
private[dataskipping] case class SortedArrayContains(left: Expression, right: Expression)
37+
extends BinaryExpression
38+
with Predicate {
39+
40+
override def prettyName: String = "sorted_array_contains"
41+
42+
override def nullable: Boolean = true
43+
44+
override def eval(input: InternalRow): Any = {
45+
val value = right.eval(input)
46+
if (value != null) {
47+
val arr = left.eval(input).asInstanceOf[ArrayData]
48+
val dt = right.dataType
49+
val n = arr.numElements()
50+
if (n > 0 &&
51+
ordering.lteq(arr.get(0, dt), value) &&
52+
ordering.lteq(value, arr.get(n - 1, dt))) {
53+
val (found, _) = SortedArrayUtils.binarySearch(arr, dt, ordering, 0, n, value)
54+
if (found) return true
55+
}
56+
return false
57+
}
58+
null
59+
}
60+
61+
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
62+
val leftGen = left.genCode(ctx)
63+
val arr = leftGen.value
64+
val rightGen = right.genCode(ctx)
65+
val value = rightGen.value
66+
val dt = right.dataType
67+
val n = ctx.freshName("n")
68+
val binarySearch = SortedArrayUtils.binarySearchCodeGen(ctx, dt)
69+
val resultCode =
70+
s"""
71+
|if (!(${rightGen.isNull})) {
72+
| ${leftGen.code}
73+
| ${ev.isNull} = false;
74+
| int $n = $arr.numElements();
75+
| if ($n > 0 &&
76+
| !(${ctx.genGreater(dt, CodeGenerator.getValue(arr, dt, "0"), value)}) &&
77+
| !(${ctx.genGreater(dt, value, CodeGenerator.getValue(arr, dt, s"$n - 1"))})) {
78+
| ${ev.value} = $binarySearch($arr, 0, $n, $value).found();
79+
| }
80+
|}
81+
""".stripMargin
82+
ev.copy(code = code"""
83+
${rightGen.code}
84+
boolean ${ev.isNull} = true;
85+
boolean ${ev.value} = false;
86+
$resultCode""")
87+
}
88+
89+
@transient private lazy val ordering: Ordering[Any] =
90+
TypeUtils.getInterpretedOrdering(right.dataType)
91+
}
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
/*
2+
* Copyright (2021) The Hyperspace Project Authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.microsoft.hyperspace.index.dataskipping.expressions
18+
19+
import org.apache.spark.sql.catalyst.InternalRow
20+
import org.apache.spark.sql.catalyst.expressions.{Expression, Predicate, UnaryExpression}
21+
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral}
22+
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
23+
import org.apache.spark.sql.catalyst.util.{ArrayData, TypeUtils}
24+
import org.apache.spark.sql.types.DataType
25+
26+
/**
27+
* Returns true if the sorted array (child) contains any of the values.
28+
*
29+
* If either array is empty, false is returned.
30+
*
31+
* Preconditions (unchecked):
32+
* - Both arrays must not be null.
33+
* - Elements in the arrays must be in ascending order.
34+
* - The left array should not contain duplicate elements.
35+
* - The arrays must not contain null elements.
36+
*
37+
* If the element type can be represented as a primitive type in Scala,
38+
* then the right array must be an array of the primitive type.
39+
*/
40+
private[dataskipping] case class SortedArrayContainsAny(
41+
child: Expression,
42+
values: Any,
43+
elementType: DataType)
44+
extends UnaryExpression
45+
with Predicate {
46+
47+
override def prettyName: String = "sorted_array_contains_any"
48+
49+
override def nullable: Boolean = false
50+
51+
override def eval(input: InternalRow): Boolean = {
52+
val arr1 = child.eval(input).asInstanceOf[ArrayData]
53+
val arr2 = values.asInstanceOf[Array[_]]
54+
val dt = elementType
55+
val n = arr1.numElements()
56+
val m = arr2.length
57+
if (n > 0 && m > 0 &&
58+
ordering.lteq(arr1.get(0, dt), arr2(m - 1)) &&
59+
ordering.lteq(arr2(0), arr1.get(n - 1, dt))) {
60+
var i = 0
61+
var j = 0
62+
do {
63+
val v = arr1.get(i, dt)
64+
while (j < m && ordering.lt(arr2(j), v)) j += 1
65+
if (j == m) return false
66+
val u = arr2(j)
67+
j += 1
68+
val (found, k) = SortedArrayUtils.binarySearch(arr1, dt, ordering, i, n, u)
69+
if (found) return true
70+
if (k == n) return false
71+
i = k
72+
} while (j < m)
73+
}
74+
false
75+
}
76+
77+
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
78+
val childGen = child.genCode(ctx)
79+
val arr1 = childGen.value
80+
val arr2 = ctx.freshName("values")
81+
val dt = elementType
82+
val javaType = CodeGenerator.javaType(dt)
83+
val arrayType = if (values.isInstanceOf[Array[Any]]) "java.lang.Object[]" else s"$javaType[]"
84+
val valuesRef = ctx.addReferenceObj("values", values, arrayType)
85+
val n = ctx.freshName("n")
86+
val m = ctx.freshName("m")
87+
val i = ctx.freshName("i")
88+
val j = ctx.freshName("j")
89+
val v = ctx.freshName("v")
90+
val u = ctx.freshName("u")
91+
val result = ctx.freshName("result")
92+
val binarySearchResultType =
93+
SortedArrayUtils.BinarySearchResult.getClass.getCanonicalName.stripSuffix("$")
94+
val binarySearch = SortedArrayUtils.binarySearchCodeGen(ctx, dt)
95+
import CodeGenerator.getValue
96+
val resultCode =
97+
s"""
98+
|int $n = $arr1.numElements();
99+
|int $m = $arr2.length;
100+
|if ($n > 0 && $m > 0 &&
101+
| !(${ctx.genGreater(dt, getValue(arr1, dt, "0"), s"(($javaType) $arr2[$m - 1])")}) &&
102+
| !(${ctx.genGreater(dt, s"(($javaType)$arr2[0])", getValue(arr1, dt, s"$n - 1"))})) {
103+
| int $i = 0;
104+
| int $j = 0;
105+
| do {
106+
| $javaType $v = ${getValue(arr1, dt, i)};
107+
| while ($j < $m && ${ctx.genGreater(dt, v, s"(($javaType) $arr2[$j])")}) $j += 1;
108+
| if ($j == $m) break;
109+
| $javaType $u = ($javaType) $arr2[$j];
110+
| $j += 1;
111+
| $binarySearchResultType $result = $binarySearch($arr1, $i, $n, $u);
112+
| if ($result.found()) {
113+
| ${ev.value} = true;
114+
| break;
115+
| }
116+
| if ($result.index() == $n) break;
117+
| $i = $result.index();
118+
| } while ($j < $m);
119+
|}
120+
""".stripMargin
121+
ev.copy(
122+
code = code"""
123+
${childGen.code}
124+
$arrayType $arr2 = $valuesRef;
125+
boolean ${ev.value} = false;
126+
$resultCode""",
127+
isNull = FalseLiteral)
128+
}
129+
130+
@transient private lazy val ordering: Ordering[Any] =
131+
TypeUtils.getInterpretedOrdering(elementType)
132+
}
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
/*
2+
* Copyright (2021) The Hyperspace Project Authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.microsoft.hyperspace.index.dataskipping.sketches
18+
19+
import org.apache.spark.sql.catalyst.expressions._
20+
import org.apache.spark.sql.catalyst.expressions.aggregate.CollectSet
21+
import org.apache.spark.sql.catalyst.util.TypeUtils
22+
import org.apache.spark.sql.types.{ArrayType, DataType}
23+
24+
import com.microsoft.hyperspace.index.dataskipping.expressions._
25+
import com.microsoft.hyperspace.index.dataskipping.util.ArrayUtils
26+
27+
/**
28+
* Sketch based on distinct values for a given expression.
29+
*
30+
* This is not really a sketch, as it stores all distinct values for a given
31+
* expression. It can be useful when the number of distinct values is expected to
32+
* be small and each file tends to store only a subset of the values.
33+
*/
34+
case class ValueListSketch(
35+
override val expr: String,
36+
override val dataType: Option[DataType] = None)
37+
extends SingleExprSketch[ValueListSketch](expr, dataType) {
38+
override def name: String = "ValueList"
39+
40+
override def withNewExpression(newExpr: (String, Option[DataType])): ValueListSketch = {
41+
copy(expr = newExpr._1, dataType = newExpr._2)
42+
}
43+
44+
override def aggregateFunctions: Seq[Expression] =
45+
new ArraySort(CollectSet(parsedExpr).toAggregateExpression()) :: Nil
46+
47+
override def convertPredicate(
48+
predicate: Expression,
49+
resolvedExprs: Seq[Expression],
50+
sketchValues: Seq[Expression],
51+
nameMap: Map[ExprId, String],
52+
valueExtractor: ExpressionExtractor): Option[Expression] = {
53+
val valueList = sketchValues.head
54+
val min = ElementAt(valueList, Literal(1))
55+
val max = ElementAt(valueList, Literal(-1))
56+
// TODO: Consider shared sketches
57+
// HasNullSketch as described in MinMaxSketch.convertPredicate
58+
// can be useful for ValueListSketch too, as it can be used to
59+
// to optimize Not(EqualTo) as well as IsNull.
60+
val resolvedExpr = resolvedExprs.head
61+
val dataType = resolvedExpr.dataType
62+
val exprExtractor = NormalizedExprExtractor(resolvedExpr, nameMap)
63+
val ExprIsTrue = IsTrueExtractor(exprExtractor)
64+
val ExprIsFalse = IsFalseExtractor(exprExtractor)
65+
val ExprIsNotNull = IsNotNullExtractor(exprExtractor)
66+
val ExprEqualTo = EqualToExtractor(exprExtractor, valueExtractor)
67+
val ExprEqualNullSafe = EqualNullSafeExtractor(exprExtractor, valueExtractor)
68+
val ExprLessThan = LessThanExtractor(exprExtractor, valueExtractor)
69+
val ExprLessThanOrEqualTo = LessThanOrEqualExtractor(exprExtractor, valueExtractor)
70+
val ExprGreaterThan = GreaterThanExtractor(exprExtractor, valueExtractor)
71+
val ExprGreaterThanOrEqualTo = GreaterThanOrEqualExtractor(exprExtractor, valueExtractor)
72+
val ExprIn = InExtractor(exprExtractor, valueExtractor)
73+
val ExprInSet = InSetExtractor(exprExtractor)
74+
def Empty(arr: Expression) = EqualTo(Size(arr), Literal(0))
75+
Option(predicate).collect {
76+
case ExprIsTrue(_) => ArrayContains(valueList, Literal(true))
77+
case ExprIsFalse(_) => ArrayContains(valueList, Literal(false))
78+
case ExprIsNotNull(_) => Not(Empty(valueList))
79+
case ExprEqualTo(_, v) => SortedArrayContains(valueList, v)
80+
case ExprEqualNullSafe(_, v) => Or(IsNull(v), SortedArrayContains(valueList, v))
81+
case Not(ExprEqualTo(_, v)) =>
82+
And(
83+
IsNotNull(v),
84+
Or(
85+
GreaterThan(Size(valueList), Literal(1)),
86+
Not(EqualTo(ElementAt(valueList, Literal(1)), v))))
87+
case ExprLessThan(_, v) => LessThan(min, v)
88+
case ExprLessThanOrEqualTo(_, v) => LessThanOrEqual(min, v)
89+
case ExprGreaterThan(_, v) => GreaterThan(max, v)
90+
case ExprGreaterThanOrEqualTo(_, v) => GreaterThanOrEqual(max, v)
91+
case ExprIn(_, vs) =>
92+
vs.map(v => SortedArrayContains(valueList, v)).reduceLeft(Or)
93+
case ExprInSet(_, vs) =>
94+
SortedArrayContainsAny(
95+
valueList,
96+
ArrayUtils.toArray(
97+
vs.filter(_ != null).toArray.sorted(TypeUtils.getInterpretedOrdering(dataType)),
98+
dataType),
99+
dataType)
100+
// TODO: StartsWith, Like with constant prefix
101+
}
102+
}
103+
}

src/test/scala/com/microsoft/hyperspace/index/dataskipping/DataSkippingIndexConfigTest.scala

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
package com.microsoft.hyperspace.index.dataskipping
1818

1919
import org.apache.hadoop.fs.Path
20-
import org.apache.spark.sql.functions.{input_file_name, max, min}
20+
import org.apache.spark.sql.functions.{array_sort, collect_set, input_file_name, max, min}
2121
import org.apache.spark.sql.types.{IntegerType, LongType, StringType}
2222
import org.apache.spark.util.sketch.BloomFilter
2323

@@ -86,6 +86,19 @@ class DataSkippingIndexConfigTest extends DataSkippingSuite with BloomFilterTest
8686
checkAnswer(indexData, withFileId(expectedSketchValues))
8787
}
8888

89+
test("createIndex works correctly with a ValueListSketch.") {
90+
val sourceData =
91+
createSourceData(spark.range(100).selectExpr("cast(id / 10 as int) as A").toDF)
92+
val indexConfig = DataSkippingIndexConfig("MyIndex", ValueListSketch("A"))
93+
val (index, indexData) = indexConfig.createIndex(ctx, sourceData, Map())
94+
assert(index.sketches === Seq(ValueListSketch("A", Some(IntegerType))))
95+
val expectedSketchValues = sourceData
96+
.groupBy(input_file_name().as(fileNameCol))
97+
.agg(array_sort(collect_set("A")))
98+
checkAnswer(indexData, withFileId(expectedSketchValues))
99+
assert(indexData.columns === Seq(IndexConstants.DATA_FILE_NAME_ID, "ValueList_A__0"))
100+
}
101+
89102
test("createIndex works correctly with a BloomFilterSketch.") {
90103
val sourceData = createSourceData(spark.range(100).toDF("A"))
91104
val indexConfig = DataSkippingIndexConfig("MyIndex", BloomFilterSketch("A", 0.001, 20))

0 commit comments

Comments
 (0)