compute: intra-ts thinning for monotonic topk

petrosagg · petrosagg · commit e825678a4175 · 2023-01-04T17:30:49.000+01:00
This PR implements a pre-arrangement thinning of monotonic collections
that are on their way to a topk computation. This thinning has the
advantage of being able to be performed in a streaming fashion even for
single timestamps that might contain a lot of data.

With this change a monotonic collection flowing into a top 3 operator
whose snapshot is 10GB can be performed on machines with very little RAM
as we will incrementally discard records that cannot possible be in the
top 3 as rows flow in.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/src/compute/Cargo.toml b/src/compute/Cargo.toml
@@ -32,6 +32,7 @@ once_cell = "1.16.0"
 prometheus = { version = "0.13.3", default-features = false }
 scopeguard = "1.1.0"
 serde = { version = "1.0.152", features = ["derive"] }
+smallvec = { version = "1.10.0", features = ["serde", "union"] }
 timely = { git = "https://github.com/TimelyDataflow/timely-dataflow", default-features = false, features = ["bincode"] }
 tokio = { version = "1.23.0", features = ["fs", "rt", "sync", "net"] }
 tracing = "0.1.37"
diff --git a/src/compute/src/render/top_k.rs b/src/compute/src/render/top_k.rs
@@ -11,6 +11,8 @@
 //!
 //! Consult [TopKPlan] documentation for details.
 
+use std::collections::HashMap;
+
 use differential_dataflow::hashable::Hashable;
 use differential_dataflow::lattice::Lattice;
 use differential_dataflow::operators::arrange::ArrangeBySelf;
@@ -19,6 +21,8 @@ use differential_dataflow::operators::Consolidate;
 use differential_dataflow::trace::implementations::ord::OrdValSpine;
 use differential_dataflow::AsCollection;
 use differential_dataflow::Collection;
+use timely::dataflow::channels::pact::Pipeline;
+use timely::dataflow::operators::Operator;
 use timely::dataflow::Scope;
 
 use mz_compute_client::plan::top_k::{
@@ -56,6 +60,17 @@ where
                     arity,
                     limit,
                 }) => {
+                    // For monotonic inputs, we are able to thin the input relation in two stages:
+                    // 1. First, we can do an intra-timestamp thinning which has the advantage of
+                    //    being computed in a streaming fashion, even for the initial snapshot.
+                    // 2. Then, we can do inter-timestamp thinning by feeding back negations for
+                    //    any records that have been invalidated.
+                    let ok_input = if let Some(limit) = limit {
+                        render_intra_ts_thinning(ok_input, order_key.clone(), limit)
+                    } else {
+                        ok_input
+                    };
+
                     // For monotonic inputs, we are able to retract inputs that can no longer be produced
                     // as outputs. Any inputs beyond `offset + limit` will never again be produced as
                     // outputs, and can be removed. The simplest form of this is when `offset == 0` and
@@ -65,7 +80,7 @@ where
                     // of `offset` and `limit`, discarding only the records not produced in the intermediate
                     // stage.
                     use differential_dataflow::operators::iterate::Variable;
-                    let delay = std::time::Duration::from_nanos(10_000_000_000);
+                    let delay = std::time::Duration::from_secs(10);
                     let retractions = Variable::new(
                         &mut ok_input.scope(),
                         <G::Timestamp as crate::render::RenderTimestamp>::system_delay(
@@ -317,6 +332,143 @@ where
             // TODO(#7331): Here we discard the arranged output.
             result.as_collection(|_k, v| v.clone())
         }
+
+        fn render_intra_ts_thinning<G>(
+            collection: Collection<G, Row, Diff>,
+            order_key: Vec<mz_expr::ColumnOrder>,
+            limit: usize,
+        ) -> Collection<G, Row, Diff>
+        where
+            G: Scope,
+            G::Timestamp: Lattice,
+        {
+            let mut aggregates = HashMap::new();
+            let mut vector = Vec::new();
+            collection
+                .inner
+                .unary_notify(
+                    Pipeline,
+                    "TopKIntraTimeThinning",
+                    [],
+                    move |input, output, notificator| {
+                        while let Some((time, data)) = input.next() {
+                            data.swap(&mut vector);
+                            let agg_time = aggregates
+                                .entry(time.time().clone())
+                                .or_insert_with(HashMap::new);
+                            for (row, record_time, diff) in vector.drain(..) {
+                                let monoid = monoids::Top1Monoid {
+                                    row,
+                                    order_key: order_key.clone(),
+                                };
+                                let topk = agg_time.entry(record_time).or_insert_with(move || {
+                                    topk_agg::TopKBatch::new(limit.try_into().expect("must fit"))
+                                });
+                                topk.update(monoid, diff);
+                            }
+                            notificator.notify_at(time.retain());
+                        }
+
+                        // pop completed aggregates, send along whatever
+                        notificator.for_each(|time, _, _| {
+                            if let Some(aggs) = aggregates.remove(time.time()) {
+                                let mut session = output.session(&time);
+                                for (record_time, topk) in aggs {
+                                    session.give_iterator(topk.into_iter().map(
+                                        |(monoid, diff)| (monoid.row, record_time.clone(), diff),
+                                    ))
+                                }
+                            }
+                        });
+                    },
+                )
+                .as_collection()
+        }
+    }
+}
+
+/// Types for in-place intra-ts aggregation of monotonic streams.
+pub mod topk_agg {
+    use smallvec::SmallVec;
+
+    pub struct TopKBatch<T> {
+        updates: SmallVec<[(T, i64); 16]>,
+        clean: usize,
+        limit: i64,
+    }
+
+    impl<T: Ord> TopKBatch<T> {
+        pub fn new(limit: i64) -> Self {
+            Self {
+                updates: SmallVec::new(),
+                clean: 0,
+                limit,
+            }
+        }
+
+        /// Adds a new update, for `item` with `value`.
+        ///
+        /// This could be optimized to perform compaction when the number of "dirty" elements exceeds
+        /// half the length of the list, which would keep the total footprint within reasonable bounds
+        /// even under an arbitrary number of updates. This has a cost, and it isn't clear whether it
+        /// is worth paying without some experimentation.
+        #[inline]
+        pub fn update(&mut self, item: T, value: i64) {
+            self.updates.push((item, value));
+            self.maintain_bounds();
+        }
+
+        /// Compact the internal representation.
+        ///
+        /// This method sort `self.updates` and consolidates elements with equal item, discarding
+        /// any whose accumulation is zero. It is optimized to only do this if the number of dirty
+        /// elements is non-zero.
+        #[inline]
+        pub fn compact(&mut self) {
+            if self.clean < self.updates.len() && self.updates.len() > 1 {
+                self.updates.sort_by(|x, y| x.0.cmp(&y.0));
+                for i in 0..self.updates.len() - 1 {
+                    if self.updates[i].0 == self.updates[i + 1].0 {
+                        self.updates[i + 1].1 += self.updates[i].1;
+                        self.updates[i].1 = 0;
+                    }
+                }
+                let mut limit = self.limit;
+                self.updates.retain(|x| {
+                    if limit > 0 {
+                        limit -= x.1;
+                        true
+                    } else {
+                        false
+                    }
+                });
+                // Adjust the diff of the last record that was retained so that we have exactly K
+                // records
+                if let Some(item) = self.updates.last_mut() {
+                    item.1 -= -limit;
+                }
+            }
+            self.clean = self.updates.len();
+        }
+
+        /// Maintain the bounds of pending (non-compacted) updates versus clean (compacted) data.
+        /// This function tries to minimize work by only compacting if enough work has accumulated.
+        fn maintain_bounds(&mut self) {
+            // if we have more than 32 elements and at least half of them are not clean, compact
+            if self.updates.len() > 32 && self.updates.len() >> 1 >= self.clean {
+                self.compact()
+            }
+        }
+    }
+
+    impl<T: Ord> IntoIterator for TopKBatch<T> {
+        type Item = (T, i64);
+        type IntoIter = smallvec::IntoIter<[(T, i64); 16]>;
+
+        fn into_iter(mut self) -> Self::IntoIter {
+            self.compact();
+            self.updates.into_iter()
+        }
     }
 }