Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 69 additions & 34 deletions datafusion/physical-plan/src/sorts/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
use crate::spill::get_record_batch_memory_size;
use arrow::compute::interleave;
use arrow::datatypes::SchemaRef;
use arrow::error::ArrowError;
use arrow::record_batch::RecordBatch;
use datafusion_common::Result;
use datafusion_common::{DataFusionError, Result};
use datafusion_execution::memory_pool::MemoryReservation;
use std::sync::Arc;

Expand Down Expand Up @@ -104,53 +105,87 @@ impl BatchBuilder {
&self.schema
}

/// Try to interleave all columns using the given index slice.
fn try_interleave_columns(
&self,
indices: &[(usize, usize)],
) -> Result<Vec<Arc<dyn arrow::array::Array>>> {
(0..self.schema.fields.len())
.map(|column_idx| {
let arrays: Vec<_> = self
.batches
.iter()
.map(|(_, batch)| batch.column(column_idx).as_ref())
.collect();
Ok(interleave(&arrays, indices)?)
})
.collect::<Result<Vec<_>>>()
}

/// Drains the in_progress row indexes, and builds a new RecordBatch from them
///
/// Will then drop any batches for which all rows have been yielded to the output
/// Will then drop any batches for which all rows have been yielded to the output.
/// If an offset overflow occurs (e.g. string/list offsets exceed i32::MAX),
/// retries with progressively fewer rows until it succeeds.
///
/// Returns `None` if no pending rows
pub fn build_record_batch(&mut self) -> Result<Option<RecordBatch>> {
if self.is_empty() {
return Ok(None);
}

let columns = (0..self.schema.fields.len())
.map(|column_idx| {
let arrays: Vec<_> = self
.batches
.iter()
.map(|(_, batch)| batch.column(column_idx).as_ref())
.collect();
Ok(interleave(&arrays, &self.indices)?)
})
.collect::<Result<Vec<_>>>()?;

self.indices.clear();

// New cursors are only created once the previous cursor for the stream
// is finished. This means all remaining rows from all but the last batch
// for each stream have been yielded to the newly created record batch
//
// We can therefore drop all but the last batch for each stream
let mut batch_idx = 0;
let mut retained = 0;
self.batches.retain(|(stream_idx, batch)| {
let stream_cursor = &mut self.cursors[*stream_idx];
let retain = stream_cursor.batch_idx == batch_idx;
batch_idx += 1;

if retain {
stream_cursor.batch_idx = retained;
retained += 1;
} else {
self.reservation.shrink(get_record_batch_memory_size(batch));
// Try with progressively fewer rows on offset overflow.
let mut end = self.indices.len();
let columns = loop {
match self.try_interleave_columns(&self.indices[..end]) {
Ok(columns) => break columns,
Err(e) if is_offset_overflow(&e) && end > 1 => {
end /= 2;
}
Err(e) => return Err(e),
}
retain
});
};

// Remove consumed indices, keeping any remaining for the next call.
self.indices.drain(..end);

// Only clean up fully-consumed batches when all indices are drained,
// because remaining indices may still reference earlier batches.
if self.indices.is_empty() {
// New cursors are only created once the previous cursor for the stream
// is finished. This means all remaining rows from all but the last batch
// for each stream have been yielded to the newly created record batch
//
// We can therefore drop all but the last batch for each stream
let mut batch_idx = 0;
let mut retained = 0;
self.batches.retain(|(stream_idx, batch)| {
let stream_cursor = &mut self.cursors[*stream_idx];
let retain = stream_cursor.batch_idx == batch_idx;
batch_idx += 1;

if retain {
stream_cursor.batch_idx = retained;
retained += 1;
} else {
self.reservation.shrink(get_record_batch_memory_size(batch));
}
retain
});
}

Ok(Some(RecordBatch::try_new(
Arc::clone(&self.schema),
columns,
)?))
}
}

/// Returns `true` if the error is an Arrow offset overflow error.
fn is_offset_overflow(e: &DataFusionError) -> bool {
matches!(
e,
DataFusionError::ArrowError(err, _)
if matches!(err.as_ref(), ArrowError::OffsetOverflowError(_))
)
}
6 changes: 4 additions & 2 deletions datafusion/physical-plan/src/sorts/merge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -288,9 +288,11 @@ impl<C: CursorValues> SortPreservingMergeStream<C> {
}
}

self.produced += self.in_progress.len();
let before = self.in_progress.len();
let result = self.in_progress.build_record_batch();
self.produced += before - self.in_progress.len();

return Poll::Ready(self.in_progress.build_record_batch().transpose());
return Poll::Ready(result.transpose());
}
}

Expand Down
Loading