Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 41 additions & 3 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -378,19 +378,19 @@ const double test_traits<::arrow::DoubleType>::value(4.2);
template <>
struct test_traits<::arrow::StringType> {
static constexpr ParquetType::type parquet_enum = ParquetType::BYTE_ARRAY;
static std::string const value;
static const std::string value;
Comment thread
wecharyu marked this conversation as resolved.
Outdated
};

template <>
struct test_traits<::arrow::BinaryType> {
static constexpr ParquetType::type parquet_enum = ParquetType::BYTE_ARRAY;
static std::string const value;
static const std::string value;
};

template <>
struct test_traits<::arrow::FixedSizeBinaryType> {
static constexpr ParquetType::type parquet_enum = ParquetType::FIXED_LEN_BYTE_ARRAY;
static std::string const value;
static const std::string value;
};

const std::string test_traits<::arrow::StringType>::value("Test"); // NOLINT
Expand Down Expand Up @@ -5794,6 +5794,44 @@ TEST(TestArrowReadWrite, WriteRecordBatchNotProduceEmptyRowGroup) {
}
}

TEST(TestArrowReadWrite, FlushRowGroupByBufferedSize) {
auto pool = ::arrow::default_memory_pool();
auto sink = CreateOutputStream();
// Limit the max bytes in a row group to 10 so that each batch produces a new group.
auto writer_properties = WriterProperties::Builder().max_row_group_bytes(10)->build();
auto arrow_writer_properties = default_arrow_writer_properties();

// Prepare schema
auto schema = ::arrow::schema({::arrow::field("a", ::arrow::int64())});
std::shared_ptr<SchemaDescriptor> parquet_schema;
ASSERT_OK_NO_THROW(ToParquetSchema(schema.get(), *writer_properties,
*arrow_writer_properties, &parquet_schema));
auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema_root());

auto gen = ::arrow::random::RandomArrayGenerator(/*seed=*/42);

// Create writer to write data via RecordBatch.
auto writer = ParquetFileWriter::Open(sink, schema_node, writer_properties);
std::unique_ptr<FileWriter> arrow_writer;
ASSERT_OK(FileWriter::Make(pool, std::move(writer), schema, arrow_writer_properties,
&arrow_writer));
// NewBufferedRowGroup() is not called explicitly and it will be called
// inside WriteRecordBatch().
for (int i = 0; i < 5; ++i) {
auto record_batch =
gen.BatchOf({::arrow::field("a", ::arrow::int64())}, /*length=*/1);
ASSERT_OK_NO_THROW(arrow_writer->WriteRecordBatch(*record_batch));
}
ASSERT_OK_NO_THROW(arrow_writer->Close());
ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());

auto file_metadata = arrow_writer->metadata();
EXPECT_EQ(5, file_metadata->num_row_groups());
for (int i = 0; i < 5; ++i) {
EXPECT_EQ(1, file_metadata->RowGroup(i)->num_rows());
}
}

TEST(TestArrowReadWrite, MultithreadedWrite) {
const int num_columns = 20;
const int num_rows = 1000;
Expand Down
35 changes: 21 additions & 14 deletions cpp/src/parquet/arrow/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -442,12 +442,8 @@ class FileWriterImpl : public FileWriter {
return Status::OK();
}

// Max number of rows allowed in a row group.
const int64_t max_row_group_length = this->properties().max_row_group_length();

// Initialize a new buffered row group writer if necessary.
if (row_group_writer_ == nullptr || !row_group_writer_->buffered() ||
row_group_writer_->num_rows() >= max_row_group_length) {
if (row_group_writer_ == nullptr || !row_group_writer_->buffered()) {
RETURN_NOT_OK(NewBufferedRowGroup());
}

Expand Down Expand Up @@ -480,17 +476,28 @@ class FileWriterImpl : public FileWriter {
return Status::OK();
};

// Max number of rows allowed in a row group.
const int64_t max_row_group_length = this->properties().max_row_group_length();
Comment thread
wgtmac marked this conversation as resolved.
// Max number of bytes allowed in a row group.
Comment thread
wecharyu marked this conversation as resolved.
Outdated
const int64_t max_row_group_bytes = this->properties().max_row_group_bytes();

int64_t offset = 0;
while (offset < batch.num_rows()) {
const int64_t batch_size =
std::min(max_row_group_length - row_group_writer_->num_rows(),
batch.num_rows() - offset);
RETURN_NOT_OK(WriteBatch(offset, batch_size));
offset += batch_size;

// Flush current row group writer and create a new writer if it is full.
if (row_group_writer_->num_rows() >= max_row_group_length &&
offset < batch.num_rows()) {
int64_t group_rows = row_group_writer_->num_rows();
int64_t batch_size =
std::min(max_row_group_length - group_rows, batch.num_rows() - offset);
if (group_rows > 0) {
Comment thread
wgtmac marked this conversation as resolved.
Outdated
int64_t buffered_bytes = row_group_writer_->current_buffered_bytes();
double avg_row_bytes = buffered_bytes * 1.0 / group_rows;
batch_size = std::min(
batch_size,
static_cast<int64_t>((max_row_group_bytes - buffered_bytes) / avg_row_bytes));
}
if (batch_size > 0) {
RETURN_NOT_OK(WriteBatch(offset, batch_size));
offset += batch_size;
} else if (offset < batch.num_rows()) {
// Current row group is full, write remaining rows in a new group.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will it cause infinite loop at this line if batch_size is always 0?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would cause infinite loop only when the max_row_group_bytes/avg_row_size is 0, is it OK to return Invalid status in WriteXxx() at this case?

        if (batch_size == 0 && row_group_writer_->num_rows() == 0) {
          return Status::Invalid(
              "Configured max_row_group_bytes is too small to hold a single row");
        }

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot accept infinite loop so perhaps we have to set the minimum batch size to 1 in this case?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Set the minimum batch size to 1 is not reasonable, when the buffered_bytes > max_row_group_bytes we still set the batch size as 1, then it will continually append one row to the active row group and never create a new one. Returning an invalid status might be more intuitive.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we check row group size after writing each batch? If a large per row size leads to batch size equal to 1, we just end up with checking row group size after writing every row.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We does not check row group size after write batch, current write logic is like:

  1. check rows and bytes to determine current batch_size
  2. if batch_size > 0, write these rows to current row group, it's guaranteed not exceeds the row group limits
  3. if batch_size = 0 and still has rows to write, new a row group
  4. next loop to 1

In this way we don't need check size after written, it's guaranteed in step 1; and we'll not leave an possible empty row group in the final batch, it guaranteed in step 3.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do you achieve the step 4 above next loop to 1? This looks a bit complicated to me. How about the logic below.

For each loop iteration:

  1. Check the row group threshold (both rows and bytes) and append a new row group if needed.
  2. Compute a new batch_size based on different conditions and set its minimum to 1 so we don't get empty batch.
  3. Write the batch as before.

RETURN_NOT_OK(NewBufferedRowGroup());
}
}
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/parquet/arrow/writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,9 @@ class PARQUET_EXPORT FileWriter {
/// Multiple RecordBatches can be written into the same row group
/// through this method.
///
/// WriterProperties.max_row_group_length() is respected and a new
/// row group will be created if the current row group exceeds the
/// limit.
/// WriterProperties.max_row_group_length() and WriterProperties.max_row_group_bytes()
/// are respected and a new row group will be created if the current row group exceeds
/// the limits.
///
/// Batches get flushed to the output stream once NewBufferedRowGroup()
/// or Close() is called.
Expand Down
20 changes: 20 additions & 0 deletions cpp/src/parquet/file_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ int64_t RowGroupWriter::total_compressed_bytes_written() const {
return contents_->total_compressed_bytes_written();
}

int64_t RowGroupWriter::current_buffered_bytes() const {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function name is a little misleading because readers may think it is same as contents_->estimated_buffered_value_bytes().

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rename to total_buffered_bytes()

return contents_->total_compressed_bytes() +
contents_->total_compressed_bytes_written() +
contents_->estimated_buffered_value_bytes();
}

bool RowGroupWriter::buffered() const { return contents_->buffered(); }

int RowGroupWriter::current_column() { return contents_->current_column(); }
Expand Down Expand Up @@ -195,6 +201,20 @@ class RowGroupSerializer : public RowGroupWriter::Contents {
return total_compressed_bytes_written;
}

int64_t estimated_buffered_value_bytes() const override {
Comment thread
wecharyu marked this conversation as resolved.
Outdated
if (closed_) {
return 0;
}
int64_t estimated_buffered_value_bytes = 0;
for (size_t i = 0; i < column_writers_.size(); i++) {
if (column_writers_[i]) {
estimated_buffered_value_bytes +=
column_writers_[i]->estimated_buffered_value_bytes();
}
}
return estimated_buffered_value_bytes;
}

bool buffered() const override { return buffered_row_group_; }

void Close() override {
Expand Down
5 changes: 5 additions & 0 deletions cpp/src/parquet/file_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ class PARQUET_EXPORT RowGroupWriter {
virtual int64_t total_compressed_bytes() const = 0;
/// \brief total compressed bytes written by the page writer
virtual int64_t total_compressed_bytes_written() const = 0;
/// \brief estimated size of the values that are not written to a page yet
Comment thread
wecharyu marked this conversation as resolved.
Outdated
virtual int64_t estimated_buffered_value_bytes() const = 0;

virtual bool buffered() const = 0;
};
Expand Down Expand Up @@ -99,6 +101,9 @@ class PARQUET_EXPORT RowGroupWriter {
int64_t total_compressed_bytes() const;
/// \brief total compressed bytes written by the page writer
int64_t total_compressed_bytes_written() const;
/// \brief including compressed bytes in page writer and uncompressed data
/// value buffer.
int64_t current_buffered_bytes() const;

/// Returns whether the current RowGroupWriter is in the buffered mode and is created
/// by calling ParquetFileWriter::AppendBufferedRowGroup.
Expand Down
33 changes: 25 additions & 8 deletions cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize;
static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024;
static constexpr int64_t DEFAULT_MAX_ROW_GROUP_BYTES = 128 * 1024 * 1024;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a particular reason for this value? AFAIK some Parquet implementation (is it Parquet Rust? @alamb ) writes a single row group per file by default.

I also feel like the HDFS-related reasons in the Parquet docs are completely outdated (who cares about HDFS?).

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think smaller row groups are still useful when pruning is essential. https://www.firebolt.io/blog/unlocking-faster-iceberg-queries-the-writer-optimizations-you-are-missing is a good read.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, but the value is not easy to devise. For example, if you have 10_000 columns, this will make for some very short columns.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case, we could set a very large value as default to keep the current behavior. Some engines are smart enough to derive a good threshold (from history or whatever source).

Copy link
Copy Markdown
Contributor

@alamb alamb Jan 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a particular reason for this value? AFAIK some Parquet implementation (is it Parquet Rust? @alamb ) writes a single row group per file by default.

The default row group size in the rust writer is 1M rows (1024*1024) -- NOT bytes

https://docs.rs/parquet/latest/parquet/file/properties/struct.WriterPropertiesBuilder.html#method.set_max_row_group_size

I looked through and didn't find any setting for max row group size in bytes.

I believe at least at some point in the past, the DuckDB Parquet writer wrote a single large row group -- I am not sure if that is the current behavior or not

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think Apache Impala also produces single row group in its Parquet writer. However I think limiting row group size in bytes is still useful in some cases. For example, there is a table property in Iceberg: https://github.com/apache/iceberg/blob/73a26fc1f49e6749656a273b2e4d78eb9e64f19e/docs/docs/configuration.md?plain=1#L46. As iceberg-cpp is depending on the Parquet writer here, it is nice to support this feature.

static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
Expand Down Expand Up @@ -293,6 +294,7 @@ class PARQUET_EXPORT WriterProperties {
dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
max_row_group_bytes_(DEFAULT_MAX_ROW_GROUP_BYTES),
pagesize_(kDefaultDataPageSize),
max_rows_per_page_(kDefaultMaxRowsPerPage),
version_(ParquetVersion::PARQUET_2_6),
Expand All @@ -309,6 +311,7 @@ class PARQUET_EXPORT WriterProperties {
dictionary_pagesize_limit_(properties.dictionary_pagesize_limit()),
write_batch_size_(properties.write_batch_size()),
max_row_group_length_(properties.max_row_group_length()),
max_row_group_bytes_(properties.max_row_group_bytes()),
pagesize_(properties.data_pagesize()),
max_rows_per_page_(properties.max_rows_per_page()),
version_(properties.version()),
Expand Down Expand Up @@ -418,6 +421,13 @@ class PARQUET_EXPORT WriterProperties {
return this;
}

/// Specify the max number of bytes to put in a single row group.
Comment thread
wecharyu marked this conversation as resolved.
/// Default 128MB.
Builder* max_row_group_bytes(int64_t max_row_group_bytes) {
max_row_group_bytes_ = max_row_group_bytes;
return this;
}

/// Specify the data page size.
/// Default 1MB.
Builder* data_pagesize(int64_t pg_size) {
Expand Down Expand Up @@ -779,11 +789,12 @@ class PARQUET_EXPORT WriterProperties {

return std::shared_ptr<WriterProperties>(new WriterProperties(
pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
pagesize_, max_rows_per_page_, version_, created_by_, page_checksum_enabled_,
size_statistics_level_, std::move(file_encryption_properties_),
default_column_properties_, column_properties, data_page_version_,
store_decimal_as_integer_, std::move(sorting_columns_),
content_defined_chunking_enabled_, content_defined_chunking_options_));
max_row_group_bytes_, pagesize_, max_rows_per_page_, version_, created_by_,
page_checksum_enabled_, size_statistics_level_,
std::move(file_encryption_properties_), default_column_properties_,
column_properties, data_page_version_, store_decimal_as_integer_,
std::move(sorting_columns_), content_defined_chunking_enabled_,
content_defined_chunking_options_));
}

private:
Expand All @@ -793,6 +804,7 @@ class PARQUET_EXPORT WriterProperties {
int64_t dictionary_pagesize_limit_;
int64_t write_batch_size_;
int64_t max_row_group_length_;
int64_t max_row_group_bytes_;
int64_t pagesize_;
int64_t max_rows_per_page_;
ParquetVersion::type version_;
Expand Down Expand Up @@ -828,6 +840,8 @@ class PARQUET_EXPORT WriterProperties {

inline int64_t max_row_group_length() const { return max_row_group_length_; }

inline int64_t max_row_group_bytes() const { return max_row_group_bytes_; }

inline int64_t data_pagesize() const { return pagesize_; }

inline int64_t max_rows_per_page() const { return max_rows_per_page_; }
Expand Down Expand Up @@ -946,9 +960,10 @@ class PARQUET_EXPORT WriterProperties {
private:
explicit WriterProperties(
MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size,
int64_t max_row_group_length, int64_t pagesize, int64_t max_rows_per_page,
ParquetVersion::type version, const std::string& created_by,
bool page_write_checksum_enabled, SizeStatisticsLevel size_statistics_level,
int64_t max_row_group_length, int64_t max_row_group_bytes, int64_t pagesize,
int64_t max_rows_per_page, ParquetVersion::type version,
const std::string& created_by, bool page_write_checksum_enabled,
SizeStatisticsLevel size_statistics_level,
std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
const ColumnProperties& default_column_properties,
const std::unordered_map<std::string, ColumnProperties>& column_properties,
Expand All @@ -959,6 +974,7 @@ class PARQUET_EXPORT WriterProperties {
dictionary_pagesize_limit_(dictionary_pagesize_limit),
write_batch_size_(write_batch_size),
max_row_group_length_(max_row_group_length),
max_row_group_bytes_(max_row_group_bytes),
pagesize_(pagesize),
max_rows_per_page_(max_rows_per_page),
parquet_data_page_version_(data_page_version),
Expand All @@ -978,6 +994,7 @@ class PARQUET_EXPORT WriterProperties {
int64_t dictionary_pagesize_limit_;
int64_t write_batch_size_;
int64_t max_row_group_length_;
int64_t max_row_group_bytes_;
int64_t pagesize_;
int64_t max_rows_per_page_;
ParquetDataPageVersion parquet_data_page_version_;
Expand Down