From 7af45f9eeaf0ac221ca2383cc3829882c7ac9abc Mon Sep 17 00:00:00 2001 From: Anas Khan <83116240+anxkhn@users.noreply.github.com> Date: Mon, 29 Jun 2026 07:00:49 +0530 Subject: [PATCH] feat: support pyarrow float16 by widening to float on read/write PyArrow's float16 (halffloat) raised UnsupportedPyArrowTypeException during schema conversion because _ConvertToIceberg.primitive only handled float32/float64. Iceberg has no half-precision float, but float16 -> float32 is lossless, mirroring how int8/int16 already widen to IntegerType. Map float16 to FloatType, and widen smaller float arrays to the target type in ArrowProjectionVisitor._cast_if_needed (parallel to the integer-widening branch) so float16 columns write as float32. --- pyiceberg/io/pyarrow.py | 12 ++++++++++++ tests/io/test_pyarrow.py | 29 +++++++++++++++++++++++++++++ tests/io/test_pyarrow_visitor.py | 6 ++++++ 3 files changed, 47 insertions(+) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 6259f311e9..45a6c00513 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -1438,6 +1438,9 @@ def primitive(self, primitive: pa.DataType) -> PrimitiveType: else: # Does not exist (yet) raise TypeError(f"Unsupported integer type: {primitive}") + elif pa.types.is_float16(primitive): + # Iceberg has no half-precision float; widen to single precision (lossless) + return FloatType() elif pa.types.is_float32(primitive): return FloatType() elif pa.types.is_float64(primitive): @@ -1978,6 +1981,15 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: target_width = target_type.bit_width if source_width < target_width: return values.cast(target_type) + elif isinstance(field.field_type, (FloatType, DoubleType)): + # Cast smaller float types to target type for cross-platform compatibility + # Only allow widening conversions (smaller bit width to larger), e.g. float16 -> float32 + # Narrowing conversions fall through to promote() handling below + if pa.types.is_floating(values.type): + source_width = values.type.bit_width + target_width = target_type.bit_width + if source_width < target_width: + return values.cast(target_type) if field.field_type != file_field.field_type: target_schema = schema_to_pyarrow( diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index 532311899d..dde3547350 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -3109,6 +3109,35 @@ def test__to_requested_schema_integer_promotion( assert result.column(0).to_pylist() == [1, 2, 3, None] +@pytest.mark.parametrize( + "arrow_type,iceberg_type,expected_arrow_type", + [ + (pa.float16(), FloatType(), pa.float32()), + (pa.float16(), DoubleType(), pa.float64()), + (pa.float32(), DoubleType(), pa.float64()), + ], +) +def test__to_requested_schema_float_promotion( + arrow_type: pa.DataType, + iceberg_type: PrimitiveType, + expected_arrow_type: pa.DataType, +) -> None: + """Test that smaller float types are cast to target Iceberg type during write.""" + requested_schema = Schema(NestedField(1, "col", iceberg_type, required=False)) + file_schema = requested_schema + + arrow_schema = pa.schema([pa.field("col", arrow_type)]) + data = pa.array([1.5, 2.25, 3.0, None], type=arrow_type) + batch = pa.RecordBatch.from_arrays([data], schema=arrow_schema) + + result = _to_requested_schema( + requested_schema, file_schema, batch, downcast_ns_timestamp_to_us=False, include_field_ids=False + ) + + assert result.schema[0].type == expected_arrow_type + assert result.column(0).to_pylist() == [1.5, 2.25, 3.0, None] + + def test_pyarrow_file_io_fs_by_scheme_cache() -> None: # It's better to set up multi-region minio servers for an integration test once `endpoint_url` argument # becomes available for `resolve_s3_region` diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py index 6727b8c768..e98d76e262 100644 --- a/tests/io/test_pyarrow_visitor.py +++ b/tests/io/test_pyarrow_visitor.py @@ -128,6 +128,12 @@ def test_pyarrow_int64_to_iceberg() -> None: assert visit(converted_iceberg_type, _ConvertToArrowSchema()) == pyarrow_type +def test_pyarrow_float16_to_iceberg() -> None: + pyarrow_type = pa.float16() + converted_iceberg_type = visit_pyarrow(pyarrow_type, _ConvertToIceberg()) + assert converted_iceberg_type == FloatType() + + def test_pyarrow_float32_to_iceberg() -> None: pyarrow_type = pa.float32() converted_iceberg_type = visit_pyarrow(pyarrow_type, _ConvertToIceberg())