From 9ff3c3ee8b558a68d5d89d4370c77f6f458c4232 Mon Sep 17 00:00:00 2001 From: hentzthename Date: Fri, 23 Jan 2026 22:08:03 +1300 Subject: [PATCH 1/2] Add test for field ordering bug in cast_table_safe Add regression test that demonstrates the field ordering bug where cast_table_safe fails when source table fields are in a different order than the target schema, even when all field names and types match. --- tests/test_schema_casting.py | 47 ++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/test_schema_casting.py b/tests/test_schema_casting.py index 17b00a0..0255ec2 100644 --- a/tests/test_schema_casting.py +++ b/tests/test_schema_casting.py @@ -456,3 +456,50 @@ def test_real_world_iceberg_scenario(): result = cast_table_safe(table, iceberg_schema, strict=True) assert len(result) == 3 assert result.schema == iceberg_schema + + +def test_cast_table_safe_different_field_order(): + """ + Test that cast_table_safe handles tables with different field ordering. + + This is a regression test for the field ordering bug where cast_table_safe + fails when the source table has fields in a different order than the target + schema, even when all field names and types match exactly. + + This commonly occurs when loading JSON data where field order isn't guaranteed + (e.g., different API responses or extraction runs). + """ + # Source table with fields in order: a, b, c + source_schema = pa.schema([ + pa.field("a", pa.int64()), + pa.field("b", pa.string()), + pa.field("c", pa.float64()), + ]) + + table = pa.table( + { + "a": [1, 2, 3], + "b": ["x", "y", "z"], + "c": [1.1, 2.2, 3.3], + }, + schema=source_schema + ) + + # Target schema with same fields but different order: c, b, a + target_schema = pa.schema([ + pa.field("c", pa.float64()), + pa.field("b", pa.string()), + pa.field("a", pa.int64()), + ]) + + # This should succeed - all field names and types match + # Currently fails with: "Target schema's field names are not matching the table's field names" + result = cast_table_safe(table, target_schema, strict=True) + + assert len(result) == 3 + assert result.schema == target_schema + # Verify columns are reordered correctly + assert result.column_names == ["c", "b", "a"] + assert result["a"].to_pylist() == [1, 2, 3] + assert result["b"].to_pylist() == ["x", "y", "z"] + assert result["c"].to_pylist() == [1.1, 2.2, 3.3] From 3437f77491f7d9eda2eac3b2ea55a3c1c9839f33 Mon Sep 17 00:00:00 2001 From: hentzthename Date: Fri, 23 Jan 2026 22:20:37 +1300 Subject: [PATCH 2/2] Fix field ordering bug in cast_table_safe Reorder source table columns to match target schema order before casting. PyArrow's cast() matches fields by position, not name, so tables with different field ordering would fail even when all field names and types matched. --- src/dlt_iceberg/schema_casting.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/dlt_iceberg/schema_casting.py b/src/dlt_iceberg/schema_casting.py index fcb5d4b..c004fe8 100644 --- a/src/dlt_iceberg/schema_casting.py +++ b/src/dlt_iceberg/schema_casting.py @@ -436,6 +436,11 @@ def cast_table_safe( f"to schema with {len(target_schema)} fields" ) + # Reorder columns to match target schema before casting + # PyArrow's cast() requires fields to be in the same order + target_field_names = [field.name for field in target_schema] + table = table.select(target_field_names) + try: casted_table = table.cast(target_schema) logger.info("Cast completed successfully")