apache · timsaucer · Feb 5, 2026 · Feb 1, 2026 · Feb 1, 2026 · Feb 5, 2026
diff --git a/docs/source/user-guide/io/csv.rst b/docs/source/user-guide/io/csv.rst
@@ -36,3 +36,25 @@ An alternative is to use :py:func:`~datafusion.context.SessionContext.register_c
 
     ctx.register_csv("file", "file.csv")
     df = ctx.table("file")
+
+If you require additional control over how to read the CSV file, you can use
+:py:class:`~datafusion.options.CsvReadOptions` to set a variety of options.
+
+.. code-block:: python
+
+    from datafusion import CsvReadOptions
+    options = (
+        CsvReadOptions()
+        .with_has_header(True) # File contains a header row
+        .with_delimiter(";") # Use ; as the delimiter instead of ,
+        .with_comment("#")  # Skip lines starting with #
+        .with_escape("\\")  # Escape character
+        .with_null_regex(r"^(null|NULL|N/A)$")  # Treat these as NULL
+        .with_truncated_rows(True) # Allow rows to have incomplete columns
+        .with_file_compression_type("gzip")  # Read gzipped CSV
+        .with_file_extension(".gz") # File extension other than .csv
+    )
+    df = ctx.read_csv("data.csv.gz", options=options)
+
+Details for all CSV reading options can be found on the
+`DataFusion documentation site <https://datafusion.apache.org/library-user-guide/custom-table-providers.html>`_.
diff --git a/examples/csv-read-options.py b/examples/csv-read-options.py
@@ -0,0 +1,96 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Example demonstrating CsvReadOptions usage."""
+
+from datafusion import CsvReadOptions, SessionContext
+
+# Create a SessionContext
+ctx = SessionContext()
+
+# Example 1: Using CsvReadOptions with default values
+print("Example 1: Default CsvReadOptions")
+options = CsvReadOptions()
+df = ctx.read_csv("data.csv", options=options)
+
+# Example 2: Using CsvReadOptions with custom parameters
+print("\nExample 2: Custom CsvReadOptions")
+options = CsvReadOptions(
+    has_header=True,
+    delimiter=",",
+    quote='"',
+    schema_infer_max_records=1000,
+    file_extension=".csv",
+)
+df = ctx.read_csv("data.csv", options=options)
+
+# Example 3: Using the builder pattern (recommended for readability)
+print("\nExample 3: Builder pattern")
+options = (
+    CsvReadOptions()
+    .with_has_header(True)  # noqa: FBT003
+    .with_delimiter("|")
+    .with_quote("'")
+    .with_schema_infer_max_records(500)
+    .with_truncated_rows(False)  # noqa: FBT003
+    .with_newlines_in_values(True)  # noqa: FBT003
+)
+df = ctx.read_csv("data.csv", options=options)
+
+# Example 4: Advanced options
+print("\nExample 4: Advanced options")
+options = (
+    CsvReadOptions()
+    .with_has_header(True)  # noqa: FBT003
+    .with_delimiter(",")
+    .with_comment("#")  # Skip lines starting with #
+    .with_escape("\\")  # Escape character
+    .with_null_regex(r"^(null|NULL|N/A)$")  # Treat these as NULL
+    .with_truncated_rows(True)  # noqa: FBT003
+    .with_file_compression_type("gzip")  # Read gzipped CSV
+    .with_file_extension(".gz")
+)
+df = ctx.read_csv("data.csv.gz", options=options)
+
+# Example 5: Register CSV table with options
+print("\nExample 5: Register CSV table")
+options = CsvReadOptions().with_has_header(True).with_delimiter(",")  # noqa: FBT003
+ctx.register_csv("my_table", "data.csv", options=options)
+df = ctx.sql("SELECT * FROM my_table")
+
+# Example 6: Backward compatibility (without options)
+print("\nExample 6: Backward compatibility")
+# Still works the old way!
+df = ctx.read_csv("data.csv", has_header=True, delimiter=",")
+
+print("\nAll examples completed!")
+print("\nFor all available options, see the CsvReadOptions documentation:")
+print("  - has_header: bool")
+print("  - delimiter: str")
+print("  - quote: str")
+print("  - terminator: str | None")
+print("  - escape: str | None")
+print("  - comment: str | None")
+print("  - newlines_in_values: bool")
+print("  - schema: pa.Schema | None")
+print("  - schema_infer_max_records: int")
+print("  - file_extension: str")
+print("  - table_partition_cols: list[tuple[str, pa.DataType]]")
+print("  - file_compression_type: str")
+print("  - file_sort_order: list[list[SortExpr]]")
+print("  - null_regex: str | None")
+print("  - truncated_rows: bool")
diff --git a/python/datafusion/__init__.py b/python/datafusion/__init__.py
@@ -54,6 +54,7 @@
 from .dataframe_formatter import configure_formatter
 from .expr import Expr, WindowFrame
 from .io import read_avro, read_csv, read_json, read_parquet
+from .options import CsvReadOptions
 from .plan import ExecutionPlan, LogicalPlan
 from .record_batch import RecordBatch, RecordBatchStream
 from .user_defined import (
@@ -75,6 +76,7 @@
     "AggregateUDF",
     "Catalog",
     "Config",
+    "CsvReadOptions",
     "DFSchema",
     "DataFrame",
     "DataFrameWriteOptions",
@@ -106,6 +108,7 @@
     "lit",
     "literal",
     "object_store",
+    "options",
     "read_avro",
     "read_csv",
     "read_json",

diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -34,6 +34,11 @@
 from datafusion.catalog import Catalog
 from datafusion.dataframe import DataFrame
 from datafusion.expr import sort_list_to_raw_sort_list
+from datafusion.options import (
+    DEFAULT_MAX_INFER_SCHEMA,
+    CsvReadOptions,
+    _convert_table_partition_cols,
+)
 from datafusion.record_batch import RecordBatchStream
 
 from ._internal import RuntimeEnvBuilder as RuntimeEnvBuilderInternal
@@ -584,7 +589,7 @@ def register_listing_table(
         """
         if table_partition_cols is None:
             table_partition_cols = []
-        table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
+        table_partition_cols = _convert_table_partition_cols(table_partition_cols)
         self.ctx.register_listing_table(
             name,
             str(path),
@@ -905,7 +910,7 @@ def register_parquet(
         """
         if table_partition_cols is None:
             table_partition_cols = []
-        table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
+        table_partition_cols = _convert_table_partition_cols(table_partition_cols)
         self.ctx.register_parquet(
             name,
             str(path),
@@ -924,9 +929,10 @@ def register_csv(
         schema: pa.Schema | None = None,
         has_header: bool = True,
         delimiter: str = ",",
-        schema_infer_max_records: int = 1000,
+        schema_infer_max_records: int = DEFAULT_MAX_INFER_SCHEMA,
         file_extension: str = ".csv",
         file_compression_type: str | None = None,
+        options: CsvReadOptions | None = None,
     ) -> None:
         """Register a CSV file as a table.
 
@@ -946,18 +952,46 @@ def register_csv(
             file_extension: File extension; only files with this extension are
                 selected for data input.
             file_compression_type: File compression type.
+            options: Set advanced options for CSV reading. This cannot be
+                combined with any of the other options in this method.
         """
-        path = [str(p) for p in path] if isinstance(path, list) else str(path)
+        path_arg = [str(p) for p in path] if isinstance(path, list) else str(path)
+
+        if options is not None and (
+            schema is not None
+            or not has_header
+            or delimiter != ","
+            or schema_infer_max_records != DEFAULT_MAX_INFER_SCHEMA
+            or file_extension != ".csv"
+            or file_compression_type is not None
+        ):
+            message = (
+                "Combining CsvReadOptions parameter with additional options "
+                "is not supported. Use CsvReadOptions to set parameters."
+            )
+            warnings.warn(
+                message,
+                category=UserWarning,
+                stacklevel=2,
+            )
+
+        options = (
+            options
+            if options is not None
+            else CsvReadOptions(
+                schema=schema,
+                has_header=has_header,
+                delimiter=delimiter,
+                schema_infer_max_records=schema_infer_max_records,
+                file_extension=file_extension,
+                file_compression_type=file_compression_type,
+            )
+        )
 
         self.ctx.register_csv(
             name,
-            path,
-            schema,
-            has_header,
-            delimiter,
-            schema_infer_max_records,
-            file_extension,
-            file_compression_type,
+            path_arg,
+            options.to_inner(),
         )
 
     def register_json(
@@ -988,7 +1022,7 @@ def register_json(
         """
         if table_partition_cols is None:
             table_partition_cols = []
-        table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
+        table_partition_cols = _convert_table_partition_cols(table_partition_cols)
         self.ctx.register_json(
             name,
             str(path),
@@ -1021,7 +1055,7 @@ def register_avro(
         """
         if table_partition_cols is None:
             table_partition_cols = []
-        table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
+        table_partition_cols = _convert_table_partition_cols(table_partition_cols)
         self.ctx.register_avro(
             name, str(path), schema, file_extension, table_partition_cols
         )
@@ -1101,7 +1135,7 @@ def read_json(
         """
         if table_partition_cols is None:
             table_partition_cols = []
-        table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
+        table_partition_cols = _convert_table_partition_cols(table_partition_cols)
         return DataFrame(
             self.ctx.read_json(
                 str(path),
@@ -1119,10 +1153,11 @@ def read_csv(
         schema: pa.Schema | None = None,
         has_header: bool = True,
         delimiter: str = ",",
-        schema_infer_max_records: int = 1000,
+        schema_infer_max_records: int = DEFAULT_MAX_INFER_SCHEMA,
         file_extension: str = ".csv",
         table_partition_cols: list[tuple[str, str | pa.DataType]] | None = None,
         file_compression_type: str | None = None,
+        options: CsvReadOptions | None = None,
     ) -> DataFrame:
         """Read a CSV data source.
 
@@ -1140,26 +1175,51 @@ def read_csv(
                 selected for data input.
             table_partition_cols:  Partition columns.
             file_compression_type:  File compression type.
+            options: Set advanced options for CSV reading. This cannot be
+                combined with any of the other options in this method.
 
         Returns:
             DataFrame representation of the read CSV files
         """
-        if table_partition_cols is None:
-            table_partition_cols = []
-        table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
+        path_arg = [str(p) for p in path] if isinstance(path, list) else str(path)
+
+        if options is not None and (
+            schema is not None
+            or not has_header
+            or delimiter != ","
+            or schema_infer_max_records != DEFAULT_MAX_INFER_SCHEMA
+            or file_extension != ".csv"
+            or table_partition_cols is not None
+            or file_compression_type is not None
+        ):
+            message = (
+                "Combining CsvReadOptions parameter with additional options "
+                "is not supported. Use CsvReadOptions to set parameters."
+            )
+            warnings.warn(
+                message,
+                category=UserWarning,
+                stacklevel=2,
+            )
 
-        path = [str(p) for p in path] if isinstance(path, list) else str(path)
+        options = (
+            options
+            if options is not None
+            else CsvReadOptions(
+                schema=schema,
+                has_header=has_header,
+                delimiter=delimiter,
+                schema_infer_max_records=schema_infer_max_records,
+                file_extension=file_extension,
+                table_partition_cols=table_partition_cols,
+                file_compression_type=file_compression_type,
+            )
+        )
 
         return DataFrame(
             self.ctx.read_csv(
-                path,
-                schema,
-                has_header,
-                delimiter,
-                schema_infer_max_records,
-                file_extension,
-                table_partition_cols,
-                file_compression_type,
+                path_arg,
+                options.to_inner(),
             )
         )
 
@@ -1197,7 +1257,7 @@ def read_parquet(
         """
         if table_partition_cols is None:
             table_partition_cols = []
-        table_partition_cols = self._convert_table_partition_cols(table_partition_cols)
+        table_partition_cols = _convert_table_partition_cols(table_partition_cols)
         file_sort_order = self._convert_file_sort_order(file_sort_order)
         return DataFrame(
             self.ctx.read_parquet(
@@ -1231,7 +1291,7 @@ def read_avro(
         """
         if file_partition_cols is None:
             file_partition_cols = []
-        file_partition_cols = self._convert_table_partition_cols(file_partition_cols)
+        file_partition_cols = _convert_table_partition_cols(file_partition_cols)
         return DataFrame(
             self.ctx.read_avro(str(path), schema, file_partition_cols, file_extension)
         )