From 1cff2e4053b66e6b5609971cdda388d6839b5348 Mon Sep 17 00:00:00 2001 From: Ivan Ogasawara Date: Mon, 27 Apr 2026 18:11:07 +0000 Subject: [PATCH 1/4] add support for dataframes --- docs/roadmap.md | 29 ++++++++++++++++++++++++++++- mkdocs.yaml | 2 ++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/docs/roadmap.md b/docs/roadmap.md index e70ad54..5d3b37e 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -43,4 +43,31 @@ type for each variable and function returning. ## Implement native tensors -TBA +Native tensors now have an initial Arrow C++ backed implementation. Remaining +work should continue to make runtime-shaped tensor values usable in more +contexts, while preserving the same runtime-layout rules for every collection +type that uses that approach. + +- [ ] Expand runtime-layout annotations beyond function and extern parameters + once default values, ownership, and type checking are ready for local + declarations and expression contexts. +- [ ] Keep tensor semantics aligned with the Arrow-backed runtime rather than + adding Arx-local lowering behavior. + +## Implement Arrow-backed DataFrames + +DataFrames should be a distinct public collection abstraction for heterogeneous +named columns. The accepted design direction is documented in +[Arrow-backed DataFrames](proposals/arrow-backed-dataframes.md). + +- [ ] Add the builtin `dataframe[...]` type. +- [ ] Add the builtin `series[T]` type for typed DataFrame columns. +- [ ] Add the builtin `dataframe({...})` constructor for column-oriented + literals. +- [ ] Back DataFrame values with Arrow C++ `arrow::Table`. +- [ ] Back Series values with Arrow C++ `arrow::ChunkedArray`. +- [ ] Keep the MVP limited to fixed-width numeric and `bool` columns. +- [ ] Add string, nullable, nested, temporal, and user-defined column support + after the fixed-width MVP is stable. +- [ ] Keep runtime-schema `dataframe[...]` annotations limited to function and + extern parameters at first, then expand this consistently with tensors. diff --git a/mkdocs.yaml b/mkdocs.yaml index 43bdc3c..35acf9e 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -45,6 +45,8 @@ nav: - API Docs: api/ - Syntax: syntax.md - Roadmap: roadmap.md + - Proposals: + - Arrow-backed DataFrames: proposals/arrow-backed-dataframes.md - Contributing Guide: contributing.md - Sponsor: sponsor.md - Partners: partners.md From b644f90282555bbc91f88f5e1f4016d836d2a575 Mon Sep 17 00:00:00 2001 From: Ivan Ogasawara Date: Mon, 27 Apr 2026 18:11:24 +0000 Subject: [PATCH 2/4] feat: add support for dataframes --- docs/proposals/arrow-backed-dataframes.md | 310 ++++++++++++++++++++++ 1 file changed, 310 insertions(+) create mode 100644 docs/proposals/arrow-backed-dataframes.md diff --git a/docs/proposals/arrow-backed-dataframes.md b/docs/proposals/arrow-backed-dataframes.md new file mode 100644 index 0000000..2293a85 --- /dev/null +++ b/docs/proposals/arrow-backed-dataframes.md @@ -0,0 +1,310 @@ +# Arrow-backed DataFrames + +This proposal defines the first Arx DataFrame surface and its intended IRx +runtime model. It is a design document only; implementation should land in small +follow-up changes across ASTx, IRx, Arx, tests, and docs. + +## Goals + +- Add a public `dataframe[...]` type for heterogeneous named columns. +- Add a public `series[T]` type for typed DataFrame columns. +- Back DataFrames with Apache Arrow C++ table storage. +- Keep the first implementation read-only and intentionally small. +- Preserve the existing roles of `tensor`, Arrow arrays, and `buffer/view`. + +## Non-goals + +- General-purpose dictionary or record literals. +- DataFrame mutation, append, join, group-by, sort, or lazy query planning. +- Row objects and row-wise indexing. +- String, nullable, nested, temporal, or user-defined column types in the MVP. +- New Arx-owned AST node types or Arx-local lowering behavior. + +## Surface design + +### Builtin type + +`dataframe` is a builtin type with a static schema: + +```arx +dataframe[id: i32, score: f64, active: bool] +``` + +Rules: + +- column names are identifiers; +- column names must be unique; +- column order is preserved; +- row count is runtime metadata, not part of the type; +- schema is part of the type; +- MVP column types are fixed-width numeric types and `bool`. + +The first column type set should match the fixed-width Arrow primitive support +already shared by arrays and tensors where possible: + +- signed integers: `i8`, `i16`, `i32`, `i64` +- floating point: `f32`, `f64` +- boolean: `bool` + +Unsigned integer aliases may be added if/when the Arx surface exposes them +consistently. + +Boolean columns should be supported as Arrow boolean arrays, but they should not +be assumed to be `buffer/view` compatible because Arrow stores boolean values in +bit-packed buffers. + +### Public series type + +`series[T]` is also public: + +```arx +series[i32] +series[f64] +series[bool] +``` + +A series represents one typed DataFrame column. The runtime representation +should be Arrow `ChunkedArray`, even if the first implementation only creates +single-chunk columns. + +### Runtime-schema form + +The runtime-schema form is: + +```arx +dataframe[...] +``` + +For the first phase, runtime-schema DataFrames are only valid in function and +extern parameter annotations. This matches the current runtime-layout rule for +tensors and keeps declaration defaults and local type checking deterministic. + +The roadmap should later expand this rule consistently for every type that uses +the same runtime-layout/schema approach, including both tensors and DataFrames. + +### Builtin constructor + +`dataframe` is also a builtin function. The preferred constructor syntax is: + +```arx +var rows: dataframe[id: i32, score: f64] = dataframe({ + id: [1, 2, 3], + score: [0.5, 0.8, 1.0], +}) +``` + +The `{ ... }` argument is a constructor-only column map in the MVP. It should +not imply that Arx has gained general dictionary or record literal semantics. + +Constructor rules: + +- the target DataFrame type must be explicit in the MVP; +- keys must be column identifiers; +- values must be column literals; +- every declared column must be present exactly once; +- no undeclared column may be present; +- all columns must have the same length; +- each column value must match its declared type. + +Future type inference may allow the constructor to infer +`dataframe[id: i32, score: f64]` from the column map, but that is not required +for the first phase. + +### Column access + +Both static field access and string-key access should be supported: + +```arx +var scores: series[f64] = rows.score +var ids: series[i32] = rows["id"] +``` + +Rules: + +- `rows.score` is statically validated when `rows` has a known schema; +- `rows["id"]` is statically validated when the key is a string literal and the + schema is known; +- dynamic string lookup on `dataframe[...]` runtime-schema values is deferred. + +### Basic methods + +The MVP should include only simple metadata queries: + +```arx +rows.nrows() +rows.ncols() +``` + +Both should lower to Arrow table metadata queries and return integer values. + +## Arrow backing + +### DataFrame storage + +DataFrames should wrap Arrow C++ `arrow::Table`. + +`arrow::Table` is the best long-term match because it provides: + +- named columns; +- a schema; +- heterogeneous column types; +- equal row count across columns; +- immutable, shareable columnar storage; +- cheap projection and slicing semantics; +- a natural base for Arrow compute integration later. + +DataFrame values should lower to opaque table handles, not to `buffer/view`. A +table is heterogeneous, may contain chunked columns, and cannot be represented +as one flat layout descriptor. + +### Series storage + +Series should wrap Arrow C++ `arrow::ChunkedArray`. + +This matches table columns directly. The first implementation may construct each +series as a single chunk from one Arrow array, but the public model should not +depend on single-chunk storage. + +For later scalar indexing, a fixed-width, non-null, single-chunk series can +borrow a `buffer/view`. General chunked indexing should remain series-specific. + +### Alternatives considered + +`arrow::RecordBatch` : Good for import/export and literal construction, but too +narrow as the core DataFrame value because it is single-batch/single-chunk. + +`arrow::StructArray` : Useful for row-like interop, but it does not model a +DataFrame as directly as `arrow::Table`. + +Arrow Dataset or scanner APIs : Useful later for lazy IO and query planning, but +too high-level for the core in-memory value. + +Arrow compute / Acero : Useful later for filtering, projection, joins, group-by, +and sorting. These should build on top of the table abstraction instead of +replacing it. + +## ASTx and IRx work + +Core nodes and semantic support should be added in ASTx/IRx first. Arx should +only parse the surface syntax and emit the IRx/ASTx facade nodes. + +Suggested ASTx additions: + +- `DataFrameType` +- `SeriesType` +- `DataFrameColumn` +- `DataFrameLiteral` +- `DataFrameColumnAccess` +- `DataFrameStringColumnAccess` +- `DataFrameRowCount` +- `DataFrameColumnCount` +- `DataFrameRetain` +- `DataFrameRelease` +- `SeriesRetain` +- `SeriesRelease` + +Suggested semantic metadata: + +- `DATAFRAME_SCHEMA_EXTRA` +- `DATAFRAME_COLUMN_INDEX_EXTRA` +- `SERIES_ELEMENT_TYPE_EXTRA` +- `SERIES_NULLABLE_EXTRA` + +Each schema entry should include: + +- column name; +- column type; +- nullable flag, initially always false; +- stable column index. + +## Runtime ABI sketch + +The Arrow runtime should add opaque table and series handles beside the existing +schema, array, and tensor handles: + +```c +typedef struct irx_arrow_table_handle irx_arrow_table_handle; +typedef struct irx_arrow_chunked_array_handle + irx_arrow_chunked_array_handle; +``` + +Initial C ABI: + +```c +int irx_arrow_table_new_from_arrays( + int64_t column_count, + const char** names, + irx_arrow_array_handle** arrays, + irx_arrow_table_handle** out_table); + +int64_t irx_arrow_table_num_rows( + const irx_arrow_table_handle* table); +int64_t irx_arrow_table_num_columns( + const irx_arrow_table_handle* table); + +int irx_arrow_table_column_by_name( + const irx_arrow_table_handle* table, + const char* name, + irx_arrow_chunked_array_handle** out_column); +int irx_arrow_table_column_by_index( + const irx_arrow_table_handle* table, + int32_t index, + irx_arrow_chunked_array_handle** out_column); + +int irx_arrow_table_retain(irx_arrow_table_handle* table); +void irx_arrow_table_release(irx_arrow_table_handle* table); + +int irx_arrow_chunked_array_retain( + irx_arrow_chunked_array_handle* column); +void irx_arrow_chunked_array_release( + irx_arrow_chunked_array_handle* column); +``` + +The runtime should preserve existing Arrow runtime conventions: + +- integer return codes; +- `irx_arrow_last_error()`; +- explicit retain/release; +- C ABI boundary over the Arrow C++ implementation; +- reuse of the shared Arrow C++ runtime source where practical. + +## Lowering model + +MVP lowering should support: + +1. build each constructor column as an Arrow array; +2. build an Arrow table from column names and array handles; +3. lower DataFrame values to opaque table handles; +4. lower `nrows()` and `ncols()` to table metadata calls; +5. lower `df.column` and `df["column"]` to chunked-array extraction; +6. retain/release table and series handles. + +Series indexing, filtering, projections, and Arrow compute integration should be +follow-up work. + +## Parser and documentation impact + +Arx parser updates should include: + +- `dataframe[...]` type parsing; +- `series[T]` type parsing; +- runtime-schema marker parsing for `dataframe[...]`; +- constructor-only column map parsing inside `dataframe({ ... })`; +- static schema validation for duplicate, missing, and extra columns; +- field and string-key column access; +- `syntax.json` updates for the new structural form if lexer/tooling metadata + needs to describe it. + +Docs and examples should include valid Douki module docstrings in every new `.x` +file. + +## Suggested rollout + +1. Add ASTx and IRx type/schema nodes. +2. Add IRx semantic validation for DataFrame and Series nodes. +3. Extend the Arrow C++ runtime with table and chunked-array handles. +4. Add lowering for constructor, row/column counts, and column extraction. +5. Add Arx parser support for the surface syntax. +6. Add docs, examples, syntax manifest updates, and parser/runtime tests. + +Each phase should include targeted tests before expanding the surface area. From 5e8f5b2bda9e297baacd7dec1ff7366f0c621200 Mon Sep 17 00:00:00 2001 From: Ivan Ogasawara Date: Mon, 27 Apr 2026 19:16:59 +0000 Subject: [PATCH 3/4] feat: Add initial support for dataframes/series --- docs/index.md | 13 +- docs/irx/runtime-features.md | 13 + docs/library/built-in-types.md | 84 ++- docs/library/datatypes.md | 15 +- docs/proposals/arrow-backed-dataframes.md | 310 ----------- docs/roadmap.md | 25 +- examples/dataframe.x | 24 + mkdocs.yaml | 2 - packages/arx/src/arx/builtins.py | 4 +- packages/arx/src/arx/dataframe.py | 248 +++++++++ packages/arx/src/arx/lexer/syntax.json | 11 + packages/arx/src/arx/parser/base.py | 63 ++- packages/arx/src/arx/parser/control_flow.py | 57 +- packages/arx/src/arx/parser/core.py | 56 ++ packages/arx/src/arx/parser/declarations.py | 77 ++- packages/arx/src/arx/parser/expressions.py | 217 ++++++++ packages/arx/src/arx/parser/types.py | 75 ++- packages/arx/tests/python/test_dataframe.py | 142 +++++ packages/astx/src/astx/__init__.py | 28 + packages/astx/src/astx/dataframe.py | 517 ++++++++++++++++++ .../handlers/_expressions/__init__.py | 4 + .../handlers/_expressions/dataframes.py | 325 +++++++++++ packages/irx/src/irx/analysis/types.py | 55 ++ packages/irx/src/irx/builder/backend.py | 2 + packages/irx/src/irx/builder/core.py | 10 + .../irx/src/irx/builder/lowering/__init__.py | 2 + .../irx/src/irx/builder/lowering/dataframe.py | 472 ++++++++++++++++ .../irx/src/irx/builder/lowering/literals.py | 2 + .../irx/src/irx/builder/lowering/variables.py | 9 + .../runtime/arrow/native/irx_arrow_runtime.cc | 203 +++++++ .../runtime/arrow/native/irx_arrow_runtime.h | 22 + .../irx/builder/runtime/dataframe/__init__.py | 9 + .../irx/builder/runtime/dataframe/feature.py | 339 ++++++++++++ .../irx/src/irx/builder/runtime/registry.py | 4 + packages/irx/src/irx/builder/types.py | 4 + .../src/irx/builtins/collections/dataframe.py | 175 ++++++ packages/irx/tests/test_dataframe.py | 228 ++++++++ 37 files changed, 3473 insertions(+), 373 deletions(-) delete mode 100644 docs/proposals/arrow-backed-dataframes.md create mode 100644 examples/dataframe.x create mode 100644 packages/arx/src/arx/dataframe.py create mode 100644 packages/arx/tests/python/test_dataframe.py create mode 100644 packages/astx/src/astx/dataframe.py create mode 100644 packages/irx/src/irx/analysis/handlers/_expressions/dataframes.py create mode 100644 packages/irx/src/irx/builder/lowering/dataframe.py create mode 100644 packages/irx/src/irx/builder/runtime/dataframe/__init__.py create mode 100644 packages/irx/src/irx/builder/runtime/dataframe/feature.py create mode 100644 packages/irx/src/irx/builtins/collections/dataframe.py create mode 100644 packages/irx/tests/test_dataframe.py diff --git a/docs/index.md b/docs/index.md index fd8f6a0..40e124c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,9 +1,9 @@ # ArxLang -Arx is a multi-purpose compiler that aims to provide native list and tensor -abstractions backed internally by IRx runtime support. It uses the power of -[LLVM](https://llvm.org/) to provide multi-architecture machine target code -generation. +Arx is a multi-purpose compiler that aims to provide native list, tensor, and +dataframe abstractions backed internally by IRx runtime support. It uses the +power of [LLVM](https://llvm.org/) to provide multi-architecture machine target +code generation. The language syntax is influenced by Python, C++, and YAML, featuring significant whitespace, static typing (planned), and a focus on data-oriented @@ -43,8 +43,9 @@ from Arx. - **LLVM-powered** -- compiles to native machine code via LLVM - **Python-like syntax** -- indentation-based blocks, familiar keywords -- **Lists and tensors** -- generic collections plus Arrow-backed numeric tensors - with first-class indexing and compiler-known shapes +- **Lists, tensors, and dataframes** -- generic collections, Arrow-backed + numeric tensors with compiler-known shapes, and Arrow-backed named-column + DataFrames - **Multiple output modes** -- inspect tokens, AST, LLVM IR, or compile to object files diff --git a/docs/irx/runtime-features.md b/docs/irx/runtime-features.md index 189de40..79d1167 100644 --- a/docs/irx/runtime-features.md +++ b/docs/irx/runtime-features.md @@ -46,6 +46,8 @@ when needed. - `array` Declares the builtin one-dimensional Arrow array runtime surface. - `tensor` Declares the builtin homogeneous N-dimensional Arrow tensor runtime surface. +- `dataframe` Declares the builtin heterogeneous named-column Arrow table + runtime surface. - `list` Declares the minimal dynamic-list runtime used by `ListCreate`, `ListAppend`, and lowered list indexing. @@ -175,6 +177,17 @@ Current initial Tensor layer alongside that substrate: storage - current tensor lowering supports fixed-width numeric element types only +Current initial DataFrame layer alongside that substrate: + +- dataframe values are created through `irx_arrow_table_*` runtime symbols +- dataframe construction stores named columns in Arrow C++ `arrow::Table` + handles +- series values are column views backed by Arrow C++ `arrow::ChunkedArray` + handles +- static column access resolves to a known column index during semantic analysis +- current dataframe lowering supports fixed-width numeric and `bool` columns + only + What IRx does not do here: - no direct LLVM struct encoding of Arrow containers diff --git a/docs/library/built-in-types.md b/docs/library/built-in-types.md index 1db41f3..aa44f51 100644 --- a/docs/library/built-in-types.md +++ b/docs/library/built-in-types.md @@ -6,27 +6,30 @@ their canonical spellings, accepted aliases, and current surface syntax. ## Overview -| Canonical type | Accepted aliases | Category | Example | Notes | -| ---------------------------- | ---------------- | ---------- | ------------------------------------------------ | ---------------------------------------- | -| `i8` | `int8` | integer | `var a: i8 = 8` | 8-bit integer | -| `i16` | `int16` | integer | `var b: i16 = 16` | 16-bit integer | -| `i32` | `int32` | integer | `var c: i32 = 32` | 32-bit integer | -| `i64` | `int64` | integer | `var d: i64 = 64` | 64-bit integer | -| `f16` | `float16` | float | `var x: f16 = 1.5` | 16-bit float | -| `f32` | `float32` | float | `var y: f32 = 3.25` | 32-bit float | -| `f64` | `float64` | float | `var z: f64 = 9.5` | 64-bit float | -| `bool` | `boolean` | boolean | `var ok: bool = true` | Uses `true` and `false` literals | -| `none` | — | unit | `fn log() -> none:` | Also the single value of the `none` type | -| `str` | `string` | text | `var s: str = "hi"` | UTF-8 string | -| `char` | — | text | `var ch: char = 'A'` | Currently mapped to `i8` | -| `datetime` | — | temporal | `datetime("2026-03-05T12:30:59")` | Constructor-style literal form | -| `timestamp` | — | temporal | `timestamp("2026-03-05T12:30:59Z")` | Constructor-style literal form | -| `date` | — | temporal | `var d: date` | Recognized as a built-in type name | -| `time` | — | temporal | `var t: time` | Recognized as a built-in type name | -| `list[T]` | — | collection | `var ids: list[i32] = [1, 2, 3]` | Generic collection type | -| `tensor[T, N]` | — | collection | `var ids: tensor[i32, 4] = [1, 2, 3, 4]` | Fixed-shape 1D numeric tensor | -| `tensor[T, d0, d1, ..., dN]` | — | collection | `var grid: tensor[i32, 2, 2] = [[1, 2], [3, 4]]` | Fixed-shape multidimensional tensor | -| `tensor[T, ...]` | — | collection | `fn sink(values: tensor[i32, ...]) -> none:` | Runtime-shaped tensor parameter | +| Canonical type | Accepted aliases | Category | Example | Notes | +| ---------------------------- | ---------------- | ---------- | ----------------------------------------------------- | ---------------------------------------- | +| `i8` | `int8` | integer | `var a: i8 = 8` | 8-bit integer | +| `i16` | `int16` | integer | `var b: i16 = 16` | 16-bit integer | +| `i32` | `int32` | integer | `var c: i32 = 32` | 32-bit integer | +| `i64` | `int64` | integer | `var d: i64 = 64` | 64-bit integer | +| `f16` | `float16` | float | `var x: f16 = 1.5` | 16-bit float | +| `f32` | `float32` | float | `var y: f32 = 3.25` | 32-bit float | +| `f64` | `float64` | float | `var z: f64 = 9.5` | 64-bit float | +| `bool` | `boolean` | boolean | `var ok: bool = true` | Uses `true` and `false` literals | +| `none` | — | unit | `fn log() -> none:` | Also the single value of the `none` type | +| `str` | `string` | text | `var s: str = "hi"` | UTF-8 string | +| `char` | — | text | `var ch: char = 'A'` | Currently mapped to `i8` | +| `datetime` | — | temporal | `datetime("2026-03-05T12:30:59")` | Constructor-style literal form | +| `timestamp` | — | temporal | `timestamp("2026-03-05T12:30:59Z")` | Constructor-style literal form | +| `date` | — | temporal | `var d: date` | Recognized as a built-in type name | +| `time` | — | temporal | `var t: time` | Recognized as a built-in type name | +| `list[T]` | — | collection | `var ids: list[i32] = [1, 2, 3]` | Generic collection type | +| `tensor[T, N]` | — | collection | `var ids: tensor[i32, 4] = [1, 2, 3, 4]` | Fixed-shape 1D numeric tensor | +| `tensor[T, d0, d1, ..., dN]` | — | collection | `var grid: tensor[i32, 2, 2] = [[1, 2], [3, 4]]` | Fixed-shape multidimensional tensor | +| `tensor[T, ...]` | — | collection | `fn sink(values: tensor[i32, ...]) -> none:` | Runtime-shaped tensor parameter | +| `dataframe[name: T, ...]` | — | collection | `var rows: dataframe[id: i32] = dataframe({id: [1]})` | Static-schema DataFrame | +| `dataframe[...]` | — | collection | `fn sink(rows: dataframe[...]) -> none:` | Runtime-schema DataFrame parameter | +| `series[T]` | — | collection | `var ids: series[i32] = rows["id"]` | Typed DataFrame column | ## Numeric Types @@ -94,7 +97,7 @@ fn time_demo() -> none: The parser also recognizes `date` and `time` as built-in type names in annotations. -## Collections and tensors +## Collections, tensors, and dataframes Arx exposes two public collection constructors: @@ -102,16 +105,19 @@ Arx exposes two public collection constructors: - `tensor[T, N]` for fixed-shape 1D numeric tensors - `tensor[T, d0, d1, ..., dN]` for fixed-shape multidimensional tensors - `tensor[T, ...]` for runtime-shaped tensor parameters +- `dataframe[name: T, ...]` for static-schema named-column DataFrames +- `dataframe[...]` for runtime-schema DataFrame parameters +- `series[T]` for typed DataFrame columns In the fixed-shape form, `...` is documentation prose for additional integer dimensions. The literal `...` marker is reserved for runtime-shaped tensor -parameters. +parameters and runtime-schema DataFrame parameters. The naming is intentional: Arx uses `Tensor` for homogeneous N-dimensional data, aligning with common data-science terminology and IRx's Arrow C++ backed -runtime. `Array` remains the term for one-dimensional Arrow-style data where it -is exposed, and future dataframe/table support will be separate and -heterogeneous. +runtime. `DataFrame` is the heterogeneous named-column abstraction backed by +Arrow C++ `Table`, and `Series` is the one-dimensional typed column view backed +by Arrow C++ `ChunkedArray`. ```arx fn tensor_demo() -> none: @@ -139,6 +145,32 @@ Current tensor rules in this phase: - current lowering is read-only and is focused on literal/default-initialized shaped tensors +Current DataFrame rules in this phase: + +- column types are fixed-width numeric types (`i8`, `i16`, `i32`, `i64`, `f32`, + `f64`) or `bool` +- string, nullable, nested, temporal, and user-defined columns are not part of + the MVP yet +- static-schema values use `dataframe[name: T, ...]` annotations and the + column-oriented `dataframe({...})` constructor +- constructor columns must be list literals, use declared column names, and have + equal row counts +- columns can be accessed as `rows.score` or `rows["score"]` +- `rows.nrows()` and `rows.ncols()` return row and column counts as `i64` +- `dataframe[...]` is accepted only in function and extern parameter annotations + for now; column access on runtime-schema parameters is not available yet + +```arx +fn dataframe_demo() -> i32: + var rows: dataframe[id: i32, score: f64] = dataframe({ + id: [1, 2, 3], + score: [0.5, 0.8, 1.0], + }) + var scores: series[f64] = rows.score + var ids: series[i32] = rows["id"] + return cast(rows.nrows(), i32) +``` + ## Casting Use the built-in `cast(value, type)` helper to convert values between supported diff --git a/docs/library/datatypes.md b/docs/library/datatypes.md index aabc176..ae3c7ff 100644 --- a/docs/library/datatypes.md +++ b/docs/library/datatypes.md @@ -28,8 +28,13 @@ fn add(a: i32, b: i32) -> i32: ```arx fn summarize(name: str, values: list[i32]) -> none: var grid: tensor[i32, 2, 2] = [[1, 2], [3, 4]] + var rows: dataframe[id: i32, score: f64] = dataframe({ + id: [1, 2], + score: [0.5, 1.0], + }) var count: i32 = 0 print(grid[0, 1]) + print(rows.nrows()) return ``` @@ -42,12 +47,20 @@ Common places where types appear: - shaped 1D tensor annotations: `tensor[i32, 4]` - multidimensional tensor annotations: `tensor[i32, 2, 2]` - runtime-shaped tensor parameters: `fn sink(x: tensor[i32, ...]) -> none:` +- static-schema DataFrame annotations: `dataframe[id: i32, score: f64]` +- runtime-schema DataFrame parameters: `fn sink(rows: dataframe[...]) -> none:` +- typed DataFrame column annotations: `series[f64]` `tensor[T, ...]` is currently parameter-only. Use fixed-shape tensor annotations for variables, fields, and return types until runtime-shaped storage and return semantics are defined. Runtime-shaped tensor parameters can be passed through, but indexed access currently requires a static-shape tensor annotation. +`dataframe[...]` follows the same current restriction: it is accepted only in +function and extern parameter annotations. Static-schema DataFrames can be +constructed with `dataframe({...})`, and their columns can be accessed with +either `rows.score` or `rows["score"]`. + ## Built-in Type Reference For the catalog of built-in types, aliases, and examples, see @@ -58,5 +71,5 @@ That page covers: - numeric types and aliases - `none` as the unit type and value - string, character, and temporal types -- lists, tensors, and current limitations +- lists, tensors, dataframes, series, and current limitations - the `cast(value, type)` helper diff --git a/docs/proposals/arrow-backed-dataframes.md b/docs/proposals/arrow-backed-dataframes.md deleted file mode 100644 index 2293a85..0000000 --- a/docs/proposals/arrow-backed-dataframes.md +++ /dev/null @@ -1,310 +0,0 @@ -# Arrow-backed DataFrames - -This proposal defines the first Arx DataFrame surface and its intended IRx -runtime model. It is a design document only; implementation should land in small -follow-up changes across ASTx, IRx, Arx, tests, and docs. - -## Goals - -- Add a public `dataframe[...]` type for heterogeneous named columns. -- Add a public `series[T]` type for typed DataFrame columns. -- Back DataFrames with Apache Arrow C++ table storage. -- Keep the first implementation read-only and intentionally small. -- Preserve the existing roles of `tensor`, Arrow arrays, and `buffer/view`. - -## Non-goals - -- General-purpose dictionary or record literals. -- DataFrame mutation, append, join, group-by, sort, or lazy query planning. -- Row objects and row-wise indexing. -- String, nullable, nested, temporal, or user-defined column types in the MVP. -- New Arx-owned AST node types or Arx-local lowering behavior. - -## Surface design - -### Builtin type - -`dataframe` is a builtin type with a static schema: - -```arx -dataframe[id: i32, score: f64, active: bool] -``` - -Rules: - -- column names are identifiers; -- column names must be unique; -- column order is preserved; -- row count is runtime metadata, not part of the type; -- schema is part of the type; -- MVP column types are fixed-width numeric types and `bool`. - -The first column type set should match the fixed-width Arrow primitive support -already shared by arrays and tensors where possible: - -- signed integers: `i8`, `i16`, `i32`, `i64` -- floating point: `f32`, `f64` -- boolean: `bool` - -Unsigned integer aliases may be added if/when the Arx surface exposes them -consistently. - -Boolean columns should be supported as Arrow boolean arrays, but they should not -be assumed to be `buffer/view` compatible because Arrow stores boolean values in -bit-packed buffers. - -### Public series type - -`series[T]` is also public: - -```arx -series[i32] -series[f64] -series[bool] -``` - -A series represents one typed DataFrame column. The runtime representation -should be Arrow `ChunkedArray`, even if the first implementation only creates -single-chunk columns. - -### Runtime-schema form - -The runtime-schema form is: - -```arx -dataframe[...] -``` - -For the first phase, runtime-schema DataFrames are only valid in function and -extern parameter annotations. This matches the current runtime-layout rule for -tensors and keeps declaration defaults and local type checking deterministic. - -The roadmap should later expand this rule consistently for every type that uses -the same runtime-layout/schema approach, including both tensors and DataFrames. - -### Builtin constructor - -`dataframe` is also a builtin function. The preferred constructor syntax is: - -```arx -var rows: dataframe[id: i32, score: f64] = dataframe({ - id: [1, 2, 3], - score: [0.5, 0.8, 1.0], -}) -``` - -The `{ ... }` argument is a constructor-only column map in the MVP. It should -not imply that Arx has gained general dictionary or record literal semantics. - -Constructor rules: - -- the target DataFrame type must be explicit in the MVP; -- keys must be column identifiers; -- values must be column literals; -- every declared column must be present exactly once; -- no undeclared column may be present; -- all columns must have the same length; -- each column value must match its declared type. - -Future type inference may allow the constructor to infer -`dataframe[id: i32, score: f64]` from the column map, but that is not required -for the first phase. - -### Column access - -Both static field access and string-key access should be supported: - -```arx -var scores: series[f64] = rows.score -var ids: series[i32] = rows["id"] -``` - -Rules: - -- `rows.score` is statically validated when `rows` has a known schema; -- `rows["id"]` is statically validated when the key is a string literal and the - schema is known; -- dynamic string lookup on `dataframe[...]` runtime-schema values is deferred. - -### Basic methods - -The MVP should include only simple metadata queries: - -```arx -rows.nrows() -rows.ncols() -``` - -Both should lower to Arrow table metadata queries and return integer values. - -## Arrow backing - -### DataFrame storage - -DataFrames should wrap Arrow C++ `arrow::Table`. - -`arrow::Table` is the best long-term match because it provides: - -- named columns; -- a schema; -- heterogeneous column types; -- equal row count across columns; -- immutable, shareable columnar storage; -- cheap projection and slicing semantics; -- a natural base for Arrow compute integration later. - -DataFrame values should lower to opaque table handles, not to `buffer/view`. A -table is heterogeneous, may contain chunked columns, and cannot be represented -as one flat layout descriptor. - -### Series storage - -Series should wrap Arrow C++ `arrow::ChunkedArray`. - -This matches table columns directly. The first implementation may construct each -series as a single chunk from one Arrow array, but the public model should not -depend on single-chunk storage. - -For later scalar indexing, a fixed-width, non-null, single-chunk series can -borrow a `buffer/view`. General chunked indexing should remain series-specific. - -### Alternatives considered - -`arrow::RecordBatch` : Good for import/export and literal construction, but too -narrow as the core DataFrame value because it is single-batch/single-chunk. - -`arrow::StructArray` : Useful for row-like interop, but it does not model a -DataFrame as directly as `arrow::Table`. - -Arrow Dataset or scanner APIs : Useful later for lazy IO and query planning, but -too high-level for the core in-memory value. - -Arrow compute / Acero : Useful later for filtering, projection, joins, group-by, -and sorting. These should build on top of the table abstraction instead of -replacing it. - -## ASTx and IRx work - -Core nodes and semantic support should be added in ASTx/IRx first. Arx should -only parse the surface syntax and emit the IRx/ASTx facade nodes. - -Suggested ASTx additions: - -- `DataFrameType` -- `SeriesType` -- `DataFrameColumn` -- `DataFrameLiteral` -- `DataFrameColumnAccess` -- `DataFrameStringColumnAccess` -- `DataFrameRowCount` -- `DataFrameColumnCount` -- `DataFrameRetain` -- `DataFrameRelease` -- `SeriesRetain` -- `SeriesRelease` - -Suggested semantic metadata: - -- `DATAFRAME_SCHEMA_EXTRA` -- `DATAFRAME_COLUMN_INDEX_EXTRA` -- `SERIES_ELEMENT_TYPE_EXTRA` -- `SERIES_NULLABLE_EXTRA` - -Each schema entry should include: - -- column name; -- column type; -- nullable flag, initially always false; -- stable column index. - -## Runtime ABI sketch - -The Arrow runtime should add opaque table and series handles beside the existing -schema, array, and tensor handles: - -```c -typedef struct irx_arrow_table_handle irx_arrow_table_handle; -typedef struct irx_arrow_chunked_array_handle - irx_arrow_chunked_array_handle; -``` - -Initial C ABI: - -```c -int irx_arrow_table_new_from_arrays( - int64_t column_count, - const char** names, - irx_arrow_array_handle** arrays, - irx_arrow_table_handle** out_table); - -int64_t irx_arrow_table_num_rows( - const irx_arrow_table_handle* table); -int64_t irx_arrow_table_num_columns( - const irx_arrow_table_handle* table); - -int irx_arrow_table_column_by_name( - const irx_arrow_table_handle* table, - const char* name, - irx_arrow_chunked_array_handle** out_column); -int irx_arrow_table_column_by_index( - const irx_arrow_table_handle* table, - int32_t index, - irx_arrow_chunked_array_handle** out_column); - -int irx_arrow_table_retain(irx_arrow_table_handle* table); -void irx_arrow_table_release(irx_arrow_table_handle* table); - -int irx_arrow_chunked_array_retain( - irx_arrow_chunked_array_handle* column); -void irx_arrow_chunked_array_release( - irx_arrow_chunked_array_handle* column); -``` - -The runtime should preserve existing Arrow runtime conventions: - -- integer return codes; -- `irx_arrow_last_error()`; -- explicit retain/release; -- C ABI boundary over the Arrow C++ implementation; -- reuse of the shared Arrow C++ runtime source where practical. - -## Lowering model - -MVP lowering should support: - -1. build each constructor column as an Arrow array; -2. build an Arrow table from column names and array handles; -3. lower DataFrame values to opaque table handles; -4. lower `nrows()` and `ncols()` to table metadata calls; -5. lower `df.column` and `df["column"]` to chunked-array extraction; -6. retain/release table and series handles. - -Series indexing, filtering, projections, and Arrow compute integration should be -follow-up work. - -## Parser and documentation impact - -Arx parser updates should include: - -- `dataframe[...]` type parsing; -- `series[T]` type parsing; -- runtime-schema marker parsing for `dataframe[...]`; -- constructor-only column map parsing inside `dataframe({ ... })`; -- static schema validation for duplicate, missing, and extra columns; -- field and string-key column access; -- `syntax.json` updates for the new structural form if lexer/tooling metadata - needs to describe it. - -Docs and examples should include valid Douki module docstrings in every new `.x` -file. - -## Suggested rollout - -1. Add ASTx and IRx type/schema nodes. -2. Add IRx semantic validation for DataFrame and Series nodes. -3. Extend the Arrow C++ runtime with table and chunked-array handles. -4. Add lowering for constructor, row/column counts, and column extraction. -5. Add Arx parser support for the surface syntax. -6. Add docs, examples, syntax manifest updates, and parser/runtime tests. - -Each phase should include targeted tests before expanding the surface area. diff --git a/docs/roadmap.md b/docs/roadmap.md index 5d3b37e..839a8ca 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -54,20 +54,21 @@ type that uses that approach. - [ ] Keep tensor semantics aligned with the Arrow-backed runtime rather than adding Arx-local lowering behavior. -## Implement Arrow-backed DataFrames +## DataFrames and Series -DataFrames should be a distinct public collection abstraction for heterogeneous -named columns. The accepted design direction is documented in -[Arrow-backed DataFrames](proposals/arrow-backed-dataframes.md). +DataFrames are a distinct public collection abstraction for heterogeneous named +columns. Static-schema values use `dataframe[name: T, ...]`, column views use +`series[T]`, and literals are constructed with `dataframe({...})`. -- [ ] Add the builtin `dataframe[...]` type. -- [ ] Add the builtin `series[T]` type for typed DataFrame columns. -- [ ] Add the builtin `dataframe({...})` constructor for column-oriented +- [x] Add the builtin `dataframe[...]` type. +- [x] Add the builtin `series[T]` type for typed DataFrame columns. +- [x] Add the builtin `dataframe({...})` constructor for column-oriented literals. -- [ ] Back DataFrame values with Arrow C++ `arrow::Table`. -- [ ] Back Series values with Arrow C++ `arrow::ChunkedArray`. -- [ ] Keep the MVP limited to fixed-width numeric and `bool` columns. +- [x] Back DataFrame values with Arrow C++ `arrow::Table`. +- [x] Back Series values with Arrow C++ `arrow::ChunkedArray`. +- [x] Keep the MVP limited to fixed-width numeric and `bool` columns. - [ ] Add string, nullable, nested, temporal, and user-defined column support after the fixed-width MVP is stable. -- [ ] Keep runtime-schema `dataframe[...]` annotations limited to function and - extern parameters at first, then expand this consistently with tensors. +- [ ] Expand runtime-layout/schema annotations beyond function and extern + parameters, applying the same behavior to both `dataframe[...]` and + `tensor[T, ...]`. diff --git a/examples/dataframe.x b/examples/dataframe.x new file mode 100644 index 0000000..21b584a --- /dev/null +++ b/examples/dataframe.x @@ -0,0 +1,24 @@ +``` +title: DataFrame example +summary: Demonstrates fixed-width Arrow-backed DataFrame syntax. +``` + +fn row_count(rows: dataframe[id: i32, score: f64]) -> i32: + ``` + title: row_count + summary: Returns the number of rows in a static-schema DataFrame. + ``` + return cast(rows.nrows(), i32) + +fn main() -> i32: + ``` + title: main + summary: Builds a DataFrame and accesses columns by name and string key. + ``` + var rows: dataframe[id: i32, score: f64] = dataframe({ + id: [1, 2, 3], + score: [0.5, 0.8, 1.0], + }) + var scores: series[f64] = rows.score + var ids: series[i32] = rows["id"] + return row_count(rows) diff --git a/mkdocs.yaml b/mkdocs.yaml index 35acf9e..43bdc3c 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -45,8 +45,6 @@ nav: - API Docs: api/ - Syntax: syntax.md - Roadmap: roadmap.md - - Proposals: - - Arrow-backed DataFrames: proposals/arrow-backed-dataframes.md - Contributing Guide: contributing.md - Sponsor: sponsor.md - Partners: partners.md diff --git a/packages/arx/src/arx/builtins.py b/packages/arx/src/arx/builtins.py index eadeb75..b7f49c2 100644 --- a/packages/arx/src/arx/builtins.py +++ b/packages/arx/src/arx/builtins.py @@ -19,6 +19,7 @@ _BUILTIN_RESOURCE_DIR = "builtins" BUILTIN_CAST = "cast" +BUILTIN_DATAFRAME = "dataframe" BUILTIN_PRINT = "print" BUILTIN_RANGE = "range" _GENERATORS_MODULE = f"{BUILTIN_NAMESPACE}.generators" @@ -69,6 +70,7 @@ class AmbientBuiltinBinding: __all__ = [ "BUILTIN_CAST", + "BUILTIN_DATAFRAME", "BUILTIN_NAMESPACE", "BUILTIN_PRINT", "BUILTIN_RANGE", @@ -96,7 +98,7 @@ def is_builtin(name: str) -> bool: returns: type: bool """ - return name in {BUILTIN_CAST, BUILTIN_PRINT} + return name in {BUILTIN_CAST, BUILTIN_DATAFRAME, BUILTIN_PRINT} def build_cast( diff --git a/packages/arx/src/arx/dataframe.py b/packages/arx/src/arx/dataframe.py new file mode 100644 index 0000000..d2337e4 --- /dev/null +++ b/packages/arx/src/arx/dataframe.py @@ -0,0 +1,248 @@ +""" +title: DataFrame surface helpers for Arx. +summary: >- + Adapt Arx surface dataframe syntax to IRx DataFrame nodes while keeping user- + facing schema rules local to Arx. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import cast + +import astx + +from irx.analysis.resolved_nodes import SemanticInfo +from irx.builtins.collections.dataframe import ( + DATAFRAME_SCHEMA_EXTRA, + DataFrameSchema, + dataframe_column_type_is_supported, + schema_from_type, +) + + +@dataclass(frozen=True) +class DataFrameBinding: + """ + title: Static DataFrame binding metadata. + attributes: + schema: + type: DataFrameSchema + """ + + schema: DataFrameSchema + + +def is_dataframe_type(data_type: astx.DataType | None) -> bool: + """ + title: Return whether one type is a DataFrame type. + parameters: + data_type: + type: astx.DataType | None + returns: + type: bool + """ + return isinstance(data_type, astx.DataFrameType) + + +def is_series_type(data_type: astx.DataType | None) -> bool: + """ + title: Return whether one type is a Series type. + parameters: + data_type: + type: astx.DataType | None + returns: + type: bool + """ + return isinstance(data_type, astx.SeriesType) + + +def dataframe_type( + columns: tuple[astx.DataFrameColumn, ...], +) -> astx.DataFrameType: + """ + title: Build one static-schema DataFrame surface type. + parameters: + columns: + type: tuple[astx.DataFrameColumn, Ellipsis] + returns: + type: astx.DataFrameType + """ + if not columns: + raise ValueError("dataframe types require at least one column") + seen: set[str] = set() + for column in columns: + if column.name in seen: + raise ValueError(f"duplicate dataframe column '{column.name}'") + seen.add(column.name) + if column.nullable: + raise ValueError( + "nullable dataframe columns are not supported yet" + ) + if not dataframe_column_type_is_supported(column.type_): + raise ValueError( + "dataframe columns currently support only fixed-width " + "numeric and bool types" + ) + return astx.DataFrameType(columns) + + +def runtime_dataframe_type() -> astx.DataFrameType: + """ + title: Build one runtime-schema DataFrame surface type. + returns: + type: astx.DataFrameType + """ + return astx.DataFrameType() + + +def series_type(element_type: astx.DataType) -> astx.SeriesType: + """ + title: Build one Series surface type. + parameters: + element_type: + type: astx.DataType + returns: + type: astx.SeriesType + """ + if not dataframe_column_type_is_supported(element_type): + raise ValueError( + "series element types currently support only fixed-width " + "numeric and bool types" + ) + return astx.SeriesType(element_type) + + +def binding_from_type( + data_type: astx.DataType | None, +) -> DataFrameBinding | None: + """ + title: Build one static DataFrame binding from one declared type. + parameters: + data_type: + type: astx.DataType | None + returns: + type: DataFrameBinding | None + """ + if not isinstance(data_type, astx.DataFrameType): + return None + schema = schema_from_type(data_type) + if schema is None: + return None + return DataFrameBinding(schema) + + +def attach_binding(node: astx.AST, binding: DataFrameBinding) -> None: + """ + title: Attach static DataFrame metadata to one AST node. + parameters: + node: + type: astx.AST + binding: + type: DataFrameBinding + """ + info = cast(SemanticInfo | None, getattr(node, "semantic", None)) + if info is None or not isinstance(info, SemanticInfo): + info = SemanticInfo() + setattr(node, "semantic", info) + info.extras[DATAFRAME_SCHEMA_EXTRA] = binding.schema + + +def coerce_expression( + expr: astx.Expr, + target_type: astx.DataType, + *, + context: str, +) -> astx.Expr: + """ + title: Coerce one parsed expression into one declared DataFrame type. + parameters: + expr: + type: astx.Expr + target_type: + type: astx.DataType + context: + type: str + returns: + type: astx.Expr + """ + del context + if not isinstance(target_type, astx.DataFrameType): + return expr + if not isinstance(expr, astx.DataFrameLiteral): + return expr + binding = binding_from_type(target_type) + if binding is None: + raise ValueError( + "dataframe literals require a static dataframe schema" + ) + coerced = astx.DataFrameLiteral( + _columns_in_schema_order(expr, binding.schema), + type_=target_type, + ) + attach_binding(coerced, binding) + return coerced + + +def column_type( + binding: DataFrameBinding, + column_name: str, +) -> astx.DataType | None: + """ + title: Return the type of one DataFrame column. + parameters: + binding: + type: DataFrameBinding + column_name: + type: str + returns: + type: astx.DataType | None + """ + column = binding.schema.column(column_name) + return None if column is None else column.type_ + + +def _columns_in_schema_order( + literal: astx.DataFrameLiteral, + schema: DataFrameSchema, +) -> tuple[astx.DataFrameLiteralColumn, ...]: + """ + title: Return literal columns ordered by schema. + parameters: + literal: + type: astx.DataFrameLiteral + schema: + type: DataFrameSchema + returns: + type: tuple[astx.DataFrameLiteralColumn, Ellipsis] + """ + literal_columns = {column.name: column for column in literal.columns} + ordered: list[astx.DataFrameLiteralColumn] = [] + for schema_column in schema.columns: + column = literal_columns.get(schema_column.name) + if column is None: + raise ValueError( + f"dataframe literal is missing column '{schema_column.name}'" + ) + ordered.append(column) + schema_column_names = {column.name for column in schema.columns} + extra = sorted(set(literal_columns) - schema_column_names) + if extra: + raise ValueError( + "dataframe literal has undeclared columns: " + ", ".join(extra) + ) + return tuple(ordered) + + +__all__ = [ + "DataFrameBinding", + "attach_binding", + "binding_from_type", + "coerce_expression", + "column_type", + "dataframe_type", + "is_dataframe_type", + "is_series_type", + "runtime_dataframe_type", + "series_type", +] diff --git a/packages/arx/src/arx/lexer/syntax.json b/packages/arx/src/arx/lexer/syntax.json index 06eb2fc..3a78eab 100644 --- a/packages/arx/src/arx/lexer/syntax.json +++ b/packages/arx/src/arx/lexer/syntax.json @@ -149,6 +149,17 @@ }, "relative_imports": { "leading_dot_tokens": true + }, + "dataframe_type": { + "static_schema": "dataframe[name: type, ...]", + "runtime_schema_parameter": "dataframe[...]" + }, + "series_type": { + "form": "series[type]" + }, + "dataframe_constructor": { + "form": "dataframe({name: [literal, ...], ...})", + "column_access": ["value.name", "value[\"name\"]"] } }, diff --git a/packages/arx/src/arx/parser/base.py b/packages/arx/src/arx/parser/base.py index 0ea8a10..a297370 100644 --- a/packages/arx/src/arx/parser/base.py +++ b/packages/arx/src/arx/parser/base.py @@ -9,6 +9,7 @@ import astx +from arx.dataframe import DataFrameBinding from arx.lexer import Token, TokenList from arx.parser.state import ( ParsedAnnotation, @@ -32,6 +33,8 @@ class ParserMixinBase: type: set[str] tensor_scopes: type: list[dict[str, TensorBinding | None]] + dataframe_scopes: + type: list[dict[str, DataFrameBinding | None]] return_type_scopes: type: list[astx.DataType] template_type_scopes: @@ -47,6 +50,7 @@ class ParserMixinBase: list_scopes: list[set[str]] known_class_names: set[str] tensor_scopes: list[dict[str, TensorBinding | None]] + dataframe_scopes: list[dict[str, DataFrameBinding | None]] return_type_scopes: list[astx.DataType] template_type_scopes: list[dict[str, astx.DataType]] value_scopes: list[set[str]] @@ -57,6 +61,7 @@ def _push_value_scope( declared_names: tuple[str, ...] = (), declared_lists: tuple[str, ...] = (), declared_tensors: dict[str, TensorBinding | None] | None = None, + declared_dataframes: dict[str, DataFrameBinding | None] | None = None, ) -> None: """ title: Push one visible-name scope. @@ -67,10 +72,13 @@ def _push_value_scope( type: tuple[str, Ellipsis] declared_tensors: type: dict[str, TensorBinding | None] | None + declared_dataframes: + type: dict[str, DataFrameBinding | None] | None """ del declared_names del declared_lists del declared_tensors + del declared_dataframes raise NotImplementedError def _pop_value_scope(self) -> None: @@ -130,6 +138,35 @@ def _is_tensor_name(self, name: str) -> bool: del name raise NotImplementedError + def _declare_dataframe_name( + self, + name: str, + binding: DataFrameBinding | None, + ) -> None: + """ + title: Record one visible DataFrame binding in the current scope. + parameters: + name: + type: str + binding: + type: DataFrameBinding | None + """ + del name + del binding + raise NotImplementedError + + def _is_dataframe_name(self, name: str) -> bool: + """ + title: Return whether one visible name is declared as a DataFrame. + parameters: + name: + type: str + returns: + type: bool + """ + del name + raise NotImplementedError + def _declare_list_name(self, name: str) -> None: """ title: Record one visible list binding in the current scope. @@ -164,6 +201,21 @@ def _lookup_tensor_binding(self, name: str) -> TensorBinding | None: del name raise NotImplementedError + def _lookup_dataframe_binding( + self, + name: str, + ) -> DataFrameBinding | None: + """ + title: Look up one visible DataFrame binding by name. + parameters: + name: + type: str + returns: + type: DataFrameBinding | None + """ + del name + raise NotImplementedError + def _push_template_scope( self, template_params: tuple[astx.TemplateParam, ...] = (), @@ -353,6 +405,7 @@ def parse_block( declared_names: tuple[str, ...] = (), declared_lists: tuple[str, ...] = (), declared_tensors: dict[str, TensorBinding | None] | None = None, + declared_dataframes: dict[str, DataFrameBinding | None] | None = None, ) -> astx.Block: """ title: Parse one block of nodes. @@ -365,10 +418,18 @@ def parse_block( type: tuple[str, Ellipsis] declared_tensors: type: dict[str, TensorBinding | None] | None + declared_dataframes: + type: dict[str, DataFrameBinding | None] | None returns: type: astx.Block """ - del allow_docstring, declared_names, declared_lists, declared_tensors + del ( + allow_docstring, + declared_names, + declared_lists, + declared_tensors, + declared_dataframes, + ) raise NotImplementedError def parse_type( diff --git a/packages/arx/src/arx/parser/control_flow.py b/packages/arx/src/arx/parser/control_flow.py index 9addede..e095aec 100644 --- a/packages/arx/src/arx/parser/control_flow.py +++ b/packages/arx/src/arx/parser/control_flow.py @@ -13,6 +13,16 @@ from astx import SourceLocation +from arx.dataframe import ( + DataFrameBinding, + is_dataframe_type, +) +from arx.dataframe import ( + binding_from_type as dataframe_binding_from_type, +) +from arx.dataframe import ( + coerce_expression as coerce_dataframe_expression, +) from arx.docstrings import validate_docstring from arx.exceptions import ParserException from arx.lexer import Token, TokenKind @@ -21,9 +31,11 @@ from arx.tensor import ( TensorBinding, binding_from_type, - coerce_expression, is_tensor_type, ) +from arx.tensor import ( + coerce_expression as coerce_tensor_expression, +) class ControlFlowParserMixin(ParserMixinBase): @@ -63,6 +75,9 @@ def parse_block( declared_names: tuple[str, ...] = (), declared_lists: tuple[str, ...] = (), declared_tensors: dict[str, TensorBinding | None] | None = None, + declared_dataframes: ( + dict[str, DataFrameBinding | None] | None + ) = None, ) -> astx.Block: """ title: Parse a block of nodes. @@ -75,6 +90,8 @@ def parse_block( type: tuple[str, Ellipsis] declared_tensors: type: dict[str, TensorBinding | None] | None + declared_dataframes: + type: dict[str, DataFrameBinding | None] | None returns: type: astx.Block """ @@ -94,6 +111,7 @@ def parse_block( declared_names, declared_lists, declared_tensors, + declared_dataframes, ) block = astx.Block() @@ -286,11 +304,17 @@ def parse_for_count_stmt( "Tensor loop initializers require a static shape." ) declared_tensors[initializer.name] = binding + declared_dataframes: dict[str, DataFrameBinding | None] = {} + if is_dataframe_type(initializer.type_): + declared_dataframes[initializer.name] = ( + dataframe_binding_from_type(initializer.type_) + ) self._push_value_scope( (initializer.name,), declared_lists, declared_tensors, + declared_dataframes, ) try: condition = self.parse_expression() @@ -341,8 +365,13 @@ def parse_inline_var_declaration(self) -> astx.InlineVariableDeclaration: self._consume_operator("=") try: - value = coerce_expression( - cast(astx.Expr, self.parse_expression()), + raw_value = cast(astx.Expr, self.parse_expression()) + value = coerce_dataframe_expression( + coerce_tensor_expression( + raw_value, + var_type, + context=f"inline variable '{name}'", + ), var_type, context=f"inline variable '{name}'", ) @@ -384,8 +413,13 @@ def parse_var_expr(self) -> astx.VariableDeclaration: if self._is_operator("="): self._consume_operator("=") try: - value = coerce_expression( - cast(astx.Expr, self.parse_expression()), + raw_value = cast(astx.Expr, self.parse_expression()) + value = coerce_dataframe_expression( + coerce_tensor_expression( + raw_value, + var_type, + context=f"variable '{name}'", + ), var_type, context=f"variable '{name}'", ) @@ -416,6 +450,11 @@ def parse_var_expr(self) -> astx.VariableDeclaration: "Tensor declarations require a static shape." ) self._declare_tensor_name(name, binding) + if is_dataframe_type(var_type): + self._declare_dataframe_name( + name, + dataframe_binding_from_type(var_type), + ) if isinstance(var_type, astx.ListType): self._declare_list_name(name) return declaration @@ -479,8 +518,12 @@ def parse_return_function(self) -> astx.FunctionReturn: return_type = self._current_return_type() if return_type is not None: try: - value = coerce_expression( - cast(astx.Expr, value), + value = coerce_dataframe_expression( + coerce_tensor_expression( + cast(astx.Expr, value), + return_type, + context="return value", + ), return_type, context="return value", ) diff --git a/packages/arx/src/arx/parser/core.py b/packages/arx/src/arx/parser/core.py index 782e590..2c5d8d4 100644 --- a/packages/arx/src/arx/parser/core.py +++ b/packages/arx/src/arx/parser/core.py @@ -13,6 +13,7 @@ import astx +from arx.dataframe import DataFrameBinding from arx.docstrings import validate_docstring from arx.exceptions import ParserException from arx.lexer import Token, TokenKind, TokenList @@ -34,6 +35,8 @@ class ParserCore(ParserMixinBase): type: set[str] tensor_scopes: type: list[dict[str, TensorBinding | None]] + dataframe_scopes: + type: list[dict[str, DataFrameBinding | None]] return_type_scopes: type: list[astx.DataType] template_type_scopes: @@ -49,6 +52,7 @@ class ParserCore(ParserMixinBase): list_scopes: list[set[str]] known_class_names: set[str] tensor_scopes: list[dict[str, TensorBinding | None]] + dataframe_scopes: list[dict[str, DataFrameBinding | None]] return_type_scopes: list[astx.DataType] template_type_scopes: list[dict[str, astx.DataType]] value_scopes: list[set[str]] @@ -82,6 +86,7 @@ def __init__(self, tokens: TokenList = TokenList([])) -> None: self.list_scopes = [set()] self.known_class_names = set() self.tensor_scopes = [{}] + self.dataframe_scopes = [{}] self.return_type_scopes = [] self.template_type_scopes = [] self.value_scopes = [set()] @@ -95,6 +100,7 @@ def clean(self) -> None: self.list_scopes = [set()] self.known_class_names = set() self.tensor_scopes = [{}] + self.dataframe_scopes = [{}] self.return_type_scopes = [] self.template_type_scopes = [] self.value_scopes = [set()] @@ -216,6 +222,9 @@ def _push_value_scope( declared_names: tuple[str, ...] = (), declared_lists: tuple[str, ...] = (), declared_tensors: dict[str, TensorBinding | None] | None = None, + declared_dataframes: ( + dict[str, DataFrameBinding | None] | None + ) = None, ) -> None: """ title: Push one visible-name scope for expression disambiguation. @@ -226,10 +235,13 @@ def _push_value_scope( type: tuple[str, Ellipsis] declared_tensors: type: dict[str, TensorBinding | None] | None + declared_dataframes: + type: dict[str, DataFrameBinding | None] | None """ self.value_scopes.append(set(declared_names)) self.list_scopes.append(set(declared_lists)) self.tensor_scopes.append(dict(declared_tensors or {})) + self.dataframe_scopes.append(dict(declared_dataframes or {})) def _pop_value_scope(self) -> None: """ @@ -238,6 +250,7 @@ def _pop_value_scope(self) -> None: self.value_scopes.pop() self.list_scopes.pop() self.tensor_scopes.pop() + self.dataframe_scopes.pop() def _declare_value_name(self, name: str) -> None: """ @@ -288,6 +301,32 @@ def _is_tensor_name(self, name: str) -> bool: """ return any(name in scope for scope in reversed(self.tensor_scopes)) + def _declare_dataframe_name( + self, + name: str, + binding: DataFrameBinding | None, + ) -> None: + """ + title: Record one visible DataFrame binding in the current scope. + parameters: + name: + type: str + binding: + type: DataFrameBinding | None + """ + self.dataframe_scopes[-1][name] = binding + + def _is_dataframe_name(self, name: str) -> bool: + """ + title: Return whether one visible name is declared as a DataFrame. + parameters: + name: + type: str + returns: + type: bool + """ + return any(name in scope for scope in reversed(self.dataframe_scopes)) + def _declare_list_name(self, name: str) -> None: """ title: Record one visible list binding in the current scope. @@ -322,6 +361,23 @@ def _lookup_tensor_binding(self, name: str) -> TensorBinding | None: return scope[name] return None + def _lookup_dataframe_binding( + self, + name: str, + ) -> DataFrameBinding | None: + """ + title: Return one visible DataFrame binding by name. + parameters: + name: + type: str + returns: + type: DataFrameBinding | None + """ + for scope in reversed(self.dataframe_scopes): + if name in scope: + return scope[name] + return None + def _push_template_scope( self, template_params: tuple[astx.TemplateParam, ...] = (), diff --git a/packages/arx/src/arx/parser/declarations.py b/packages/arx/src/arx/parser/declarations.py index 43b9235..edffa14 100644 --- a/packages/arx/src/arx/parser/declarations.py +++ b/packages/arx/src/arx/parser/declarations.py @@ -14,6 +14,16 @@ from astx import SourceLocation from astx.types import AnyType +from arx.dataframe import ( + DataFrameBinding, + is_dataframe_type, +) +from arx.dataframe import ( + binding_from_type as dataframe_binding_from_type, +) +from arx.dataframe import ( + coerce_expression as coerce_dataframe_expression, +) from arx.docstrings import validate_docstring from arx.exceptions import ParserException from arx.lexer import Token, TokenKind @@ -34,9 +44,11 @@ from arx.tensor import ( TensorBinding, binding_from_type, - coerce_expression, is_tensor_type, ) +from arx.tensor import ( + coerce_expression as coerce_tensor_expression, +) class DeclarationParserMixin(ParserMixinBase): @@ -44,6 +56,31 @@ class DeclarationParserMixin(ParserMixinBase): title: Declaration parser mixin. """ + def _coerce_declared_expression( + self, + expr: astx.Expr, + target_type: astx.DataType, + *, + context: str, + ) -> astx.Expr: + """ + title: Coerce one expression for declared collection target types. + parameters: + expr: + type: astx.Expr + target_type: + type: astx.DataType + context: + type: str + returns: + type: astx.Expr + """ + return coerce_dataframe_expression( + coerce_tensor_expression(expr, target_type, context=context), + target_type, + context=context, + ) + def _parse_argument_default( self, arg_name: str, @@ -64,7 +101,7 @@ def _parse_argument_default( self._consume_operator("=") try: - return coerce_expression( + return self._coerce_declared_expression( cast(astx.Expr, self.parse_expression()), arg_type, context=f"default value for parameter '{arg_name}'", @@ -137,6 +174,9 @@ def parse_function( declared_tensors=self._tensor_bindings_for_arguments( proto.args.nodes ), + declared_dataframes=self._dataframe_bindings_for_arguments( + proto.args.nodes + ), ) finally: if pushed_return_type: @@ -338,7 +378,7 @@ def parse_field_decl( if self._is_operator("="): self._consume_operator("=") try: - initializer = coerce_expression( + initializer = self._coerce_declared_expression( cast(astx.Expr, self.parse_expression()), field_type, context=f"field '{name}'", @@ -429,6 +469,11 @@ def parse_method_decl( declared_tensors=self._tensor_bindings_for_arguments( prototype.args.nodes ), + declared_dataframes=( + self._dataframe_bindings_for_arguments( + prototype.args.nodes + ) + ), ) if body.nodes: raise ParserException( @@ -448,6 +493,11 @@ def parse_method_decl( declared_tensors=self._tensor_bindings_for_arguments( prototype.args.nodes ), + declared_dataframes=( + self._dataframe_bindings_for_arguments( + prototype.args.nodes + ) + ), ) elif not ( self._has_modifier(modifiers, "abstract") @@ -1077,6 +1127,27 @@ def _tensor_bindings_for_arguments( bindings[argument.name] = binding_from_type(argument.type_) return bindings + def _dataframe_bindings_for_arguments( + self, + arguments: tuple[astx.Argument, ...] | list[astx.Argument], + ) -> dict[str, DataFrameBinding | None]: + """ + title: Build one DataFrame scope map for function arguments. + parameters: + arguments: + type: tuple[astx.Argument, Ellipsis] | list[astx.Argument] + returns: + type: dict[str, DataFrameBinding | None] + """ + bindings: dict[str, DataFrameBinding | None] = {} + for argument in arguments: + if not is_dataframe_type(argument.type_): + continue + bindings[argument.name] = dataframe_binding_from_type( + argument.type_ + ) + return bindings + def _list_names_for_arguments( self, arguments: tuple[astx.Argument, ...] | list[astx.Argument], diff --git a/packages/arx/src/arx/parser/expressions.py b/packages/arx/src/arx/parser/expressions.py index 272a09e..c2fe067 100644 --- a/packages/arx/src/arx/parser/expressions.py +++ b/packages/arx/src/arx/parser/expressions.py @@ -14,6 +14,12 @@ from irx.builtins.collections.tensor import TENSOR_LAYOUT_EXTRA, TensorLayout from arx import builtins +from arx.dataframe import ( + attach_binding as attach_dataframe_binding, +) +from arx.dataframe import ( + column_type as dataframe_column_type, +) from arx.exceptions import ParserException from arx.lexer import TokenKind from arx.parser.base import ParserMixinBase @@ -133,6 +139,24 @@ def parse_postfix(self) -> astx.AST: self._consume_operator(",") self._consume_operator(")") + if self._is_dataframe_expr(expr) and member_name in { + "nrows", + "ncols", + }: + if template_args is not None: + raise ParserException( + "DataFrame metadata methods do not accept " + "template arguments." + ) + if args: + raise ParserException( + f"DataFrame {member_name} expects no arguments." + ) + if member_name == "nrows": + expr = astx.DataFrameRowCount(expr) + else: + expr = astx.DataFrameColumnCount(expr) + continue if ( member_name == "append" and isinstance(expr, astx.Identifier) @@ -168,6 +192,8 @@ def parse_postfix(self) -> astx.AST: class_name = self._class_name_from_expr(expr) if class_name is not None: expr = astx.StaticFieldAccess(class_name, member_name) + elif self._is_dataframe_expr(expr): + expr = self._parse_dataframe_field_access(expr, member_name) else: expr = astx.FieldAccess(expr, member_name) @@ -303,10 +329,20 @@ def parse_identifier_expr(self) -> astx.AST: binding = self._lookup_tensor_binding(id_name) if binding is not None: attach_binding(identifier, binding) + dataframe_binding = self._lookup_dataframe_binding(id_name) + if dataframe_binding is not None: + attach_dataframe_binding(identifier, dataframe_binding) return identifier self._consume_operator("(") + if id_name == builtins.BUILTIN_DATAFRAME: + if template_args is not None: + raise ParserException( + f"Builtin '{id_name}' does not accept template arguments." + ) + return self.parse_dataframe_constructor() + if id_name == builtins.BUILTIN_CAST: if template_args is not None: raise ParserException( @@ -399,6 +435,21 @@ def parse_subscript_expr(self, base: astx.AST) -> astx.AST: self._consume_operator("]") + dataframe_base = self._coerce_dataframe_base(base) + if dataframe_base is not None: + if len(indices) != 1 or not isinstance( + indices[0], + astx.LiteralString, + ): + raise ParserException( + "DataFrame string-key column access expects exactly one " + "string literal key." + ) + return self._parse_dataframe_string_access( + dataframe_base, + indices[0].value, + ) + tensor_base = self._coerce_tensor_base(base) if tensor_base is None: if len(indices) != 1: @@ -415,6 +466,172 @@ def parse_subscript_expr(self, base: astx.AST) -> astx.AST: self._validate_tensor_indices(tensor_base, indices) return astx.TensorIndex(tensor_base, indices) + def parse_dataframe_constructor(self) -> astx.DataFrameLiteral: + """ + title: Parse a builtin DataFrame constructor expression. + returns: + type: astx.DataFrameLiteral + """ + self._skip_inline_indents() + self._consume_operator("{") + self._skip_inline_indents() + + columns: list[astx.DataFrameLiteralColumn] = [] + seen: set[str] = set() + if self._is_operator("}"): + raise ParserException( + "DataFrame constructor requires at least one column." + ) + + while True: + if self.tokens.cur_tok.kind != TokenKind.identifier: + raise ParserException( + "DataFrame constructor column names must be identifiers." + ) + column_name = cast(str, self.tokens.cur_tok.value) + if column_name in seen: + raise ParserException( + f"duplicate dataframe column '{column_name}'" + ) + seen.add(column_name) + self.tokens.get_next_token() + + self._consume_operator(":") + value = self.parse_expression() + if not isinstance(value, astx.LiteralList): + raise ParserException( + "DataFrame constructor column values must be list " + "literals." + ) + columns.append( + astx.DataFrameLiteralColumn( + column_name, + tuple(value.elements), + ) + ) + self._skip_inline_indents() + + if self._is_operator("}"): + break + + self._consume_operator(",") + self._skip_inline_indents() + if self._is_operator("}"): + break + + self._consume_operator("}") + self._skip_inline_indents() + self._consume_operator(")") + return astx.DataFrameLiteral(columns) + + def _skip_inline_indents(self) -> None: + """ + title: Skip indentation markers inside grouped expressions. + """ + while self.tokens.cur_tok.kind == TokenKind.indent: + self.tokens.get_next_token() + + def _is_dataframe_expr(self, expr: astx.AST) -> bool: + """ + title: Return whether one expression is known as a DataFrame. + parameters: + expr: + type: astx.AST + returns: + type: bool + """ + if isinstance(expr, astx.DataFrameLiteral): + return True + if isinstance(expr, astx.Identifier): + return self._is_dataframe_name(expr.name) + return False + + def _coerce_dataframe_base(self, base: astx.AST) -> astx.AST | None: + """ + title: Return one DataFrame-aware access base when available. + parameters: + base: + type: astx.AST + returns: + type: astx.AST | None + """ + if isinstance(base, astx.DataFrameLiteral): + return base + if not isinstance(base, astx.Identifier): + return None + binding = self._lookup_dataframe_binding(base.name) + if binding is not None: + attach_dataframe_binding(base, binding) + return base + if self._is_dataframe_name(base.name): + raise ParserException( + "Runtime-schema dataframe column access is not supported yet." + ) + return None + + def _parse_dataframe_field_access( + self, + base: astx.AST, + column_name: str, + ) -> astx.DataFrameColumnAccess: + """ + title: Build and validate one DataFrame field-style column access. + parameters: + base: + type: astx.AST + column_name: + type: str + returns: + type: astx.DataFrameColumnAccess + """ + dataframe_base = self._coerce_dataframe_base(base) + if dataframe_base is None: + raise ParserException("Expected a DataFrame value.") + if isinstance(base, astx.Identifier): + binding = self._lookup_dataframe_binding(base.name) + if ( + binding is not None + and dataframe_column_type( + binding, + column_name, + ) + is None + ): + raise ParserException( + f"DataFrame has no column '{column_name}'." + ) + return astx.DataFrameColumnAccess(dataframe_base, column_name) + + def _parse_dataframe_string_access( + self, + base: astx.AST, + column_name: str, + ) -> astx.DataFrameStringColumnAccess: + """ + title: Build and validate one DataFrame string-key column access. + parameters: + base: + type: astx.AST + column_name: + type: str + returns: + type: astx.DataFrameStringColumnAccess + """ + if isinstance(base, astx.Identifier): + binding = self._lookup_dataframe_binding(base.name) + if ( + binding is not None + and dataframe_column_type( + binding, + column_name, + ) + is None + ): + raise ParserException( + f"DataFrame has no column '{column_name}'." + ) + return astx.DataFrameStringColumnAccess(base, column_name) + def _coerce_tensor_base(self, base: astx.AST) -> astx.AST | None: """ title: Return one tensor-aware indexing base when available. diff --git a/packages/arx/src/arx/parser/types.py b/packages/arx/src/arx/parser/types.py index 2c794bc..3fef101 100644 --- a/packages/arx/src/arx/parser/types.py +++ b/packages/arx/src/arx/parser/types.py @@ -10,6 +10,12 @@ import astx +from arx.dataframe import ( + dataframe_type, + is_dataframe_type, + runtime_dataframe_type, + series_type, +) from arx.exceptions import ParserException from arx.lexer import TokenKind from arx.parser.base import ParserMixinBase @@ -34,9 +40,7 @@ def _consume_runtime_shape_marker(self) -> None: """ for _ in range(3): if not self._is_operator("."): - raise ParserException( - "Runtime-shaped tensor marker must be '...'." - ) + raise ParserException("Runtime-layout marker must be '...'.") self._consume_operator(".") def _ensure_runtime_layout_allowed( @@ -55,7 +59,7 @@ def _ensure_runtime_layout_allowed( if type_context.allows_runtime_layout: return raise ParserException( - f"Runtime-shaped {type_name} types using '...' are only " + f"Runtime-layout {type_name} types using '...' are only " "supported in function parameter annotations." ) @@ -108,6 +112,15 @@ def _default_value_for_type(self, data_type: astx.DataType) -> astx.Expr: return default_value(data_type) except ValueError as err: raise ParserException(str(err)) from err + if is_dataframe_type(data_type): + raise ParserException( + "Parser: DataFrame declarations require an explicit " + "initializer." + ) + if isinstance(data_type, astx.SeriesType): + raise ParserException( + "Parser: Series declarations require an explicit initializer." + ) raise ParserException( f"Parser: No default value defined for type " @@ -223,6 +236,60 @@ def parse_type( type_ = tensor_type(elem_type, tuple(shape)) except ValueError as err: raise ParserException(str(err)) from err + elif type_name == "series": + self.tokens.get_next_token() # eat series + self._consume_operator("[") + elem_type = self.parse_type( + allow_template_vars=allow_template_vars, + allow_union=allow_union, + type_context=TypeUseContext.NESTED, + ) + if self._is_operator(","): + raise ParserException( + "Series types accept exactly one element type." + ) + self._consume_operator("]") + try: + type_ = series_type(elem_type) + except ValueError as err: + raise ParserException(str(err)) from err + elif type_name == "dataframe": + self.tokens.get_next_token() # eat dataframe + self._consume_operator("[") + if self._is_operator("."): + self._consume_runtime_shape_marker() + self._consume_operator("]") + self._ensure_runtime_layout_allowed( + "dataframe", + type_context, + ) + type_ = runtime_dataframe_type() + else: + columns: list[astx.DataFrameColumn] = [] + while True: + if self.tokens.cur_tok.kind != TokenKind.identifier: + raise ParserException( + "DataFrame column names must be identifiers." + ) + column_name = cast(str, self.tokens.cur_tok.value) + self.tokens.get_next_token() + self._consume_operator(":") + column_type = self.parse_type( + allow_template_vars=allow_template_vars, + allow_union=allow_union, + type_context=TypeUseContext.NESTED, + ) + columns.append( + astx.DataFrameColumn(column_name, column_type) + ) + if not self._is_operator(","): + break + self._consume_operator(",") + self._consume_operator("]") + try: + type_ = dataframe_type(tuple(columns)) + except ValueError as err: + raise ParserException(str(err)) from err else: type_map: dict[str, astx.DataType] = { "i8": astx.Int8(), diff --git a/packages/arx/tests/python/test_dataframe.py b/packages/arx/tests/python/test_dataframe.py new file mode 100644 index 0000000..f4eb0f8 --- /dev/null +++ b/packages/arx/tests/python/test_dataframe.py @@ -0,0 +1,142 @@ +""" +title: Tests for Arx dataframe helpers and parser paths. +""" + +from __future__ import annotations + +from textwrap import dedent + +import astx +import pytest + +from arx.exceptions import ParserException +from arx.io import ArxIO +from arx.lexer import Lexer +from arx.parser import Parser + + +def _parse_module(code: str) -> astx.Module: + """ + title: Parse one Arx module snippet. + parameters: + code: + type: str + returns: + type: astx.Module + """ + ArxIO.string_to_buffer(dedent(code).lstrip()) + return Parser().parse(Lexer().lex()) + + +def test_parse_dataframe_type_constructor_and_column_access() -> None: + """ + title: Parse DataFrame types, constructor calls, and both access styles. + """ + tree = _parse_module( + """ + fn main() -> i32: + var rows: dataframe[id: i32, score: f64] = dataframe({ + id: [1, 2, 3], + score: [0.5, 0.8, 1.0], + }) + var score: series[f64] = rows.score + var ids: series[i32] = rows["id"] + return cast(rows.nrows(), i32) + """ + ) + + function = tree.nodes[0] + assert isinstance(function, astx.FunctionDef) + rows = function.body.nodes[0] + assert isinstance(rows, astx.VariableDeclaration) + assert isinstance(rows.type_, astx.DataFrameType) + assert [column.name for column in rows.type_.columns or ()] == [ + "id", + "score", + ] + assert isinstance(rows.value, astx.DataFrameLiteral) + + score = function.body.nodes[1] + assert isinstance(score, astx.VariableDeclaration) + assert isinstance(score.type_, astx.SeriesType) + assert isinstance(score.value, astx.DataFrameColumnAccess) + assert score.value.column_name == "score" + + ids = function.body.nodes[2] + assert isinstance(ids, astx.VariableDeclaration) + assert isinstance(ids.type_, astx.SeriesType) + assert isinstance(ids.value, astx.DataFrameStringColumnAccess) + assert ids.value.column_name == "id" + + +def test_parse_runtime_schema_dataframe_only_for_parameters() -> None: + """ + title: Runtime-schema dataframe annotations are parameter-only for now. + """ + tree = _parse_module( + """ + extern sink(rows: dataframe[...]) -> none + + fn accept(rows: dataframe[...]) -> i32: + return cast(rows.nrows(), i32) + """ + ) + + extern = tree.nodes[0] + assert isinstance(extern, astx.FunctionPrototype) + extern_type = extern.args.nodes[0].type_ + assert isinstance(extern_type, astx.DataFrameType) + assert extern_type.columns is None + + function = tree.nodes[1] + assert isinstance(function, astx.FunctionDef) + arg_type = function.prototype.args.nodes[0].type_ + assert isinstance(arg_type, astx.DataFrameType) + assert arg_type.columns is None + + with pytest.raises(ParserException, match="function parameter"): + _parse_module( + """ + fn bad() -> none: + var rows: dataframe[...] = dataframe({id: [1]}) + return none + """ + ) + + +def test_dataframe_constructor_requires_declared_columns() -> None: + """ + title: DataFrame literals are checked against the declared static schema. + """ + with pytest.raises(ParserException, match="missing column 'score'"): + _parse_module( + """ + fn bad() -> none: + var rows: dataframe[id: i32, score: f64] = dataframe({ + id: [1, 2, 3], + }) + return none + """ + ) + + +def test_dataframe_mvp_rejects_string_columns() -> None: + """ + title: MVP DataFrame and Series types reject non fixed-width columns. + """ + with pytest.raises(ParserException, match="fixed-width numeric and bool"): + _parse_module( + """ + fn bad() -> none: + var rows: dataframe[name: str] = dataframe({name: ["Ada"]}) + return none + """ + ) + + with pytest.raises(ParserException, match="fixed-width numeric and bool"): + _parse_module( + """ + fn bad(value: series[str]) -> none: + return none + """ + ) diff --git a/packages/astx/src/astx/__init__.py b/packages/astx/src/astx/__init__.py index 4128b6c..5bd0ed7 100644 --- a/packages/astx/src/astx/__init__.py +++ b/packages/astx/src/astx/__init__.py @@ -128,6 +128,21 @@ Variable, VariableDeclaration, ) +from astx.dataframe import ( + DataFrameColumn, + DataFrameColumnAccess, + DataFrameColumnCount, + DataFrameLiteral, + DataFrameLiteralColumn, + DataFrameRelease, + DataFrameRetain, + DataFrameRowCount, + DataFrameStringColumnAccess, + DataFrameType, + SeriesRelease, + SeriesRetain, + SeriesType, +) from astx.exceptions import ( CatchHandlerStmt, ExceptionHandlerStmt, @@ -389,6 +404,16 @@ def get_version() -> str: "Comprehension", "ComprehensionClause", "ContinueStmt", + "DataFrameColumn", + "DataFrameColumnAccess", + "DataFrameColumnCount", + "DataFrameLiteral", + "DataFrameLiteralColumn", + "DataFrameRelease", + "DataFrameRetain", + "DataFrameRowCount", + "DataFrameStringColumnAccess", + "DataFrameType", "DataType", "DataTypeOps", "Date", @@ -504,6 +529,9 @@ def get_version() -> str: "PrintExpr", "Program", "ScopeKind", + "SeriesRelease", + "SeriesRetain", + "SeriesType", "SetComprehension", "SetType", "SignedInteger", diff --git a/packages/astx/src/astx/dataframe.py b/packages/astx/src/astx/dataframe.py new file mode 100644 index 0000000..494317b --- /dev/null +++ b/packages/astx/src/astx/dataframe.py @@ -0,0 +1,517 @@ +""" +title: ASTx DataFrame AST nodes. +summary: >- + Provide internal nodes for Arrow C++ backed DataFrame and Series runtime + values. +""" + +from __future__ import annotations + +from collections.abc import Sequence +from dataclasses import dataclass +from typing import cast + +import astx + +from astx.tools.typing import typechecked +from astx.types import AnyType + + +@typechecked +@dataclass(frozen=True) +class DataFrameColumn: + """ + title: Static DataFrame column schema entry. + attributes: + name: + type: str + type_: + type: astx.DataType + nullable: + type: bool + """ + + name: str + type_: astx.DataType + nullable: bool = False + + def get_struct(self, simplified: bool = False) -> dict[str, object]: + """ + title: Return the structured representation of the column schema. + parameters: + simplified: + type: bool + returns: + type: dict[str, object] + """ + return { + "name": self.name, + "type": self.type_.get_struct(simplified), + "nullable": self.nullable, + } + + +@typechecked +@dataclass(frozen=True) +class DataFrameLiteralColumn: + """ + title: One DataFrame literal column payload. + attributes: + name: + type: str + values: + type: tuple[astx.AST, Ellipsis] + """ + + name: str + values: tuple[astx.AST, ...] + + def get_struct(self, simplified: bool = False) -> dict[str, object]: + """ + title: Return the structured representation of the literal column. + parameters: + simplified: + type: bool + returns: + type: dict[str, object] + """ + return { + "name": self.name, + "values": [value.get_struct(simplified) for value in self.values], + } + + +@typechecked +class SeriesType(AnyType): + """ + title: Internal Series semantic type. + summary: >- + Represent a one-dimensional typed DataFrame column backed by Arrow + ChunkedArray storage. + attributes: + element_type: + type: astx.DataType | None + nullable: + type: bool + """ + + element_type: astx.DataType | None + nullable: bool + + def __init__( + self, + element_type: astx.DataType | None = None, + *, + nullable: bool = False, + ) -> None: + """ + title: Initialize one Series type. + parameters: + element_type: + type: astx.DataType | None + nullable: + type: bool + """ + super().__init__() + self.element_type = element_type + self.nullable = nullable + + def __str__(self) -> str: + """ + title: Render the Series type. + returns: + type: str + """ + if self.element_type is None: + return "SeriesType" + return f"SeriesType[{self.element_type}]" + + +@typechecked +class DataFrameType(AnyType): + """ + title: Internal DataFrame semantic type. + summary: >- + Represent a heterogeneous named-column table backed by Arrow Table + storage. + attributes: + columns: + type: tuple[DataFrameColumn, Ellipsis] | None + """ + + columns: tuple[DataFrameColumn, ...] | None + + def __init__( + self, + columns: Sequence[DataFrameColumn] | None = None, + ) -> None: + """ + title: Initialize one DataFrame type. + parameters: + columns: + type: Sequence[DataFrameColumn] | None + """ + super().__init__() + self.columns = None if columns is None else tuple(columns) + + def __str__(self) -> str: + """ + title: Render the DataFrame type. + returns: + type: str + """ + if self.columns is None: + return "DataFrameType" + columns = ", ".join( + f"{column.name}: {column.type_}" for column in self.columns + ) + return f"DataFrameType[{columns}]" + + +@typechecked +class DataFrameLiteral(astx.base.DataType): + """ + title: Internal Arrow C++ backed DataFrame literal node. + summary: Build one Arrow C++ table from named column values. + attributes: + columns: + type: tuple[DataFrameLiteralColumn, Ellipsis] + type_: + type: DataFrameType + """ + + columns: tuple[DataFrameLiteralColumn, ...] + type_: DataFrameType + + def __init__( + self, + columns: Sequence[DataFrameLiteralColumn], + *, + type_: DataFrameType | None = None, + ) -> None: + """ + title: Initialize one DataFrame literal. + parameters: + columns: + type: Sequence[DataFrameLiteralColumn] + type_: + type: DataFrameType | None + """ + super().__init__() + self.columns = tuple(columns) + self.type_ = type_ or DataFrameType() + + def get_struct(self, simplified: bool = False) -> astx.base.ReprStruct: + """ + title: Return the structured representation of the DataFrame literal. + parameters: + simplified: + type: bool + returns: + type: astx.base.ReprStruct + """ + value = { + "columns": [ + column.get_struct(simplified) for column in self.columns + ], + "type": ( + None + if self.type_.columns is None + else [ + column.get_struct(simplified) + for column in self.type_.columns + ] + ), + } + return self._prepare_struct( + "DataFrameLiteral", + cast(astx.base.ReprStruct, value), + simplified, + ) + + +@typechecked +class DataFrameColumnAccess(astx.base.DataType): + """ + title: Internal DataFrame column access by static column name. + attributes: + base: + type: astx.AST + column_name: + type: str + type_: + type: SeriesType + """ + + base: astx.AST + column_name: str + type_: SeriesType + + def __init__(self, base: astx.AST, column_name: str) -> None: + """ + title: Initialize one DataFrame column access. + parameters: + base: + type: astx.AST + column_name: + type: str + """ + super().__init__() + self.base = base + self.column_name = column_name + self.type_ = SeriesType() + + def get_struct(self, simplified: bool = False) -> astx.base.ReprStruct: + """ + title: Return the structured representation of the column access. + parameters: + simplified: + type: bool + returns: + type: astx.base.ReprStruct + """ + value = { + "base": self.base.get_struct(simplified), + "column_name": self.column_name, + } + return self._prepare_struct( + "DataFrameColumnAccess", + cast(astx.base.ReprStruct, value), + simplified, + ) + + +@typechecked +class DataFrameStringColumnAccess(DataFrameColumnAccess): + """ + title: Internal DataFrame column access by string key. + attributes: + base: + type: astx.AST + column_name: + type: str + type_: + type: SeriesType + """ + + def get_struct(self, simplified: bool = False) -> astx.base.ReprStruct: + """ + title: Return the structured representation of the string access. + parameters: + simplified: + type: bool + returns: + type: astx.base.ReprStruct + """ + value = { + "base": self.base.get_struct(simplified), + "column_name": self.column_name, + } + return self._prepare_struct( + "DataFrameStringColumnAccess", + cast(astx.base.ReprStruct, value), + simplified, + ) + + +@typechecked +class DataFrameRowCount(astx.base.DataType): + """ + title: Internal DataFrame row-count query. + attributes: + base: + type: astx.AST + type_: + type: astx.Int64 + """ + + base: astx.AST + type_: astx.Int64 + + def __init__(self, base: astx.AST) -> None: + """ + title: Initialize one DataFrame row-count query. + parameters: + base: + type: astx.AST + """ + super().__init__() + self.base = base + self.type_ = astx.Int64() + + def get_struct(self, simplified: bool = False) -> astx.base.ReprStruct: + """ + title: Return the structured representation of the row-count query. + parameters: + simplified: + type: bool + returns: + type: astx.base.ReprStruct + """ + return self._prepare_struct( + "DataFrameRowCount", + self.base.get_struct(simplified), + simplified, + ) + + +@typechecked +class DataFrameColumnCount(astx.base.DataType): + """ + title: Internal DataFrame column-count query. + attributes: + base: + type: astx.AST + type_: + type: astx.Int64 + """ + + base: astx.AST + type_: astx.Int64 + + def __init__(self, base: astx.AST) -> None: + """ + title: Initialize one DataFrame column-count query. + parameters: + base: + type: astx.AST + """ + super().__init__() + self.base = base + self.type_ = astx.Int64() + + def get_struct(self, simplified: bool = False) -> astx.base.ReprStruct: + """ + title: Return the structured representation of the column-count query. + parameters: + simplified: + type: bool + returns: + type: astx.base.ReprStruct + """ + return self._prepare_struct( + "DataFrameColumnCount", + self.base.get_struct(simplified), + simplified, + ) + + +@typechecked +class DataFrameRetain(astx.base.DataType): + """ + title: Internal explicit retain for DataFrame-backed storage. + attributes: + base: + type: astx.AST + type_: + type: astx.Int32 + """ + + base: astx.AST + type_: astx.Int32 + + def __init__(self, base: astx.AST) -> None: + """ + title: Initialize one DataFrame retain helper. + parameters: + base: + type: astx.AST + """ + super().__init__() + self.base = base + self.type_ = astx.Int32() + + +@typechecked +class DataFrameRelease(astx.base.DataType): + """ + title: Internal explicit release for DataFrame-backed storage. + attributes: + base: + type: astx.AST + type_: + type: astx.Int32 + """ + + base: astx.AST + type_: astx.Int32 + + def __init__(self, base: astx.AST) -> None: + """ + title: Initialize one DataFrame release helper. + parameters: + base: + type: astx.AST + """ + super().__init__() + self.base = base + self.type_ = astx.Int32() + + +@typechecked +class SeriesRetain(astx.base.DataType): + """ + title: Internal explicit retain for Series-backed storage. + attributes: + base: + type: astx.AST + type_: + type: astx.Int32 + """ + + base: astx.AST + type_: astx.Int32 + + def __init__(self, base: astx.AST) -> None: + """ + title: Initialize one Series retain helper. + parameters: + base: + type: astx.AST + """ + super().__init__() + self.base = base + self.type_ = astx.Int32() + + +@typechecked +class SeriesRelease(astx.base.DataType): + """ + title: Internal explicit release for Series-backed storage. + attributes: + base: + type: astx.AST + type_: + type: astx.Int32 + """ + + base: astx.AST + type_: astx.Int32 + + def __init__(self, base: astx.AST) -> None: + """ + title: Initialize one Series release helper. + parameters: + base: + type: astx.AST + """ + super().__init__() + self.base = base + self.type_ = astx.Int32() + + +__all__ = [ + "DataFrameColumn", + "DataFrameColumnAccess", + "DataFrameColumnCount", + "DataFrameLiteral", + "DataFrameLiteralColumn", + "DataFrameRelease", + "DataFrameRetain", + "DataFrameRowCount", + "DataFrameStringColumnAccess", + "DataFrameType", + "SeriesRelease", + "SeriesRetain", + "SeriesType", +] diff --git a/packages/irx/src/irx/analysis/handlers/_expressions/__init__.py b/packages/irx/src/irx/analysis/handlers/_expressions/__init__.py index a635c2a..9ed79fc 100644 --- a/packages/irx/src/irx/analysis/handlers/_expressions/__init__.py +++ b/packages/irx/src/irx/analysis/handlers/_expressions/__init__.py @@ -13,6 +13,9 @@ from irx.analysis.handlers._expressions.classes import ( ExpressionClassVisitorMixin, ) +from irx.analysis.handlers._expressions.dataframes import ( + ExpressionDataFrameVisitorMixin, +) from irx.analysis.handlers._expressions.literals import ( ExpressionLiteralVisitorMixin, ) @@ -38,6 +41,7 @@ class ExpressionVisitorMixin( ExpressionModuleVisitorMixin, ExpressionMutationVisitorMixin, ExpressionClassVisitorMixin, + ExpressionDataFrameVisitorMixin, ExpressionOperatorVisitorMixin, ExpressionArrayVisitorMixin, ExpressionTensorBufferVisitorMixin, diff --git a/packages/irx/src/irx/analysis/handlers/_expressions/dataframes.py b/packages/irx/src/irx/analysis/handlers/_expressions/dataframes.py new file mode 100644 index 0000000..3437d8f --- /dev/null +++ b/packages/irx/src/irx/analysis/handlers/_expressions/dataframes.py @@ -0,0 +1,325 @@ +# mypy: disable-error-code=no-redef +# mypy: disable-error-code=untyped-decorator + +""" +title: Expression DataFrame visitors. +summary: >- + Handle DataFrame literals, column access, metadata queries, and lifetime + helper expressions. +""" + +from __future__ import annotations + +import astx + +from irx.analysis.handlers.base import ( + SemanticAnalyzerCore, + SemanticVisitorMixinBase, +) +from irx.analysis.validation import validate_assignment +from irx.builtins.collections.dataframe import ( + DATAFRAME_COLUMN_INDEX_EXTRA, + DATAFRAME_SCHEMA_EXTRA, + SERIES_ELEMENT_TYPE_EXTRA, + SERIES_NULLABLE_EXTRA, + DataFrameSchema, + dataframe_column_type_is_supported, + schema_from_type, +) +from irx.diagnostics import DiagnosticCodes +from irx.typecheck import typechecked + + +@typechecked +class ExpressionDataFrameVisitorMixin(SemanticVisitorMixinBase): + """ + title: Expression DataFrame visitors. + """ + + def _dataframe_schema( + self, + node: astx.AST, + ) -> DataFrameSchema | None: + """ + title: Return static DataFrame schema metadata for one expression. + parameters: + node: + type: astx.AST + returns: + type: DataFrameSchema | None + """ + semantic = getattr(node, "semantic", None) + extras = getattr(semantic, "extras", {}) + schema = extras.get(DATAFRAME_SCHEMA_EXTRA) + if isinstance(schema, DataFrameSchema): + return schema + + resolved_type = self._expr_type(node) + if isinstance(resolved_type, astx.DataFrameType): + return schema_from_type(resolved_type) + + return None + + def _set_dataframe_schema( + self, + node: astx.AST, + schema: DataFrameSchema | None, + ) -> None: + """ + title: Attach static DataFrame schema metadata when available. + parameters: + node: + type: astx.AST + schema: + type: DataFrameSchema | None + """ + if schema is not None: + self._semantic(node).extras[DATAFRAME_SCHEMA_EXTRA] = schema + + @SemanticAnalyzerCore.visit.dispatch + def visit(self, node: astx.DataFrameLiteral) -> None: + """ + title: Visit DataFrameLiteral nodes. + parameters: + node: + type: astx.DataFrameLiteral + """ + schema = schema_from_type(node.type_) + if schema is None: + self.context.diagnostics.add( + "dataframe literals require an explicit static DataFrame type", + node=node, + code=DiagnosticCodes.SEMANTIC_TYPE_MISMATCH, + ) + else: + declared_names = {column.name for column in schema.columns} + literal_names = {column.name for column in node.columns} + missing = sorted(declared_names - literal_names) + extra = sorted(literal_names - declared_names) + if missing: + self.context.diagnostics.add( + "dataframe literal is missing columns: " + + ", ".join(missing), + node=node, + code=DiagnosticCodes.SEMANTIC_TYPE_MISMATCH, + ) + if extra: + self.context.diagnostics.add( + "dataframe literal has undeclared columns: " + + ", ".join(extra), + node=node, + code=DiagnosticCodes.SEMANTIC_TYPE_MISMATCH, + ) + for column in schema.columns: + if not dataframe_column_type_is_supported(column.type_): + self.context.diagnostics.add( + "dataframe columns currently support only " + "fixed-width numeric and bool types", + node=node, + code=DiagnosticCodes.SEMANTIC_TYPE_MISMATCH, + ) + + seen_names: set[str] = set() + row_count: int | None = None + for literal_column in node.columns: + if literal_column.name in seen_names: + self.context.diagnostics.add( + f"duplicate dataframe column '{literal_column.name}'", + node=node, + code=DiagnosticCodes.SEMANTIC_TYPE_MISMATCH, + ) + seen_names.add(literal_column.name) + + if row_count is None: + row_count = len(literal_column.values) + elif len(literal_column.values) != row_count: + self.context.diagnostics.add( + "dataframe literal columns must have the same length", + node=node, + code=DiagnosticCodes.SEMANTIC_TYPE_MISMATCH, + ) + + declared_column = ( + None if schema is None else schema.column(literal_column.name) + ) + for value in literal_column.values: + self.visit(value) + if declared_column is not None: + validate_assignment( + self.context.diagnostics, + target_name=( + f"dataframe column '{literal_column.name}'" + ), + target_type=declared_column.type_, + value_type=self._expr_type(value), + node=value, + ) + + self._set_dataframe_schema(node, schema) + self._set_type(node, node.type_) + + @SemanticAnalyzerCore.visit.dispatch + def visit(self, node: astx.DataFrameColumnAccess) -> None: + """ + title: Visit DataFrameColumnAccess nodes. + parameters: + node: + type: astx.DataFrameColumnAccess + """ + self.visit(node.base) + base_type = self._expr_type(node.base) + if not isinstance(base_type, astx.DataFrameType): + self.context.diagnostics.add( + "dataframe column access requires a DataFrame value", + node=node, + code=DiagnosticCodes.SEMANTIC_INVALID_FIELD_ACCESS, + ) + self._set_type(node, None) + return + + schema = self._dataframe_schema(node.base) + column = None if schema is None else schema.column(node.column_name) + if column is None: + self.context.diagnostics.add( + f"dataframe has no column '{node.column_name}'", + node=node, + code=DiagnosticCodes.SEMANTIC_INVALID_FIELD_ACCESS, + ) + self._set_type(node, None) + return + + node.type_ = astx.SeriesType( + column.type_, + nullable=column.nullable, + ) + self._semantic(node).extras[DATAFRAME_COLUMN_INDEX_EXTRA] = ( + column.index + ) + self._semantic(node).extras[SERIES_ELEMENT_TYPE_EXTRA] = column.type_ + self._semantic(node).extras[SERIES_NULLABLE_EXTRA] = column.nullable + self._set_type(node, node.type_) + + @SemanticAnalyzerCore.visit.dispatch + def visit(self, node: astx.DataFrameStringColumnAccess) -> None: + """ + title: Visit DataFrameStringColumnAccess nodes. + parameters: + node: + type: astx.DataFrameStringColumnAccess + """ + self.visit( + cast_column := astx.DataFrameColumnAccess( + node.base, + node.column_name, + ) + ) + semantic = getattr(cast_column, "semantic", None) + if semantic is not None: + self._semantic(node).extras.update(semantic.extras) + node.type_ = cast_column.type_ + self._set_type(node, self._expr_type(cast_column)) + + @SemanticAnalyzerCore.visit.dispatch + def visit(self, node: astx.DataFrameRowCount) -> None: + """ + title: Visit DataFrameRowCount nodes. + parameters: + node: + type: astx.DataFrameRowCount + """ + self.visit(node.base) + if not isinstance(self._expr_type(node.base), astx.DataFrameType): + self.context.diagnostics.add( + "dataframe nrows requires a DataFrame value", + node=node, + code=DiagnosticCodes.SEMANTIC_TYPE_MISMATCH, + ) + self._set_type(node, astx.Int64()) + + @SemanticAnalyzerCore.visit.dispatch + def visit(self, node: astx.DataFrameColumnCount) -> None: + """ + title: Visit DataFrameColumnCount nodes. + parameters: + node: + type: astx.DataFrameColumnCount + """ + self.visit(node.base) + if not isinstance(self._expr_type(node.base), astx.DataFrameType): + self.context.diagnostics.add( + "dataframe ncols requires a DataFrame value", + node=node, + code=DiagnosticCodes.SEMANTIC_TYPE_MISMATCH, + ) + self._set_type(node, astx.Int64()) + + @SemanticAnalyzerCore.visit.dispatch + def visit(self, node: astx.DataFrameRetain) -> None: + """ + title: Visit DataFrameRetain nodes. + parameters: + node: + type: astx.DataFrameRetain + """ + self.visit(node.base) + if not isinstance(self._expr_type(node.base), astx.DataFrameType): + self.context.diagnostics.add( + "dataframe retain requires a DataFrame value", + node=node, + code=DiagnosticCodes.SEMANTIC_TYPE_MISMATCH, + ) + self._set_type(node, astx.Int32()) + + @SemanticAnalyzerCore.visit.dispatch + def visit(self, node: astx.DataFrameRelease) -> None: + """ + title: Visit DataFrameRelease nodes. + parameters: + node: + type: astx.DataFrameRelease + """ + self.visit(node.base) + if not isinstance(self._expr_type(node.base), astx.DataFrameType): + self.context.diagnostics.add( + "dataframe release requires a DataFrame value", + node=node, + code=DiagnosticCodes.SEMANTIC_TYPE_MISMATCH, + ) + self._set_type(node, astx.Int32()) + + @SemanticAnalyzerCore.visit.dispatch + def visit(self, node: astx.SeriesRetain) -> None: + """ + title: Visit SeriesRetain nodes. + parameters: + node: + type: astx.SeriesRetain + """ + self.visit(node.base) + if not isinstance(self._expr_type(node.base), astx.SeriesType): + self.context.diagnostics.add( + "series retain requires a Series value", + node=node, + code=DiagnosticCodes.SEMANTIC_TYPE_MISMATCH, + ) + self._set_type(node, astx.Int32()) + + @SemanticAnalyzerCore.visit.dispatch + def visit(self, node: astx.SeriesRelease) -> None: + """ + title: Visit SeriesRelease nodes. + parameters: + node: + type: astx.SeriesRelease + """ + self.visit(node.base) + if not isinstance(self._expr_type(node.base), astx.SeriesType): + self.context.diagnostics.add( + "series release requires a Series value", + node=node, + code=DiagnosticCodes.SEMANTIC_TYPE_MISMATCH, + ) + self._set_type(node, astx.Int32()) + + +__all__ = ["ExpressionDataFrameVisitorMixin"] diff --git a/packages/irx/src/irx/analysis/types.py b/packages/irx/src/irx/analysis/types.py index e3a34f1..d5ad051 100644 --- a/packages/irx/src/irx/analysis/types.py +++ b/packages/irx/src/irx/analysis/types.py @@ -152,6 +152,26 @@ def clone_type(type_: astx.DataType) -> astx.DataType: else None ) return astx.TensorType(element_type) + if isinstance(type_, astx.SeriesType): + element_type = ( + clone_type(type_.element_type) + if type_.element_type is not None + else None + ) + return astx.SeriesType(element_type, nullable=type_.nullable) + if isinstance(type_, astx.DataFrameType): + if type_.columns is None: + return astx.DataFrameType() + return astx.DataFrameType( + tuple( + astx.DataFrameColumn( + column.name, + clone_type(column.type_), + nullable=column.nullable, + ) + for column in type_.columns + ) + ) return type_.__class__() @@ -230,6 +250,18 @@ def display_type_name(type_: astx.DataType | None) -> str: if type_.element_type is None: return "TensorType" return f"TensorType[{display_type_name(type_.element_type)}]" + if isinstance(type_, astx.SeriesType): + if type_.element_type is None: + return "SeriesType" + return f"SeriesType[{display_type_name(type_.element_type)}]" + if isinstance(type_, astx.DataFrameType): + if type_.columns is None: + return "DataFrameType" + columns = ", ".join( + f"{column.name}: {display_type_name(column.type_)}" + for column in type_.columns + ) + return f"DataFrameType[{columns}]" return str(type_.__class__.__name__) @@ -345,6 +377,29 @@ def same_type(lhs: astx.DataType | None, rhs: astx.DataType | None) -> bool: if lhs.element_type is None or rhs.element_type is None: return True return same_type(lhs.element_type, rhs.element_type) + if isinstance(lhs, astx.SeriesType) and isinstance( + rhs, + astx.SeriesType, + ): + if lhs.element_type is None or rhs.element_type is None: + return True + return lhs.nullable == rhs.nullable and same_type( + lhs.element_type, rhs.element_type + ) + if isinstance(lhs, astx.DataFrameType) and isinstance( + rhs, + astx.DataFrameType, + ): + if lhs.columns is None or rhs.columns is None: + return True + if len(lhs.columns) != len(rhs.columns): + return False + return all( + left.name == right.name + and left.nullable == right.nullable + and same_type(left.type_, right.type_) + for left, right in zip(lhs.columns, rhs.columns, strict=True) + ) return lhs.__class__ is rhs.__class__ diff --git a/packages/irx/src/irx/builder/backend.py b/packages/irx/src/irx/builder/backend.py index 218983c..c6e5d07 100644 --- a/packages/irx/src/irx/builder/backend.py +++ b/packages/irx/src/irx/builder/backend.py @@ -23,6 +23,7 @@ BufferVisitorMixin, CollectionVisitorMixin, ControlFlowVisitorMixin, + DataFrameVisitorMixin, FunctionVisitorMixin, GeneratorVisitorMixin, ListVisitorMixin, @@ -51,6 +52,7 @@ class Visitor( GeneratorVisitorMixin, FunctionVisitorMixin, TemporalVisitorMixin, + DataFrameVisitorMixin, TensorVisitorMixin, ArrayVisitorMixin, BufferVisitorMixin, diff --git a/packages/irx/src/irx/builder/core.py b/packages/irx/src/irx/builder/core.py index c6eaaa6..dddcac7 100644 --- a/packages/irx/src/irx/builder/core.py +++ b/packages/irx/src/irx/builder/core.py @@ -830,6 +830,12 @@ def initialize(self) -> None: self._llvm.TENSOR_BUILDER_HANDLE_TYPE ) self._llvm.ARROW_TENSOR_HANDLE_TYPE = self._llvm.TENSOR_HANDLE_TYPE + self._llvm.TABLE_HANDLE_TYPE = self._llvm.OPAQUE_POINTER_TYPE + self._llvm.CHUNKED_ARRAY_HANDLE_TYPE = self._llvm.OPAQUE_POINTER_TYPE + self._llvm.ARROW_TABLE_HANDLE_TYPE = self._llvm.TABLE_HANDLE_TYPE + self._llvm.ARROW_CHUNKED_ARRAY_HANDLE_TYPE = ( + self._llvm.CHUNKED_ARRAY_HANDLE_TYPE + ) self._llvm.TIME_TYPE = ir.LiteralStructType( [ self._llvm.INT32_TYPE, @@ -1112,6 +1118,10 @@ def _llvm_type_for_ast_type( return self._llvm.BUFFER_VIEW_TYPE if isinstance(type_, astx.TensorType): return self._llvm.BUFFER_VIEW_TYPE + if isinstance(type_, astx.DataFrameType): + return self._llvm.TABLE_HANDLE_TYPE + if isinstance(type_, astx.SeriesType): + return self._llvm.CHUNKED_ARRAY_HANDLE_TYPE if isinstance(type_, astx.StructType): struct_key = type_.qualified_name if struct_key is None and type_.module_key is not None: diff --git a/packages/irx/src/irx/builder/lowering/__init__.py b/packages/irx/src/irx/builder/lowering/__init__.py index 9559eb5..1942b22 100644 --- a/packages/irx/src/irx/builder/lowering/__init__.py +++ b/packages/irx/src/irx/builder/lowering/__init__.py @@ -9,6 +9,7 @@ from irx.builder.lowering.control_flow import ( ControlFlowVisitorMixin, ) +from irx.builder.lowering.dataframe import DataFrameVisitorMixin from irx.builder.lowering.functions import FunctionVisitorMixin from irx.builder.lowering.generators import GeneratorVisitorMixin from irx.builder.lowering.list import ListVisitorMixin @@ -26,6 +27,7 @@ "BufferVisitorMixin", "CollectionVisitorMixin", "ControlFlowVisitorMixin", + "DataFrameVisitorMixin", "FunctionVisitorMixin", "GeneratorVisitorMixin", "ListVisitorMixin", diff --git a/packages/irx/src/irx/builder/lowering/dataframe.py b/packages/irx/src/irx/builder/lowering/dataframe.py new file mode 100644 index 0000000..e0de9f4 --- /dev/null +++ b/packages/irx/src/irx/builder/lowering/dataframe.py @@ -0,0 +1,472 @@ +# mypy: disable-error-code=no-redef + +""" +title: DataFrame visitor mixin for llvmliteir. +""" + +from __future__ import annotations + +from typing import Any, cast + +import astx + +from llvmlite import ir + +from irx.analysis.types import is_float_type, is_unsigned_type +from irx.builder.core import VisitorCore +from irx.builder.protocols import VisitorMixinBase +from irx.builder.runtime import safe_pop +from irx.builder.types import is_int_type +from irx.builtins.collections.dataframe import ( + DATAFRAME_COLUMN_INDEX_EXTRA, + dataframe_type_id, +) +from irx.typecheck import typechecked + + +@typechecked +class DataFrameVisitorMixin(VisitorMixinBase): + """ + title: DataFrame visitor mixin. + """ + + def _append_dataframe_value( + self, + builder_handle: ir.Value, + value_node: astx.AST, + target_type: astx.DataType, + ) -> None: + """ + title: Append one lowered scalar value to an Arrow array builder. + parameters: + builder_handle: + type: ir.Value + value_node: + type: astx.AST + target_type: + type: astx.DataType + """ + self.visit_child(value_node) + value = safe_pop(self.result_stack) + if value is None: + raise Exception("dataframe column value lowering failed") + value = self._cast_ast_value( + value, + source_type=self._resolved_ast_type(value_node), + target_type=target_type, + ) + + if is_float_type(target_type): + append = self.require_runtime_symbol( + "array", + "irx_arrow_array_builder_append_double", + ) + if value.type != self._llvm.DOUBLE_TYPE: + value = self._llvm.ir_builder.fpext( + value, + self._llvm.DOUBLE_TYPE, + "dataframe_fpext", + ) + self._llvm.ir_builder.call(append, [builder_handle, value]) + return + + append = self.require_runtime_symbol( + "array", + "irx_arrow_array_builder_append_int", + ) + if not is_int_type(value.type): + raise Exception("dataframe column value must lower to a scalar") + if value.type.width < self._llvm.INT64_TYPE.width: + if is_unsigned_type(target_type) or isinstance( + target_type, + astx.Boolean, + ): + value = self._llvm.ir_builder.zext( + value, + self._llvm.INT64_TYPE, + "dataframe_zext", + ) + else: + value = self._llvm.ir_builder.sext( + value, + self._llvm.INT64_TYPE, + "dataframe_sext", + ) + elif value.type.width > self._llvm.INT64_TYPE.width: + value = self._llvm.ir_builder.trunc( + value, + self._llvm.INT64_TYPE, + "dataframe_trunc", + ) + self._llvm.ir_builder.call(append, [builder_handle, value]) + + def _build_arrow_array_from_column( + self, + column_name: str, + column_type: astx.DataType, + values: tuple[astx.AST, ...], + ) -> ir.Value: + """ + title: Build one Arrow array handle from column values. + parameters: + column_name: + type: str + column_type: + type: astx.DataType + values: + type: tuple[astx.AST, Ellipsis] + returns: + type: ir.Value + """ + type_id = dataframe_type_id(column_type) + if type_id is None: + raise Exception("unsupported dataframe column type") + + builder_new = self.require_runtime_symbol( + "array", + "irx_arrow_array_builder_new", + ) + finish_builder = self.require_runtime_symbol( + "array", + "irx_arrow_array_builder_finish", + ) + + builder_slot = self._llvm.ir_builder.alloca( + self._llvm.ARRAY_BUILDER_HANDLE_TYPE, + name=f"{column_name}_array_builder_slot", + ) + self._llvm.ir_builder.call( + builder_new, + [ + ir.Constant(self._llvm.INT32_TYPE, type_id), + builder_slot, + ], + ) + builder_handle = self._llvm.ir_builder.load( + builder_slot, + f"{column_name}_array_builder", + ) + + for value in values: + self._append_dataframe_value(builder_handle, value, column_type) + + array_slot = self._llvm.ir_builder.alloca( + self._llvm.ARRAY_HANDLE_TYPE, + name=f"{column_name}_array_slot", + ) + self._llvm.ir_builder.call( + finish_builder, + [builder_handle, array_slot], + ) + return self._llvm.ir_builder.load( + array_slot, + f"{column_name}_array", + ) + + def _column_index(self, node: astx.DataFrameColumnAccess) -> int | None: + """ + title: Return the statically resolved column index when available. + parameters: + node: + type: astx.DataFrameColumnAccess + returns: + type: int | None + """ + semantic = getattr(node, "semantic", None) + extras = getattr(semantic, "extras", {}) + index = extras.get(DATAFRAME_COLUMN_INDEX_EXTRA) + return index if isinstance(index, int) else None + + def _lower_dataframe_column_access( + self, + node: astx.DataFrameColumnAccess, + ) -> None: + """ + title: Lower one DataFrame column access node. + parameters: + node: + type: astx.DataFrameColumnAccess + """ + self.visit_child(node.base) + table_handle = safe_pop(self.result_stack) + if table_handle is None: + raise Exception("dataframe column access requires a table") + + column_slot = self._llvm.ir_builder.alloca( + self._llvm.CHUNKED_ARRAY_HANDLE_TYPE, + name="dataframe_column_slot", + ) + index = self._column_index(node) + if index is not None: + column_by_index = self.require_runtime_symbol( + "dataframe", + "irx_arrow_table_column_by_index", + ) + self._llvm.ir_builder.call( + column_by_index, + [ + table_handle, + ir.Constant(self._llvm.INT32_TYPE, index), + column_slot, + ], + ) + else: + column_by_name = self.require_runtime_symbol( + "dataframe", + "irx_arrow_table_column_by_name", + ) + name_pointer = cast(Any, self)._constant_c_string_pointer( + node.column_name, + name_hint=f"dataframe_column_{node.column_name}", + ) + self._llvm.ir_builder.call( + column_by_name, + [table_handle, name_pointer, column_slot], + ) + + column_handle = self._llvm.ir_builder.load( + column_slot, + "dataframe_column", + ) + self.result_stack.append(column_handle) + + @VisitorCore.visit.dispatch + def visit(self, node: astx.DataFrameLiteral) -> None: + """ + title: Visit DataFrameLiteral nodes. + parameters: + node: + type: astx.DataFrameLiteral + """ + if node.type_.columns is None: + raise Exception("dataframe literal lowering requires a schema") + + literal_by_name = {column.name: column for column in node.columns} + release_array = self.require_runtime_symbol( + "array", + "irx_arrow_array_release", + ) + table_new = self.require_runtime_symbol( + "dataframe", + "irx_arrow_table_new_from_arrays", + ) + + array_handles: list[ir.Value] = [] + name_pointers: list[ir.Value] = [] + for schema_column in node.type_.columns: + literal_column = literal_by_name.get(schema_column.name) + if literal_column is None: + raise Exception("dataframe literal is missing a column") + array_handles.append( + self._build_arrow_array_from_column( + schema_column.name, + schema_column.type_, + literal_column.values, + ) + ) + name_pointers.append( + cast(Any, self)._constant_c_string_pointer( + schema_column.name, + name_hint=f"dataframe_column_{schema_column.name}", + ) + ) + + column_count = len(array_handles) + names_array_type = ir.ArrayType( + self._llvm.ASCII_STRING_TYPE, + column_count, + ) + arrays_array_type = ir.ArrayType( + self._llvm.ARRAY_HANDLE_TYPE, + column_count, + ) + names_array = self._llvm.ir_builder.alloca( + names_array_type, + name="dataframe_names", + ) + arrays_array = self._llvm.ir_builder.alloca( + arrays_array_type, + name="dataframe_arrays", + ) + + for index, (name_pointer, array_handle) in enumerate( + zip(name_pointers, array_handles, strict=True) + ): + indices = [ + ir.Constant(self._llvm.INT32_TYPE, 0), + ir.Constant(self._llvm.INT32_TYPE, index), + ] + name_slot = self._llvm.ir_builder.gep(names_array, indices) + array_slot = self._llvm.ir_builder.gep(arrays_array, indices) + self._llvm.ir_builder.store(name_pointer, name_slot) + self._llvm.ir_builder.store(array_handle, array_slot) + + names_ptr = self._llvm.ir_builder.gep( + names_array, + [ + ir.Constant(self._llvm.INT32_TYPE, 0), + ir.Constant(self._llvm.INT32_TYPE, 0), + ], + ) + arrays_ptr = self._llvm.ir_builder.gep( + arrays_array, + [ + ir.Constant(self._llvm.INT32_TYPE, 0), + ir.Constant(self._llvm.INT32_TYPE, 0), + ], + ) + table_slot = self._llvm.ir_builder.alloca( + self._llvm.TABLE_HANDLE_TYPE, + name="dataframe_table_slot", + ) + self._llvm.ir_builder.call( + table_new, + [ + ir.Constant(self._llvm.INT64_TYPE, column_count), + names_ptr, + arrays_ptr, + table_slot, + ], + ) + table_handle = self._llvm.ir_builder.load( + table_slot, + "dataframe_table", + ) + + for array_handle in array_handles: + self._llvm.ir_builder.call(release_array, [array_handle]) + + self.result_stack.append(table_handle) + + @VisitorCore.visit.dispatch + def visit(self, node: astx.DataFrameColumnAccess) -> None: + """ + title: Visit DataFrameColumnAccess nodes. + parameters: + node: + type: astx.DataFrameColumnAccess + """ + self._lower_dataframe_column_access(node) + + @VisitorCore.visit.dispatch + def visit(self, node: astx.DataFrameStringColumnAccess) -> None: + """ + title: Visit DataFrameStringColumnAccess nodes. + parameters: + node: + type: astx.DataFrameStringColumnAccess + """ + self._lower_dataframe_column_access(node) + + @VisitorCore.visit.dispatch + def visit(self, node: astx.DataFrameRowCount) -> None: + """ + title: Visit DataFrameRowCount nodes. + parameters: + node: + type: astx.DataFrameRowCount + """ + self.visit_child(node.base) + table_handle = safe_pop(self.result_stack) + if table_handle is None: + raise Exception("dataframe nrows requires a table") + nrows = self.require_runtime_symbol( + "dataframe", + "irx_arrow_table_num_rows", + ) + self.result_stack.append( + self._llvm.ir_builder.call(nrows, [table_handle], "nrows") + ) + + @VisitorCore.visit.dispatch + def visit(self, node: astx.DataFrameColumnCount) -> None: + """ + title: Visit DataFrameColumnCount nodes. + parameters: + node: + type: astx.DataFrameColumnCount + """ + self.visit_child(node.base) + table_handle = safe_pop(self.result_stack) + if table_handle is None: + raise Exception("dataframe ncols requires a table") + ncols = self.require_runtime_symbol( + "dataframe", + "irx_arrow_table_num_columns", + ) + self.result_stack.append( + self._llvm.ir_builder.call(ncols, [table_handle], "ncols") + ) + + @VisitorCore.visit.dispatch + def visit(self, node: astx.DataFrameRetain) -> None: + """ + title: Visit DataFrameRetain nodes. + parameters: + node: + type: astx.DataFrameRetain + """ + self.visit_child(node.base) + table_handle = safe_pop(self.result_stack) + retain = self.require_runtime_symbol( + "dataframe", + "irx_arrow_table_retain", + ) + self.result_stack.append( + self._llvm.ir_builder.call(retain, [table_handle]) + ) + + @VisitorCore.visit.dispatch + def visit(self, node: astx.DataFrameRelease) -> None: + """ + title: Visit DataFrameRelease nodes. + parameters: + node: + type: astx.DataFrameRelease + """ + self.visit_child(node.base) + table_handle = safe_pop(self.result_stack) + release = self.require_runtime_symbol( + "dataframe", + "irx_arrow_table_release", + ) + self._llvm.ir_builder.call(release, [table_handle]) + self.result_stack.append(ir.Constant(self._llvm.INT32_TYPE, 0)) + + @VisitorCore.visit.dispatch + def visit(self, node: astx.SeriesRetain) -> None: + """ + title: Visit SeriesRetain nodes. + parameters: + node: + type: astx.SeriesRetain + """ + self.visit_child(node.base) + column_handle = safe_pop(self.result_stack) + retain = self.require_runtime_symbol( + "dataframe", + "irx_arrow_chunked_array_retain", + ) + self.result_stack.append( + self._llvm.ir_builder.call(retain, [column_handle]) + ) + + @VisitorCore.visit.dispatch + def visit(self, node: astx.SeriesRelease) -> None: + """ + title: Visit SeriesRelease nodes. + parameters: + node: + type: astx.SeriesRelease + """ + self.visit_child(node.base) + column_handle = safe_pop(self.result_stack) + release = self.require_runtime_symbol( + "dataframe", + "irx_arrow_chunked_array_release", + ) + self._llvm.ir_builder.call(release, [column_handle]) + self.result_stack.append(ir.Constant(self._llvm.INT32_TYPE, 0)) + + +__all__ = ["DataFrameVisitorMixin"] diff --git a/packages/irx/src/irx/builder/lowering/literals.py b/packages/irx/src/irx/builder/lowering/literals.py index 5403249..793de49 100644 --- a/packages/irx/src/irx/builder/lowering/literals.py +++ b/packages/irx/src/irx/builder/lowering/literals.py @@ -123,6 +123,8 @@ def _default_runtime_initializer( astx.StructType, astx.BufferViewType, astx.TensorType, + astx.DataFrameType, + astx.SeriesType, astx.PointerType, astx.OpaqueHandleType, astx.BufferOwnerType, diff --git a/packages/irx/src/irx/builder/lowering/variables.py b/packages/irx/src/irx/builder/lowering/variables.py index 03a4248..ee386ba 100644 --- a/packages/irx/src/irx/builder/lowering/variables.py +++ b/packages/irx/src/irx/builder/lowering/variables.py @@ -263,6 +263,13 @@ def visit(self, node: astx.VariableDeclaration) -> None: if existing_storage is not None else self.create_entry_block_alloca(node.name, llvm_type) ) + elif isinstance(node.type_, astx.DataFrameType | astx.SeriesType): + init_val = ir.Constant(llvm_type, None) + alloca = ( + existing_storage + if existing_storage is not None + else self.create_entry_block_alloca(node.name, llvm_type) + ) elif "float" in type_str: init_val = ir.Constant(self._llvm.get_data_type(type_str), 0.0) alloca = ( @@ -325,6 +332,8 @@ def visit(self, node: astx.InlineVariableDeclaration) -> None: init_val = ir.Constant(llvm_type, None) elif isinstance(node.type_, astx.GeneratorType): init_val = ir.Constant(llvm_type, None) + elif isinstance(node.type_, astx.DataFrameType | astx.SeriesType): + init_val = ir.Constant(llvm_type, None) elif "float" in type_str: init_val = ir.Constant(self._llvm.get_data_type(type_str), 0.0) else: diff --git a/packages/irx/src/irx/builder/runtime/arrow/native/irx_arrow_runtime.cc b/packages/irx/src/irx/builder/runtime/arrow/native/irx_arrow_runtime.cc index 9dd8061..ddc324c 100644 --- a/packages/irx/src/irx/builder/runtime/arrow/native/irx_arrow_runtime.cc +++ b/packages/irx/src/irx/builder/runtime/arrow/native/irx_arrow_runtime.cc @@ -238,6 +238,16 @@ struct irx_arrow_tensor_handle { int64_t element_size_bytes = 0; }; +struct irx_arrow_table_handle { + int64_t refcount = 0; + std::shared_ptr table; +}; + +struct irx_arrow_chunked_array_handle { + int64_t refcount = 0; + std::shared_ptr column; +}; + namespace { void clear_error() { last_error[0] = '\0'; } @@ -1628,6 +1638,199 @@ void irx_arrow_tensor_release(irx_arrow_tensor_handle* tensor) { } } +int irx_arrow_table_new_from_arrays( + int64_t column_count, + const char** names, + irx_arrow_array_handle** arrays, + irx_arrow_table_handle** out_table) { + clear_error(); + try { + if (out_table == nullptr) { + return set_error(EINVAL, "out_table must not be NULL"); + } + *out_table = nullptr; + if (column_count < 0) { + return set_error(EINVAL, "column_count must be non-negative"); + } + if (column_count > 0 && names == nullptr) { + return set_error(EINVAL, "names must not be NULL"); + } + if (column_count > 0 && arrays == nullptr) { + return set_error(EINVAL, "arrays must not be NULL"); + } + + std::vector> fields; + std::vector> columns; + fields.reserve(static_cast(column_count)); + columns.reserve(static_cast(column_count)); + + int64_t row_count = -1; + for (int64_t index = 0; index < column_count; ++index) { + if (names[index] == nullptr) { + return set_error(EINVAL, "column name must not be NULL"); + } + irx_arrow_array_handle* array = arrays[index]; + if (array == nullptr || !array->array) { + return set_error(EINVAL, "array handle must not be NULL"); + } + const int64_t length = array->array->length(); + if (row_count < 0) { + row_count = length; + } else if (length != row_count) { + return set_error(EINVAL, "dataframe columns must have equal length"); + } + + fields.push_back(arrow::field( + std::string(names[index]), + array->array->type(), + array->nullable != 0)); + columns.push_back(std::make_shared(array->array)); + } + + auto schema = arrow::schema(std::move(fields)); + auto table = arrow::Table::Make( + schema, + std::move(columns), + row_count < 0 ? 0 : row_count); + + auto handle = std::make_unique(); + handle->refcount = kInitialRefcount; + handle->table = std::move(table); + *out_table = handle.release(); + return kArrowOk; + } catch (const std::bad_alloc&) { + return set_error(ENOMEM, "failed to allocate Arrow table"); + } catch (const std::exception& exc) { + return set_exception_error("irx_arrow_table_new_from_arrays", exc); + } +} + +int64_t irx_arrow_table_num_rows(const irx_arrow_table_handle* table) { + clear_error(); + if (table == nullptr || !table->table) { + set_error(EINVAL, "table must not be NULL"); + return -1; + } + return table->table->num_rows(); +} + +int64_t irx_arrow_table_num_columns(const irx_arrow_table_handle* table) { + clear_error(); + if (table == nullptr || !table->table) { + set_error(EINVAL, "table must not be NULL"); + return -1; + } + return table->table->num_columns(); +} + +int irx_arrow_table_column_by_name( + const irx_arrow_table_handle* table, + const char* name, + irx_arrow_chunked_array_handle** out_column) { + clear_error(); + try { + if (table == nullptr || !table->table) { + return set_error(EINVAL, "table must not be NULL"); + } + if (name == nullptr) { + return set_error(EINVAL, "column name must not be NULL"); + } + if (out_column == nullptr) { + return set_error(EINVAL, "out_column must not be NULL"); + } + *out_column = nullptr; + + std::shared_ptr column = + table->table->GetColumnByName(name); + if (!column) { + return set_error(EINVAL, "table has no column named '%s'", name); + } + + auto handle = std::make_unique(); + handle->refcount = kInitialRefcount; + handle->column = std::move(column); + *out_column = handle.release(); + return kArrowOk; + } catch (const std::bad_alloc&) { + return set_error(ENOMEM, "failed to allocate Arrow chunked array handle"); + } catch (const std::exception& exc) { + return set_exception_error("irx_arrow_table_column_by_name", exc); + } +} + +int irx_arrow_table_column_by_index( + const irx_arrow_table_handle* table, + int32_t index, + irx_arrow_chunked_array_handle** out_column) { + clear_error(); + try { + if (table == nullptr || !table->table) { + return set_error(EINVAL, "table must not be NULL"); + } + if (out_column == nullptr) { + return set_error(EINVAL, "out_column must not be NULL"); + } + *out_column = nullptr; + if (index < 0 || index >= table->table->num_columns()) { + return set_error(EINVAL, "column index is out of bounds"); + } + + auto handle = std::make_unique(); + handle->refcount = kInitialRefcount; + handle->column = table->table->column(index); + *out_column = handle.release(); + return kArrowOk; + } catch (const std::bad_alloc&) { + return set_error(ENOMEM, "failed to allocate Arrow chunked array handle"); + } catch (const std::exception& exc) { + return set_exception_error("irx_arrow_table_column_by_index", exc); + } +} + +int irx_arrow_table_retain(irx_arrow_table_handle* table) { + clear_error(); + if (table == nullptr) { + return kArrowOk; + } + if (table->refcount <= 0) { + return set_error(EINVAL, "table handle is released"); + } + table->refcount += 1; + return kArrowOk; +} + +void irx_arrow_table_release(irx_arrow_table_handle* table) { + if (table == nullptr || table->refcount <= 0) { + return; + } + table->refcount -= 1; + if (table->refcount == 0) { + delete table; + } +} + +int irx_arrow_chunked_array_retain(irx_arrow_chunked_array_handle* column) { + clear_error(); + if (column == nullptr) { + return kArrowOk; + } + if (column->refcount <= 0) { + return set_error(EINVAL, "chunked array handle is released"); + } + column->refcount += 1; + return kArrowOk; +} + +void irx_arrow_chunked_array_release(irx_arrow_chunked_array_handle* column) { + if (column == nullptr || column->refcount <= 0) { + return; + } + column->refcount -= 1; + if (column->refcount == 0) { + delete column; + } +} + const char* irx_arrow_last_error(void) { return last_error; } diff --git a/packages/irx/src/irx/builder/runtime/arrow/native/irx_arrow_runtime.h b/packages/irx/src/irx/builder/runtime/arrow/native/irx_arrow_runtime.h index 1a835ee..61d4670 100644 --- a/packages/irx/src/irx/builder/runtime/arrow/native/irx_arrow_runtime.h +++ b/packages/irx/src/irx/builder/runtime/arrow/native/irx_arrow_runtime.h @@ -18,6 +18,8 @@ typedef struct irx_arrow_array_builder_handle irx_arrow_array_builder_handle; typedef struct irx_arrow_array_handle irx_arrow_array_handle; typedef struct irx_arrow_tensor_builder_handle irx_arrow_tensor_builder_handle; typedef struct irx_arrow_tensor_handle irx_arrow_tensor_handle; +typedef struct irx_arrow_table_handle irx_arrow_table_handle; +typedef struct irx_arrow_chunked_array_handle irx_arrow_chunked_array_handle; enum irx_arrow_type_id { IRX_ARROW_TYPE_UNKNOWN = 0, @@ -144,6 +146,26 @@ int irx_arrow_tensor_borrow_buffer_view( irx_buffer_view* out_view); int irx_arrow_tensor_retain(irx_arrow_tensor_handle* tensor); void irx_arrow_tensor_release(irx_arrow_tensor_handle* tensor); + +int irx_arrow_table_new_from_arrays( + int64_t column_count, + const char** names, + irx_arrow_array_handle** arrays, + irx_arrow_table_handle** out_table); +int64_t irx_arrow_table_num_rows(const irx_arrow_table_handle* table); +int64_t irx_arrow_table_num_columns(const irx_arrow_table_handle* table); +int irx_arrow_table_column_by_name( + const irx_arrow_table_handle* table, + const char* name, + irx_arrow_chunked_array_handle** out_column); +int irx_arrow_table_column_by_index( + const irx_arrow_table_handle* table, + int32_t index, + irx_arrow_chunked_array_handle** out_column); +int irx_arrow_table_retain(irx_arrow_table_handle* table); +void irx_arrow_table_release(irx_arrow_table_handle* table); +int irx_arrow_chunked_array_retain(irx_arrow_chunked_array_handle* column); +void irx_arrow_chunked_array_release(irx_arrow_chunked_array_handle* column); const char* irx_arrow_last_error(void); #ifdef __cplusplus diff --git a/packages/irx/src/irx/builder/runtime/dataframe/__init__.py b/packages/irx/src/irx/builder/runtime/dataframe/__init__.py new file mode 100644 index 0000000..3013924 --- /dev/null +++ b/packages/irx/src/irx/builder/runtime/dataframe/__init__.py @@ -0,0 +1,9 @@ +""" +title: Canonical dataframe runtime feature support for IRx. +""" + +from irx.builder.runtime.dataframe.feature import ( + build_dataframe_runtime_feature, +) + +__all__ = ["build_dataframe_runtime_feature"] diff --git a/packages/irx/src/irx/builder/runtime/dataframe/feature.py b/packages/irx/src/irx/builder/runtime/dataframe/feature.py new file mode 100644 index 0000000..98da751 --- /dev/null +++ b/packages/irx/src/irx/builder/runtime/dataframe/feature.py @@ -0,0 +1,339 @@ +""" +title: Builtin dataframe runtime feature declarations backed by Arrow. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +from llvmlite import ir + +from irx.builder.runtime.arrowcpp import ( + arrowcpp_compile_flags, + arrowcpp_include_dirs, + arrowcpp_linker_flags, + arrowcpp_runtime_metadata, +) +from irx.builder.runtime.features import ( + ExternalSymbolSpec, + NativeArtifact, + RuntimeFeature, + declare_external_function, +) +from irx.typecheck import typechecked + +if TYPE_CHECKING: + from irx.builder.protocols import VisitorProtocol + + +@typechecked +def build_dataframe_runtime_feature() -> RuntimeFeature: + """ + title: Build the builtin dataframe runtime feature specification. + returns: + type: RuntimeFeature + """ + runtime_root = Path(__file__).resolve().parent + native_root = (runtime_root.parent / "arrow" / "native").resolve() + buffer_native_root = (runtime_root.parent / "buffer" / "native").resolve() + include_dirs = ( + native_root, + buffer_native_root, + *arrowcpp_include_dirs(), + ) + artifacts = [ + NativeArtifact( + kind="cxx_source", + path=native_root / "irx_arrow_runtime.cc", + include_dirs=include_dirs, + compile_flags=arrowcpp_compile_flags(), + ) + ] + + return RuntimeFeature( + name="dataframe", + symbols={ + "irx_arrow_table_new_from_arrays": ExternalSymbolSpec( + "irx_arrow_table_new_from_arrays", + _declare_table_new_from_arrays, + ), + "irx_arrow_table_num_rows": ExternalSymbolSpec( + "irx_arrow_table_num_rows", + _declare_table_num_rows, + ), + "irx_arrow_table_num_columns": ExternalSymbolSpec( + "irx_arrow_table_num_columns", + _declare_table_num_columns, + ), + "irx_arrow_table_column_by_name": ExternalSymbolSpec( + "irx_arrow_table_column_by_name", + _declare_table_column_by_name, + ), + "irx_arrow_table_column_by_index": ExternalSymbolSpec( + "irx_arrow_table_column_by_index", + _declare_table_column_by_index, + ), + "irx_arrow_table_retain": ExternalSymbolSpec( + "irx_arrow_table_retain", + _declare_table_retain, + ), + "irx_arrow_table_release": ExternalSymbolSpec( + "irx_arrow_table_release", + _declare_table_release, + ), + "irx_arrow_chunked_array_retain": ExternalSymbolSpec( + "irx_arrow_chunked_array_retain", + _declare_chunked_array_retain, + ), + "irx_arrow_chunked_array_release": ExternalSymbolSpec( + "irx_arrow_chunked_array_release", + _declare_chunked_array_release, + ), + "irx_arrow_last_error": ExternalSymbolSpec( + "irx_arrow_last_error", + _declare_last_error, + ), + }, + artifacts=tuple(artifacts), + metadata={ + "opaque_handles": { + "table": "irx_arrow_table_handle", + "chunked_array": "irx_arrow_chunked_array_handle", + }, + "canonical_name": "dataframe", + **arrowcpp_runtime_metadata(), + }, + linker_flags=arrowcpp_linker_flags(), + ) + + +@typechecked +def _declare_function( + visitor: VisitorProtocol, + name: str, + return_type: ir.Type, + arg_types: list[ir.Type], +) -> ir.Function: + """ + title: Declare one Arrow dataframe runtime symbol. + parameters: + visitor: + type: VisitorProtocol + name: + type: str + return_type: + type: ir.Type + arg_types: + type: list[ir.Type] + returns: + type: ir.Function + """ + fn_type = ir.FunctionType(return_type, arg_types) + return declare_external_function(visitor._llvm.module, name, fn_type) + + +@typechecked +def _declare_table_new_from_arrays( + visitor: VisitorProtocol, +) -> ir.Function: + """ + title: Declare Arrow table construction from arrays. + parameters: + visitor: + type: VisitorProtocol + returns: + type: ir.Function + """ + return _declare_function( + visitor, + "irx_arrow_table_new_from_arrays", + visitor._llvm.INT32_TYPE, + [ + visitor._llvm.INT64_TYPE, + visitor._llvm.ASCII_STRING_TYPE.as_pointer(), + visitor._llvm.ARRAY_HANDLE_TYPE.as_pointer(), + visitor._llvm.TABLE_HANDLE_TYPE.as_pointer(), + ], + ) + + +@typechecked +def _declare_table_num_rows(visitor: VisitorProtocol) -> ir.Function: + """ + title: Declare Arrow table row-count query. + parameters: + visitor: + type: VisitorProtocol + returns: + type: ir.Function + """ + return _declare_function( + visitor, + "irx_arrow_table_num_rows", + visitor._llvm.INT64_TYPE, + [visitor._llvm.TABLE_HANDLE_TYPE], + ) + + +@typechecked +def _declare_table_num_columns(visitor: VisitorProtocol) -> ir.Function: + """ + title: Declare Arrow table column-count query. + parameters: + visitor: + type: VisitorProtocol + returns: + type: ir.Function + """ + return _declare_function( + visitor, + "irx_arrow_table_num_columns", + visitor._llvm.INT64_TYPE, + [visitor._llvm.TABLE_HANDLE_TYPE], + ) + + +@typechecked +def _declare_table_column_by_name( + visitor: VisitorProtocol, +) -> ir.Function: + """ + title: Declare Arrow table column lookup by name. + parameters: + visitor: + type: VisitorProtocol + returns: + type: ir.Function + """ + return _declare_function( + visitor, + "irx_arrow_table_column_by_name", + visitor._llvm.INT32_TYPE, + [ + visitor._llvm.TABLE_HANDLE_TYPE, + visitor._llvm.ASCII_STRING_TYPE, + visitor._llvm.CHUNKED_ARRAY_HANDLE_TYPE.as_pointer(), + ], + ) + + +@typechecked +def _declare_table_column_by_index( + visitor: VisitorProtocol, +) -> ir.Function: + """ + title: Declare Arrow table column lookup by index. + parameters: + visitor: + type: VisitorProtocol + returns: + type: ir.Function + """ + return _declare_function( + visitor, + "irx_arrow_table_column_by_index", + visitor._llvm.INT32_TYPE, + [ + visitor._llvm.TABLE_HANDLE_TYPE, + visitor._llvm.INT32_TYPE, + visitor._llvm.CHUNKED_ARRAY_HANDLE_TYPE.as_pointer(), + ], + ) + + +@typechecked +def _declare_table_retain(visitor: VisitorProtocol) -> ir.Function: + """ + title: Declare Arrow table retain. + parameters: + visitor: + type: VisitorProtocol + returns: + type: ir.Function + """ + return _declare_function( + visitor, + "irx_arrow_table_retain", + visitor._llvm.INT32_TYPE, + [visitor._llvm.TABLE_HANDLE_TYPE], + ) + + +@typechecked +def _declare_table_release(visitor: VisitorProtocol) -> ir.Function: + """ + title: Declare Arrow table release. + parameters: + visitor: + type: VisitorProtocol + returns: + type: ir.Function + """ + return _declare_function( + visitor, + "irx_arrow_table_release", + visitor._llvm.VOID_TYPE, + [visitor._llvm.TABLE_HANDLE_TYPE], + ) + + +@typechecked +def _declare_chunked_array_retain( + visitor: VisitorProtocol, +) -> ir.Function: + """ + title: Declare Arrow chunked-array retain. + parameters: + visitor: + type: VisitorProtocol + returns: + type: ir.Function + """ + return _declare_function( + visitor, + "irx_arrow_chunked_array_retain", + visitor._llvm.INT32_TYPE, + [visitor._llvm.CHUNKED_ARRAY_HANDLE_TYPE], + ) + + +@typechecked +def _declare_chunked_array_release( + visitor: VisitorProtocol, +) -> ir.Function: + """ + title: Declare Arrow chunked-array release. + parameters: + visitor: + type: VisitorProtocol + returns: + type: ir.Function + """ + return _declare_function( + visitor, + "irx_arrow_chunked_array_release", + visitor._llvm.VOID_TYPE, + [visitor._llvm.CHUNKED_ARRAY_HANDLE_TYPE], + ) + + +@typechecked +def _declare_last_error(visitor: VisitorProtocol) -> ir.Function: + """ + title: Declare Arrow runtime last-error query. + parameters: + visitor: + type: VisitorProtocol + returns: + type: ir.Function + """ + return _declare_function( + visitor, + "irx_arrow_last_error", + visitor._llvm.ASCII_STRING_TYPE, + [], + ) + + +__all__ = ["build_dataframe_runtime_feature"] diff --git a/packages/irx/src/irx/builder/runtime/registry.py b/packages/irx/src/irx/builder/runtime/registry.py index cee3ba1..15efdda 100644 --- a/packages/irx/src/irx/builder/runtime/registry.py +++ b/packages/irx/src/irx/builder/runtime/registry.py @@ -14,6 +14,9 @@ build_assertions_runtime_feature, ) from irx.builder.runtime.buffer.feature import build_buffer_runtime_feature +from irx.builder.runtime.dataframe.feature import ( + build_dataframe_runtime_feature, +) from irx.builder.runtime.feature_libc import build_libc_runtime_feature from irx.builder.runtime.feature_libm import build_libm_runtime_feature from irx.builder.runtime.features import NativeArtifact, RuntimeFeature @@ -299,6 +302,7 @@ def get_default_runtime_feature_registry() -> RuntimeFeatureRegistry: registry.register(build_libm_runtime_feature()) registry.register(build_buffer_runtime_feature()) registry.register(build_array_runtime_feature()) + registry.register(build_dataframe_runtime_feature()) registry.register(build_tensor_runtime_feature()) registry.register(build_list_runtime_feature()) return registry diff --git a/packages/irx/src/irx/builder/types.py b/packages/irx/src/irx/builder/types.py index 22edaf1..e33d7b0 100644 --- a/packages/irx/src/irx/builder/types.py +++ b/packages/irx/src/irx/builder/types.py @@ -78,6 +78,10 @@ class VariablesLLVM: TENSOR_HANDLE_TYPE: ir.types.Type ARROW_TENSOR_BUILDER_HANDLE_TYPE: ir.types.Type ARROW_TENSOR_HANDLE_TYPE: ir.types.Type + TABLE_HANDLE_TYPE: ir.types.Type + CHUNKED_ARRAY_HANDLE_TYPE: ir.types.Type + ARROW_TABLE_HANDLE_TYPE: ir.types.Type + ARROW_CHUNKED_ARRAY_HANDLE_TYPE: ir.types.Type context: ir.context.Context module: ir.module.Module diff --git a/packages/irx/src/irx/builtins/collections/dataframe.py b/packages/irx/src/irx/builtins/collections/dataframe.py new file mode 100644 index 0000000..991dd48 --- /dev/null +++ b/packages/irx/src/irx/builtins/collections/dataframe.py @@ -0,0 +1,175 @@ +""" +title: DataFrame helpers layered on the builtin Arrow runtime. +summary: >- + Define IRx's backend-neutral DataFrame metadata helpers on top of Arrow Table + and ChunkedArray storage. +""" + +from __future__ import annotations + +from dataclasses import dataclass + +import astx + +from public import public + +from irx.builtins.collections.array_primitives import ( + ARRAY_PRIMITIVE_TYPE_SPECS, +) +from irx.builtins.collections.tensor import tensor_primitive_type_name +from irx.typecheck import typechecked + +DATAFRAME_SCHEMA_EXTRA = "dataframe_schema" +DATAFRAME_COLUMN_INDEX_EXTRA = "dataframe_column_index" +SERIES_ELEMENT_TYPE_EXTRA = "series_element_type" +SERIES_NULLABLE_EXTRA = "series_nullable" + + +@public +@typechecked +@dataclass(frozen=True) +class DataFrameSchemaColumn: + """ + title: Static DataFrame schema column metadata. + attributes: + name: + type: str + type_: + type: astx.DataType + nullable: + type: bool + index: + type: int + """ + + name: str + type_: astx.DataType + nullable: bool + index: int + + +@public +@typechecked +@dataclass(frozen=True) +class DataFrameSchema: + """ + title: Static DataFrame schema metadata. + attributes: + columns: + type: tuple[DataFrameSchemaColumn, Ellipsis] + """ + + columns: tuple[DataFrameSchemaColumn, ...] + + @property + def column_count(self) -> int: + """ + title: Return the number of columns in this schema. + returns: + type: int + """ + return len(self.columns) + + def column(self, name: str) -> DataFrameSchemaColumn | None: + """ + title: Return one schema column by name. + parameters: + name: + type: str + returns: + type: DataFrameSchemaColumn | None + """ + for column in self.columns: + if column.name == name: + return column + return None + + +@public +@typechecked +def schema_from_type( + type_: astx.DataFrameType, +) -> DataFrameSchema | None: + """ + title: Return static schema metadata from one DataFrame type. + parameters: + type_: + type: astx.DataFrameType + returns: + type: DataFrameSchema | None + """ + if type_.columns is None: + return None + return DataFrameSchema( + tuple( + DataFrameSchemaColumn( + name=column.name, + type_=column.type_, + nullable=column.nullable, + index=index, + ) + for index, column in enumerate(type_.columns) + ) + ) + + +@public +@typechecked +def dataframe_primitive_type_name( + type_: astx.DataType | None, +) -> str | None: + """ + title: Return the builtin primitive storage name for one column type. + parameters: + type_: + type: astx.DataType | None + returns: + type: str | None + """ + return tensor_primitive_type_name(type_) + + +@public +@typechecked +def dataframe_type_id(type_: astx.DataType | None) -> int | None: + """ + title: Return the Arrow runtime type id for one supported column type. + parameters: + type_: + type: astx.DataType | None + returns: + type: int | None + """ + primitive_name = dataframe_primitive_type_name(type_) + if primitive_name is None: + return None + spec = ARRAY_PRIMITIVE_TYPE_SPECS.get(primitive_name) + return None if spec is None else spec.type_id + + +@public +@typechecked +def dataframe_column_type_is_supported(type_: astx.DataType | None) -> bool: + """ + title: Return whether one type is supported for MVP DataFrame columns. + parameters: + type_: + type: astx.DataType | None + returns: + type: bool + """ + return dataframe_type_id(type_) is not None + + +__all__ = [ + "DATAFRAME_COLUMN_INDEX_EXTRA", + "DATAFRAME_SCHEMA_EXTRA", + "SERIES_ELEMENT_TYPE_EXTRA", + "SERIES_NULLABLE_EXTRA", + "DataFrameSchema", + "DataFrameSchemaColumn", + "dataframe_column_type_is_supported", + "dataframe_primitive_type_name", + "dataframe_type_id", + "schema_from_type", +] diff --git a/packages/irx/tests/test_dataframe.py b/packages/irx/tests/test_dataframe.py new file mode 100644 index 0000000..5942c6e --- /dev/null +++ b/packages/irx/tests/test_dataframe.py @@ -0,0 +1,228 @@ +""" +title: Tests for the IRx DataFrame layer. +""" + +from __future__ import annotations + +import astx +import pytest + +from irx.analysis import SemanticError, analyze +from irx.builder import Builder + +from .conftest import assert_ir_parses, make_main_module + + +def _dataframe_type() -> astx.DataFrameType: + """ + title: Build one static DataFrame test schema. + returns: + type: astx.DataFrameType + """ + return astx.DataFrameType( + ( + astx.DataFrameColumn("id", astx.Int32()), + astx.DataFrameColumn("score", astx.Float64()), + astx.DataFrameColumn("ok", astx.Boolean()), + ) + ) + + +def _dataframe_literal( + type_: astx.DataFrameType | None = None, +) -> astx.DataFrameLiteral: + """ + title: Build one DataFrame literal aligned to the test schema. + parameters: + type_: + type: astx.DataFrameType | None + returns: + type: astx.DataFrameLiteral + """ + schema = type_ or _dataframe_type() + return astx.DataFrameLiteral( + ( + astx.DataFrameLiteralColumn( + "id", + ( + astx.LiteralInt32(1), + astx.LiteralInt32(2), + astx.LiteralInt32(3), + ), + ), + astx.DataFrameLiteralColumn( + "score", + ( + astx.LiteralFloat64(0.5), + astx.LiteralFloat64(0.8), + astx.LiteralFloat64(1.0), + ), + ), + astx.DataFrameLiteralColumn( + "ok", + ( + astx.LiteralBoolean(True), + astx.LiteralBoolean(False), + astx.LiteralBoolean(True), + ), + ), + ), + type_=schema, + ) + + +def _declare_dataframe(name: str = "rows") -> astx.VariableDeclaration: + """ + title: Build one DataFrame variable declaration. + parameters: + name: + type: str + returns: + type: astx.VariableDeclaration + """ + type_ = _dataframe_type() + return astx.VariableDeclaration( + name=name, + type_=type_, + mutability=astx.MutabilityKind.mutable, + value=_dataframe_literal(type_), + ) + + +def test_dataframe_literal_get_struct_exposes_columns() -> None: + """ + title: DataFrame literal structs expose column payload and schema entries. + """ + type_ = _dataframe_type() + node = _dataframe_literal(type_) + + full = node.get_struct() + + assert "DataFrameLiteral" in full + content = full["DataFrameLiteral"]["content"] + assert [column["name"] for column in content["columns"]] == [ + "id", + "score", + "ok", + ] + assert [column["name"] for column in content["type"]] == [ + "id", + "score", + "ok", + ] + + +def test_dataframe_literal_lowers_through_arrow_table_runtime() -> None: + """ + title: DataFrame literals lower through the Arrow Table runtime feature. + """ + module = make_main_module( + _declare_dataframe(), + astx.FunctionReturn(astx.DataFrameRowCount(astx.Identifier("rows"))), + return_type=astx.Int64(), + ) + + ir_text = Builder().translate(module) + + assert '@"irx_arrow_table_new_from_arrays"' in ir_text + assert '@"irx_arrow_table_num_rows"' in ir_text + assert '@"irx_arrow_array_builder_new"' in ir_text + assert_ir_parses(ir_text) + + +def test_dataframe_column_access_lowers_to_chunked_array_handle() -> None: + """ + title: Static DataFrame column access lowers to a ChunkedArray handle. + """ + module = make_main_module( + _declare_dataframe(), + astx.FunctionReturn( + astx.SeriesRelease( + astx.DataFrameColumnAccess(astx.Identifier("rows"), "score") + ) + ), + ) + + ir_text = Builder().translate(module) + + assert '@"irx_arrow_table_column_by_index"' in ir_text + assert '@"irx_arrow_chunked_array_release"' in ir_text + assert_ir_parses(ir_text) + + +def test_dataframe_string_access_resolves_static_column_index() -> None: + """ + title: String-key column access shares static schema resolution. + """ + module = make_main_module( + _declare_dataframe(), + astx.FunctionReturn( + astx.SeriesRelease( + astx.DataFrameStringColumnAccess( + astx.Identifier("rows"), + "id", + ) + ) + ), + ) + + ir_text = Builder().translate(module) + + assert '@"irx_arrow_table_column_by_index"' in ir_text + assert '@"irx_arrow_table_column_by_name"' not in ir_text + assert_ir_parses(ir_text) + + +def test_dataframe_semantic_rejects_mismatched_column_lengths() -> None: + """ + title: DataFrame literal columns must have equal row counts. + """ + type_ = _dataframe_type() + bad_literal = astx.DataFrameLiteral( + ( + astx.DataFrameLiteralColumn( + "id", + (astx.LiteralInt32(1), astx.LiteralInt32(2)), + ), + astx.DataFrameLiteralColumn( + "score", + (astx.LiteralFloat64(0.5),), + ), + astx.DataFrameLiteralColumn( + "ok", + (astx.LiteralBoolean(True), astx.LiteralBoolean(False)), + ), + ), + type_=type_, + ) + module = make_main_module( + astx.VariableDeclaration( + name="rows", + type_=type_, + mutability=astx.MutabilityKind.mutable, + value=bad_literal, + ) + ) + + with pytest.raises(SemanticError, match="same length"): + analyze(module) + + +def test_dataframe_semantic_rejects_unknown_column() -> None: + """ + title: Static schema column access rejects unknown names. + """ + module = make_main_module( + _declare_dataframe(), + astx.FunctionReturn( + astx.SeriesRelease( + astx.DataFrameColumnAccess( + astx.Identifier("rows"), + "missing", + ) + ) + ), + ) + + with pytest.raises(SemanticError, match="no column 'missing'"): + analyze(module) From 7d97800414a4cde57e36880ced64a1ffecbac2bf Mon Sep 17 00:00:00 2001 From: Ivan Ogasawara Date: Mon, 27 Apr 2026 19:34:23 +0000 Subject: [PATCH 4/4] apply changes from reviewer --- docs/library/built-in-types.md | 3 + packages/arx/src/arx/parser/core.py | 57 +++++++++-- packages/arx/tests/python/test_dataframe.py | 25 +++++ .../handlers/_expressions/dataframes.py | 88 +++++++++-------- .../irx/src/irx/builder/lowering/dataframe.py | 97 ++++++++++++++++--- packages/irx/tests/test_dataframe.py | 60 +++++++++++- 6 files changed, 264 insertions(+), 66 deletions(-) diff --git a/docs/library/built-in-types.md b/docs/library/built-in-types.md index aa44f51..bc820fb 100644 --- a/docs/library/built-in-types.md +++ b/docs/library/built-in-types.md @@ -157,6 +157,9 @@ Current DataFrame rules in this phase: equal row counts - columns can be accessed as `rows.score` or `rows["score"]` - `rows.nrows()` and `rows.ncols()` return row and column counts as `i64` +- column access and metadata methods currently work on DataFrame identifiers and + literals whose schema is known while parsing, not on arbitrary + DataFrame-returning expressions - `dataframe[...]` is accepted only in function and extern parameter annotations for now; column access on runtime-schema parameters is not available yet diff --git a/packages/arx/src/arx/parser/core.py b/packages/arx/src/arx/parser/core.py index 2c5d8d4..0fb461f 100644 --- a/packages/arx/src/arx/parser/core.py +++ b/packages/arx/src/arx/parser/core.py @@ -299,7 +299,16 @@ def _is_tensor_name(self, name: str) -> bool: returns: type: bool """ - return any(name in scope for scope in reversed(self.tensor_scopes)) + for value_scope, tensor_scope in zip( + reversed(self.value_scopes), + reversed(self.tensor_scopes), + strict=True, + ): + if name in tensor_scope: + return True + if name in value_scope: + return False + return False def _declare_dataframe_name( self, @@ -325,7 +334,16 @@ def _is_dataframe_name(self, name: str) -> bool: returns: type: bool """ - return any(name in scope for scope in reversed(self.dataframe_scopes)) + for value_scope, dataframe_scope in zip( + reversed(self.value_scopes), + reversed(self.dataframe_scopes), + strict=True, + ): + if name in dataframe_scope: + return True + if name in value_scope: + return False + return False def _declare_list_name(self, name: str) -> None: """ @@ -345,7 +363,16 @@ def _is_list_name(self, name: str) -> bool: returns: type: bool """ - return any(name in scope for scope in reversed(self.list_scopes)) + for value_scope, list_scope in zip( + reversed(self.value_scopes), + reversed(self.list_scopes), + strict=True, + ): + if name in list_scope: + return True + if name in value_scope: + return False + return False def _lookup_tensor_binding(self, name: str) -> TensorBinding | None: """ @@ -356,9 +383,15 @@ def _lookup_tensor_binding(self, name: str) -> TensorBinding | None: returns: type: TensorBinding | None """ - for scope in reversed(self.tensor_scopes): - if name in scope: - return scope[name] + for value_scope, tensor_scope in zip( + reversed(self.value_scopes), + reversed(self.tensor_scopes), + strict=True, + ): + if name in tensor_scope: + return tensor_scope[name] + if name in value_scope: + return None return None def _lookup_dataframe_binding( @@ -373,9 +406,15 @@ def _lookup_dataframe_binding( returns: type: DataFrameBinding | None """ - for scope in reversed(self.dataframe_scopes): - if name in scope: - return scope[name] + for value_scope, dataframe_scope in zip( + reversed(self.value_scopes), + reversed(self.dataframe_scopes), + strict=True, + ): + if name in dataframe_scope: + return dataframe_scope[name] + if name in value_scope: + return None return None def _push_template_scope( diff --git a/packages/arx/tests/python/test_dataframe.py b/packages/arx/tests/python/test_dataframe.py index f4eb0f8..ef130e1 100644 --- a/packages/arx/tests/python/test_dataframe.py +++ b/packages/arx/tests/python/test_dataframe.py @@ -120,6 +120,31 @@ def test_dataframe_constructor_requires_declared_columns() -> None: ) +def test_dataframe_name_tracking_respects_inner_scope_shadowing() -> None: + """ + title: Inner non-DataFrame variables shadow outer DataFrame bindings. + """ + tree = _parse_module( + """ + fn main() -> i32: + var rows: dataframe[id: i32] = dataframe({id: [1]}) + if true: + var rows: i32 = 1 + return rows.nrows() + return 0 + """ + ) + + function = tree.nodes[0] + assert isinstance(function, astx.FunctionDef) + branch = function.body.nodes[1] + assert isinstance(branch, astx.IfStmt) + result = branch.then.nodes[1] + assert isinstance(result, astx.FunctionReturn) + assert isinstance(result.value, astx.MethodCall) + assert not isinstance(result.value, astx.DataFrameRowCount) + + def test_dataframe_mvp_rejects_string_columns() -> None: """ title: MVP DataFrame and Series types reject non fixed-width columns. diff --git a/packages/irx/src/irx/analysis/handlers/_expressions/dataframes.py b/packages/irx/src/irx/analysis/handlers/_expressions/dataframes.py index 3437d8f..020e0ad 100644 --- a/packages/irx/src/irx/analysis/handlers/_expressions/dataframes.py +++ b/packages/irx/src/irx/analysis/handlers/_expressions/dataframes.py @@ -76,6 +76,49 @@ def _set_dataframe_schema( if schema is not None: self._semantic(node).extras[DATAFRAME_SCHEMA_EXTRA] = schema + def _visit_dataframe_column_access( + self, + node: astx.DataFrameColumnAccess, + ) -> None: + """ + title: Visit shared DataFrame column access semantics. + parameters: + node: + type: astx.DataFrameColumnAccess + """ + self.visit(node.base) + base_type = self._expr_type(node.base) + if not isinstance(base_type, astx.DataFrameType): + self.context.diagnostics.add( + "dataframe column access requires a DataFrame value", + node=node, + code=DiagnosticCodes.SEMANTIC_INVALID_FIELD_ACCESS, + ) + self._set_type(node, None) + return + + schema = self._dataframe_schema(node.base) + column = None if schema is None else schema.column(node.column_name) + if column is None: + self.context.diagnostics.add( + f"dataframe has no column '{node.column_name}'", + node=node, + code=DiagnosticCodes.SEMANTIC_INVALID_FIELD_ACCESS, + ) + self._set_type(node, None) + return + + node.type_ = astx.SeriesType( + column.type_, + nullable=column.nullable, + ) + self._semantic(node).extras[DATAFRAME_COLUMN_INDEX_EXTRA] = ( + column.index + ) + self._semantic(node).extras[SERIES_ELEMENT_TYPE_EXTRA] = column.type_ + self._semantic(node).extras[SERIES_NULLABLE_EXTRA] = column.nullable + self._set_type(node, node.type_) + @SemanticAnalyzerCore.visit.dispatch def visit(self, node: astx.DataFrameLiteral) -> None: """ @@ -166,38 +209,7 @@ def visit(self, node: astx.DataFrameColumnAccess) -> None: node: type: astx.DataFrameColumnAccess """ - self.visit(node.base) - base_type = self._expr_type(node.base) - if not isinstance(base_type, astx.DataFrameType): - self.context.diagnostics.add( - "dataframe column access requires a DataFrame value", - node=node, - code=DiagnosticCodes.SEMANTIC_INVALID_FIELD_ACCESS, - ) - self._set_type(node, None) - return - - schema = self._dataframe_schema(node.base) - column = None if schema is None else schema.column(node.column_name) - if column is None: - self.context.diagnostics.add( - f"dataframe has no column '{node.column_name}'", - node=node, - code=DiagnosticCodes.SEMANTIC_INVALID_FIELD_ACCESS, - ) - self._set_type(node, None) - return - - node.type_ = astx.SeriesType( - column.type_, - nullable=column.nullable, - ) - self._semantic(node).extras[DATAFRAME_COLUMN_INDEX_EXTRA] = ( - column.index - ) - self._semantic(node).extras[SERIES_ELEMENT_TYPE_EXTRA] = column.type_ - self._semantic(node).extras[SERIES_NULLABLE_EXTRA] = column.nullable - self._set_type(node, node.type_) + self._visit_dataframe_column_access(node) @SemanticAnalyzerCore.visit.dispatch def visit(self, node: astx.DataFrameStringColumnAccess) -> None: @@ -207,17 +219,7 @@ def visit(self, node: astx.DataFrameStringColumnAccess) -> None: node: type: astx.DataFrameStringColumnAccess """ - self.visit( - cast_column := astx.DataFrameColumnAccess( - node.base, - node.column_name, - ) - ) - semantic = getattr(cast_column, "semantic", None) - if semantic is not None: - self._semantic(node).extras.update(semantic.extras) - node.type_ = cast_column.type_ - self._set_type(node, self._expr_type(cast_column)) + self._visit_dataframe_column_access(node) @SemanticAnalyzerCore.visit.dispatch def visit(self, node: astx.DataFrameRowCount) -> None: diff --git a/packages/irx/src/irx/builder/lowering/dataframe.py b/packages/irx/src/irx/builder/lowering/dataframe.py index e0de9f4..8780156 100644 --- a/packages/irx/src/irx/builder/lowering/dataframe.py +++ b/packages/irx/src/irx/builder/lowering/dataframe.py @@ -16,6 +16,10 @@ from irx.builder.core import VisitorCore from irx.builder.protocols import VisitorMixinBase from irx.builder.runtime import safe_pop +from irx.builder.runtime.assertions.feature import ( + ASSERT_FAILURE_SYMBOL_NAME, + ASSERT_RUNTIME_FEATURE_NAME, +) from irx.builder.types import is_int_type from irx.builtins.collections.dataframe import ( DATAFRAME_COLUMN_INDEX_EXTRA, @@ -30,6 +34,63 @@ class DataFrameVisitorMixin(VisitorMixinBase): title: DataFrame visitor mixin. """ + def _check_arrow_status(self, status: ir.Value, operation: str) -> None: + """ + title: Branch to the assertion runtime when an Arrow call fails. + parameters: + status: + type: ir.Value + operation: + type: str + """ + is_ok = self._llvm.ir_builder.icmp_signed( + "==", + status, + ir.Constant(self._llvm.INT32_TYPE, 0), + f"{operation}_ok", + ) + counter = getattr(self, "_dataframe_runtime_check_counter", 0) + setattr(self, "_dataframe_runtime_check_counter", counter + 1) + function = self._llvm.ir_builder.function + pass_block = function.append_basic_block( + f"dataframe_runtime_ok_{counter}" + ) + fail_block = function.append_basic_block( + f"dataframe_runtime_fail_{counter}" + ) + self._llvm.ir_builder.cbranch(is_ok, pass_block, fail_block) + + self._llvm.ir_builder.position_at_start(fail_block) + last_error = self.require_runtime_symbol( + "dataframe", + "irx_arrow_last_error", + ) + message_ptr = self._llvm.ir_builder.call( + last_error, + [], + f"{operation}_message", + ) + source_ptr = cast(Any, self)._constant_c_string_pointer( + "irx-arrow-dataframe", + name_hint="dataframe_runtime_source", + ) + fail_function = self.require_runtime_symbol( + ASSERT_RUNTIME_FEATURE_NAME, + ASSERT_FAILURE_SYMBOL_NAME, + ) + self._llvm.ir_builder.call( + fail_function, + [ + source_ptr, + ir.Constant(self._llvm.INT32_TYPE, 0), + ir.Constant(self._llvm.INT32_TYPE, 0), + message_ptr, + ], + ) + self._llvm.ir_builder.unreachable() + + self._llvm.ir_builder.position_at_start(pass_block) + def _append_dataframe_value( self, builder_handle: ir.Value, @@ -67,7 +128,11 @@ def _append_dataframe_value( self._llvm.DOUBLE_TYPE, "dataframe_fpext", ) - self._llvm.ir_builder.call(append, [builder_handle, value]) + status = self._llvm.ir_builder.call( + append, + [builder_handle, value], + ) + self._check_arrow_status(status, "dataframe_append_double") return append = self.require_runtime_symbol( @@ -98,7 +163,8 @@ def _append_dataframe_value( self._llvm.INT64_TYPE, "dataframe_trunc", ) - self._llvm.ir_builder.call(append, [builder_handle, value]) + status = self._llvm.ir_builder.call(append, [builder_handle, value]) + self._check_arrow_status(status, "dataframe_append_int") def _build_arrow_array_from_column( self, @@ -135,13 +201,14 @@ def _build_arrow_array_from_column( self._llvm.ARRAY_BUILDER_HANDLE_TYPE, name=f"{column_name}_array_builder_slot", ) - self._llvm.ir_builder.call( + status = self._llvm.ir_builder.call( builder_new, [ ir.Constant(self._llvm.INT32_TYPE, type_id), builder_slot, ], ) + self._check_arrow_status(status, "dataframe_array_builder_new") builder_handle = self._llvm.ir_builder.load( builder_slot, f"{column_name}_array_builder", @@ -154,10 +221,11 @@ def _build_arrow_array_from_column( self._llvm.ARRAY_HANDLE_TYPE, name=f"{column_name}_array_slot", ) - self._llvm.ir_builder.call( + status = self._llvm.ir_builder.call( finish_builder, [builder_handle, array_slot], ) + self._check_arrow_status(status, "dataframe_array_builder_finish") return self._llvm.ir_builder.load( array_slot, f"{column_name}_array", @@ -202,7 +270,7 @@ def _lower_dataframe_column_access( "dataframe", "irx_arrow_table_column_by_index", ) - self._llvm.ir_builder.call( + status = self._llvm.ir_builder.call( column_by_index, [ table_handle, @@ -210,6 +278,7 @@ def _lower_dataframe_column_access( column_slot, ], ) + self._check_arrow_status(status, "dataframe_column_by_index") else: column_by_name = self.require_runtime_symbol( "dataframe", @@ -219,10 +288,11 @@ def _lower_dataframe_column_access( node.column_name, name_hint=f"dataframe_column_{node.column_name}", ) - self._llvm.ir_builder.call( + status = self._llvm.ir_builder.call( column_by_name, [table_handle, name_pointer, column_slot], ) + self._check_arrow_status(status, "dataframe_column_by_name") column_handle = self._llvm.ir_builder.load( column_slot, @@ -319,7 +389,7 @@ def visit(self, node: astx.DataFrameLiteral) -> None: self._llvm.TABLE_HANDLE_TYPE, name="dataframe_table_slot", ) - self._llvm.ir_builder.call( + status = self._llvm.ir_builder.call( table_new, [ ir.Constant(self._llvm.INT64_TYPE, column_count), @@ -328,6 +398,7 @@ def visit(self, node: astx.DataFrameLiteral) -> None: table_slot, ], ) + self._check_arrow_status(status, "dataframe_table_new") table_handle = self._llvm.ir_builder.load( table_slot, "dataframe_table", @@ -412,9 +483,9 @@ def visit(self, node: astx.DataFrameRetain) -> None: "dataframe", "irx_arrow_table_retain", ) - self.result_stack.append( - self._llvm.ir_builder.call(retain, [table_handle]) - ) + status = self._llvm.ir_builder.call(retain, [table_handle]) + self._check_arrow_status(status, "dataframe_table_retain") + self.result_stack.append(status) @VisitorCore.visit.dispatch def visit(self, node: astx.DataFrameRelease) -> None: @@ -447,9 +518,9 @@ def visit(self, node: astx.SeriesRetain) -> None: "dataframe", "irx_arrow_chunked_array_retain", ) - self.result_stack.append( - self._llvm.ir_builder.call(retain, [column_handle]) - ) + status = self._llvm.ir_builder.call(retain, [column_handle]) + self._check_arrow_status(status, "dataframe_series_retain") + self.result_stack.append(status) @VisitorCore.visit.dispatch def visit(self, node: astx.SeriesRelease) -> None: diff --git a/packages/irx/tests/test_dataframe.py b/packages/irx/tests/test_dataframe.py index 5942c6e..9f59560 100644 --- a/packages/irx/tests/test_dataframe.py +++ b/packages/irx/tests/test_dataframe.py @@ -4,13 +4,17 @@ from __future__ import annotations +import shutil + import astx import pytest from irx.analysis import SemanticError, analyze from irx.builder import Builder -from .conftest import assert_ir_parses, make_main_module +from .conftest import assert_ir_parses, build_and_run, make_main_module + +EXPECTED_ROW_OR_COLUMN_COUNT = 3 def _dataframe_type() -> astx.DataFrameType: @@ -226,3 +230,57 @@ def test_dataframe_semantic_rejects_unknown_column() -> None: with pytest.raises(SemanticError, match="no column 'missing'"): analyze(module) + + +def test_dataframe_build_returns_row_count() -> None: + """ + title: Built DataFrame programs return Arrow Table row counts. + """ + if shutil.which("clang") is None: + pytest.skip("builder.build() currently requires clang") + + module = make_main_module( + _declare_dataframe(), + astx.FunctionReturn( + astx.Cast( + astx.DataFrameRowCount(astx.Identifier("rows")), + astx.Int32(), + ) + ), + ) + + result = build_and_run(Builder(), module) + + assert result.returncode == EXPECTED_ROW_OR_COLUMN_COUNT, ( + result.stderr or result.stdout + ) + + +def test_dataframe_build_releases_accessed_series_and_returns_ncols() -> None: + """ + title: Built DataFrame programs can acquire and release Series handles. + """ + if shutil.which("clang") is None: + pytest.skip("builder.build() currently requires clang") + + module = make_main_module( + _declare_dataframe(), + astx.SeriesRelease( + astx.DataFrameColumnAccess(astx.Identifier("rows"), "score") + ), + astx.SeriesRelease( + astx.DataFrameStringColumnAccess(astx.Identifier("rows"), "id") + ), + astx.FunctionReturn( + astx.Cast( + astx.DataFrameColumnCount(astx.Identifier("rows")), + astx.Int32(), + ) + ), + ) + + result = build_and_run(Builder(), module) + + assert result.returncode == EXPECTED_ROW_OR_COLUMN_COUNT, ( + result.stderr or result.stdout + )