22// Copyright 2026 Johnson Ogundeji
33#pragma once
44
5- // ---------------------------------------------------------------------------
6- // column_index.hpp -- ColumnIndex, OffsetIndex, and ColumnIndexBuilder
7- //
8- // Per-column-chunk structures for predicate pushdown and random page access.
9- // Written after the row groups but before the footer in the Parquet file.
10- //
11- // ColumnIndex stores per-page min/max statistics, enabling readers to skip
12- // pages that cannot contain matching data. OffsetIndex stores page locations
13- // for efficient random access into column chunks.
14- //
15- // Thrift field IDs follow the canonical parquet.thrift specification:
16- // ColumnIndex: 1=null_pages, 2=min_values, 3=max_values, 4=boundary_order, 5=null_counts
17- // OffsetIndex: 1=page_locations (list<PageLocation>)
18- // PageLocation: 1=offset, 2=compressed_page_size, 3=first_row_index
19- // ---------------------------------------------------------------------------
5+ // / @file column_index.hpp
6+ // / @brief ColumnIndex, OffsetIndex, and ColumnIndexBuilder for predicate pushdown.
7+ // /
8+ // / Per-column-chunk structures for predicate pushdown and random page access.
9+ // / Written after the row groups but before the footer in the Parquet file.
10+ // /
11+ // / ColumnIndex stores per-page min/max statistics, enabling readers to skip
12+ // / pages that cannot contain matching data. OffsetIndex stores page locations
13+ // / for efficient random access into column chunks.
14+ // /
15+ // / Thrift field IDs follow the canonical parquet.thrift specification:
16+ // / - ColumnIndex: 1=null_pages, 2=min_values, 3=max_values, 4=boundary_order, 5=null_counts
17+ // / - OffsetIndex: 1=page_locations (list of PageLocation)
18+ // / - PageLocation: 1=offset, 2=compressed_page_size, 3=first_row_index
2019
2120#include " signet/thrift/compact.hpp"
2221
2625
2726namespace signet ::forge {
2827
29- // ---------------------------------------------------------------------------
30- // PageLocation -- file offset and size for a single data page
31- // ---------------------------------------------------------------------------
28+ // / File offset and size descriptor for a single data page.
29+ // /
30+ // / Used by OffsetIndex to record the location of each page within the
31+ // / Parquet file, enabling random access into column chunks without
32+ // / sequential scanning.
3233struct PageLocation {
33- int64_t offset = 0 ; // File offset of the page
34- int32_t compressed_page_size = 0 ; // Size of the page ( compressed bytes)
35- int64_t first_row_index = 0 ; // First row in this page (relative to row group)
34+ int64_t offset = 0 ; // /< Absolute file offset of the page header.
35+ int32_t compressed_page_size = 0 ; // /< Size of the page in compressed bytes.
36+ int64_t first_row_index = 0 ; // /< First row in this page (relative to row group).
3637
38+ // / Serialize this PageLocation to a Thrift compact encoder.
39+ // / @param enc The encoder to write to.
3740 void serialize (thrift::CompactEncoder& enc) const {
3841 enc.begin_struct ();
3942
@@ -53,6 +56,8 @@ struct PageLocation {
5356 enc.end_struct ();
5457 }
5558
59+ // / Deserialize this PageLocation from a Thrift compact decoder.
60+ // / @param dec The decoder to read from.
5661 void deserialize (thrift::CompactDecoder& dec) {
5762 dec.begin_struct ();
5863 for (;;) {
@@ -69,12 +74,18 @@ struct PageLocation {
6974 }
7075};
7176
72- // ---------------------------------------------------------------------------
73- // OffsetIndex -- page locations for random access within a column chunk
74- // ---------------------------------------------------------------------------
77+ // / Page locations for random access within a column chunk.
78+ // /
79+ // / Contains a list of PageLocation entries, one per data page in the
80+ // / column chunk. Written to the Parquet file after row groups to enable
81+ // / readers to seek directly to any page.
82+ // /
83+ // / @see ColumnIndex (companion structure for predicate pushdown)
7584struct OffsetIndex {
76- std::vector<PageLocation> page_locations;
85+ std::vector<PageLocation> page_locations; // /< One entry per data page.
7786
87+ // / Serialize this OffsetIndex to a Thrift compact encoder.
88+ // / @param enc The encoder to write to.
7889 void serialize (thrift::CompactEncoder& enc) const {
7990 enc.begin_struct ();
8091
@@ -90,6 +101,8 @@ struct OffsetIndex {
90101 enc.end_struct ();
91102 }
92103
104+ // / Deserialize this OffsetIndex from a Thrift compact decoder.
105+ // / @param dec The decoder to read from.
93106 void deserialize (thrift::CompactDecoder& dec) {
94107 dec.begin_struct ();
95108 for (;;) {
@@ -111,23 +124,32 @@ struct OffsetIndex {
111124 }
112125};
113126
114- // ---------------------------------------------------------------------------
115- // ColumnIndex -- per-page min/max statistics for predicate pushdown
116- // ---------------------------------------------------------------------------
127+ // / Per-page min/max statistics for predicate pushdown.
128+ // /
129+ // / Stores binary-encoded min/max values for each data page in a column
130+ // / chunk, along with null-page flags, boundary ordering, and optional
131+ // / null counts. Readers use filter_pages() to eliminate pages whose value
132+ // / ranges do not overlap the query predicate.
133+ // /
134+ // / @see OffsetIndex (companion for page offsets)
135+ // / @see ColumnIndexBuilder (builder pattern for constructing during writes)
117136struct ColumnIndex {
118- std::vector<bool > null_pages; // true if page is all nulls
119- std::vector<std::string> min_values; // Min value per page (binary encoded)
120- std::vector<std::string> max_values; // Max value per page (binary encoded)
137+ std::vector<bool > null_pages; // /< True if the corresponding page is all nulls.
138+ std::vector<std::string> min_values; // /< Binary-encoded minimum value per page.
139+ std::vector<std::string> max_values; // /< Binary-encoded maximum value per page.
121140
141+ // / Ordering of min values across pages, used to short-circuit filtering.
122142 enum class BoundaryOrder : int32_t {
123- UNORDERED = 0 ,
124- ASCENDING = 1 ,
125- DESCENDING = 2
143+ UNORDERED = 0 , // /< Min values have no particular order.
144+ ASCENDING = 1 , // /< Min values are non-decreasing across pages.
145+ DESCENDING = 2 // /< Min values are non-increasing across pages.
126146 };
127- BoundaryOrder boundary_order = BoundaryOrder::UNORDERED;
147+ BoundaryOrder boundary_order = BoundaryOrder::UNORDERED; // /< Boundary order of min values.
128148
129- std::vector<int64_t > null_counts; // Null count per page (optional)
149+ std::vector<int64_t > null_counts; // /< Null count per page (optional).
130150
151+ // / Serialize this ColumnIndex to a Thrift compact encoder.
152+ // / @param enc The encoder to write to.
131153 void serialize (thrift::CompactEncoder& enc) const {
132154 enc.begin_struct ();
133155
@@ -173,6 +195,8 @@ struct ColumnIndex {
173195 enc.end_struct ();
174196 }
175197
198+ // / Deserialize this ColumnIndex from a Thrift compact decoder.
199+ // / @param dec The decoder to read from.
176200 void deserialize (thrift::CompactDecoder& dec) {
177201 dec.begin_struct ();
178202 for (;;) {
@@ -224,20 +248,22 @@ struct ColumnIndex {
224248 dec.end_struct ();
225249 }
226250
227- // -------------------------------------------------------------------
228- // filter_pages -- predicate pushdown filter
229- //
230- // Given a range [min_val, max_val] (binary-encoded, same encoding
231- // as min_values/max_values), returns a vector of page indices that
232- // might contain matching data. A page is excluded only if its max
233- // is strictly less than min_val or its min is strictly greater than
234- // max_val. Null pages are always excluded.
235- //
236- // Binary comparison uses lexicographic byte ordering, which is
237- // correct for unsigned integer types and strings. For signed types,
238- // the caller should ensure values use a comparison-safe binary
239- // encoding (e.g., Parquet's standard signed-magnitude encoding).
240- // -------------------------------------------------------------------
251+ // / Filter pages by a value range for predicate pushdown.
252+ // /
253+ // / Given a range [@p min_val, @p max_val] (binary-encoded, same encoding
254+ // / as min_values/max_values), returns page indices that might contain
255+ // / matching data. A page is excluded only if its max is strictly less
256+ // / than @p min_val or its min is strictly greater than @p max_val.
257+ // / All-null pages are always excluded.
258+ // /
259+ // / @note Binary comparison uses lexicographic byte ordering, which is
260+ // / correct for unsigned integer types and strings. For signed
261+ // / types, the caller should ensure values use a comparison-safe
262+ // / binary encoding.
263+ // /
264+ // / @param min_val Lower bound of the query range (binary-encoded).
265+ // / @param max_val Upper bound of the query range (binary-encoded).
266+ // / @return A vector of page indices (0-based) that may contain matches.
241267 [[nodiscard]] std::vector<size_t > filter_pages (
242268 const std::string& min_val,
243269 const std::string& max_val) const {
@@ -268,23 +294,26 @@ struct ColumnIndex {
268294 }
269295};
270296
271- // ---------------------------------------------------------------------------
272- // ColumnIndexBuilder -- accumulates per-page statistics during writing
273- //
274- // Usage:
275- // ColumnIndexBuilder builder;
276- // for (each page being written) {
277- // builder.start_page();
278- // builder.set_min(...);
279- // builder.set_max(...);
280- // builder.set_null_page(false);
281- // builder.set_null_count(0);
282- // builder.set_first_row_index(row_offset);
283- // builder.set_page_location(file_offset, compressed_size);
284- // }
285- // ColumnIndex ci = builder.build_column_index();
286- // OffsetIndex oi = builder.build_offset_index();
287- // ---------------------------------------------------------------------------
297+ // / Builder that accumulates per-page statistics during column writing.
298+ // /
299+ // / Usage:
300+ // / @code
301+ // / ColumnIndexBuilder builder;
302+ // / for (each page being written) {
303+ // / builder.start_page();
304+ // / builder.set_min(...);
305+ // / builder.set_max(...);
306+ // / builder.set_null_page(false);
307+ // / builder.set_null_count(0);
308+ // / builder.set_first_row_index(row_offset);
309+ // / builder.set_page_location(file_offset, compressed_size);
310+ // / }
311+ // / ColumnIndex ci = builder.build_column_index();
312+ // / OffsetIndex oi = builder.build_offset_index();
313+ // / @endcode
314+ // /
315+ // / @see ColumnIndex (output of build_column_index())
316+ // / @see OffsetIndex (output of build_offset_index())
288317class ColumnIndexBuilder {
289318public:
290319 // / Start a new page. Must be called before set_min/set_max etc.
@@ -293,42 +322,48 @@ class ColumnIndexBuilder {
293322 }
294323
295324 // / Record the minimum value for the current page (binary-encoded).
325+ // / @param min_val The binary-encoded minimum value.
296326 void set_min (const std::string& min_val) {
297327 if (!pages_.empty ()) {
298328 pages_.back ().min_value = min_val;
299329 }
300330 }
301331
302332 // / Record the maximum value for the current page (binary-encoded).
333+ // / @param max_val The binary-encoded maximum value.
303334 void set_max (const std::string& max_val) {
304335 if (!pages_.empty ()) {
305336 pages_.back ().max_value = max_val;
306337 }
307338 }
308339
309340 // / Mark the current page as all-nulls (or not).
341+ // / @param is_null True if the page contains only null values.
310342 void set_null_page (bool is_null) {
311343 if (!pages_.empty ()) {
312344 pages_.back ().null_page = is_null;
313345 }
314346 }
315347
316348 // / Record the null count for the current page.
349+ // / @param count Number of null values in the current page.
317350 void set_null_count (int64_t count) {
318351 if (!pages_.empty ()) {
319352 pages_.back ().null_count = count;
320353 }
321354 }
322355
323356 // / Record the first row index for the current page (relative to row group).
357+ // / @param row_index Zero-based row index of the first row in this page.
324358 void set_first_row_index (int64_t row_index) {
325359 if (!pages_.empty ()) {
326360 pages_.back ().first_row_index = row_index;
327361 }
328362 }
329363
330- // / Record the page location (file offset and compressed size) for the
331- // / current page.
364+ // / Record the page location (file offset and compressed size) for the current page.
365+ // / @param offset Absolute file offset of the page.
366+ // / @param compressed_size Page size in compressed bytes.
332367 void set_page_location (int64_t offset, int32_t compressed_size) {
333368 if (!pages_.empty ()) {
334369 pages_.back ().offset = offset;
@@ -337,6 +372,10 @@ class ColumnIndexBuilder {
337372 }
338373
339374 // / Finalize and return the ColumnIndex from accumulated page info.
375+ // /
376+ // / Automatically detects boundary order from the min_values sequence.
377+ // /
378+ // / @return A fully populated ColumnIndex ready for serialization.
340379 [[nodiscard]] ColumnIndex build_column_index () const {
341380 ColumnIndex ci;
342381 ci.null_pages .reserve (pages_.size ());
@@ -368,6 +407,7 @@ class ColumnIndexBuilder {
368407 }
369408
370409 // / Finalize and return the OffsetIndex from accumulated page info.
410+ // / @return A fully populated OffsetIndex ready for serialization.
371411 [[nodiscard]] OffsetIndex build_offset_index () const {
372412 OffsetIndex oi;
373413 oi.page_locations .reserve (pages_.size ());
@@ -392,6 +432,7 @@ class ColumnIndexBuilder {
392432 [[nodiscard]] size_t num_pages () const { return pages_.size (); }
393433
394434private:
435+ // / Accumulated per-page metadata during building.
395436 struct PageInfo {
396437 std::string min_value;
397438 std::string max_value;
@@ -404,12 +445,7 @@ class ColumnIndexBuilder {
404445
405446 std::vector<PageInfo> pages_;
406447
407- // -------------------------------------------------------------------
408- // Detect boundary order from the min_values sequence.
409- // Returns ASCENDING if all min values are non-decreasing,
410- // DESCENDING if all are non-increasing, UNORDERED otherwise.
411- // A single-page or empty sequence is ASCENDING by convention.
412- // -------------------------------------------------------------------
448+ // / Detect boundary order from the min_values sequence.
413449 [[nodiscard]] static ColumnIndex::BoundaryOrder detect_boundary_order (
414450 const std::vector<std::string>& values) {
415451
0 commit comments