Skip to content

Commit e486243

Browse files
docs: Batch 2+3 Doxygen — core support, encodings, compression (15 headers)
Add /// documentation to all public symbols in: - Core support: memory.hpp, statistics.hpp, column_reader.hpp, column_writer.hpp, column_index.hpp, mmap_reader.hpp - Encodings: rle.hpp, delta.hpp, dictionary.hpp, byte_stream_split.hpp - Compression: codec.hpp, snappy.hpp, zstd.hpp, lz4.hpp, gzip.hpp Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 2f37f20 commit e486243

15 files changed

Lines changed: 1446 additions & 517 deletions

include/signet/column_index.hpp

Lines changed: 111 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,20 @@
22
// Copyright 2026 Johnson Ogundeji
33
#pragma once
44

5-
// ---------------------------------------------------------------------------
6-
// column_index.hpp -- ColumnIndex, OffsetIndex, and ColumnIndexBuilder
7-
//
8-
// Per-column-chunk structures for predicate pushdown and random page access.
9-
// Written after the row groups but before the footer in the Parquet file.
10-
//
11-
// ColumnIndex stores per-page min/max statistics, enabling readers to skip
12-
// pages that cannot contain matching data. OffsetIndex stores page locations
13-
// for efficient random access into column chunks.
14-
//
15-
// Thrift field IDs follow the canonical parquet.thrift specification:
16-
// ColumnIndex: 1=null_pages, 2=min_values, 3=max_values, 4=boundary_order, 5=null_counts
17-
// OffsetIndex: 1=page_locations (list<PageLocation>)
18-
// PageLocation: 1=offset, 2=compressed_page_size, 3=first_row_index
19-
// ---------------------------------------------------------------------------
5+
/// @file column_index.hpp
6+
/// @brief ColumnIndex, OffsetIndex, and ColumnIndexBuilder for predicate pushdown.
7+
///
8+
/// Per-column-chunk structures for predicate pushdown and random page access.
9+
/// Written after the row groups but before the footer in the Parquet file.
10+
///
11+
/// ColumnIndex stores per-page min/max statistics, enabling readers to skip
12+
/// pages that cannot contain matching data. OffsetIndex stores page locations
13+
/// for efficient random access into column chunks.
14+
///
15+
/// Thrift field IDs follow the canonical parquet.thrift specification:
16+
/// - ColumnIndex: 1=null_pages, 2=min_values, 3=max_values, 4=boundary_order, 5=null_counts
17+
/// - OffsetIndex: 1=page_locations (list of PageLocation)
18+
/// - PageLocation: 1=offset, 2=compressed_page_size, 3=first_row_index
2019

2120
#include "signet/thrift/compact.hpp"
2221

@@ -26,14 +25,18 @@
2625

2726
namespace signet::forge {
2827

29-
// ---------------------------------------------------------------------------
30-
// PageLocation -- file offset and size for a single data page
31-
// ---------------------------------------------------------------------------
28+
/// File offset and size descriptor for a single data page.
29+
///
30+
/// Used by OffsetIndex to record the location of each page within the
31+
/// Parquet file, enabling random access into column chunks without
32+
/// sequential scanning.
3233
struct PageLocation {
33-
int64_t offset = 0; // File offset of the page
34-
int32_t compressed_page_size = 0; // Size of the page (compressed bytes)
35-
int64_t first_row_index = 0; // First row in this page (relative to row group)
34+
int64_t offset = 0; ///< Absolute file offset of the page header.
35+
int32_t compressed_page_size = 0; ///< Size of the page in compressed bytes.
36+
int64_t first_row_index = 0; ///< First row in this page (relative to row group).
3637

38+
/// Serialize this PageLocation to a Thrift compact encoder.
39+
/// @param enc The encoder to write to.
3740
void serialize(thrift::CompactEncoder& enc) const {
3841
enc.begin_struct();
3942

@@ -53,6 +56,8 @@ struct PageLocation {
5356
enc.end_struct();
5457
}
5558

59+
/// Deserialize this PageLocation from a Thrift compact decoder.
60+
/// @param dec The decoder to read from.
5661
void deserialize(thrift::CompactDecoder& dec) {
5762
dec.begin_struct();
5863
for (;;) {
@@ -69,12 +74,18 @@ struct PageLocation {
6974
}
7075
};
7176

72-
// ---------------------------------------------------------------------------
73-
// OffsetIndex -- page locations for random access within a column chunk
74-
// ---------------------------------------------------------------------------
77+
/// Page locations for random access within a column chunk.
78+
///
79+
/// Contains a list of PageLocation entries, one per data page in the
80+
/// column chunk. Written to the Parquet file after row groups to enable
81+
/// readers to seek directly to any page.
82+
///
83+
/// @see ColumnIndex (companion structure for predicate pushdown)
7584
struct OffsetIndex {
76-
std::vector<PageLocation> page_locations;
85+
std::vector<PageLocation> page_locations; ///< One entry per data page.
7786

87+
/// Serialize this OffsetIndex to a Thrift compact encoder.
88+
/// @param enc The encoder to write to.
7889
void serialize(thrift::CompactEncoder& enc) const {
7990
enc.begin_struct();
8091

@@ -90,6 +101,8 @@ struct OffsetIndex {
90101
enc.end_struct();
91102
}
92103

104+
/// Deserialize this OffsetIndex from a Thrift compact decoder.
105+
/// @param dec The decoder to read from.
93106
void deserialize(thrift::CompactDecoder& dec) {
94107
dec.begin_struct();
95108
for (;;) {
@@ -111,23 +124,32 @@ struct OffsetIndex {
111124
}
112125
};
113126

114-
// ---------------------------------------------------------------------------
115-
// ColumnIndex -- per-page min/max statistics for predicate pushdown
116-
// ---------------------------------------------------------------------------
127+
/// Per-page min/max statistics for predicate pushdown.
128+
///
129+
/// Stores binary-encoded min/max values for each data page in a column
130+
/// chunk, along with null-page flags, boundary ordering, and optional
131+
/// null counts. Readers use filter_pages() to eliminate pages whose value
132+
/// ranges do not overlap the query predicate.
133+
///
134+
/// @see OffsetIndex (companion for page offsets)
135+
/// @see ColumnIndexBuilder (builder pattern for constructing during writes)
117136
struct ColumnIndex {
118-
std::vector<bool> null_pages; // true if page is all nulls
119-
std::vector<std::string> min_values; // Min value per page (binary encoded)
120-
std::vector<std::string> max_values; // Max value per page (binary encoded)
137+
std::vector<bool> null_pages; ///< True if the corresponding page is all nulls.
138+
std::vector<std::string> min_values; ///< Binary-encoded minimum value per page.
139+
std::vector<std::string> max_values; ///< Binary-encoded maximum value per page.
121140

141+
/// Ordering of min values across pages, used to short-circuit filtering.
122142
enum class BoundaryOrder : int32_t {
123-
UNORDERED = 0,
124-
ASCENDING = 1,
125-
DESCENDING = 2
143+
UNORDERED = 0, ///< Min values have no particular order.
144+
ASCENDING = 1, ///< Min values are non-decreasing across pages.
145+
DESCENDING = 2 ///< Min values are non-increasing across pages.
126146
};
127-
BoundaryOrder boundary_order = BoundaryOrder::UNORDERED;
147+
BoundaryOrder boundary_order = BoundaryOrder::UNORDERED; ///< Boundary order of min values.
128148

129-
std::vector<int64_t> null_counts; // Null count per page (optional)
149+
std::vector<int64_t> null_counts; ///< Null count per page (optional).
130150

151+
/// Serialize this ColumnIndex to a Thrift compact encoder.
152+
/// @param enc The encoder to write to.
131153
void serialize(thrift::CompactEncoder& enc) const {
132154
enc.begin_struct();
133155

@@ -173,6 +195,8 @@ struct ColumnIndex {
173195
enc.end_struct();
174196
}
175197

198+
/// Deserialize this ColumnIndex from a Thrift compact decoder.
199+
/// @param dec The decoder to read from.
176200
void deserialize(thrift::CompactDecoder& dec) {
177201
dec.begin_struct();
178202
for (;;) {
@@ -224,20 +248,22 @@ struct ColumnIndex {
224248
dec.end_struct();
225249
}
226250

227-
// -------------------------------------------------------------------
228-
// filter_pages -- predicate pushdown filter
229-
//
230-
// Given a range [min_val, max_val] (binary-encoded, same encoding
231-
// as min_values/max_values), returns a vector of page indices that
232-
// might contain matching data. A page is excluded only if its max
233-
// is strictly less than min_val or its min is strictly greater than
234-
// max_val. Null pages are always excluded.
235-
//
236-
// Binary comparison uses lexicographic byte ordering, which is
237-
// correct for unsigned integer types and strings. For signed types,
238-
// the caller should ensure values use a comparison-safe binary
239-
// encoding (e.g., Parquet's standard signed-magnitude encoding).
240-
// -------------------------------------------------------------------
251+
/// Filter pages by a value range for predicate pushdown.
252+
///
253+
/// Given a range [@p min_val, @p max_val] (binary-encoded, same encoding
254+
/// as min_values/max_values), returns page indices that might contain
255+
/// matching data. A page is excluded only if its max is strictly less
256+
/// than @p min_val or its min is strictly greater than @p max_val.
257+
/// All-null pages are always excluded.
258+
///
259+
/// @note Binary comparison uses lexicographic byte ordering, which is
260+
/// correct for unsigned integer types and strings. For signed
261+
/// types, the caller should ensure values use a comparison-safe
262+
/// binary encoding.
263+
///
264+
/// @param min_val Lower bound of the query range (binary-encoded).
265+
/// @param max_val Upper bound of the query range (binary-encoded).
266+
/// @return A vector of page indices (0-based) that may contain matches.
241267
[[nodiscard]] std::vector<size_t> filter_pages(
242268
const std::string& min_val,
243269
const std::string& max_val) const {
@@ -268,23 +294,26 @@ struct ColumnIndex {
268294
}
269295
};
270296

271-
// ---------------------------------------------------------------------------
272-
// ColumnIndexBuilder -- accumulates per-page statistics during writing
273-
//
274-
// Usage:
275-
// ColumnIndexBuilder builder;
276-
// for (each page being written) {
277-
// builder.start_page();
278-
// builder.set_min(...);
279-
// builder.set_max(...);
280-
// builder.set_null_page(false);
281-
// builder.set_null_count(0);
282-
// builder.set_first_row_index(row_offset);
283-
// builder.set_page_location(file_offset, compressed_size);
284-
// }
285-
// ColumnIndex ci = builder.build_column_index();
286-
// OffsetIndex oi = builder.build_offset_index();
287-
// ---------------------------------------------------------------------------
297+
/// Builder that accumulates per-page statistics during column writing.
298+
///
299+
/// Usage:
300+
/// @code
301+
/// ColumnIndexBuilder builder;
302+
/// for (each page being written) {
303+
/// builder.start_page();
304+
/// builder.set_min(...);
305+
/// builder.set_max(...);
306+
/// builder.set_null_page(false);
307+
/// builder.set_null_count(0);
308+
/// builder.set_first_row_index(row_offset);
309+
/// builder.set_page_location(file_offset, compressed_size);
310+
/// }
311+
/// ColumnIndex ci = builder.build_column_index();
312+
/// OffsetIndex oi = builder.build_offset_index();
313+
/// @endcode
314+
///
315+
/// @see ColumnIndex (output of build_column_index())
316+
/// @see OffsetIndex (output of build_offset_index())
288317
class ColumnIndexBuilder {
289318
public:
290319
/// Start a new page. Must be called before set_min/set_max etc.
@@ -293,42 +322,48 @@ class ColumnIndexBuilder {
293322
}
294323

295324
/// Record the minimum value for the current page (binary-encoded).
325+
/// @param min_val The binary-encoded minimum value.
296326
void set_min(const std::string& min_val) {
297327
if (!pages_.empty()) {
298328
pages_.back().min_value = min_val;
299329
}
300330
}
301331

302332
/// Record the maximum value for the current page (binary-encoded).
333+
/// @param max_val The binary-encoded maximum value.
303334
void set_max(const std::string& max_val) {
304335
if (!pages_.empty()) {
305336
pages_.back().max_value = max_val;
306337
}
307338
}
308339

309340
/// Mark the current page as all-nulls (or not).
341+
/// @param is_null True if the page contains only null values.
310342
void set_null_page(bool is_null) {
311343
if (!pages_.empty()) {
312344
pages_.back().null_page = is_null;
313345
}
314346
}
315347

316348
/// Record the null count for the current page.
349+
/// @param count Number of null values in the current page.
317350
void set_null_count(int64_t count) {
318351
if (!pages_.empty()) {
319352
pages_.back().null_count = count;
320353
}
321354
}
322355

323356
/// Record the first row index for the current page (relative to row group).
357+
/// @param row_index Zero-based row index of the first row in this page.
324358
void set_first_row_index(int64_t row_index) {
325359
if (!pages_.empty()) {
326360
pages_.back().first_row_index = row_index;
327361
}
328362
}
329363

330-
/// Record the page location (file offset and compressed size) for the
331-
/// current page.
364+
/// Record the page location (file offset and compressed size) for the current page.
365+
/// @param offset Absolute file offset of the page.
366+
/// @param compressed_size Page size in compressed bytes.
332367
void set_page_location(int64_t offset, int32_t compressed_size) {
333368
if (!pages_.empty()) {
334369
pages_.back().offset = offset;
@@ -337,6 +372,10 @@ class ColumnIndexBuilder {
337372
}
338373

339374
/// Finalize and return the ColumnIndex from accumulated page info.
375+
///
376+
/// Automatically detects boundary order from the min_values sequence.
377+
///
378+
/// @return A fully populated ColumnIndex ready for serialization.
340379
[[nodiscard]] ColumnIndex build_column_index() const {
341380
ColumnIndex ci;
342381
ci.null_pages.reserve(pages_.size());
@@ -368,6 +407,7 @@ class ColumnIndexBuilder {
368407
}
369408

370409
/// Finalize and return the OffsetIndex from accumulated page info.
410+
/// @return A fully populated OffsetIndex ready for serialization.
371411
[[nodiscard]] OffsetIndex build_offset_index() const {
372412
OffsetIndex oi;
373413
oi.page_locations.reserve(pages_.size());
@@ -392,6 +432,7 @@ class ColumnIndexBuilder {
392432
[[nodiscard]] size_t num_pages() const { return pages_.size(); }
393433

394434
private:
435+
/// Accumulated per-page metadata during building.
395436
struct PageInfo {
396437
std::string min_value;
397438
std::string max_value;
@@ -404,12 +445,7 @@ class ColumnIndexBuilder {
404445

405446
std::vector<PageInfo> pages_;
406447

407-
// -------------------------------------------------------------------
408-
// Detect boundary order from the min_values sequence.
409-
// Returns ASCENDING if all min values are non-decreasing,
410-
// DESCENDING if all are non-increasing, UNORDERED otherwise.
411-
// A single-page or empty sequence is ASCENDING by convention.
412-
// -------------------------------------------------------------------
448+
/// Detect boundary order from the min_values sequence.
413449
[[nodiscard]] static ColumnIndex::BoundaryOrder detect_boundary_order(
414450
const std::vector<std::string>& values) {
415451

0 commit comments

Comments
 (0)