Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
"dependencies": {
"@types/react": "^19.2.14",
"better-sqlite3": "^12.6.2",
"chrono-node": "^2.9.0",
"exceljs": "^4.4.0",
"fast-xml-parser": "^5.3.7",
"ink": "^6.8.0",
Expand All @@ -47,6 +48,7 @@
"papaparse": "^5.5.3",
"pg": "^8.18.0",
"react": "^19.2.4",
"string-width": "^8.2.0"
"string-width": "^8.2.0",
"woothee": "^1.11.1"
}
}
97 changes: 97 additions & 0 deletions src/aggregators/Ord2Bivariate.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import type { Aggregator } from "../Aggregator.ts";
import type { Record } from "../Record.ts";
import type { JsonValue } from "../types/json.ts";
import { findKey } from "../KeySpec.ts";
import { aggregatorRegistry } from "../Aggregator.ts";

// [sum1, sumX, sumY, sumXY, sumX2, sumY2]
type Ord2BivState = [number, number, number, number, number, number];

/**
* Second-order bivariate statistics aggregator.
* Computes covariance, correlation, and linear regression parameters
* between two fields using a single pass.
*
* Analogous to App::RecordStream::Aggregator::Ord2Bivariate in Perl.
*/
export class Ord2BivariateAggregator implements Aggregator<Ord2BivState | null> {
fieldX: string;
fieldY: string;

constructor(fieldX: string, fieldY: string) {
this.fieldX = fieldX;
this.fieldY = fieldY;
}

initial(): Ord2BivState | null {
return null;
}

combine(state: Ord2BivState | null, record: Record): Ord2BivState | null {
const vx = findKey(record.dataRef(), this.fieldX, true);
const vy = findKey(record.dataRef(), this.fieldY, true);
if (vx === undefined || vx === null || vy === undefined || vy === null) return state;
const x = Number(vx);
const y = Number(vy);
const mapped: Ord2BivState = [1, x, y, x * y, x * x, y * y];
if (state === null) return mapped;
return [
state[0] + mapped[0],
state[1] + mapped[1],
state[2] + mapped[2],
state[3] + mapped[3],
state[4] + mapped[4],
state[5] + mapped[5],
];
}

squish(state: Ord2BivState | null): JsonValue {
if (state === null) return null;
const [n, sumX, sumY, sumXY, sumX2, sumY2] = state;

const meanX = sumX / n;
const meanY = sumY / n;

// Covariance: E[XY] - E[X]*E[Y]
const covariance = sumXY / n - meanX * meanY;

// Variances
const varX = sumX2 / n - meanX * meanX;
const varY = sumY2 / n - meanY * meanY;

// Correlation: cov / (stdX * stdY)
const denominator = Math.sqrt(varX * varY);
const correlation = denominator > 0
? (sumXY * n - sumX * sumY) / Math.sqrt((sumX2 * n - sumX ** 2) * (sumY2 * n - sumY ** 2))
: null;

// Linear regression: y = alpha + beta * x
const betaDenom = sumX2 * n - sumX ** 2;
const beta = betaDenom !== 0 ? (sumXY * n - sumX * sumY) / betaDenom : null;
const alpha = beta !== null ? (sumY - beta * sumX) / n : null;

const result: { [key: string]: JsonValue } = {
count: n,
covariance,
correlation,
};

if (alpha !== null && beta !== null) {
result["alpha"] = alpha;
result["beta"] = beta;
}

return result;
}
}

aggregatorRegistry.register("ord2biv", {
create: (fieldX: string, fieldY: string) => new Ord2BivariateAggregator(fieldX, fieldY),
argCounts: [2],
shortUsage: "compute second-order bivariate statistics for two fields",
longUsage:
"Usage: ord2biv,<field1>,<field2>\n" +
" Compute covariance, correlation, and linear regression parameters\n" +
" between two fields.",
aliases: ["ord2bivariate"],
});
82 changes: 82 additions & 0 deletions src/aggregators/Ord2Univariate.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import type { Aggregator } from "../Aggregator.ts";
import type { Record } from "../Record.ts";
import type { JsonValue } from "../types/json.ts";
import { findKey } from "../KeySpec.ts";
import { aggregatorRegistry } from "../Aggregator.ts";

// [count, sumX, sumX2, sumX3, sumX4]
type Ord2UniState = [number, number, number, number, number];

/**
* Second-order univariate statistics aggregator.
* Computes count, mean, variance, standard deviation, skewness, and kurtosis
* for a single field using a single pass.
*
* Analogous to App::RecordStream::Aggregator::Ord2Univariate in Perl.
*/
export class Ord2UnivariateAggregator implements Aggregator<Ord2UniState | null> {
field: string;

constructor(field: string) {
this.field = field;
}

initial(): Ord2UniState | null {
return null;
}

combine(state: Ord2UniState | null, record: Record): Ord2UniState | null {
const value = findKey(record.dataRef(), this.field, true);
if (value === undefined || value === null) return state;
const x = Number(value);
const mapped: Ord2UniState = [1, x, x * x, x * x * x, x * x * x * x];
if (state === null) return mapped;
return [
state[0] + mapped[0],
state[1] + mapped[1],
state[2] + mapped[2],
state[3] + mapped[3],
state[4] + mapped[4],
];
}

squish(state: Ord2UniState | null): JsonValue {
if (state === null) return null;
const [n, sumX, sumX2, sumX3, sumX4] = state;

const mean = sumX / n;
const variance = sumX2 / n - mean * mean;
const stddev = Math.sqrt(variance);

const result: { [key: string]: JsonValue } = {
count: n,
mean,
variance,
stddev,
};

// Skewness and kurtosis require variance > 0
if (variance > 0) {
// E[(X - mean)^3] = E[X^3] - 3*mean*E[X^2] + 2*mean^3
const m3 = sumX3 / n - 3 * mean * (sumX2 / n) + 2 * mean * mean * mean;
result["skewness"] = m3 / (stddev * stddev * stddev);

// E[(X - mean)^4] = E[X^4] - 4*mean*E[X^3] + 6*mean^2*E[X^2] - 3*mean^4
const m4 = sumX4 / n - 4 * mean * (sumX3 / n) + 6 * mean * mean * (sumX2 / n) - 3 * mean * mean * mean * mean;
result["kurtosis"] = m4 / (variance * variance);
}

return result;
}
}

aggregatorRegistry.register("ord2uni", {
create: (field: string) => new Ord2UnivariateAggregator(field),
argCounts: [1],
shortUsage: "compute second-order univariate statistics for a field",
longUsage:
"Usage: ord2uni,<field>\n" +
" Compute count, mean, variance, standard deviation, skewness,\n" +
" and kurtosis for the specified field.",
aliases: ["ord2univariate"],
});
2 changes: 2 additions & 0 deletions src/aggregators/registry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,7 @@ import "./FirstRecord.ts";
import "./LastRecord.ts";
import "./RecordForMaximum.ts";
import "./RecordForMinimum.ts";
import "./Ord2Univariate.ts";
import "./Ord2Bivariate.ts";

export { aggregatorRegistry, makeAggregators } from "../Aggregator.ts";
72 changes: 57 additions & 15 deletions src/clumpers/Options.ts
Original file line number Diff line number Diff line change
Expand Up @@ -132,30 +132,38 @@ export class ClumperOptions {
keyParts.push(String(val ?? ""));
}

const groupKey = keyParts.join("\x1E");

if (!this.groups) {
this.groups = new Map();
}

let cookie = this.groups.get(groupKey);
if (cookie === undefined) {
// Handle LRU eviction if keySize is set and NOT in perfect mode
if (!this.keyPerfect && this.keySize !== null && this.groups.size >= this.keySize) {
const oldestKey = this.groupOrder.shift()!;
const oldCookie = this.groups.get(oldestKey);
if (oldCookie !== undefined) {
this.callback.clumperCallbackEnd(oldCookie);
this.groups.delete(oldestKey);
// In cube mode, generate all 2^N combinations of actual values and "ALL"
const combos = this.keyCube
? this.cubeKeyValues(keySpecs, keyValues)
: [{ keyValues, keyParts }];

for (const combo of combos) {
const groupKey = combo.keyParts.join("\x1E");

let cookie = this.groups.get(groupKey);
if (cookie === undefined) {
// Handle LRU eviction if keySize is set and NOT in perfect mode
if (!this.keyPerfect && this.keySize !== null && this.groups.size >= this.keySize) {
const oldestKey = this.groupOrder.shift()!;
const oldCookie = this.groups.get(oldestKey);
if (oldCookie !== undefined) {
this.callback.clumperCallbackEnd(oldCookie);
this.groups.delete(oldestKey);
}
}

cookie = this.callback.clumperCallbackBegin(combo.keyValues);
this.groups.set(groupKey, cookie);
this.groupOrder.push(groupKey);
}

cookie = this.callback.clumperCallbackBegin(keyValues);
this.groups.set(groupKey, cookie);
this.groupOrder.push(groupKey);
this.callback.clumperCallbackPushRecord(cookie, record);
}

this.callback.clumperCallbackPushRecord(cookie, record);
return true;
}

Expand Down Expand Up @@ -183,6 +191,40 @@ export class ClumperOptions {
}
}

/**
* Generate all 2^N combinations of actual key values and "ALL" for cube mode.
*/
cubeKeyValues(
keySpecs: string[],
keyValues: { [key: string]: JsonValue }
): Array<{ keyValues: { [key: string]: JsonValue }; keyParts: string[] }> {
const n = keySpecs.length;
const combos: Array<{ keyValues: { [key: string]: JsonValue }; keyParts: string[] }> = [];

// Iterate all 2^N bitmasks
for (let mask = 0; mask < (1 << n); mask++) {
const comboValues: { [key: string]: JsonValue } = {};
const comboParts: string[] = [];

for (let i = 0; i < n; i++) {
const spec = keySpecs[i]!;
if (mask & (1 << i)) {
// Replace this key with "ALL"
comboValues[spec] = "ALL";
comboParts.push("ALL");
} else {
// Use actual value
comboValues[spec] = keyValues[spec] ?? null;
comboParts.push(String(keyValues[spec] ?? ""));
}
}

combos.push({ keyValues: comboValues, keyParts: comboParts });
}

return combos;
}

getKeySize(): number | null {
return this.keySize;
}
Expand Down
11 changes: 11 additions & 0 deletions src/deaggregators/registry.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
/**
* Deaggregator registry - central place for registering and looking up
* deaggregators by name.
*
* Importing this module ensures all deaggregator implementations are registered.
*/

// Import all deaggregator implementations to trigger their self-registration
import "./Split.ts";
import "./Unarray.ts";
import "./Unhash.ts";
Loading
Loading