diff --git a/.gitignore b/.gitignore index 91babc8..9f8f1f6 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ examples/basic/basic examples/otel-test/otel-test *.exe -temp/ \ No newline at end of file +temp/ +examples/clickhouse-simulator/* \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..33907c8 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,107 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Commands + +```bash +# Test (race detector + coverage, all packages) +make test +# or directly: +go test -v -race -cover ./... + +# Run a single test +go test -v -run TestName ./... + +# Lint (golangci-lint must be installed) +make lint + +# Format +make fmt + +# Dependencies +make deps +``` + +## Architecture + +Ion is a Go observability library. The module path is `github.com/JupiterMetaLabs/ion`. + +### Public API (stable, `internal/*` has no guarantees) + +| File | Role | +|------|------| +| `ion.go` | `*Ion` struct — root observability instance; `New()`, `Child()`, `Named()`, `With()`, `Shutdown()` | +| `logger.go` | `Logger` interface | +| `logger_impl.go` | `zapLogger` struct — internal Zap wrapper, `prepareFields()`, field conversion | +| `config.go` | Type aliases into `internal/config`; `Default()`, `Development()` builders | +| `fields.go` | `Field` struct and typed constructors (`String`, `Int64`, `Err`, etc.) | +| `context.go` | Context helpers (`WithRequestID`, `extractContextZapFields`, etc.) | +| `tracer.go` | `Tracer`/`Span` interfaces; `noopTracer` | +| `attrs.go` | `Attr` type alias, OTEL status constants | +| `fields/blockchain.go` | Domain-specific field constructors (`TxHash`, `BlockHeight`, etc.) | +| `middleware/ionhttp/` | HTTP middleware for context propagation | +| `middleware/iongrpc/` | gRPC stats handler for context propagation | + +### Internal packages + +| Package | Role | +|---------|------| +| `internal/config/config.go` | All config structs (`Config`, `OTELConfig`, `FileConfig`, …), `Validate()`, `Default()`, `Development()`, `NewFileWriter()` | +| `internal/core/logger_factory.go` | `NewZapLogger()` — assembles all Zap cores (console, file, OTEL) into a `zapcore.Tee` | +| `internal/core/otel.go` | `SetupLogProvider()`, `SetupTracerProvider()`, OTLP exporter construction, endpoint/auth helpers | +| `internal/core/meter.go` | `SetupMeterProvider()`, OTLP metrics exporter construction | +| `internal/core/filter.go` | `filteringCore` — strips internal sentinel keys from log output | +| `internal/core/enforcer.go` | `levelEnforcer` — overrides a core's level check (needed for the OTEL zap bridge which defaults to Info) | +| `internal/core/constants.go` | `SentinelKey = "__ion_ctx__"`, `SystemFieldPrefix = "__ion_"` | + +### Log pipeline (critical to understand before modifying) + +``` +ion.Info(ctx, msg, fields...) + └─ zapLogger.prepareFields(ctx, fields) + ├─ toZapFields(fields) // Field → zap.Field, zero-alloc for primitives + └─ extractContextZapFields(ctx) // trace_id, span_id, request_id, user_id + + zap.Reflect(SentinelKey, ctx) // carries raw ctx into the core for OTEL bridge + └─ zap.Logger.Info(msg, zapFields...) + └─ zapcore.Tee [ + filteringCore(consoleCore) // strips SentinelKey before writing to stdout/stderr + filteringCore(fileCore) // strips SentinelKey before writing to file + filteringCore(levelEnforcer(otelzapCore)) // OTEL bridge uses SentinelKey to + ] // extract TraceID/SpanID before it's stripped +``` + +**`SentinelKey` (`"__ion_ctx__"`)** is the mechanism for passing `context.Context` through Zap's field system so the `otelzap` bridge can call `trace.SpanContextFromContext()` inside the core, while `filteringCore` prevents the raw context from leaking into console/file output. + +### Adding a new log sink (the pattern to follow for ClickHouse) + +All sinks are assembled in `internal/core/logger_factory.go` inside `NewZapLogger()`. The `cores` slice is built and passed to `zapcore.NewTee`. Steps: + +1. Add config struct fields to `internal/config/config.go` (follow `FileConfig` pattern). +2. Alias the new type in `config.go` (public package), add a fluent builder on `Config`. +3. Create `internal/core/.go` implementing or wrapping `zapcore.Core`. +4. In `NewZapLogger()`: build the core if enabled, wrap it in `NewFilteringCore(core, SentinelKey)`, append to `cores`. +5. If the sink needs graceful shutdown (connections, flushers), add it to `ZapFactoryResult` and call shutdown in `zapLogger.Shutdown()`. +6. Wire any new config-level minimum-level calculation into the `minLevel` block in `NewZapLogger()`. + +`filteringCore` **must** wrap every new core to strip the sentinel; otherwise raw `context.Context` values appear in output. + +### `*Ion` vs `Logger` vs `zapLogger` + +- `zapLogger` — concrete, unexported. Owns the `*zap.Logger`, `zap.AtomicLevel`, and `*core.LogProvider`. Implements `Logger`. +- `*Ion` — exported. Embeds `*zapLogger` (promoting all `Logger` methods). Also holds `tracerProvider` and `meterProvider`. The concrete type behind every `Logger` interface value returned by the public API. +- `Child()` returns `*Ion` directly (caller needs Tracer/Meter). `Named()`/`With()` return `Logger` (interface) — internally still `*Ion`. +- All children share the same `zap.AtomicLevel` pointer → `SetLevel()` on any instance propagates everywhere. +- Only the root `*Ion` from `New()` should be shut down. `Shutdown()` on a child tears down shared providers. + +### `Critical()` — no-exit Fatal + +`Critical()` calls `zap.Fatal()` but `New()` installs `zap.WithFatalHook(noExitHook{})`, which is a no-op hook. This emits a `FATAL`-level log entry and returns control to the caller — it never calls `os.Exit`. + +### Config inheritance + +`Tracing` and `Metrics` configs inherit `Endpoint`, `Protocol`, `Insecure`, `Username`, `Password`, `Headers`, `Timeout`, `BatchSize`, and `ExportInterval` from `OTEL` config when their own values are empty. This inheritance is applied in `ion.go` `New()` before calling the setup functions. + +## Linting + +golangci-lint v2 config is in `.golangci.yml`. Active linters: `govet`, `ineffassign`, `unused`, `nolintlint`, `staticcheck`, `errcheck`, `gosec`. Every `//nolint` directive must name the specific linter and include a reason comment. Import ordering enforced by `goimports` with local prefix `github.com/JupiterMetaLabs/ion`. diff --git a/README.md b/README.md index f4c764e..2f565e3 100644 --- a/README.md +++ b/README.md @@ -296,6 +296,7 @@ Ion uses a comprehensive configuration struct for behavior control. This maps 1: | `OTEL` | `OTELConfig` | `Enabled: false` | Configuration for remote OpenTelemetry logging. | | `Tracing` | `TracingConfig` | `Enabled: false` | Configuration for distributed tracing. | | `Metrics` | `MetricsConfig` | `Enabled: false` | Configuration for OpenTelemetry metrics. | +| `ClickHouse` | `ClickHouseConfig` | `Enabled: false` | Configuration for ClickHouse analytics log sink. | ### Console Configuration (`ion.ConsoleConfig`) @@ -348,6 +349,77 @@ Controls the OpenTelemetry **Trace** Provider. Empty fields inherit from `OTELCo | `Username` | `string` | `""` | Inherits `OTEL.Username` if empty. | | `Password` | `string` | `""` | Inherits `OTEL.Password` if empty. | +### ClickHouse Configuration (`ion.ClickHouseConfig`) + +Ion can write every log entry to a ClickHouse table in parallel with console/file/OTEL output. The sink is fully asynchronous — log calls return immediately and a background goroutine batches rows to ClickHouse. This enables fast analytical queries over log data (e.g., "error rate per validator in the last 5 minutes") that are not possible with Loki. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `Enabled` | `bool` | `false` | Enables the ClickHouse sink. | +| `DSN` | `string` | `""` | Connection string. **Required when enabled.** `"http://user:pass@host:8123/db"` or `"clickhouse://user:pass@host:9000/db"`. | +| `Table` | `string` | `"ion_logs"` | Target table name. Must be a valid unquoted SQL identifier. | +| `Level` | `string` | `""` | Minimum level for this sink. Inherits global `Level` if empty. | +| `BatchSize` | `int` | `1000` | Max rows per flush. Larger batches → fewer TCP writes, higher latency. | +| `FlushInterval` | `Duration` | `5s` | How often the background flusher sends pending rows. | +| `ChannelBuffer` | `int` | `10000` | Async queue depth. Entries are dropped (not blocked) when full. | +| `AutoSchema` | `bool` | `false` | Run `CREATE TABLE IF NOT EXISTS` on startup. Set `true` for dev; manage DDL yourself in production. | +| `DialTimeout` | `Duration` | `10s` | Connection dial timeout. | +| `WriteTimeout` | `Duration` | `30s` | Per-flush write timeout. | +| `MaxOpenConns` | `int` | `5` | Connection pool size. | +| `MaxIdleConns` | `int` | `5` | Idle connections kept open. | +| `ConnMaxLifetime` | `Duration` | `1h` | Max connection reuse duration. | + +#### Table Schema + +Ion creates the following table when `AutoSchema = true`. You can also run the DDL manually before deployment: + +```sql +CREATE TABLE IF NOT EXISTS ion_logs +( + timestamp DateTime64(9, 'UTC'), -- nanosecond precision + level LowCardinality(String), -- dictionary-encoded, fast WHERE + service LowCardinality(String), + version LowCardinality(String), + logger String, + message String, + trace_id String, + span_id String, + request_id String, + user_id String, + caller String, + str_fields Map(String, String), -- ion.String() fields + int_fields Map(String, Int64), -- ion.Int64(), ion.Uint64() fields + flt_fields Map(String, Float64), -- ion.Float64() fields + bool_fields Map(String, UInt8), -- ion.Bool() fields (0/1) + extra String -- JSON bag for errors, structs +) +ENGINE = MergeTree() +PARTITION BY toYYYYMM(timestamp) +ORDER BY (service, level, timestamp) +TTL timestamp + INTERVAL 30 DAY DELETE +SETTINGS index_granularity = 8192; +``` + +**Schema design:** `LowCardinality` on `level`/`service`/`version` gives ~10× storage reduction for high-cardinality repeated strings. Typed map columns (`int_fields`, `flt_fields`) allow index pushdown on `WHERE int_fields['block_height'] > 19000000` without string casting. The `extra` column is a JSON escape hatch for complex Go types (errors, structs) and is not indexed. + +#### Monitoring Back-Pressure + +```go +app, _, _ := ion.New(cfg) + +// DroppedCount returns the total entries dropped due to a full channel buffer. +// A rising value means the buffer is filling faster than the flusher can drain it. +dropped := app.DroppedCount() +``` + +Tuning guide: + +| Symptom | Cause | Action | +|---------|-------|--------| +| `DroppedCount()` rising | Buffer filling faster than it is flushed | Decrease `FlushInterval` (flush more often), increase `BatchSize` (flush more rows per write), and increase `ChannelBuffer` (more room before drops occur) | +| Flush errors in stderr | ClickHouse unreachable | Fix connectivity; rows during downtime are lost | +| Memory growth | `ChannelBuffer` too large | Decrease `ChannelBuffer` | + ### Metrics Configuration (`ion.MetricsConfig`) Controls the OpenTelemetry **Metrics** Provider (OTLP Push). Empty fields inherit from `OTELConfig`. diff --git a/config.go b/config.go index 5a9fc4b..a897878 100644 --- a/config.go +++ b/config.go @@ -1,6 +1,7 @@ package ion import ( + clickhouseconfig "github.com/JupiterMetaLabs/ion/internal/clickhouse/config" "github.com/JupiterMetaLabs/ion/internal/config" ) @@ -23,6 +24,9 @@ type TracingConfig = config.TracingConfig // MetricsConfig configures OpenTelemetry metrics export. type MetricsConfig = config.MetricsConfig +// ClickHouseConfig configures the ClickHouse log sink. +type ClickHouseConfig = clickhouseconfig.Config + // Default returns a Config with sensible production defaults. func Default() Config { return config.Default() diff --git a/docs/CLICKHOUSE_DESIGN.md b/docs/CLICKHOUSE_DESIGN.md new file mode 100644 index 0000000..f67d504 --- /dev/null +++ b/docs/CLICKHOUSE_DESIGN.md @@ -0,0 +1,465 @@ +# ClickHouse Log Sink — Design & Implementation Plan + +**Date:** 2026-05-01 +**Status:** Approved for implementation +**Module:** `github.com/JupiterMetaLabs/ion` (existing module, new package `clickhouse/`) +**Author context:** Ion is a Go observability library built on Zap + OpenTelemetry. This document describes a new `zapcore.Core` implementation that writes log entries directly to ClickHouse for fast time-series analytics queries. + +--- + +## Context + +Ion's log pipeline works by assembling a list of `zapcore.Core` instances inside `internal/core/logger_factory.go` and fanning them out with `zapcore.NewTee`. Existing sinks are: console (stdout/stderr), file (lumberjack), and OTEL (via otelzap bridge). Each sink is independent — adding or removing one does not affect the others. + +ClickHouse is being added as a fourth sink. The reason: OTEL → Loki works for operational triage, but ClickHouse enables fast analytical queries over log data (e.g., "how many block validation errors per validator in the last 10 minutes?"). This is not possible efficiently with Loki. + +Logs in Ion are semi-structured: every entry has a fixed set of well-known fields (`level`, `service`, `trace_id`, etc.) plus arbitrary typed fields attached at call time via `ion.String(...)`, `ion.Int64(...)`, `ion.F(...)`. The ClickHouse schema must store both without forcing callers to change anything. + +**This package is built independently first.** It does not import or wire into `ion.New()` yet. It exposes a standard `zapcore.Core` that can be attached to any `*zap.Logger`. Once validated, connecting it to ion is a trivial change in `internal/core/logger_factory.go`. + +--- + +## Package Structure + +``` +ion/ ← module root + go.mod ← gains one new dep: clickhouse-go/v2 + clickhouse/ + config.go ← Config struct + Validate() + core.go ← Core struct + zapcore.Core interface + New/Open/Shutdown + batch.go ← batchWriter: channel, flush goroutine, drain + row.go ← logRow struct + extractRow() field extraction + schema.go ← DDL const + EnsureSchema() + core_test.go ← unit tests + integration_test.go ← integration tests (env-gated via CLICKHOUSE_TEST_DSN) +``` + +The `clickhouse/` package is within the same module, so it is allowed to import `github.com/JupiterMetaLabs/ion/internal/core` for the sentinel key constant (`SentinelKey = "__ion_ctx__"`) and `SystemFieldPrefix = "__ion_"`. + +--- + +## Architecture & Data Flow + +``` +Application goroutine +──────────────────────────────────────────────────────────────── + logger.Info(ctx, "msg", ion.String("k","v"), ion.Int64("n",42)) + │ + └─► zapLogger.zap.Info(...) + │ + └─► zapcore.Tee ─────► consoleCore (existing) + │ ─────► fileCore (existing) + │ ─────► otelCore (existing) + │ + └─────────────► clickhouse.Core.Write(entry, fields) + │ + ├─ extractRow(entry, allFields) ← O(N) field scan + │ maps zap field types → + │ StrFields / IntFields / FltFields / BoolFields / Extra + │ + └─ select { + case batchWriter.ch <- row: ← non-blocking send + default: ← buffer full + dropped.Add(1) + } + return nil ← always returns immediately + +Background goroutine (one per Core, started in Open()) +──────────────────────────────────────────────────────────────── + batchWriter.run(): + ticker := time.NewTicker(FlushInterval) + batch := []logRow{} + + for { + select { + case row := <-ch: + batch = append(batch, row) + if len(batch) >= BatchSize: + flush(batch) → batch = nil + + case <-ticker.C: + if len(batch) > 0: + flush(batch) → batch = nil + + case <-stop: + drain(ch) → append remaining to batch + flush(batch) + return + } + } + +flush(batch): + b, err := conn.PrepareBatch(ctx, "INSERT INTO ") + for each row: + b.AppendStruct(&row) + b.Send() ← one TCP write, binary column-oriented protocol + if err: print to stderr, drop batch, continue +``` + +**Key invariant:** `Write()` is always non-blocking. The only work done in the caller's goroutine is field extraction (O(N) scan) + a non-blocking channel send. ClickHouse I/O is entirely in the background goroutine. + +--- + +## Component Specifications + +### `config.go` — `Config` + +```go +type Config struct { + // Required + DSN string // "clickhouse://user:pass@host:9000/db" or HTTP variant + + // Optional with defaults + Database string // default: "default" + Table string // default: "ion_logs" + Level string // minimum level to ship to CH; default: "info" + BatchSize int // rows per flush; default: 1000 + FlushInterval time.Duration // flush cadence; default: 5s + ChannelBuffer int // async queue depth; default: 10000 + AutoSchema bool // run CREATE TABLE IF NOT EXISTS on Open(); default: true + DialTimeout time.Duration // default: 10s + MaxOpenConns int // connection pool size; default: 5 + MaxIdleConns int // default: 5 + ConnMaxLifetime time.Duration // default: 1h +} +``` + +`Validate()` returns a single combined error for all invalid fields — same pattern as `internal/config.Config.Validate()` in the existing codebase. + +`withDefaults()` is a private method that fills zero-value fields with the defaults above, called inside `New()`. + +--- + +### `core.go` — `Core` (implements `zapcore.Core`) + +```go +type Core struct { + cfg Config + conn driver.Conn // clickhouse-go/v2 native connection + level zapcore.LevelEnabler + writer *batchWriter + presetFields []zapcore.Field // accumulated by With() calls + dropped atomic.Int64 +} +``` + +**`zapcore.Core` interface — method-by-method:** + +| Method | Behaviour | +|--------|-----------| +| `Enabled(lvl zapcore.Level) bool` | Delegates to `c.level.Enabled(lvl)`. Fast path: if the level is not enabled, Zap never calls `Check` or `Write`. | +| `With(fields []zapcore.Field) zapcore.Core` | Returns a **new** `*Core` with `presetFields = append(c.presetFields, fields...)`. Never mutates the receiver. The new Core shares the same `conn`, `writer`, and `dropped` counter — they all feed the same background flusher. | +| `Check(entry zapcore.Entry, ce *zapcore.CheckedEntry) *zapcore.CheckedEntry` | Standard pattern: if `c.Enabled(entry.Level)`, call `ce.AddCore(entry, c)` and return. | +| `Write(entry zapcore.Entry, fields []zapcore.Field) error` | Merge `presetFields` + `fields`. Call `extractRow(entry, merged)`. Non-blocking send to `writer.ch`. Return `nil` always. | +| `Sync() error` | Call `writer.flushSync()` to flush the current in-memory batch immediately. Returns any ClickHouse error. | + +**Public functions on `core.go`:** + +```go +// New validates cfg, fills defaults, returns an uninitialised *Core. +// Does NOT connect to ClickHouse yet. Safe to call at program start. +func New(cfg Config) (*Core, error) + +// Open connects to ClickHouse, optionally runs DDL (if AutoSchema=true), +// starts the background batch-flush goroutine. +// ctx controls the connection and DDL timeout. +func (c *Core) Open(ctx context.Context) error + +// Shutdown signals the background goroutine to stop, drains the channel, +// flushes remaining rows, and closes the ClickHouse connection. +// Respects ctx deadline. +func (c *Core) Shutdown(ctx context.Context) error + +// DroppedCount returns the total log entries dropped due to a full buffer. +// Use this to detect back-pressure. Expose via a metric in production. +func (c *Core) DroppedCount() int64 +``` + +--- + +### `batch.go` — `batchWriter` + +```go +type batchWriter struct { + ch chan logRow + conn driver.Conn + cfg Config + wg sync.WaitGroup + stop chan struct{} + dropped *atomic.Int64 // pointer to Core.dropped +} +``` + +`run()` is the background goroutine body (described in the data flow section above). + +`flush(rows []logRow)`: +1. `conn.PrepareBatch(ctx, "INSERT INTO
")` +2. For each row: `batch.AppendStruct(&row)` +3. `batch.Send()` +4. On error: `fmt.Fprintf(os.Stderr, "[ion/clickhouse] flush error: %v\n", err)` — batch is dropped, loop continues. This is consistent with ion's failure isolation design: a backend failure never crashes or stalls the service. + +`flushSync()` — called by `Sync()`: drains what's currently in the channel into a local slice, then flushes. Used for orderly shutdown and testing. + +`Shutdown(ctx context.Context) error`: +1. Close `stop` channel → signals `run()` to exit +2. `wg.Wait()` with `ctx` deadline +3. Return ctx error if deadline exceeded + +--- + +### `row.go` — `logRow` + `extractRow()` + +```go +type logRow struct { + Timestamp time.Time + Level string + Service string + Version string + Logger string + Message string + TraceID string + SpanID string + RequestID string + UserID string + Caller string + StrFields map[string]string + IntFields map[string]int64 + FltFields map[string]float64 + BoolFields map[string]uint8 + Extra string // JSON-encoded bag for complex/unknown field types +} +``` + +`extractRow(entry zapcore.Entry, fields []zapcore.Field) logRow`: + +Iterates `fields` once. For each `zapcore.Field`: + +``` +Key == core.SentinelKey ("__ion_ctx__") → skip (internal context carrier) +strings.HasPrefix(key, core.SystemFieldPrefix) → skip (any other internal field) + +Key == "trace_id" → row.TraceID +Key == "span_id" → row.SpanID +Key == "request_id" → row.RequestID +Key == "user_id" → row.UserID +Key == "service" → row.Service (overrides entry default if caller sets it) +Key == "version" → row.Version + +zapcore.StringType → StrFields[key] +zapcore.Int64Type / Int32Type / Int16Type / Int8Type + → IntFields[key] +zapcore.Uint64Type / Uint32Type / Uint16Type / Uint8Type + → IntFields[key] (cast to int64) +zapcore.Float64Type / Float32Type → FltFields[key] +zapcore.BoolType → BoolFields[key] (0 or 1) +zapcore.ErrorType → Extra[""] = "" (string) +zapcore.ReflectType / all others → Extra[""] = json.Marshal(field.Interface) +``` + +`Extra` is always a single flat JSON object where each key is the field's key name and each value is +the serialized representation of that field. Example with two overflow fields: + +```json +{"db_error": "connection refused", "request": {"method": "GET", "path": "/api/v1"}} +``` + +`ErrorType` values are stored as plain strings (the `.Error()` string). `ReflectType` values are +stored as whatever `json.Marshal` produces for that Go value (nested objects are valid). +An empty `Extra` (no overflow fields) is stored as `""` — not `"{}"` — to save space and make +`WHERE extra != ''` a clean "has overflow fields" filter. Marshalling failures for a single field +drop that field's key from `Extra` silently; other fields in the same entry are unaffected. + +Service and Version come from the Zap logger's initial fields (set via `zap.Fields(...)` in `buildZapOptions`). These arrive as `zapcore.StringType` fields with keys `"service"` and `"version"`. The extraction logic handles them as dedicated column writes. + +--- + +### `schema.go` — DDL + `EnsureSchema()` + +```sql +CREATE TABLE IF NOT EXISTS ion_logs +( + timestamp DateTime64(9, 'UTC'), + level LowCardinality(String), + service LowCardinality(String), + version LowCardinality(String), + logger String, + message String, + trace_id String, + span_id String, + request_id String, + user_id String, + caller String, + str_fields Map(String, String), + int_fields Map(String, Int64), + flt_fields Map(String, Float64), + bool_fields Map(String, UInt8), + extra String +) +ENGINE = MergeTree() +PARTITION BY toYYYYMM(timestamp) +ORDER BY (service, level, timestamp) +TTL timestamp + INTERVAL 30 DAY DELETE +SETTINGS index_granularity = 8192; +``` + +**Schema design rationale (for future LLMs reading this):** + +- `LowCardinality(String)` on `level`, `service`, `version` — these fields repeat identically across millions of rows. ClickHouse applies dictionary encoding, reducing storage ~10x and enabling faster `WHERE level = 'error'` scans. +- `DateTime64(9, 'UTC')` — nanosecond precision matches Go's `time.Time`. Always UTC to avoid timezone ambiguity. +- `ORDER BY (service, level, timestamp)` — the primary sparse index is built on this key. This is optimised for the dominant access pattern: "give me ERROR logs for service X in the last hour." The `(service, level)` prefix eliminates rows before the timestamp range scan. +- `PARTITION BY toYYYYMM(timestamp)` — monthly partitions allow `ALTER TABLE DROP PARTITION '202501'` for cheap data expiry without scanning the whole table. Required for the TTL to work efficiently. +- `TTL timestamp + INTERVAL 30 DAY DELETE` — automatic deletion at the partition level. Teams managing their own schema can change this interval. +- `Map(String, T)` per type — preserves numeric types for `WHERE int_fields['block_height'] > 19000000` without `CAST`. A single `Map(String, String)` would require string-to-number coercion at query time and would break index pushdown. +- `extra String` — escape hatch for complex Go types (`structs`, `errors`, `interfaces`). Stored as JSON. Queryable with ClickHouse JSON extraction functions but not indexed. Acceptable because complex-type fields are rare in practice. + +`EnsureSchema(ctx context.Context, conn driver.Conn, cfg Config) error` — executes the DDL above with the configured `cfg.Table` name substituted in. Called by `Open()` when `cfg.AutoSchema == true`. + +--- + +## Error Handling Matrix + +| Scenario | Where it happens | Behaviour | +|----------|-----------------|-----------| +| Invalid DSN / config | `New()` | Returns error immediately. Application decides. | +| ClickHouse unreachable at `Open()` | `Open()` | Returns error. Application can start without CH sink. | +| ClickHouse down during flush | `batchWriter.flush()` | Prints to stderr. Drops the batch. Continues. | +| Buffer full (`ChannelBuffer` exhausted) | `Write()` | Increments `dropped` counter. Returns `nil`. Never blocks. | +| `Shutdown()` context deadline exceeded | `Shutdown()` | Returns `ctx.Err()`. Remaining buffered rows are lost. | +| `With()` on a closed Core | `With()` | Returns new Core (With does not touch the connection). Safe. | +| Malformed row (json.Marshal fails) | `extractRow()` | The failing field's key is omitted from `extra`. Other fields still written. | + +--- + +## Performance & Memory Notes + +**Memory ceiling:** +- Each `logRow` has 4 map fields. An empty `logRow` is ~400 bytes on the heap (map headers). +- A `logRow` with 5 typical fields (2 strings, 2 ints, 1 float) is ~700–900 bytes. +- Default `ChannelBuffer = 10000` → peak heap from the channel: ~7–10 MB. +- Tuning: if memory is tight, reduce `ChannelBuffer`. If drops are observed, increase it or reduce `FlushInterval`. + +**CPU cost per log entry (in caller goroutine):** +- One O(N) pass over fields (N = number of fields per log call, typically 2–8). +- Map insertions (amortised O(1)). +- `json.Marshal` only for `AnyType`/`ReflectType` fields — not called for typed primitives. +- No lock contention: `Write()` uses a channel send (lock-free in most cases with buffered channels). + +**ClickHouse insert efficiency:** +- Native binary protocol: `clickhouse-go/v2` sends an entire batch as one binary-encoded payload. For 1,000 rows this is approximately one TCP write — not 1,000 round trips. +- ClickHouse is designed for bulk columnar inserts. Single-row inserts destroy performance. The batch writer pattern is **not optional** — it's the reason this architecture works. + +**Signals to watch in production:** +| Signal | Implication | Tuning action | +|--------|-------------|---------------| +| `DroppedCount()` rising | Buffer filling faster than ClickHouse can flush | Decrease `FlushInterval` or increase `BatchSize` + `ChannelBuffer` | +| Flush errors in stderr | ClickHouse unreachable or auth failure | Fix connectivity; rows during downtime are lost | +| Memory growth | `ChannelBuffer` too large for average row size | Decrease `ChannelBuffer` | +| High CPU | `AnyType` fields being JSON-marshalled on hot path | Prefer typed constructors (`ion.String`, `ion.Int64`) over `ion.F` with structs | + +--- + +## Usage (Standalone, Before Ion Wiring) + +```go +package main + +import ( + "context" + "time" + + "github.com/JupiterMetaLabs/ion/clickhouse" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" +) + +func main() { + ctx := context.Background() + + // 1. Build the ClickHouse core + chCore, err := clickhouse.New(clickhouse.Config{ + DSN: "clickhouse://localhost:9000/default", + Table: "ion_logs", + BatchSize: 500, + FlushInterval: 3 * time.Second, + }) + if err != nil { + panic(err) + } + if err := chCore.Open(ctx); err != nil { + // ClickHouse unavailable — decide: fail hard or continue without it + panic(err) + } + defer chCore.Shutdown(ctx) + + // 2. Attach to any *zap.Logger via zapcore.NewTee + consoleCore := zapcore.NewCore( + zapcore.NewJSONEncoder(zap.NewProductionEncoderConfig()), + zapcore.Lock(os.Stdout), + zapcore.InfoLevel, + ) + logger := zap.New(zapcore.NewTee(consoleCore, chCore)) + + // 3. Log normally — both cores receive the entry + logger.Info("block validated", + zap.String("validator", "0xabc"), + zap.Int64("block_height", 19_500_000), + zap.Float64("latency_ms", 12.5), + ) + + // At shutdown: chCore.Shutdown(ctx) flushes remaining rows +} +``` + +--- + +## Future: Connecting to `ion.New()` + +When this package is validated, wiring it into ion requires only: + +**`internal/config/config.go`** — add `ClickHouseConfig` struct (follows `FileConfig` pattern). + +**`config.go`** (public package) — add type alias and `WithClickHouse(dsn string)` fluent builder. + +**`internal/core/logger_factory.go`** — inside `NewZapLogger()`, after the file core block: +```go +if cfg.ClickHouse.Enabled { + chCore, err := clickhouse.New(cfg.ClickHouse) + // ... open, handle error as warning (like OTEL) + cores = append(cores, clickhouse.NewFilteringWrapper(chCore, SentinelKey)) +} +``` + +**`ion.go`** — add shutdown of the ClickHouse core in `Ion.Shutdown()` if it holds a closeable resource (or delegate via `zapLogger.Shutdown()` → `core.LogProvider` pattern). + +No changes to `Logger` interface. No changes to `*Ion` struct public API. + +--- + +## Files to Create + +| File | Key responsibility | +|------|--------------------| +| `clickhouse/config.go` | `Config` struct, `Validate()`, `withDefaults()` | +| `clickhouse/core.go` | `Core` struct, `zapcore.Core` impl, `New()`, `Open()`, `Shutdown()`, `DroppedCount()` | +| `clickhouse/batch.go` | `batchWriter`: channel, `run()`, `flush()`, `flushSync()`, `Shutdown()` | +| `clickhouse/row.go` | `logRow` struct, `extractRow()`, field type mapping logic | +| `clickhouse/schema.go` | DDL const, `EnsureSchema()` | +| `clickhouse/core_test.go` | Unit tests for all components | +| `clickhouse/integration_test.go` | Integration tests, skipped without `CLICKHOUSE_TEST_DSN` env var | + +**Dependency to add to `go.mod`:** +``` +github.com/ClickHouse/clickhouse-go/v2 +``` + +No other new dependencies. + +--- + +## Verification Plan + +1. `go build ./clickhouse/...` — confirms the package compiles cleanly with no ion wiring yet. +2. `go test -v -race ./clickhouse/...` — unit tests pass; integration tests skip cleanly without the env var. +3. `CLICKHOUSE_TEST_DSN="clickhouse://localhost:9000" go test -v -race ./clickhouse/...` — integration tests run against a real instance and verify rows appear in the table with correct column values. +4. Manual check: write 10,000 entries, call `Shutdown()`, query `SELECT count() FROM ion_logs` → must equal 10,000. `DroppedCount()` must be 0 under normal conditions. +5. Stress test: fill channel (set `ChannelBuffer=10`), verify `DroppedCount()` increments and `Write()` never blocks the goroutine. diff --git a/docs/CLICKHOUSE_TESTING.md b/docs/CLICKHOUSE_TESTING.md new file mode 100644 index 0000000..b1e80a0 --- /dev/null +++ b/docs/CLICKHOUSE_TESTING.md @@ -0,0 +1,148 @@ +# ClickHouse Package — Testing Guide + +## Unit Tests (no ClickHouse required) + +Run the full clickhouse package test suite with race detection: + +```bash +go test ./clickhouse/... -race +``` + +Run a specific test group: + +```bash +# Row extraction logic +go test ./clickhouse/... -race -run TestExtractRow + +# Schema DDL correctness +go test ./clickhouse/... -race -run TestBuildDDL + +# Batch writer (flush on size, flush on ticker, drops, sync, shutdown) +go test ./clickhouse/... -race -run TestBatchWriter + +# Core zapcore.Core interface (Enabled, Check, Write, With, Sync, DroppedCount) +go test ./clickhouse/... -race -run TestCore + +# Config validation and defaults +go test ./clickhouse/... -race -run TestValidate +go test ./clickhouse/... -race -run TestDefaults +``` + +Run a single named test: + +```bash +go test -v -run TestCore_With_PresetFieldsPropagated ./clickhouse/... +``` + +--- + +## Integration Tests (live ClickHouse required) + +Set the DSN environment variable before running. HTTP on port 8123 is recommended: + +```bash +export CLICKHOUSE_TEST_DSN="http://default:PASSWORD@HOST:8123/default" +``` + +Then run the full suite (unit + integration): + +```bash +CLICKHOUSE_TEST_DSN="http://default:PASSWORD@HOST:8123/default" go test ./clickhouse/... -race -v +``` + +Integration tests that run with the DSN set: + +```bash +# Verify New() connects and exposes config +CLICKHOUSE_TEST_DSN="http://default:PASSWORD@HOST:8123/default" \ + go test ./clickhouse/tests/... -race -v -run TestNew_ConnectsAndExposesConfig + +# Create table and verify all 16 columns exist +CLICKHOUSE_TEST_DSN="http://default:PASSWORD@HOST:8123/default" \ + go test ./clickhouse/tests/... -race -v -run TestEnsureSchema_CreatesTable + +# Verify CREATE TABLE IF NOT EXISTS is idempotent (run twice, no error) +CLICKHOUSE_TEST_DSN="http://default:PASSWORD@HOST:8123/default" \ + go test ./clickhouse/tests/... -race -v -run TestEnsureSchema_Idempotent + +# Ping tests +CLICKHOUSE_TEST_DSN="http://default:PASSWORD@HOST:8123/default" \ + go test ./clickhouse/tests/... -race -v -run TestPing +``` + +--- + +## Create the Production Table + +Run this once to create `ion_logs` on your ClickHouse instance. Safe to re-run — uses `IF NOT EXISTS`: + +```bash +cat > /tmp/create_ion_logs.go << 'EOF' +package main + +import ( + "context" + "fmt" + "log" + + "github.com/JupiterMetaLabs/ion/clickhouse" + "github.com/JupiterMetaLabs/ion/clickhouse/config" +) + +func main() { + cfg := config.Config{DSN: "http://default:PASSWORD@HOST:8123/default"} + if err := clickhouse.RunEnsureSchema(context.Background(), cfg, "ion_logs"); err != nil { + log.Fatalf("EnsureSchema: %v", err) + } + fmt.Println("ion_logs table created (or already exists)") +} +EOF +go run /tmp/create_ion_logs.go +``` + +--- + +## Verify the Table on ClickHouse + +Ping the HTTP endpoint: + +```bash +curl "http://default:PASSWORD@HOST:8123/ping" +# Expected: Ok. +``` + +Check the table exists: + +```bash +curl "http://default:PASSWORD@HOST:8123/" \ + --data "SELECT name, engine FROM system.tables WHERE database = 'default' AND name = 'ion_logs' FORMAT Pretty" +``` + +Inspect all column definitions: + +```bash +curl "http://default:PASSWORD@HOST:8123/" \ + --data "DESCRIBE TABLE ion_logs FORMAT Pretty" +``` + +Check row count after writing logs: + +```bash +curl "http://default:PASSWORD@HOST:8123/" \ + --data "SELECT count() FROM ion_logs FORMAT Pretty" +``` + +--- + +## Test File Map + +| File | Package | What it tests | Needs ClickHouse | +|------|---------|--------------|-----------------| +| `clickhouse/row_test.go` | white-box | `extractRow()` field routing | No | +| `clickhouse/batch_test.go` | white-box | `batchWriter` flush/drop/sync/shutdown | No | +| `clickhouse/core_test.go` | white-box | `Core` zapcore interface | No | +| `clickhouse/schema_test.go` | white-box | `buildDDL`, `ensureSchema` with mock | No | +| `clickhouse/tests/config_test.go` | black-box | `Config.Validate()`, `WithDefaults()`, `New()` | Only `TestNew_ConnectsAndExposesConfig` | +| `clickhouse/tests/ping_test.go` | black-box | `Config.Ping()` error paths | No | +| `clickhouse/tests/schema_ddl_test.go` | black-box | `BuildDDL` output | No | +| `clickhouse/tests/schema_test.go` | black-box | `RunEnsureSchema` creates real table | Yes | diff --git a/go.mod b/go.mod index 111ec27..0a561ef 100644 --- a/go.mod +++ b/go.mod @@ -25,17 +25,28 @@ require ( ) require ( + github.com/ClickHouse/ch-go v0.71.0 // indirect + github.com/ClickHouse/clickhouse-go/v2 v2.45.0 // indirect + github.com/andybalholm/brotli v1.2.0 // indirect github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/go-faster/city v1.0.1 // indirect + github.com/go-faster/errors v0.7.1 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/google/uuid v1.6.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect + github.com/klauspost/compress v1.18.3 // indirect + github.com/paulmach/orb v0.12.0 // indirect + github.com/pierrec/lz4/v4 v4.1.25 // indirect + github.com/segmentio/asm v1.2.1 // indirect + github.com/shopspring/decimal v1.4.0 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.42.0 // indirect go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/multierr v1.11.0 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/net v0.52.0 // indirect golang.org/x/sys v0.42.0 // indirect golang.org/x/text v0.35.0 // indirect diff --git a/go.sum b/go.sum index 33dabd0..62fb5ea 100644 --- a/go.sum +++ b/go.sum @@ -1,28 +1,73 @@ +github.com/ClickHouse/ch-go v0.71.0 h1:bUdZ/EZj/LcVHsMqaRUP2holqygrPWQKeMjc6nZoyRM= +github.com/ClickHouse/ch-go v0.71.0/go.mod h1:NwbNc+7jaqfY58dmdDUbG4Jl22vThgx1cYjBw0vtgXw= +github.com/ClickHouse/clickhouse-go/v2 v2.45.0 h1:iHt15nA4iYhfde5bDQAcLAat9BAh7B5ksPRNRa4UI7s= +github.com/ClickHouse/clickhouse-go/v2 v2.45.0/go.mod h1:giJfUVlMkcfUEPVfRpt51zZaGEx9i17gCos8gBl392c= +github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ= +github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY= github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/go-faster/city v1.0.1 h1:4WAxSZ3V2Ws4QRDrscLEDcibJY8uf41H6AhXDrNDcGw= +github.com/go-faster/city v1.0.1/go.mod h1:jKcUJId49qdW3L1qKHH/3wPeUstCVpVSXTM6vO3VcTw= +github.com/go-faster/errors v0.7.1 h1:MkJTnDoEdi9pDabt1dpWf7AA8/BaSYZqibYyhZ20AYg= +github.com/go-faster/errors v0.7.1/go.mod h1:5ySTjWFiphBs07IKuiL69nxdfd5+fzh1u7FPGZP2quo= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs= github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= +github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw= +github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc= +github.com/paulmach/orb v0.12.0 h1:z+zOwjmG3MyEEqzv92UN49Lg1JFYx0L9GpGKNVDKk1s= +github.com/paulmach/orb v0.12.0/go.mod h1:5mULz1xQfs3bmQm63QEJA6lNGujuRafwA5S/EnuLaLU= +github.com/paulmach/protoscan v0.2.1/go.mod h1:SpcSwydNLrxUGSDvXvO0P7g7AuhJ7lcKfDlhJCDw2gY= +github.com/pierrec/lz4/v4 v4.1.25 h1:kocOqRffaIbU5djlIBr7Wh+cx82C0vtFb0fOurZHqD0= +github.com/pierrec/lz4/v4 v4.1.25/go.mod h1:EoQMVJgeeEOMsCqCzqFm2O0cJvljX2nGZjcRIPL34O4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/segmentio/asm v1.2.1 h1:DTNbBqs57ioxAD4PrArqftgypG4/qNpXoJx8TVXxPR0= +github.com/segmentio/asm v1.2.1/go.mod h1:BqMnlJP91P8d+4ibuonYZw9mfnzI9HfxselHZr5aAcs= +github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k= +github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= +github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= +github.com/xdg-go/scram v1.1.1/go.mod h1:RaEWvsqvNKKvBPvcKeFjrG2cJqOkHTiyTpzz23ni57g= +github.com/xdg-go/stringprep v1.0.3/go.mod h1:W3f5j4i+9rC0kuIEJL0ky1VpHXQU3ocBgklLGvcBnW8= +github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d/go.mod h1:rHwXgn7JulP+udvsHwJoVG1YGAP6VLg4y9I5dyZdqmA= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.mongodb.org/mongo-driver v1.11.4/go.mod h1:PTSz5yu21bkT/wXpkS7WR5f0ddqw5quethTUn9WM+2g= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/contrib/bridges/otelzap v0.17.0 h1:oCltVHJcblcth2z9B9dRTeZIZTe2Sf9Ad9h8bcc+s8M= @@ -71,12 +116,48 @@ go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.27.1 h1:08RqriUEv8+ArZRYSTXy1LeBScaMpVSTBhCeaZYfMYc= go.uber.org/zap v1.27.1/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= google.golang.org/genproto/googleapis/api v0.0.0-20260319201613-d00831a3d3e7 h1:41r6JMbpzBMen0R/4TZeeAmGXSJC7DftGINUodzTkPI= @@ -85,9 +166,14 @@ google.golang.org/genproto/googleapis/rpc v0.0.0-20260319201613-d00831a3d3e7 h1: google.golang.org/genproto/googleapis/rpc v0.0.0-20260319201613-d00831a3d3e7/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE= google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/clickhouse/batch.go b/internal/clickhouse/batch.go new file mode 100644 index 0000000..fb86d67 --- /dev/null +++ b/internal/clickhouse/batch.go @@ -0,0 +1,140 @@ +package clickhouse + +import ( + "fmt" + "os" + "sync" + "sync/atomic" + "time" +) + +type batchConfig struct { + BatchSize int + FlushInterval time.Duration + ChannelBuffer int + Table string +} + +type batchWriter struct { + ch chan logRow + cfg batchConfig + flushFn func([]logRow) error + stop chan struct{} + stopOnce sync.Once + syncReqs chan chan struct{} + wg sync.WaitGroup + dropped atomic.Int64 +} + +func newBatchWriter(cfg batchConfig, flushFn func([]logRow) error) *batchWriter { + return &batchWriter{ + ch: make(chan logRow, cfg.ChannelBuffer), + cfg: cfg, + flushFn: flushFn, + stop: make(chan struct{}), + syncReqs: make(chan chan struct{}, 1), + } +} + +func (bw *batchWriter) start() { + bw.wg.Add(1) + go bw.run() +} + +// send enqueues a row. Non-blocking: drops and increments counter if channel is full. +func (bw *batchWriter) send(row logRow) { + select { + case bw.ch <- row: + default: + bw.dropped.Add(1) + } +} + +func (bw *batchWriter) droppedCount() int64 { + return bw.dropped.Load() +} + +// sync drains the channel and flushes all pending rows before returning. +// Safe to call after shutdown() — returns immediately instead of deadlocking. +func (bw *batchWriter) sync() { + done := make(chan struct{}) + select { + case bw.syncReqs <- done: + // sent; wait for the goroutine to confirm flush or for stop + select { + case <-done: + case <-bw.stop: + } + case <-bw.stop: + // already stopped — nothing to flush + } +} + +func (bw *batchWriter) run() { + defer bw.wg.Done() + ticker := time.NewTicker(bw.cfg.FlushInterval) + defer ticker.Stop() + + var batch []logRow + + for { + select { + case row := <-bw.ch: + batch = append(batch, row) + if len(batch) >= bw.cfg.BatchSize { + bw.flush(batch) + batch = nil + } + + case <-ticker.C: + if len(batch) > 0 { + bw.flush(batch) + batch = nil + } + + case done := <-bw.syncReqs: + // drain everything currently in the channel before flushing + drain: + for { + select { + case row := <-bw.ch: + batch = append(batch, row) + default: + break drain + } + } + if len(batch) > 0 { + bw.flush(batch) + batch = nil + } + close(done) + + case <-bw.stop: + for { + select { + case row := <-bw.ch: + batch = append(batch, row) + default: + if len(batch) > 0 { + bw.flush(batch) + } + return + } + } + } + } +} + +func (bw *batchWriter) flush(rows []logRow) { + if err := bw.flushFn(rows); err != nil { + fmt.Fprintf(os.Stderr, "[ion/clickhouse] flush error: %v\n", err) + bw.dropped.Add(int64(len(rows))) + } +} + +// shutdown signals the run goroutine to stop and waits for it to drain and flush. +// Safe to call multiple times — only the first call closes the stop channel. +func (bw *batchWriter) shutdown() { + bw.stopOnce.Do(func() { close(bw.stop) }) + bw.wg.Wait() +} diff --git a/internal/clickhouse/batch_test.go b/internal/clickhouse/batch_test.go new file mode 100644 index 0000000..3d34f0e --- /dev/null +++ b/internal/clickhouse/batch_test.go @@ -0,0 +1,291 @@ +package clickhouse + +import ( + "errors" + "sync" + "testing" + "time" +) + +func testLogRow() logRow { + return logRow{Message: "test", Level: "info"} +} + +func TestBatchWriter_FlushesOnBatchSize(t *testing.T) { + flushed := make(chan []logRow, 1) + bw := newBatchWriter(batchConfig{ + BatchSize: 3, + FlushInterval: 10 * time.Second, + ChannelBuffer: 100, + }, func(rows []logRow) error { + flushed <- append([]logRow{}, rows...) + return nil + }) + bw.start() + defer bw.shutdown() + + for range 3 { + bw.send(testLogRow()) + } + + select { + case rows := <-flushed: + if len(rows) != 3 { + t.Errorf("got %d rows, want 3", len(rows)) + } + case <-time.After(2 * time.Second): + t.Fatal("flush not triggered after BatchSize rows") + } +} + +func TestBatchWriter_FlushesOnTicker(t *testing.T) { + flushed := make(chan []logRow, 1) + bw := newBatchWriter(batchConfig{ + BatchSize: 1000, + FlushInterval: 50 * time.Millisecond, + ChannelBuffer: 100, + }, func(rows []logRow) error { + flushed <- append([]logRow{}, rows...) + return nil + }) + bw.start() + defer bw.shutdown() + + bw.send(testLogRow()) + + select { + case rows := <-flushed: + if len(rows) != 1 { + t.Errorf("got %d rows, want 1", len(rows)) + } + case <-time.After(2 * time.Second): + t.Fatal("flush not triggered by ticker") + } +} + +func TestBatchWriter_ShutdownFlushesRemaining(t *testing.T) { + var mu sync.Mutex + var total int + bw := newBatchWriter(batchConfig{ + BatchSize: 1000, + FlushInterval: 10 * time.Second, + ChannelBuffer: 100, + }, func(rows []logRow) error { + mu.Lock() + total += len(rows) + mu.Unlock() + return nil + }) + bw.start() + + for range 5 { + bw.send(testLogRow()) + } + bw.shutdown() + + mu.Lock() + defer mu.Unlock() + if total != 5 { + t.Errorf("shutdown flushed %d rows, want 5", total) + } +} + +func TestBatchWriter_DropsWhenFull(t *testing.T) { + block := make(chan struct{}) + bw := newBatchWriter(batchConfig{ + BatchSize: 1, + FlushInterval: 10 * time.Second, + ChannelBuffer: 1, + }, func(rows []logRow) error { + <-block + return nil + }) + bw.start() + + for range 10 { + bw.send(testLogRow()) + } + + if bw.droppedCount() == 0 { + t.Error("expected dropped > 0 when channel is full") + } + + close(block) + bw.shutdown() +} + +func TestBatchWriter_FlushErrorContinues(t *testing.T) { + var mu sync.Mutex + calls := 0 + bw := newBatchWriter(batchConfig{ + BatchSize: 1, + FlushInterval: 10 * time.Second, + ChannelBuffer: 100, + }, func(rows []logRow) error { + mu.Lock() + calls++ + mu.Unlock() + return errors.New("ch down") + }) + bw.start() + defer bw.shutdown() + + bw.send(testLogRow()) + bw.send(testLogRow()) + + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + mu.Lock() + n := calls + mu.Unlock() + if n >= 2 { + break + } + time.Sleep(10 * time.Millisecond) + } + + mu.Lock() + defer mu.Unlock() + if calls < 2 { + t.Errorf("flush fn called %d times, want >= 2 (errors must not stop the writer)", calls) + } +} + +func TestBatchWriter_NoRowsNoFlush(t *testing.T) { + called := false + bw := newBatchWriter(batchConfig{ + BatchSize: 10, + FlushInterval: 50 * time.Millisecond, + ChannelBuffer: 10, + }, func(rows []logRow) error { + called = true + return nil + }) + bw.start() + time.Sleep(120 * time.Millisecond) // let ticker fire at least once + bw.shutdown() + + if called { + t.Error("flush must not be called when no rows were sent") + } +} + +func TestBatchWriter_SyncFlushesCurrentBatch(t *testing.T) { + flushed := make(chan []logRow, 1) + bw := newBatchWriter(batchConfig{ + BatchSize: 1000, // high — ticker/sync must trigger, not size + FlushInterval: 10 * time.Second, // long — sync must trigger, not ticker + ChannelBuffer: 100, + }, func(rows []logRow) error { + flushed <- append([]logRow{}, rows...) + return nil + }) + bw.start() + defer bw.shutdown() + + bw.send(testLogRow()) + bw.sync() // must flush synchronously before returning + + select { + case rows := <-flushed: + if len(rows) != 1 { + t.Errorf("sync flushed %d rows, want 1", len(rows)) + } + default: + t.Fatal("sync returned but no rows were flushed") + } +} + +func TestBatchWriter_FlushErrorIncrementsDropped(t *testing.T) { + bw := newBatchWriter(batchConfig{ + BatchSize: 3, + FlushInterval: 10 * time.Second, + ChannelBuffer: 100, + }, func(rows []logRow) error { + return errors.New("ch down") + }) + bw.start() + defer bw.shutdown() + + for range 3 { + bw.send(testLogRow()) + } + + // wait for the batch to flush (and fail) + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if bw.droppedCount() == 3 { + break + } + time.Sleep(10 * time.Millisecond) + } + + if bw.droppedCount() != 3 { + t.Errorf("droppedCount = %d, want 3 after flush error", bw.droppedCount()) + } +} + +func TestBatchWriter_ShutdownIdempotent(t *testing.T) { + bw := newBatchWriter(batchConfig{ + BatchSize: 10, + FlushInterval: time.Second, + ChannelBuffer: 10, + }, func(rows []logRow) error { return nil }) + bw.start() + bw.shutdown() + bw.shutdown() // must not panic +} + +func TestBatchWriter_SyncAfterShutdown_DoesNotDeadlock(t *testing.T) { + bw := newBatchWriter(batchConfig{ + BatchSize: 10, + FlushInterval: time.Second, + ChannelBuffer: 10, + }, func(rows []logRow) error { return nil }) + bw.start() + bw.shutdown() + + done := make(chan struct{}) + go func() { + bw.sync() + close(done) + }() + + select { + case <-done: + case <-time.After(time.Second): + t.Fatal("sync() deadlocked after shutdown()") + } +} + +func TestBatchWriter_BatchSizeSpansMultipleFlushes(t *testing.T) { + flushed := make(chan []logRow, 10) + bw := newBatchWriter(batchConfig{ + BatchSize: 2, + FlushInterval: 10 * time.Second, + ChannelBuffer: 100, + }, func(rows []logRow) error { + flushed <- append([]logRow{}, rows...) + return nil + }) + bw.start() + defer bw.shutdown() + + for range 6 { + bw.send(testLogRow()) + } + + got := 0 + deadline := time.After(2 * time.Second) + for got < 6 { + select { + case rows := <-flushed: + if len(rows) != 2 { + t.Errorf("flush batch size: got %d, want 2", len(rows)) + } + got += len(rows) + case <-deadline: + t.Fatalf("only %d of 6 rows flushed before timeout", got) + } + } +} diff --git a/internal/clickhouse/config/builder.go b/internal/clickhouse/config/builder.go new file mode 100644 index 0000000..e7c8eb9 --- /dev/null +++ b/internal/clickhouse/config/builder.go @@ -0,0 +1,63 @@ +package config + +import "time" + +func (cfg Config) SetDSN(dsn string) Config { + cfg.DSN = dsn + return cfg +} + +func (cfg Config) SetTable(table string) Config { + cfg.Table = table + return cfg +} + +func (cfg Config) SetLevel(level string) Config { + cfg.Level = level + return cfg +} + +func (cfg Config) SetBatchSize(batchSize int) Config { + cfg.BatchSize = batchSize + return cfg +} + +func (cfg Config) SetFlushInterval(flushInterval time.Duration) Config { + cfg.FlushInterval = flushInterval + return cfg +} + +func (cfg Config) SetChannelBuffer(channelBuffer int) Config { + cfg.ChannelBuffer = channelBuffer + return cfg +} + +func (cfg Config) SetAutoSchema(autoSchema bool) Config { + cfg.AutoSchema = autoSchema + return cfg +} + +func (cfg Config) SetDialTimeout(dialTimeout time.Duration) Config { + cfg.DialTimeout = dialTimeout + return cfg +} + +func (cfg Config) SetWriteTimeout(writeTimeout time.Duration) Config { + cfg.WriteTimeout = writeTimeout + return cfg +} + +func (cfg Config) SetMaxOpenConns(maxOpenConns int) Config { + cfg.MaxOpenConns = maxOpenConns + return cfg +} + +func (cfg Config) SetMaxIdleConns(maxIdleConns int) Config { + cfg.MaxIdleConns = maxIdleConns + return cfg +} + +func (cfg Config) SetConnMaxLifetime(connMaxLifetime time.Duration) Config { + cfg.ConnMaxLifetime = connMaxLifetime + return cfg +} diff --git a/internal/clickhouse/config/config.go b/internal/clickhouse/config/config.go new file mode 100644 index 0000000..dc3f69e --- /dev/null +++ b/internal/clickhouse/config/config.go @@ -0,0 +1,162 @@ +package config + +import ( + "fmt" + "regexp" + "strings" + "time" + + clickhouse_errors "github.com/JupiterMetaLabs/ion/internal/clickhouse/config/errors" +) + +var validIdentifier = regexp.MustCompile(`^[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)?$`) + +// Config holds all configuration for the ClickHouse log sink. +// Call New(cfg) to validate and apply defaults before use. +type Config struct { + // Enabled controls whether the ClickHouse sink is active. + // Default: false + Enabled bool + + // DSN is the ClickHouse connection string. Required when Enabled. + // Example: "http://user:pass@host:8123/db", "clickhouse://user:pass@host:9000/db" + DSN string + + // Table is the ClickHouse table name for log entries. + // Default: "ion_logs" + Table string + + // Level is the minimum log level forwarded to ClickHouse. + // Valid values: "debug", "info", "warn", "error", "fatal". + // Default: "info" + Level string + + // BatchSize is the maximum number of rows per flush. + // Default: 1000 + BatchSize int + + // FlushInterval is how often the background flusher sends pending rows. + // Default: 5s + FlushInterval time.Duration + + // ChannelBuffer is the depth of the async write queue. + // When full, incoming entries are dropped and DroppedCount increments. + // Default: 10000 + ChannelBuffer int + + // AutoSchema runs CREATE TABLE IF NOT EXISTS on Open() when true. + // Default: false — production teams are expected to manage DDL themselves. + AutoSchema bool + + // DialTimeout is the ClickHouse connection timeout. + // Default: 10s + DialTimeout time.Duration + + // WriteTimeout is the per-flush ClickHouse write timeout. + // Default: 30s + WriteTimeout time.Duration + + // MaxOpenConns is the connection pool size. + // Default: 5 + MaxOpenConns int + + // MaxIdleConns is the number of idle connections kept open. + // Default: 5 + MaxIdleConns int + + // ConnMaxLifetime is how long a connection may be reused. + // Default: 1h + ConnMaxLifetime time.Duration +} + +var validLevels = map[string]bool{ + "debug": true, + "info": true, + "warn": true, + "error": true, + "fatal": true, +} + +// withDefaults returns a copy of cfg with zero-value fields filled with defaults. +// User-supplied non-zero values are never overwritten. +func (cfg Config) WithDefaults() Config { + if cfg.Table == "" { + cfg.Table = "ion_logs" + } + if cfg.Level == "" { + cfg.Level = "info" + } + if cfg.BatchSize == 0 { + cfg.BatchSize = 1000 + } + if cfg.FlushInterval == 0 { + cfg.FlushInterval = 5 * time.Second + } + if cfg.ChannelBuffer == 0 { + cfg.ChannelBuffer = 10000 + } + if cfg.DialTimeout == 0 { + cfg.DialTimeout = 10 * time.Second + } + if cfg.WriteTimeout == 0 { + cfg.WriteTimeout = 30 * time.Second + } + if cfg.MaxOpenConns == 0 { + cfg.MaxOpenConns = 5 + } + if cfg.MaxIdleConns == 0 { + cfg.MaxIdleConns = 5 + } + if cfg.ConnMaxLifetime == 0 { + cfg.ConnMaxLifetime = time.Hour + } + return cfg +} + +// validate checks the config for invalid values after defaults have been applied. +// Returns a combined error listing every violation found. +func (cfg Config) Validate() error { + var errs []string + + if cfg.DSN == "" { + errs = append(errs, clickhouse_errors.ErrInvalidDSN.Error()) + } + if cfg.Level != "" && !validLevels[strings.ToLower(cfg.Level)] { + errs = append(errs, clickhouse_errors.ErrInvalidLevel.Error()) + } + if cfg.Table != "" && !validIdentifier.MatchString(cfg.Table) { + errs = append(errs, clickhouse_errors.ErrInvalidTable.Error()) + } + if cfg.BatchSize < 0 { + errs = append(errs, clickhouse_errors.ErrBatchSizeMustNotBeNegative.Error()) + } + if cfg.ChannelBuffer < 0 { + errs = append(errs, clickhouse_errors.ErrChannelBufferMustNotBeNegative.Error()) + } + if cfg.FlushInterval < 0 { + errs = append(errs, clickhouse_errors.ErrFlushIntervalMustNotBeNegative.Error()) + } + if cfg.DialTimeout < 0 { + errs = append(errs, clickhouse_errors.ErrDialTimeoutMustNotBeNegative.Error()) + } + if cfg.WriteTimeout < 0 { + errs = append(errs, clickhouse_errors.ErrWriteTimeoutMustNotBeNegative.Error()) + } + if cfg.MaxOpenConns < 0 { + errs = append(errs, clickhouse_errors.ErrMaxOpenConnsMustNotBeNegative.Error()) + } + if cfg.MaxIdleConns < 0 { + errs = append(errs, clickhouse_errors.ErrMaxIdleConnsMustNotBeNegative.Error()) + } + if cfg.ConnMaxLifetime < 0 { + errs = append(errs, clickhouse_errors.ErrConnMaxLifetimeMustNotBeNegative.Error()) + } + if cfg.MaxOpenConns > 0 && cfg.MaxIdleConns > cfg.MaxOpenConns { + errs = append(errs, clickhouse_errors.ErrMaxIdleConnsExceedsMaxOpenConns.Error()) + } + + if len(errs) == 0 { + return nil + } + return fmt.Errorf("clickhouse config validation failed: %s", strings.Join(errs, "; ")) +} diff --git a/internal/clickhouse/config/errors/errors.go b/internal/clickhouse/config/errors/errors.go new file mode 100644 index 0000000..12e7076 --- /dev/null +++ b/internal/clickhouse/config/errors/errors.go @@ -0,0 +1,28 @@ +package errors + +import "errors" + +var ( + ErrOpenFailed = errors.New("clickhouse core: open failed") + ErrShutdownFailed = errors.New("clickhouse core: shutdown failed") + ErrWriteFailed = errors.New("clickhouse core: write failed") + ErrFlushFailed = errors.New("clickhouse core: flush failed") + ErrBufferFull = errors.New("clickhouse core: buffer full") + ErrContextCancelled = errors.New("clickhouse core: context cancelled") +) + +var ( + ErrInvalidDSN = errors.New("clickhouse config: invalid DSN") + ErrPingFailed = errors.New("clickhouse config: ping failed") + ErrInvalidLevel = errors.New("clickhouse config: invalid level") + ErrInvalidTable = errors.New("clickhouse config: invalid table name") + ErrBatchSizeMustNotBeNegative = errors.New("clickhouse config: batch size must not be negative") + ErrChannelBufferMustNotBeNegative = errors.New("clickhouse config: channel buffer must not be negative") + ErrFlushIntervalMustNotBeNegative = errors.New("clickhouse config: flush interval must not be negative") + ErrDialTimeoutMustNotBeNegative = errors.New("clickhouse config: dial timeout must not be negative") + ErrWriteTimeoutMustNotBeNegative = errors.New("clickhouse config: write timeout must not be negative") + ErrMaxOpenConnsMustNotBeNegative = errors.New("clickhouse config: max open conns must not be negative") + ErrMaxIdleConnsMustNotBeNegative = errors.New("clickhouse config: max idle conns must not be negative") + ErrConnMaxLifetimeMustNotBeNegative = errors.New("clickhouse config: conn max lifetime must not be negative") + ErrMaxIdleConnsExceedsMaxOpenConns = errors.New("clickhouse config: max idle conns must not exceed max open conns") +) \ No newline at end of file diff --git a/internal/clickhouse/config/ping.go b/internal/clickhouse/config/ping.go new file mode 100644 index 0000000..376c8fe --- /dev/null +++ b/internal/clickhouse/config/ping.go @@ -0,0 +1,36 @@ +package config + +import ( + "context" + "errors" + + chdriver "github.com/ClickHouse/clickhouse-go/v2" + clickhouse_errors "github.com/JupiterMetaLabs/ion/internal/clickhouse/config/errors" +) + +// Ping opens a temporary connection to ClickHouse using the given DSN, +// sends a ping, and closes the connection. It does not retain any state. +// +// Use this to validate a DSN at startup before calling Open(), or in +// health-check handlers to verify ClickHouse is reachable. +// +// The provided context controls the dial and ping timeout. A cancelled +// or deadline-exceeded context will propagate as an error. +func (cfg Config) Ping(ctx context.Context) error { + opts, err := chdriver.ParseDSN(cfg.DSN) + if err != nil { + return errors.New(clickhouse_errors.ErrInvalidDSN.Error() + ": " + err.Error()) + } + + conn, err := chdriver.Open(opts) + if err != nil { + return errors.New(clickhouse_errors.ErrOpenFailed.Error() + ": " + err.Error()) + } + // Open errors return above with no defer; from here conn is live and defer always runs on exit (including Ping errors). + defer conn.Close() //nolint:errcheck // best-effort close on a temp connection + + if err := conn.Ping(ctx); err != nil { + return errors.New(clickhouse_errors.ErrPingFailed.Error() + ": " + err.Error()) + } + return nil +} diff --git a/internal/clickhouse/core.go b/internal/clickhouse/core.go new file mode 100644 index 0000000..166d94d --- /dev/null +++ b/internal/clickhouse/core.go @@ -0,0 +1,190 @@ +package clickhouse + +import ( + "context" + "errors" + "fmt" + "time" + + chdriver "github.com/ClickHouse/clickhouse-go/v2" + "go.uber.org/zap/zapcore" + + "github.com/JupiterMetaLabs/ion/internal/clickhouse/config" +) + +// Core is a zapcore.Core that writes log entries to ClickHouse. +// Call New() to construct, Open() to connect, Shutdown() to close. +type Core struct { + Config config.Config + conn chdriver.Conn + level zapcore.Level + writer *batchWriter + presetFields []zapcore.Field +} + +// New validates cfg, applies defaults, pings ClickHouse, and returns a Core +// ready to be opened. Call Open() before attaching to a logger. +func New(ctx context.Context, cfg config.Config) (*Core, error) { + cfg = cfg.WithDefaults() + if err := cfg.Validate(); err != nil { + return nil, errors.New("clickhouse core: validation failed: " + err.Error()) + } + + var lvl zapcore.Level + _ = lvl.UnmarshalText([]byte(cfg.Level)) // already validated by Validate() + + pingCtx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + if err := cfg.Ping(pingCtx); err != nil { + return nil, errors.New("clickhouse core: ping failed: " + err.Error()) + } + + return &Core{Config: cfg, level: lvl}, nil +} + +// Open opens a persistent ClickHouse connection, optionally creates the table +// (when AutoSchema is true), and starts the background flush goroutine. +// Returns an error if called on an already-open Core. +func (c *Core) Open(ctx context.Context) error { + if c.conn != nil { + return errors.New("clickhouse core: already open") + } + opts, err := chdriver.ParseDSN(c.Config.DSN) + if err != nil { + return fmt.Errorf("clickhouse core: parse DSN: %w", err) + } + opts.MaxOpenConns = c.Config.MaxOpenConns + opts.MaxIdleConns = c.Config.MaxIdleConns + opts.ConnMaxLifetime = c.Config.ConnMaxLifetime + opts.DialTimeout = c.Config.DialTimeout + + conn, err := chdriver.Open(opts) + if err != nil { + return fmt.Errorf("clickhouse core: open: %w", err) + } + + if c.Config.AutoSchema { + if err := ensureSchema(ctx, conn, schemaConfig{Table: c.Config.Table}); err != nil { + _ = conn.Close() + return fmt.Errorf("clickhouse core: ensure schema: %w", err) + } + } + + c.conn = conn + c.writer = newBatchWriter(batchConfig{ + BatchSize: c.Config.BatchSize, + FlushInterval: c.Config.FlushInterval, + ChannelBuffer: c.Config.ChannelBuffer, + Table: c.Config.Table, + }, func(rows []logRow) error { + return c.flushRows(rows) + }) + c.writer.start() + return nil +} + +// Shutdown drains remaining buffered rows, flushes them, and closes the connection. +// Respects ctx deadline — returns ctx.Err() if the drain takes too long. +func (c *Core) Shutdown(ctx context.Context) error { + if c.writer != nil { + done := make(chan struct{}) + go func() { + c.writer.shutdown() + close(done) + }() + select { + case <-done: + case <-ctx.Done(): + return ctx.Err() + } + } + if c.conn != nil { + err := c.conn.Close() //nolint:wrapcheck // direct driver close + c.conn = nil + return err + } + return nil +} + +// DroppedCount returns the total log entries dropped due to a full buffer. +func (c *Core) DroppedCount() int64 { + if c.writer == nil { + return 0 + } + return c.writer.droppedCount() +} + +// --- zapcore.Core interface --- + +func (c *Core) Enabled(lvl zapcore.Level) bool { + return lvl >= c.level +} + +func (c *Core) With(fields []zapcore.Field) zapcore.Core { + clone := *c + clone.presetFields = make([]zapcore.Field, len(c.presetFields)+len(fields)) + copy(clone.presetFields, c.presetFields) + copy(clone.presetFields[len(c.presetFields):], fields) + return &clone +} + +func (c *Core) Check(entry zapcore.Entry, ce *zapcore.CheckedEntry) *zapcore.CheckedEntry { + if c.Enabled(entry.Level) { + return ce.AddCore(entry, c) + } + return ce +} + +func (c *Core) Write(entry zapcore.Entry, fields []zapcore.Field) error { + if c.writer == nil { + return nil + } + all := make([]zapcore.Field, 0, len(c.presetFields)+len(fields)) + all = append(all, c.presetFields...) + all = append(all, fields...) + c.writer.send(extractRow(entry, all)) + return nil +} + +func (c *Core) Sync() error { + if c.writer != nil { + c.writer.sync() + } + return nil +} + +// --- private --- + +func (c *Core) flushRows(rows []logRow) error { + ctx, cancel := context.WithTimeout(context.Background(), c.Config.WriteTimeout) + defer cancel() + b, err := c.conn.PrepareBatch(ctx, + fmt.Sprintf("INSERT INTO %s", c.Config.Table)) + if err != nil { + return err + } + for i := range rows { + normalizeRow(&rows[i]) + if err := b.AppendStruct(&rows[i]); err != nil { + return err + } + } + return b.Send() +} + +// normalizeRow replaces nil maps with empty maps before sending to ClickHouse. +// ClickHouse Map columns cannot be null; the driver requires non-nil maps. +func normalizeRow(row *logRow) { + if row.StrFields == nil { + row.StrFields = map[string]string{} + } + if row.IntFields == nil { + row.IntFields = map[string]int64{} + } + if row.FltFields == nil { + row.FltFields = map[string]float64{} + } + if row.BoolFields == nil { + row.BoolFields = map[string]uint8{} + } +} diff --git a/internal/clickhouse/core_test.go b/internal/clickhouse/core_test.go new file mode 100644 index 0000000..942d4cd --- /dev/null +++ b/internal/clickhouse/core_test.go @@ -0,0 +1,299 @@ +package clickhouse + +import ( + "context" + "errors" + "testing" + "time" + + chdriver "github.com/ClickHouse/clickhouse-go/v2" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + + "github.com/JupiterMetaLabs/ion/internal/clickhouse/config" +) + +// coreWithWriter builds a Core backed by a fake batchWriter — no real ClickHouse needed. +func coreWithWriter(t *testing.T, level zapcore.Level) (*Core, chan []logRow) { + t.Helper() + flushed := make(chan []logRow, 100) + bw := newBatchWriter(batchConfig{ + BatchSize: 1000, + FlushInterval: 10 * time.Second, + ChannelBuffer: 100, + }, func(rows []logRow) error { + flushed <- append([]logRow{}, rows...) + return nil + }) + bw.start() + t.Cleanup(bw.shutdown) + return &Core{level: level, writer: bw}, flushed +} + +// drainRows collects exactly n rows across potentially multiple flush batches. +func drainRows(t *testing.T, flushed <-chan []logRow, want int) []logRow { + t.Helper() + var all []logRow + deadline := time.After(2 * time.Second) + for len(all) < want { + select { + case rows := <-flushed: + all = append(all, rows...) + case <-deadline: + t.Fatalf("timed out: got %d rows, want %d", len(all), want) + } + } + return all +} + +// --- Enabled --- + +func TestCore_Enabled_InfoLevel(t *testing.T) { + c, _ := coreWithWriter(t, zapcore.InfoLevel) + cases := []struct { + lvl zapcore.Level + want bool + }{ + {zapcore.DebugLevel, false}, + {zapcore.InfoLevel, true}, + {zapcore.WarnLevel, true}, + {zapcore.ErrorLevel, true}, + } + for _, tc := range cases { + if got := c.Enabled(tc.lvl); got != tc.want { + t.Errorf("Enabled(%v) = %v, want %v", tc.lvl, got, tc.want) + } + } +} + +func TestCore_Enabled_DebugLevel(t *testing.T) { + c, _ := coreWithWriter(t, zapcore.DebugLevel) + if !c.Enabled(zapcore.DebugLevel) { + t.Error("Debug should be enabled on a debug-level core") + } +} + +// --- Check --- + +func TestCore_Check_EnabledLevelAddsCore(t *testing.T) { + c, _ := coreWithWriter(t, zapcore.InfoLevel) + entry := zapcore.Entry{Level: zapcore.InfoLevel, Message: "test"} + ce := c.Check(entry, nil) + if ce == nil { + t.Error("Check must return non-nil CheckedEntry for an enabled level") + } +} + +func TestCore_Check_DisabledLevelSkips(t *testing.T) { + c, _ := coreWithWriter(t, zapcore.InfoLevel) + entry := zapcore.Entry{Level: zapcore.DebugLevel, Message: "test"} + ce := c.Check(entry, nil) + if ce != nil { + t.Error("Check must return nil for a disabled level") + } +} + +// --- Write --- + +func TestCore_Write_ReturnsNil(t *testing.T) { + c, _ := coreWithWriter(t, zapcore.InfoLevel) + if err := c.Write(basicEntry(), nil); err != nil { + t.Errorf("Write must always return nil, got %v", err) + } +} + +func TestCore_Write_RowReachesWriter(t *testing.T) { + c, flushed := coreWithWriter(t, zapcore.InfoLevel) + entry := basicEntry() + entry.Message = "reach writer" + + _ = c.Write(entry, nil) + _ = c.Sync() + + rows := drainRows(t, flushed, 1) + if rows[0].Message != "reach writer" { + t.Errorf("Message: got %q, want %q", rows[0].Message, "reach writer") + } +} + +func TestCore_Write_LevelPreserved(t *testing.T) { + c, flushed := coreWithWriter(t, zapcore.DebugLevel) + entry := basicEntry() + entry.Level = zapcore.WarnLevel + + _ = c.Write(entry, nil) + _ = c.Sync() + + rows := drainRows(t, flushed, 1) + if rows[0].Level != "warn" { + t.Errorf("Level: got %q, want %q", rows[0].Level, "warn") + } +} + +func TestCore_Write_NilWriter_ReturnsNil(t *testing.T) { + c := &Core{level: zapcore.InfoLevel} // Open() never called + if err := c.Write(basicEntry(), nil); err != nil { + t.Errorf("Write with nil writer must return nil, got %v", err) + } +} + +// --- With --- + +func TestCore_With_DoesNotMutateReceiver(t *testing.T) { + c, _ := coreWithWriter(t, zapcore.InfoLevel) + before := len(c.presetFields) + + _ = c.With([]zapcore.Field{zap.String("k", "v")}) + + if len(c.presetFields) != before { + t.Errorf("With() mutated receiver: presetFields len %d → %d", before, len(c.presetFields)) + } +} + +func TestCore_With_PresetFieldsPropagated(t *testing.T) { + c, flushed := coreWithWriter(t, zapcore.InfoLevel) + child := c.With([]zapcore.Field{zap.String("env", "staging")}) + + _ = child.Write(basicEntry(), nil) + _ = child.Sync() + + rows := drainRows(t, flushed, 1) + if rows[0].StrFields["env"] != "staging" { + t.Errorf("StrFields[env]: got %q, want %q", rows[0].StrFields["env"], "staging") + } +} + +func TestCore_With_WriteFieldsDoNotPersist(t *testing.T) { + c, flushed := coreWithWriter(t, zapcore.InfoLevel) + + _ = c.Write(basicEntry(), []zapcore.Field{zap.String("once", "x")}) + _ = c.Sync() + drainRows(t, flushed, 1) // consume + + _ = c.Write(basicEntry(), nil) + _ = c.Sync() + rows := drainRows(t, flushed, 1) + if _, ok := rows[0].StrFields["once"]; ok { + t.Error("Write() field leaked into a subsequent Write() call") + } +} + +func TestCore_With_ChildSharesWriter(t *testing.T) { + c, flushed := coreWithWriter(t, zapcore.InfoLevel) + child := c.With([]zapcore.Field{zap.String("src", "child")}) + + _ = c.Write(basicEntry(), nil) + _ = child.Write(basicEntry(), nil) + _ = c.Sync() + + rows := drainRows(t, flushed, 2) + if len(rows) != 2 { + t.Errorf("expected 2 rows from parent+child sharing writer, got %d", len(rows)) + } +} + +// --- Sync --- + +func TestCore_Sync_FlushesBeforeTicker(t *testing.T) { + c, flushed := coreWithWriter(t, zapcore.InfoLevel) + + _ = c.Write(basicEntry(), nil) + if err := c.Sync(); err != nil { + t.Fatalf("Sync returned error: %v", err) + } + + select { + case rows := <-flushed: + if len(rows) != 1 { + t.Errorf("Sync flushed %d rows, want 1", len(rows)) + } + default: + t.Fatal("Sync returned but no rows were flushed") + } +} + +// --- Shutdown --- + +func TestCore_Shutdown_RespectsContextDeadline(t *testing.T) { + block := make(chan struct{}) + bw := newBatchWriter(batchConfig{ + BatchSize: 1, + FlushInterval: 10 * time.Second, + ChannelBuffer: 10, + }, func(rows []logRow) error { + <-block // never returns until test unblocks it + return nil + }) + bw.start() + bw.send(testLogRow()) + time.Sleep(30 * time.Millisecond) // let goroutine pick up the row and block in flush + + c := &Core{level: zapcore.InfoLevel, writer: bw} + ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer cancel() + defer close(block) + + err := c.Shutdown(ctx) + if !errors.Is(err, context.DeadlineExceeded) { + t.Errorf("Shutdown: expected DeadlineExceeded, got %v", err) + } +} + +// --- DroppedCount --- + +func TestCore_Open_GuardsAgainstDoubleOpen(t *testing.T) { + // Simulate an already-open Core by assigning a non-nil conn. + // dummyConn embeds the interface so it satisfies it without implementing methods. + type dummyConn struct{ chdriver.Conn } + c := &Core{ + Config: config.Config{DSN: "clickhouse://localhost:9000/default"}.WithDefaults(), + level: zapcore.InfoLevel, + conn: dummyConn{}, + } + err := c.Open(context.Background()) + if err == nil { + t.Fatal("Open on already-open Core should return error, got nil") + } +} + +func TestCore_Shutdown_IsIdempotent(t *testing.T) { + bw := newBatchWriter(batchConfig{ + BatchSize: 1, + FlushInterval: time.Second, + ChannelBuffer: 10, + }, func([]logRow) error { return nil }) + bw.start() + c := &Core{level: zapcore.InfoLevel, writer: bw} + ctx := context.Background() + if err := c.Shutdown(ctx); err != nil { + t.Fatalf("first Shutdown: %v", err) + } + if err := c.Shutdown(ctx); err != nil { + t.Fatalf("second Shutdown: %v", err) + } +} + +func TestCore_DroppedCount_ReflectsWriterDrops(t *testing.T) { + block := make(chan struct{}) + bw := newBatchWriter(batchConfig{ + BatchSize: 1, + FlushInterval: 10 * time.Second, + ChannelBuffer: 1, + }, func(rows []logRow) error { + <-block + return nil + }) + bw.start() + c := &Core{level: zapcore.InfoLevel, writer: bw} + + for range 10 { + _ = c.Write(basicEntry(), nil) + } + + if c.DroppedCount() == 0 { + t.Error("DroppedCount should be > 0 when channel is full") + } + + close(block) + bw.shutdown() +} diff --git a/internal/clickhouse/row.go b/internal/clickhouse/row.go new file mode 100644 index 0000000..a20f135 --- /dev/null +++ b/internal/clickhouse/row.go @@ -0,0 +1,255 @@ +package clickhouse + +import ( + "encoding/json" + "fmt" + "math" + "strings" + "time" + + "go.uber.org/zap/zapcore" + + "github.com/JupiterMetaLabs/ion/internal/core" +) + +// logRow is the internal representation of a single log entry +// as it will be stored in ClickHouse. +type logRow struct { + Timestamp time.Time `ch:"timestamp"` + Level string `ch:"level"` + Service string `ch:"service"` + Version string `ch:"version"` + Logger string `ch:"logger"` + Message string `ch:"message"` + TraceID string `ch:"trace_id"` + SpanID string `ch:"span_id"` + RequestID string `ch:"request_id"` + UserID string `ch:"user_id"` + Caller string `ch:"caller"` + StrFields map[string]string `ch:"str_fields"` + IntFields map[string]int64 `ch:"int_fields"` + FltFields map[string]float64 `ch:"flt_fields"` + BoolFields map[string]uint8 `ch:"bool_fields"` + Extra string `ch:"extra"` +} + +// dedicatedKeys are field keys that map directly to top-level logRow columns +// instead of the typed map columns. +var dedicatedKeys = map[string]bool{ + "trace_id": true, + "span_id": true, + "request_id": true, + "user_id": true, + "service": true, + "version": true, +} + +// extractRow converts a zapcore Entry and its fields into a logRow. +// It runs once per log call in the caller's goroutine and must stay allocation-lean +// for fields that are typed primitives. +// +// Field routing: +// - Internal sentinel/system keys → dropped +// - Dedicated column keys → top-level row fields +// - String, Int, Uint, Float, Bool → typed map columns (lazy-allocated) +// - Duration, Stringer → StrFields (human-readable string) +// - Error, Reflect, and others → Extra (single JSON object, lazy-built) +func extractRow(entry zapcore.Entry, fields []zapcore.Field) logRow { + row := logRow{ + Timestamp: entry.Time, + Level: entry.Level.String(), + Logger: entry.LoggerName, + Message: entry.Message, + } + + if entry.Caller.Defined { + row.Caller = entry.Caller.String() + } + + if len(fields) == 0 { + return row + } + + // overflow accumulates fields that cannot be stored in typed columns. + // Lazy-allocated: nil until the first overflow field is encountered. + var overflow map[string]any + + for _, f := range fields { + // Drop internal ion infrastructure keys. + if f.Key == core.SentinelKey || strings.HasPrefix(f.Key, core.SystemFieldPrefix) { + continue + } + + // Route dedicated column keys to top-level fields. + if dedicatedKeys[f.Key] { + switch f.Key { + case "trace_id": + row.TraceID = f.String + case "span_id": + row.SpanID = f.String + case "request_id": + row.RequestID = f.String + case "user_id": + row.UserID = f.String + case "service": + row.Service = f.String + case "version": + row.Version = f.String + } + continue + } + + switch f.Type { + // --- String-like --- + case zapcore.StringType: + if row.StrFields == nil { + row.StrFields = make(map[string]string) + } + row.StrFields[f.Key] = f.String + + case zapcore.DurationType: + if row.StrFields == nil { + row.StrFields = make(map[string]string) + } + row.StrFields[f.Key] = time.Duration(f.Integer).String() + + case zapcore.StringerType: + if row.StrFields == nil { + row.StrFields = make(map[string]string) + } + if s, ok := f.Interface.(fmt.Stringer); ok { + row.StrFields[f.Key] = s.String() + } + + // --- Signed integers — value in f.Integer --- + case zapcore.Int64Type, zapcore.Int32Type, zapcore.Int16Type, zapcore.Int8Type: + if row.IntFields == nil { + row.IntFields = make(map[string]int64) + } + row.IntFields[f.Key] = f.Integer + + // --- Unsigned integers --- + // Uint64 values > math.MaxInt64 cannot round-trip through Int64 without + // becoming negative. Route them to overflow instead of silently corrupting. + case zapcore.Uint64Type: + if u := uint64(f.Integer); u > math.MaxInt64 { + if overflow == nil { + overflow = make(map[string]any) + } + overflow[f.Key] = u + } else { + if row.IntFields == nil { + row.IntFields = make(map[string]int64) + } + row.IntFields[f.Key] = f.Integer + } + + case zapcore.Uint32Type, zapcore.Uint16Type, zapcore.Uint8Type: + if row.IntFields == nil { + row.IntFields = make(map[string]int64) + } + row.IntFields[f.Key] = f.Integer + + // --- Floats — bits packed into f.Integer by zap --- + case zapcore.Float64Type: + if row.FltFields == nil { + row.FltFields = make(map[string]float64) + } + row.FltFields[f.Key] = math.Float64frombits(uint64(f.Integer)) + + case zapcore.Float32Type: + if row.FltFields == nil { + row.FltFields = make(map[string]float64) + } + row.FltFields[f.Key] = float64(math.Float32frombits(uint32(f.Integer))) + + // --- Bool — 0 or 1 in f.Integer --- + case zapcore.BoolType: + if row.BoolFields == nil { + row.BoolFields = make(map[string]uint8) + } + row.BoolFields[f.Key] = uint8(f.Integer) //nolint:gosec // Integer is always 0 or 1 for BoolType + + // --- ByteString — []byte that represents a valid UTF-8 string --- + // zap.ByteString() uses this type. JSON encoders treat it as a string, + // not as binary (base64). Route to StrFields for consistency. + case zapcore.ByteStringType: + if b, ok := f.Interface.([]byte); ok { + if row.StrFields == nil { + row.StrFields = make(map[string]string) + } + row.StrFields[f.Key] = string(b) + } + + // --- Overflow: error message as plain string --- + case zapcore.ErrorType: + if err, ok := f.Interface.(error); ok { + if overflow == nil { + overflow = make(map[string]any) + } + overflow[f.Key] = err.Error() + } + + // --- Time — stored as UnixNano in f.Integer, timezone in f.Interface --- + case zapcore.TimeType: + t := time.Unix(0, f.Integer) + if loc, ok := f.Interface.(*time.Location); ok && loc != nil { + t = t.In(loc) + } + if row.StrFields == nil { + row.StrFields = make(map[string]string) + } + row.StrFields[f.Key] = t.Format(time.RFC3339Nano) + + // --- TimeFullType — full time.Time in f.Interface --- + case zapcore.TimeFullType: + if t, ok := f.Interface.(time.Time); ok { + if row.StrFields == nil { + row.StrFields = make(map[string]string) + } + row.StrFields[f.Key] = t.Format(time.RFC3339Nano) + } + + // --- Complex numbers — json.Marshal cannot handle complex types --- + case zapcore.Complex128Type: + if c, ok := f.Interface.(complex128); ok { + if row.StrFields == nil { + row.StrFields = make(map[string]string) + } + row.StrFields[f.Key] = fmt.Sprintf("%v", c) + } + + case zapcore.Complex64Type: + if c, ok := f.Interface.(complex64); ok { + if row.StrFields == nil { + row.StrFields = make(map[string]string) + } + row.StrFields[f.Key] = fmt.Sprintf("%v", c) + } + + // --- Namespace — not representable in a flat Map schema --- + case zapcore.NamespaceType: + + // --- Skip intentional no-ops --- + case zapcore.SkipType: + + // --- Overflow: everything else marshalled as JSON value --- + default: + if f.Interface != nil { + if overflow == nil { + overflow = make(map[string]any) + } + overflow[f.Key] = f.Interface + } + } + } + + if len(overflow) > 0 { + if b, err := json.Marshal(overflow); err == nil { + row.Extra = string(b) + } + // json.Marshal failure: silently omit Extra — other fields still written. + } + + return row +} diff --git a/internal/clickhouse/row_test.go b/internal/clickhouse/row_test.go new file mode 100644 index 0000000..8f8028e --- /dev/null +++ b/internal/clickhouse/row_test.go @@ -0,0 +1,396 @@ +package clickhouse + +import ( + "errors" + "math" + "strings" + "testing" + "time" + + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + + "github.com/JupiterMetaLabs/ion/internal/core" +) + +// --- entry-level fields --- + +func TestExtractRow_EntryFields(t *testing.T) { + now := time.Now() + entry := zapcore.Entry{ + Level: zapcore.InfoLevel, + Time: now, + Message: "hello world", + LoggerName: "svc.sub", + } + + row := extractRow(entry, nil) + + if !row.Timestamp.Equal(now) { + t.Errorf("Timestamp: got %v, want %v", row.Timestamp, now) + } + if row.Level != "info" { + t.Errorf("Level: got %q, want %q", row.Level, "info") + } + if row.Message != "hello world" { + t.Errorf("Message: got %q, want %q", row.Message, "hello world") + } + if row.Logger != "svc.sub" { + t.Errorf("Logger: got %q, want %q", row.Logger, "svc.sub") + } +} + +func TestExtractRow_CallerSet(t *testing.T) { + entry := basicEntry() + entry.Caller = zapcore.EntryCaller{Defined: true, File: "main.go", Line: 42} + + row := extractRow(entry, nil) + + if row.Caller == "" { + t.Error("Caller: expected non-empty, got empty") + } +} + +func TestExtractRow_CallerNotSet(t *testing.T) { + row := extractRow(basicEntry(), nil) + if row.Caller != "" { + t.Errorf("Caller: expected empty, got %q", row.Caller) + } +} + +// --- typed primitive fields → typed map columns --- + +func TestExtractRow_StringField(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + zap.String("user", "alice"), + }) + if row.StrFields["user"] != "alice" { + t.Errorf("StrFields[user]: got %q, want %q", row.StrFields["user"], "alice") + } +} + +func TestExtractRow_IntFields(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + zap.Int64("i64", 100), + zap.Int32("i32", 200), + zap.Int16("i16", 300), + zap.Int8("i8", 127), + }) + for key, want := range map[string]int64{"i64": 100, "i32": 200, "i16": 300, "i8": 127} { + if row.IntFields[key] != want { + t.Errorf("IntFields[%s]: got %d, want %d", key, row.IntFields[key], want) + } + } +} + +func TestExtractRow_UintFields(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + zap.Uint64("u64", 9999), + zap.Uint32("u32", 8888), + zap.Uint16("u16", 7777), + zap.Uint8("u8", 255), + }) + for key, want := range map[string]int64{"u64": 9999, "u32": 8888, "u16": 7777, "u8": 255} { + if row.IntFields[key] != want { + t.Errorf("IntFields[%s]: got %d, want %d", key, row.IntFields[key], want) + } + } +} + +func TestExtractRow_FloatFields(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + zap.Float64("pi", 3.14159), + zap.Float32("e", 2.71), + }) + if diff := absf(row.FltFields["pi"] - 3.14159); diff > 1e-9 { + t.Errorf("FltFields[pi]: got %v, diff %v", row.FltFields["pi"], diff) + } + if diff := absf(row.FltFields["e"] - float64(float32(2.71))); diff > 1e-6 { + t.Errorf("FltFields[e]: got %v, diff %v", row.FltFields["e"], diff) + } +} + +func TestExtractRow_BoolField(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + zap.Bool("yes", true), + zap.Bool("no", false), + }) + if row.BoolFields["yes"] != 1 { + t.Errorf("BoolFields[yes]: got %d, want 1", row.BoolFields["yes"]) + } + if row.BoolFields["no"] != 0 { + t.Errorf("BoolFields[no]: got %d, want 0", row.BoolFields["no"]) + } +} + +func TestExtractRow_DurationGoesToStrFields(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + zap.Duration("elapsed", 250*time.Millisecond), + }) + if row.StrFields["elapsed"] != "250ms" { + t.Errorf("StrFields[elapsed]: got %q, want %q", row.StrFields["elapsed"], "250ms") + } +} + +func TestExtractRow_ByteStringGoesToStrFields(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + zap.ByteString("token", []byte("abc123")), + }) + if row.StrFields["token"] != "abc123" { + t.Errorf("StrFields[token]: got %q, want %q", row.StrFields["token"], "abc123") + } +} + +func TestExtractRow_StringerGoesToStrFields(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + zap.Stringer("addr", netIPStringer("127.0.0.1")), + }) + if row.StrFields["addr"] != "127.0.0.1" { + t.Errorf("StrFields[addr]: got %q, want %q", row.StrFields["addr"], "127.0.0.1") + } +} + +// netIPStringer is a minimal fmt.Stringer for use in Stringer test. +type netIPStringer string + +func (n netIPStringer) String() string { return string(n) } + +// --- overflow types → Extra JSON --- + +func TestExtractRow_ErrorGoesToExtra(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + zap.Error(errors.New("something went wrong")), + }) + if row.Extra == "" { + t.Fatal("Extra: expected JSON, got empty") + } + if !strings.Contains(row.Extra, "something went wrong") { + t.Errorf("Extra: %q missing error message", row.Extra) + } +} + +func TestExtractRow_ReflectGoesToExtra(t *testing.T) { + type payload struct { + Name string + Value int + } + row := extractRow(basicEntry(), []zapcore.Field{ + zap.Reflect("data", payload{Name: "ion", Value: 42}), + }) + if !strings.Contains(row.Extra, `"Name"`) { + t.Errorf("Extra: %q missing reflected struct field", row.Extra) + } + if !strings.Contains(row.Extra, "ion") { + t.Errorf("Extra: %q missing reflected value", row.Extra) + } +} + +func TestExtractRow_MultipleOverflowFields_SingleJSONObject(t *testing.T) { + type blob struct{ X int } + row := extractRow(basicEntry(), []zapcore.Field{ + zap.Error(errors.New("oops")), + zap.Reflect("meta", blob{X: 7}), + }) + + if row.Extra == "" { + t.Fatal("Extra: expected JSON object, got empty") + } + if !strings.HasPrefix(row.Extra, "{") || !strings.HasSuffix(row.Extra, "}") { + t.Errorf("Extra: expected JSON object, got %q", row.Extra) + } + if !strings.Contains(row.Extra, `"error"`) { + t.Errorf("Extra missing error key: %s", row.Extra) + } + if !strings.Contains(row.Extra, `"meta"`) { + t.Errorf("Extra missing meta key: %s", row.Extra) + } +} + +// --- internal keys filtered out --- + +func TestExtractRow_SentinelKeySkipped(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + {Key: core.SentinelKey, Type: zapcore.ReflectType, Interface: "ctx-value"}, + zap.String("keep", "me"), + }) + + if _, found := row.StrFields[core.SentinelKey]; found { + t.Error("SentinelKey must not appear in StrFields") + } + if strings.Contains(row.Extra, core.SentinelKey) { + t.Error("SentinelKey must not appear in Extra") + } + if row.StrFields["keep"] != "me" { + t.Errorf("non-sentinel field lost: got %q", row.StrFields["keep"]) + } +} + +func TestExtractRow_SystemPrefixSkipped(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + {Key: core.SystemFieldPrefix + "hidden", Type: zapcore.StringType, String: "secret"}, + zap.String("visible", "yes"), + }) + + if _, found := row.StrFields[core.SystemFieldPrefix+"hidden"]; found { + t.Error("SystemFieldPrefix key must be skipped") + } + if row.StrFields["visible"] != "yes" { + t.Errorf("visible field lost: got %q", row.StrFields["visible"]) + } +} + +// --- dedicated column routing --- + +func TestExtractRow_DedicatedColumnsRouted(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + zap.String("trace_id", "trace-abc"), + zap.String("span_id", "span-def"), + zap.String("request_id", "req-ghi"), + zap.String("user_id", "usr-jkl"), + zap.String("service", "payment-svc"), + zap.String("version", "v2.0.0"), + }) + + cases := map[string]string{ + "TraceID": row.TraceID, "SpanID": row.SpanID, + "RequestID": row.RequestID, "UserID": row.UserID, + "Service": row.Service, "Version": row.Version, + } + wants := map[string]string{ + "TraceID": "trace-abc", "SpanID": "span-def", + "RequestID": "req-ghi", "UserID": "usr-jkl", + "Service": "payment-svc", "Version": "v2.0.0", + } + for field, got := range cases { + if got != wants[field] { + t.Errorf("%s: got %q, want %q", field, got, wants[field]) + } + } +} + +func TestExtractRow_DedicatedColumnsNotInStrFields(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + zap.String("trace_id", "t"), + zap.String("service", "s"), + }) + for _, key := range []string{"trace_id", "span_id", "request_id", "user_id", "service", "version"} { + if _, found := row.StrFields[key]; found { + t.Errorf("dedicated column key %q leaked into StrFields", key) + } + } +} + +// --- lazy allocation --- + +func TestExtractRow_NoFieldsNoMapsAllocated(t *testing.T) { + row := extractRow(basicEntry(), nil) + + if row.StrFields != nil { + t.Error("StrFields must be nil when empty") + } + if row.IntFields != nil { + t.Error("IntFields must be nil when empty") + } + if row.FltFields != nil { + t.Error("FltFields must be nil when empty") + } + if row.BoolFields != nil { + t.Error("BoolFields must be nil when empty") + } + if row.Extra != "" { + t.Errorf("Extra must be empty when no overflow fields: got %q", row.Extra) + } +} + +// --- uint64 overflow --- + +func TestExtractRow_LargeUint64_GoesToExtra(t *testing.T) { + // math.MaxInt64+1 wraps to a negative int64 — silent data corruption + largeU64 := uint64(math.MaxInt64) + 1 + row := extractRow(basicEntry(), []zapcore.Field{ + {Key: "big", Type: zapcore.Uint64Type, Integer: int64(largeU64)}, + }) + if _, ok := row.IntFields["big"]; ok { + t.Error("large uint64 (> MaxInt64) must not be stored in IntFields — it would be negative") + } + if !strings.Contains(row.Extra, "big") { + t.Errorf("large uint64 must appear in Extra, got Extra=%q", row.Extra) + } +} + +func TestExtractRow_SmallUint64_StaysInIntFields(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + zap.Uint64("count", 42), + }) + if row.IntFields["count"] != 42 { + t.Errorf("IntFields[count]: got %d, want 42", row.IntFields["count"]) + } + if strings.Contains(row.Extra, "count") { + t.Error("small uint64 must not appear in Extra") + } +} + +// --- time fields --- + +func TestExtractRow_TimeGoesToStrFields(t *testing.T) { + now := time.Now().UTC().Truncate(time.Nanosecond) + row := extractRow(basicEntry(), []zapcore.Field{ + zap.Time("when", now), + }) + if row.StrFields["when"] == "" { + t.Fatal("TimeType field missing from StrFields") + } + if _, err := time.Parse(time.RFC3339Nano, row.StrFields["when"]); err != nil { + t.Errorf("StrFields[when] is not RFC3339Nano: %v (got %q)", err, row.StrFields["when"]) + } +} + +func TestExtractRow_TimeFullGoesToStrFields(t *testing.T) { + now := time.Now().UTC().Truncate(time.Nanosecond) + row := extractRow(basicEntry(), []zapcore.Field{ + {Key: "when", Type: zapcore.TimeFullType, Interface: now}, + }) + if row.StrFields["when"] == "" { + t.Fatal("TimeFullType field missing from StrFields") + } +} + +// --- complex numbers --- + +func TestExtractRow_Complex128GoesToStrFields(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + zap.Complex128("c", 3+4i), + }) + if row.StrFields["c"] == "" { + t.Fatal("Complex128 field missing from StrFields") + } + if !strings.Contains(row.StrFields["c"], "3") { + t.Errorf("StrFields[c] missing real part: got %q", row.StrFields["c"]) + } +} + +func TestExtractRow_Complex64GoesToStrFields(t *testing.T) { + row := extractRow(basicEntry(), []zapcore.Field{ + zap.Complex64("c", 1+2i), + }) + if row.StrFields["c"] == "" { + t.Fatal("Complex64 field missing from StrFields") + } +} + +// --- helpers --- + +func basicEntry() zapcore.Entry { + return zapcore.Entry{ + Level: zapcore.InfoLevel, + Time: time.Now(), + Message: "msg", + } +} + +func absf(x float64) float64 { + if x < 0 { + return -x + } + return x +} diff --git a/internal/clickhouse/schema.go b/internal/clickhouse/schema.go new file mode 100644 index 0000000..164d7ea --- /dev/null +++ b/internal/clickhouse/schema.go @@ -0,0 +1,61 @@ +package clickhouse + +import ( + "context" + _ "embed" + "fmt" + "regexp" + + chdriver "github.com/ClickHouse/clickhouse-go/v2" + + "github.com/JupiterMetaLabs/ion/internal/clickhouse/config" +) + +// validIdentifier matches unquoted ClickHouse identifiers: optional db prefix, no special chars. +var validIdentifier = regexp.MustCompile(`^[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)?$`) + +/* + Below line is used to embed the create table sql into the binary. its not a comment. +*/ + +//go:embed sql/create_table.sql +var createTableSQL string + +type schemaConfig struct { + Table string +} + +type schemaExecer interface { + Exec(ctx context.Context, query string, args ...any) error +} + +// BuildDDL returns the CREATE TABLE IF NOT EXISTS DDL for the given table name. +// Panics if table is not a valid unquoted SQL identifier (letters, digits, underscores; +// optional db.table prefix). This is intentional: an invalid identifier at startup +// is a misconfiguration bug, not a recoverable runtime error. +func BuildDDL(table string) string { + if !validIdentifier.MatchString(table) { + panic(fmt.Sprintf("clickhouse: BuildDDL called with invalid table identifier %q", table)) + } + return fmt.Sprintf(createTableSQL, table) +} + +func ensureSchema(ctx context.Context, exec schemaExecer, cfg schemaConfig) error { + return exec.Exec(ctx, BuildDDL(cfg.Table)) +} + +// RunEnsureSchema opens a temporary connection using cfg, creates the table +// for the given table name if it does not exist, then closes the connection. +// Safe to call on every startup — the DDL is idempotent (IF NOT EXISTS). +func RunEnsureSchema(ctx context.Context, cfg config.Config, table string) error { + opts, err := chdriver.ParseDSN(cfg.DSN) + if err != nil { + return fmt.Errorf("clickhouse schema: parse DSN: %w", err) + } + conn, err := chdriver.Open(opts) + if err != nil { + return fmt.Errorf("clickhouse schema: open: %w", err) + } + defer conn.Close() //nolint:errcheck // best-effort close on a temp connection + return ensureSchema(ctx, conn, schemaConfig{Table: table}) +} diff --git a/internal/clickhouse/schema_test.go b/internal/clickhouse/schema_test.go new file mode 100644 index 0000000..360669e --- /dev/null +++ b/internal/clickhouse/schema_test.go @@ -0,0 +1,42 @@ +package clickhouse + +import ( + "fmt" + "strings" + "testing" +) + +// --- identifier validation --- + +func TestBuildDDL_RejectsInvalidIdentifiers(t *testing.T) { + cases := []string{ + "", + "ion logs", + "ion-logs", + "1table", + "'; DROP TABLE ion_logs; --", + "ion_logs; DROP TABLE", + "ion_logs/*comment*/", + } + for _, name := range cases { + t.Run(fmt.Sprintf("%q", name), func(t *testing.T) { + defer func() { + if r := recover(); r == nil { + t.Errorf("BuildDDL(%q) should panic on invalid identifier, but did not", name) + } + }() + _ = BuildDDL(name) + }) + } +} + +func TestBuildDDL_AcceptsValidIdentifiers(t *testing.T) { + for _, name := range []string{"ion_logs", "_private", "MyTable", "DB123", "db.table", "my_db.my_logs"} { + t.Run(name, func(t *testing.T) { + ddl := BuildDDL(name) // must not panic + if !strings.Contains(ddl, name) { + t.Errorf("DDL missing identifier %q", name) + } + }) + } +} diff --git a/internal/clickhouse/sql/create_table.sql b/internal/clickhouse/sql/create_table.sql new file mode 100644 index 0000000..92ace9f --- /dev/null +++ b/internal/clickhouse/sql/create_table.sql @@ -0,0 +1,24 @@ +CREATE TABLE IF NOT EXISTS %s +( + timestamp DateTime64(9, 'UTC'), + level LowCardinality(String), + service LowCardinality(String), + version LowCardinality(String), + logger String, + message String, + trace_id String, + span_id String, + request_id String, + user_id String, + caller String, + str_fields Map(String, String), + int_fields Map(String, Int64), + flt_fields Map(String, Float64), + bool_fields Map(String, UInt8), + extra String +) +ENGINE = MergeTree() +PARTITION BY toYYYYMM(timestamp) +ORDER BY (service, level, timestamp) +TTL timestamp + INTERVAL 30 DAY DELETE +SETTINGS index_granularity = 8192 diff --git a/internal/clickhouse/tests/config_test.go b/internal/clickhouse/tests/config_test.go new file mode 100644 index 0000000..12f95a8 --- /dev/null +++ b/internal/clickhouse/tests/config_test.go @@ -0,0 +1,286 @@ +package tests + +import ( + "context" + "fmt" + "os" + "strings" + "testing" + "time" + + "github.com/JupiterMetaLabs/ion/internal/clickhouse" + "github.com/JupiterMetaLabs/ion/internal/clickhouse/config" + cherrors "github.com/JupiterMetaLabs/ion/internal/clickhouse/config/errors" +) + +// --- Validate() — no connection required --- + +func TestValidate_EmptyDSN(t *testing.T) { + err := config.Config{}.Validate() + if err == nil { + t.Fatal("expected error for empty DSN, got nil") + } + if !strings.Contains(err.Error(), cherrors.ErrInvalidDSN.Error()) { + t.Errorf("error %q missing sentinel %q", err, cherrors.ErrInvalidDSN) + } +} + +func TestValidate_InvalidLevel(t *testing.T) { + err := config.Config{DSN: "clickhouse://localhost:9000/default", Level: "nonsense"}.Validate() + if err == nil { + t.Fatal("expected error for invalid level, got nil") + } + if !strings.Contains(err.Error(), cherrors.ErrInvalidLevel.Error()) { + t.Errorf("error %q missing sentinel %q", err, cherrors.ErrInvalidLevel) + } +} + +func TestValidate_NegativeBatchSize(t *testing.T) { + err := config.Config{DSN: "clickhouse://localhost:9000/default", BatchSize: -1}.Validate() + if err == nil { + t.Fatal("expected error for negative BatchSize, got nil") + } + if !strings.Contains(err.Error(), cherrors.ErrBatchSizeMustNotBeNegative.Error()) { + t.Errorf("error %q missing sentinel %q", err, cherrors.ErrBatchSizeMustNotBeNegative) + } +} + +func TestValidate_NegativeChannelBuffer(t *testing.T) { + err := config.Config{DSN: "clickhouse://localhost:9000/default", ChannelBuffer: -1}.Validate() + if err == nil { + t.Fatal("expected error for negative ChannelBuffer, got nil") + } + if !strings.Contains(err.Error(), cherrors.ErrChannelBufferMustNotBeNegative.Error()) { + t.Errorf("error %q missing sentinel %q", err, cherrors.ErrChannelBufferMustNotBeNegative) + } +} + +func TestValidate_ValidConfig(t *testing.T) { + // WithDefaults() must precede Validate() — same order New() uses internally. + err := config.Config{DSN: "clickhouse://localhost:9000/default"}.WithDefaults().Validate() + if err != nil { + t.Fatalf("unexpected validation error: %v", err) + } +} + +// New() propagates validation errors before it ever pings. + +func TestNew_RejectsInvalidConfig(t *testing.T) { + cases := []struct { + name string + cfg config.Config + }{ + {"empty DSN", config.Config{}}, + {"invalid level", config.Config{DSN: "clickhouse://localhost:9000/default", Level: "bad"}}, + {"negative batch size", config.Config{DSN: "clickhouse://localhost:9000/default", BatchSize: -1}}, + {"negative channel buffer", config.Config{DSN: "clickhouse://localhost:9000/default", ChannelBuffer: -1}}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + _, err := clickhouse.New(context.Background(), tc.cfg) + if err == nil { + t.Fatal("expected error, got nil") + } + }) + } +} + +// --- WithDefaults() — no connection required --- + +func TestDefaults_AppliedOnZeroValues(t *testing.T) { + cfg := config.Config{DSN: "clickhouse://localhost:9000/default"}.WithDefaults() + + checks := []struct { + field string + got any + want any + }{ + {"Table", cfg.Table, "ion_logs"}, + {"Level", cfg.Level, "info"}, + {"BatchSize", cfg.BatchSize, 1000}, + {"FlushInterval", cfg.FlushInterval, 5 * time.Second}, + {"ChannelBuffer", cfg.ChannelBuffer, 10000}, + {"DialTimeout", cfg.DialTimeout, 10 * time.Second}, + {"WriteTimeout", cfg.WriteTimeout, 30 * time.Second}, + {"MaxOpenConns", cfg.MaxOpenConns, 5}, + {"MaxIdleConns", cfg.MaxIdleConns, 5}, + {"ConnMaxLifetime", cfg.ConnMaxLifetime, time.Hour}, + } + for _, c := range checks { + if c.got != c.want { + t.Errorf("%s: got %v, want %v", c.field, c.got, c.want) + } + } +} + +func TestDefaults_UserValuesNotOverwritten(t *testing.T) { + cfg := config.Config{ + DSN: "clickhouse://localhost:9000/default", + Table: "my_logs", + Level: "debug", + BatchSize: 500, + FlushInterval: 2 * time.Second, + ChannelBuffer: 5000, + MaxOpenConns: 10, + }.WithDefaults() + + checks := []struct { + field string + got any + want any + }{ + {"Table", cfg.Table, "my_logs"}, + {"Level", cfg.Level, "debug"}, + {"BatchSize", cfg.BatchSize, 500}, + {"FlushInterval", cfg.FlushInterval, 2 * time.Second}, + {"ChannelBuffer", cfg.ChannelBuffer, 5000}, + {"MaxOpenConns", cfg.MaxOpenConns, 10}, + } + for _, c := range checks { + if c.got != c.want { + t.Errorf("%s overwritten: got %v, want %v", c.field, c.got, c.want) + } + } +} + +// --- Integration: New() success requires a live ClickHouse --- +// Skipped automatically when CLICKHOUSE_TEST_DSN is not set. + +func TestNew_ConnectsAndExposesConfig(t *testing.T) { + dsn := os.Getenv("CLICKHOUSE_TEST_DSN") + if dsn == "" { + t.Skip("CLICKHOUSE_TEST_DSN not set — skipping integration test") + } + + core, err := clickhouse.New(context.Background(), config.Config{DSN: dsn}) + if err != nil { + t.Fatalf("New() failed: %v", err) + } + if core == nil { + t.Fatal("expected non-nil Core") + } + if core.Config.DSN != dsn { + t.Errorf("Config.DSN: got %q, want %q", core.Config.DSN, dsn) + } +} + +// --- extended Validate() coverage --- + +func TestValidate_NegativeFlushInterval(t *testing.T) { + cfg := config.Config{DSN: "clickhouse://localhost:9000/default"}.WithDefaults().SetFlushInterval(-1) + err := cfg.Validate() + if err == nil { + t.Fatal("expected error for negative FlushInterval, got nil") + } + if !strings.Contains(err.Error(), cherrors.ErrFlushIntervalMustNotBeNegative.Error()) { + t.Errorf("error %q missing sentinel %q", err, cherrors.ErrFlushIntervalMustNotBeNegative) + } +} + +func TestValidate_NegativeDialTimeout(t *testing.T) { + cfg := config.Config{DSN: "clickhouse://localhost:9000/default"}.WithDefaults().SetDialTimeout(-1) + err := cfg.Validate() + if err == nil { + t.Fatal("expected error for negative DialTimeout, got nil") + } + if !strings.Contains(err.Error(), cherrors.ErrDialTimeoutMustNotBeNegative.Error()) { + t.Errorf("error %q missing sentinel %q", err, cherrors.ErrDialTimeoutMustNotBeNegative) + } +} + +func TestValidate_NegativeWriteTimeout(t *testing.T) { + cfg := config.Config{DSN: "clickhouse://localhost:9000/default"}.WithDefaults().SetWriteTimeout(-1) + err := cfg.Validate() + if err == nil { + t.Fatal("expected error for negative WriteTimeout, got nil") + } + if !strings.Contains(err.Error(), cherrors.ErrWriteTimeoutMustNotBeNegative.Error()) { + t.Errorf("error %q missing sentinel %q", err, cherrors.ErrWriteTimeoutMustNotBeNegative) + } +} + +func TestValidate_NegativeMaxOpenConns(t *testing.T) { + cfg := config.Config{DSN: "clickhouse://localhost:9000/default"}.WithDefaults().SetMaxOpenConns(-1) + err := cfg.Validate() + if err == nil { + t.Fatal("expected error for negative MaxOpenConns, got nil") + } + if !strings.Contains(err.Error(), cherrors.ErrMaxOpenConnsMustNotBeNegative.Error()) { + t.Errorf("error %q missing sentinel %q", err, cherrors.ErrMaxOpenConnsMustNotBeNegative) + } +} + +func TestValidate_NegativeMaxIdleConns(t *testing.T) { + cfg := config.Config{DSN: "clickhouse://localhost:9000/default"}.WithDefaults().SetMaxIdleConns(-1) + err := cfg.Validate() + if err == nil { + t.Fatal("expected error for negative MaxIdleConns, got nil") + } + if !strings.Contains(err.Error(), cherrors.ErrMaxIdleConnsMustNotBeNegative.Error()) { + t.Errorf("error %q missing sentinel %q", err, cherrors.ErrMaxIdleConnsMustNotBeNegative) + } +} + +func TestValidate_MaxIdleConnsExceedsMaxOpenConns(t *testing.T) { + cfg := config.Config{DSN: "clickhouse://localhost:9000/default"}.WithDefaults().SetMaxOpenConns(3).SetMaxIdleConns(5) + err := cfg.Validate() + if err == nil { + t.Fatal("expected error when MaxIdleConns > MaxOpenConns, got nil") + } + if !strings.Contains(err.Error(), cherrors.ErrMaxIdleConnsExceedsMaxOpenConns.Error()) { + t.Errorf("error %q missing sentinel %q", err, cherrors.ErrMaxIdleConnsExceedsMaxOpenConns) + } +} + +func TestValidate_NegativeConnMaxLifetime(t *testing.T) { + cfg := config.Config{DSN: "clickhouse://localhost:9000/default"}.WithDefaults().SetConnMaxLifetime(-1) + err := cfg.Validate() + if err == nil { + t.Fatal("expected error for negative ConnMaxLifetime, got nil") + } + if !strings.Contains(err.Error(), cherrors.ErrConnMaxLifetimeMustNotBeNegative.Error()) { + t.Errorf("error %q missing sentinel %q", err, cherrors.ErrConnMaxLifetimeMustNotBeNegative) + } +} + +func TestValidate_EmptyLevelIsAccepted(t *testing.T) { + // Validate() must not error on empty Level — empty means "will use default". + // Callers who don't call WithDefaults() should not get a spurious ErrInvalidLevel. + err := config.Config{DSN: "clickhouse://localhost:9000/default", BatchSize: -1}.Validate() + if err == nil { + t.Fatal("expected error for negative BatchSize, got nil") + } + if strings.Contains(err.Error(), cherrors.ErrInvalidLevel.Error()) { + t.Errorf("Validate() with empty Level must not return ErrInvalidLevel; got: %v", err) + } + if !strings.Contains(err.Error(), cherrors.ErrBatchSizeMustNotBeNegative.Error()) { + t.Errorf("expected ErrBatchSizeMustNotBeNegative in error; got: %v", err) + } +} + +func TestValidate_WarningLevelIsAccepted(t *testing.T) { + cfg := config.Config{DSN: "clickhouse://localhost:9000/default"}.WithDefaults() + cfg.Level = "warning" + if err := cfg.Validate(); err != nil { + t.Errorf("Validate() rejected 'warning' level: %v", err) + } +} + +func TestValidate_InvalidTableName(t *testing.T) { + // Empty table is not tested here: WithDefaults() fills it with "ion_logs". + // These cases exercise names that are syntactically invalid as SQL identifiers. + cases := []string{"my-table", "my table", "1table", "'; DROP TABLE"} + for _, table := range cases { + t.Run(fmt.Sprintf("%q", table), func(t *testing.T) { + cfg := config.Config{DSN: "clickhouse://localhost:9000/default"}.WithDefaults() + cfg.Table = table + err := cfg.Validate() + if err == nil { + t.Errorf("Validate() with table %q should return error, got nil", table) + } + if !strings.Contains(err.Error(), cherrors.ErrInvalidTable.Error()) { + t.Errorf("error %q missing sentinel %q", err, cherrors.ErrInvalidTable) + } + }) + } +} diff --git a/internal/clickhouse/tests/integration_test.go b/internal/clickhouse/tests/integration_test.go new file mode 100644 index 0000000..aac0e9e --- /dev/null +++ b/internal/clickhouse/tests/integration_test.go @@ -0,0 +1,410 @@ +package tests + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + "os" + "testing" + "time" + + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + + "github.com/JupiterMetaLabs/ion/internal/clickhouse" + "github.com/JupiterMetaLabs/ion/internal/clickhouse/config" +) + +// integrationDSN returns the test DSN or skips the test. +func integrationDSN(t *testing.T) string { + t.Helper() + dsn := os.Getenv("CLICKHOUSE_TEST_DSN") + if dsn == "" { + t.Skip("CLICKHOUSE_TEST_DSN not set — skipping integration test") + } + return dsn +} + +// openCore creates a Core with AutoSchema=true against a unique test table, +// registers cleanup to drop the table, and returns the core + table name. +func openCore(t *testing.T, dsn string) (*clickhouse.Core, string) { + t.Helper() + table := fmt.Sprintf("ion_integration_%d", time.Now().UnixNano()) + + cfg := config.Config{ + DSN: dsn, + Table: table, + AutoSchema: true, + BatchSize: 100, + FlushInterval: 500 * time.Millisecond, + } + + ctx := context.Background() + core, err := clickhouse.New(ctx, cfg) + if err != nil { + t.Fatalf("New: %v", err) + } + if err := core.Open(ctx); err != nil { + t.Fatalf("Open: %v", err) + } + + t.Cleanup(func() { + shutCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + _ = core.Shutdown(shutCtx) + + db := liveConn(t) + defer db.Close() + _, _ = db.ExecContext(context.Background(), fmt.Sprintf("DROP TABLE IF EXISTS %s", table)) + }) + + return core, table +} + +// queryRows fetches all rows from the given table ordered by timestamp. +func queryRows(t *testing.T, db *sql.DB, table string) *sql.Rows { + t.Helper() + rows, err := db.QueryContext(context.Background(), + fmt.Sprintf("SELECT timestamp, level, service, version, logger, message, trace_id, span_id, request_id, user_id, caller, str_fields, int_fields, flt_fields, bool_fields, extra FROM %s ORDER BY timestamp", table)) + if err != nil { + t.Fatalf("query: %v", err) + } + return rows +} + +// --- Tests --- + +func TestIntegration_SingleRow_AllFixedColumns(t *testing.T) { + dsn := integrationDSN(t) + core, table := openCore(t, dsn) + db := liveConn(t) + defer db.Close() + + ts := time.Now().UTC().Truncate(time.Millisecond) + entry := zapcore.Entry{ + Level: zapcore.InfoLevel, + Time: ts, + Message: "hello integration", + } + fields := []zapcore.Field{ + zap.String("service", "test-svc"), + zap.String("version", "v1.2.3"), + zap.String("trace_id", "trace-abc"), + zap.String("span_id", "span-xyz"), + zap.String("request_id", "req-001"), + zap.String("user_id", "user-42"), + } + if err := core.Write(entry, fields); err != nil { + t.Fatalf("Write: %v", err) + } + core.Sync() //nolint:errcheck + + // Poll briefly for the async flush + var count int + for range 20 { + _ = db.QueryRowContext(context.Background(), + fmt.Sprintf("SELECT count() FROM %s", table)).Scan(&count) + if count > 0 { + break + } + time.Sleep(100 * time.Millisecond) + } + if count == 0 { + t.Fatal("no rows found after flush") + } + + rows := queryRows(t, db, table) + defer rows.Close() + + if !rows.Next() { + t.Fatal("expected at least one row") + } + + var ( + gotTimestamp time.Time + gotLevel, gotService, gotVersion, gotLogger, gotMessage string + gotTraceID, gotSpanID, gotRequestID, gotUserID, gotCaller string + gotStrFields map[string]string + gotIntFields map[string]int64 + gotFltFields map[string]float64 + gotBoolFields map[string]uint8 + gotExtra string + ) + if err := rows.Scan( + &gotTimestamp, &gotLevel, &gotService, &gotVersion, &gotLogger, &gotMessage, + &gotTraceID, &gotSpanID, &gotRequestID, &gotUserID, &gotCaller, + &gotStrFields, &gotIntFields, &gotFltFields, &gotBoolFields, &gotExtra, + ); err != nil { + t.Fatalf("scan: %v", err) + } + + if gotLevel != "info" { + t.Errorf("level: got %q, want %q", gotLevel, "info") + } + if gotMessage != "hello integration" { + t.Errorf("message: got %q, want %q", gotMessage, "hello integration") + } + if gotService != "test-svc" { + t.Errorf("service: got %q, want %q", gotService, "test-svc") + } + if gotVersion != "v1.2.3" { + t.Errorf("version: got %q, want %q", gotVersion, "v1.2.3") + } + if gotTraceID != "trace-abc" { + t.Errorf("trace_id: got %q, want %q", gotTraceID, "trace-abc") + } + if gotSpanID != "span-xyz" { + t.Errorf("span_id: got %q, want %q", gotSpanID, "span-xyz") + } + if gotRequestID != "req-001" { + t.Errorf("request_id: got %q, want %q", gotRequestID, "req-001") + } + if gotUserID != "user-42" { + t.Errorf("user_id: got %q, want %q", gotUserID, "user-42") + } +} + +func TestIntegration_TypedMapColumns(t *testing.T) { + dsn := integrationDSN(t) + core, table := openCore(t, dsn) + db := liveConn(t) + defer db.Close() + + entry := zapcore.Entry{Level: zapcore.WarnLevel, Time: time.Now(), Message: "typed fields"} + fields := []zapcore.Field{ + zap.String("env", "staging"), + zap.Int64("latency_us", 1234), + zap.Float64("score", 0.95), + zap.Bool("ok", true), + } + if err := core.Write(entry, fields); err != nil { + t.Fatalf("Write: %v", err) + } + core.Sync() //nolint:errcheck + + var count int + for range 20 { + _ = db.QueryRowContext(context.Background(), + fmt.Sprintf("SELECT count() FROM %s", table)).Scan(&count) + if count > 0 { + break + } + time.Sleep(100 * time.Millisecond) + } + if count == 0 { + t.Fatal("no rows after flush") + } + + rows := queryRows(t, db, table) + defer rows.Close() + if !rows.Next() { + t.Fatal("expected a row") + } + + var ( + ts, level, svc, ver, logger, msg, tid, sid, rid, uid, caller string + strF map[string]string + intF map[string]int64 + fltF map[string]float64 + boolF map[string]uint8 + extra string + ) + if err := rows.Scan(&ts, &level, &svc, &ver, &logger, &msg, &tid, &sid, &rid, &uid, &caller, + &strF, &intF, &fltF, &boolF, &extra); err != nil { + t.Fatalf("scan: %v", err) + } + + if strF["env"] != "staging" { + t.Errorf("str_fields[env]: got %q, want %q", strF["env"], "staging") + } + if intF["latency_us"] != 1234 { + t.Errorf("int_fields[latency_us]: got %d, want 1234", intF["latency_us"]) + } + if fltF["score"] != 0.95 { + t.Errorf("flt_fields[score]: got %v, want 0.95", fltF["score"]) + } + if boolF["ok"] != 1 { + t.Errorf("bool_fields[ok]: got %d, want 1", boolF["ok"]) + } +} + +func TestIntegration_ErrorFieldGoesToExtra(t *testing.T) { + dsn := integrationDSN(t) + core, table := openCore(t, dsn) + db := liveConn(t) + defer db.Close() + + entry := zapcore.Entry{Level: zapcore.ErrorLevel, Time: time.Now(), Message: "something failed"} + fields := []zapcore.Field{ + zap.NamedError("db_err", fmt.Errorf("connection refused")), + } + if err := core.Write(entry, fields); err != nil { + t.Fatalf("Write: %v", err) + } + core.Sync() //nolint:errcheck + + var count int + for range 20 { + _ = db.QueryRowContext(context.Background(), + fmt.Sprintf("SELECT count() FROM %s", table)).Scan(&count) + if count > 0 { + break + } + time.Sleep(100 * time.Millisecond) + } + if count == 0 { + t.Fatal("no rows after flush") + } + + var extra string + if err := db.QueryRowContext(context.Background(), + fmt.Sprintf("SELECT extra FROM %s LIMIT 1", table)).Scan(&extra); err != nil { + t.Fatalf("query extra: %v", err) + } + if extra == "" { + t.Fatal("extra is empty — error field was not written") + } + var parsed map[string]any + if err := json.Unmarshal([]byte(extra), &parsed); err != nil { + t.Fatalf("extra is not valid JSON: %v — value: %q", err, extra) + } + if _, ok := parsed["db_err"]; !ok { + t.Errorf("extra JSON missing key %q, got: %s", "db_err", extra) + } +} + +func TestIntegration_BatchOf100Rows(t *testing.T) { + dsn := integrationDSN(t) + core, table := openCore(t, dsn) + db := liveConn(t) + defer db.Close() + + const n = 100 + for i := range n { + entry := zapcore.Entry{ + Level: zapcore.InfoLevel, + Time: time.Now(), + Message: fmt.Sprintf("row %d", i), + } + if err := core.Write(entry, []zapcore.Field{zap.Int64("i", int64(i))}); err != nil { + t.Fatalf("Write %d: %v", i, err) + } + } + + shutCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if err := core.Shutdown(shutCtx); err != nil { + t.Fatalf("Shutdown: %v", err) + } + + var count int + if err := db.QueryRowContext(context.Background(), + fmt.Sprintf("SELECT count() FROM %s", table)).Scan(&count); err != nil { + t.Fatalf("count query: %v", err) + } + if count != n { + t.Errorf("row count: got %d, want %d. DroppedCount=%d", count, n, core.DroppedCount()) + } + if core.DroppedCount() != 0 { + t.Errorf("DroppedCount: got %d, want 0", core.DroppedCount()) + } +} + +func TestIntegration_WithFields_PresetFieldsInEveryRow(t *testing.T) { + dsn := integrationDSN(t) + core, table := openCore(t, dsn) + db := liveConn(t) + defer db.Close() + + // With() returns a child core that stamps every row with preset fields. + child := core.With([]zapcore.Field{zap.String("service", "preset-svc")}) + + for range 3 { + entry := zapcore.Entry{Level: zapcore.InfoLevel, Time: time.Now(), Message: "msg"} + if err := child.Write(entry, nil); err != nil { + t.Fatalf("Write: %v", err) + } + } + + shutCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if err := core.Shutdown(shutCtx); err != nil { + t.Fatalf("Shutdown: %v", err) + } + + rows, err := db.QueryContext(context.Background(), + fmt.Sprintf("SELECT service FROM %s", table)) + if err != nil { + t.Fatalf("query: %v", err) + } + defer rows.Close() + + n := 0 + for rows.Next() { + var svc string + if err := rows.Scan(&svc); err != nil { + t.Fatalf("scan: %v", err) + } + if svc != "preset-svc" { + t.Errorf("row %d service: got %q, want %q", n, svc, "preset-svc") + } + n++ + } + if n != 3 { + t.Errorf("row count: got %d, want 3", n) + } +} + +func TestIntegration_Shutdown_FlushesRemainingRows(t *testing.T) { + dsn := integrationDSN(t) + + // Tiny FlushInterval so the ticker doesn't fire before Shutdown. + cfg := config.Config{ + DSN: dsn, + Table: fmt.Sprintf("ion_integration_%d", time.Now().UnixNano()), + AutoSchema: true, + BatchSize: 1000, // large — won't flush on batch size + FlushInterval: time.Hour, // huge — ticker won't fire + } + + ctx := context.Background() + core, err := clickhouse.New(ctx, cfg) + if err != nil { + t.Fatalf("New: %v", err) + } + if err := core.Open(ctx); err != nil { + t.Fatalf("Open: %v", err) + } + t.Cleanup(func() { + db := liveConn(t) + defer db.Close() + _, _ = db.ExecContext(context.Background(), + fmt.Sprintf("DROP TABLE IF EXISTS %s", cfg.Table)) + }) + + const n = 10 + for i := range n { + entry := zapcore.Entry{Level: zapcore.InfoLevel, Time: time.Now(), Message: "pre-shutdown"} + _ = core.Write(entry, []zapcore.Field{zap.Int64("i", int64(i))}) + } + + // Shutdown must drain and flush before returning. + shutCtx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + if err := core.Shutdown(shutCtx); err != nil { + t.Fatalf("Shutdown: %v", err) + } + + db := liveConn(t) + defer db.Close() + + var count int + if err := db.QueryRowContext(context.Background(), + fmt.Sprintf("SELECT count() FROM %s", cfg.Table)).Scan(&count); err != nil { + t.Fatalf("count: %v", err) + } + if count != n { + t.Errorf("post-shutdown count: got %d, want %d", count, n) + } +} diff --git a/internal/clickhouse/tests/ping_test.go b/internal/clickhouse/tests/ping_test.go new file mode 100644 index 0000000..fba269b --- /dev/null +++ b/internal/clickhouse/tests/ping_test.go @@ -0,0 +1,38 @@ +package tests + +import ( + "context" + "testing" + "time" + + "github.com/JupiterMetaLabs/ion/internal/clickhouse/config" +) + +func TestPing_MalformedDSN(t *testing.T) { + ctx := context.Background() + err := config.Config{DSN: "://not-valid"}.Ping(ctx) + if err == nil { + t.Fatal("expected error for malformed DSN, got nil") + } +} + +func TestPing_UnreachableHost(t *testing.T) { + // Port 19999 — nothing listening. Expect a connection error, not a hang. + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + err := config.Config{DSN: "clickhouse://localhost:19999/default"}.Ping(ctx) + if err == nil { + t.Fatal("expected error for unreachable host, got nil") + } +} + +func TestPing_ContextCancelled(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() // cancel immediately + + err := config.Config{DSN: "clickhouse://localhost:9000/default"}.Ping(ctx) + if err == nil { + t.Fatal("expected error for cancelled context, got nil") + } +} diff --git a/internal/clickhouse/tests/schema_ddl_test.go b/internal/clickhouse/tests/schema_ddl_test.go new file mode 100644 index 0000000..e200b90 --- /dev/null +++ b/internal/clickhouse/tests/schema_ddl_test.go @@ -0,0 +1,109 @@ +package tests + +import ( + "strings" + "testing" + + "github.com/JupiterMetaLabs/ion/internal/clickhouse" +) + +// --- buildDDL --- + +func TestBuildDDL_ContainsAllColumns(t *testing.T) { + ddl := clickhouse.BuildDDL("ion_logs") + + required := []string{ + "timestamp", "level", "service", "version", + "logger", "message", + "trace_id", "span_id", "request_id", "user_id", + "caller", + "str_fields", "int_fields", "flt_fields", "bool_fields", + "extra", + } + for _, col := range required { + if !strings.Contains(ddl, col) { + t.Errorf("DDL missing column %q", col) + } + } +} + +func TestBuildDDL_UsesProvidedTableName(t *testing.T) { + ddl := clickhouse.BuildDDL("my_custom_table") + + if !strings.Contains(ddl, "my_custom_table") { + t.Error("DDL does not contain the provided table name") + } + if strings.Contains(ddl, "ion_logs") { + t.Error("DDL contains hardcoded table name instead of the provided one") + } +} + +func TestBuildDDL_IfNotExists(t *testing.T) { + ddl := clickhouse.BuildDDL("ion_logs") + if !strings.Contains(ddl, "IF NOT EXISTS") { + t.Error("DDL missing IF NOT EXISTS — not safe to re-run") + } +} + +func TestBuildDDL_LowCardinalityOnRepetitiveColumns(t *testing.T) { + ddl := clickhouse.BuildDDL("ion_logs") + for _, col := range []string{"level", "service", "version"} { + // The column definition must use LowCardinality, not plain String. + if !strings.Contains(ddl, "LowCardinality(String)") { + t.Errorf("DDL: %q should use LowCardinality(String)", col) + } + } +} + +func TestBuildDDL_TypedMapColumns(t *testing.T) { + ddl := clickhouse.BuildDDL("ion_logs") + + types := map[string]string{ + "str_fields": "Map(String, String)", + "int_fields": "Map(String, Int64)", + "flt_fields": "Map(String, Float64)", + "bool_fields": "Map(String, UInt8)", + } + for col, typ := range types { + if !strings.Contains(ddl, typ) { + t.Errorf("DDL: column %q missing type %q", col, typ) + } + } +} + +func TestBuildDDL_MergeTreeEngine(t *testing.T) { + ddl := clickhouse.BuildDDL("ion_logs") + if !strings.Contains(ddl, "MergeTree()") { + t.Error("DDL missing MergeTree() engine") + } +} + +func TestBuildDDL_PartitionByMonth(t *testing.T) { + ddl := clickhouse.BuildDDL("ion_logs") + if !strings.Contains(ddl, "PARTITION BY toYYYYMM(timestamp)") { + t.Error("DDL missing PARTITION BY toYYYYMM(timestamp)") + } +} + +func TestBuildDDL_OrderBy(t *testing.T) { + ddl := clickhouse.BuildDDL("ion_logs") + if !strings.Contains(ddl, "ORDER BY (service, level, timestamp)") { + t.Error("DDL missing ORDER BY (service, level, timestamp)") + } +} + +func TestBuildDDL_TTL(t *testing.T) { + ddl := clickhouse.BuildDDL("ion_logs") + if !strings.Contains(ddl, "TTL") { + t.Error("DDL missing TTL clause") + } +} + +func TestBuildDDL_NanosecondTimestamp(t *testing.T) { + ddl := clickhouse.BuildDDL("ion_logs") + if !strings.Contains(ddl, "DateTime64(9") { + t.Error("DDL: timestamp column must use DateTime64(9,...) for nanosecond precision") + } +} + + diff --git a/internal/clickhouse/tests/schema_test.go b/internal/clickhouse/tests/schema_test.go new file mode 100644 index 0000000..7f0bd4c --- /dev/null +++ b/internal/clickhouse/tests/schema_test.go @@ -0,0 +1,120 @@ +package tests + +import ( + "context" + "database/sql" + "fmt" + "os" + "strings" + "testing" + + "github.com/JupiterMetaLabs/ion/internal/clickhouse" + "github.com/JupiterMetaLabs/ion/internal/clickhouse/config" + + chdriver "github.com/ClickHouse/clickhouse-go/v2" +) + +// liveConn opens a real ClickHouse connection using CLICKHOUSE_TEST_DSN. +// Skips the test if the env var is not set. +func liveConn(t *testing.T) *sql.DB { + t.Helper() + dsn := os.Getenv("CLICKHOUSE_TEST_DSN") + if dsn == "" { + t.Skip("CLICKHOUSE_TEST_DSN not set — skipping integration test") + } + opts, err := chdriver.ParseDSN(dsn) + if err != nil { + t.Fatalf("ParseDSN: %v", err) + } + db := chdriver.OpenDB(opts) + if err := db.Ping(); err != nil { + t.Fatalf("ping: %v", err) + } + return db +} + +func TestEnsureSchema_CreatesTable(t *testing.T) { + dsn := os.Getenv("CLICKHOUSE_TEST_DSN") + if dsn == "" { + t.Skip("CLICKHOUSE_TEST_DSN not set — skipping integration test") + } + + table := "ion_test_schema" + if err := clickhouse.RunEnsureSchema(context.Background(), config.Config{DSN: dsn}, table); err != nil { + t.Fatalf("EnsureSchema: %v", err) + } + + db := liveConn(t) + defer db.Close() + + // Verify table exists and has the expected columns + rows, err := db.QueryContext(context.Background(), + fmt.Sprintf("DESCRIBE TABLE %s", table)) + if err != nil { + t.Fatalf("DESCRIBE TABLE: %v", err) + } + defer rows.Close() + + cols := map[string]string{} + for rows.Next() { + var name, typ, defaultType, defaultExpr, comment, codecExpr, ttlExpr string + if err := rows.Scan(&name, &typ, &defaultType, &defaultExpr, &comment, &codecExpr, &ttlExpr); err != nil { + t.Fatalf("scan: %v", err) + } + cols[name] = typ + } + + required := map[string]string{ + "timestamp": "DateTime64(9, 'UTC')", + "level": "LowCardinality(String)", + "service": "LowCardinality(String)", + "version": "LowCardinality(String)", + "logger": "String", + "message": "String", + "trace_id": "String", + "span_id": "String", + "request_id": "String", + "user_id": "String", + "caller": "String", + "str_fields": "Map(String, String)", + "int_fields": "Map(String, Int64)", + "flt_fields": "Map(String, Float64)", + "bool_fields": "Map(String, UInt8)", + "extra": "String", + } + + for col, wantType := range required { + gotType, ok := cols[col] + if !ok { + t.Errorf("column %q missing from table", col) + continue + } + if !strings.EqualFold(gotType, wantType) { + t.Errorf("column %q: got type %q, want %q", col, gotType, wantType) + } + } + + // Cleanup + _, _ = db.ExecContext(context.Background(), fmt.Sprintf("DROP TABLE IF EXISTS %s", table)) +} + +func TestEnsureSchema_Idempotent(t *testing.T) { + dsn := os.Getenv("CLICKHOUSE_TEST_DSN") + if dsn == "" { + t.Skip("CLICKHOUSE_TEST_DSN not set — skipping integration test") + } + + table := "ion_test_schema_idem" + cfg := config.Config{DSN: dsn} + + // Running twice must not error — CREATE TABLE IF NOT EXISTS guarantees this + for i := range 2 { + if err := clickhouse.RunEnsureSchema(context.Background(), cfg, table); err != nil { + t.Fatalf("EnsureSchema run %d: %v", i+1, err) + } + } + + db := liveConn(t) + defer db.Close() + _, _ = db.ExecContext(context.Background(), fmt.Sprintf("DROP TABLE IF EXISTS %s", table)) +} diff --git a/internal/config/config.go b/internal/config/config.go index 80c6049..9911f88 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -7,6 +7,8 @@ import ( "time" "gopkg.in/natefinch/lumberjack.v2" + + clickhouseconfig "github.com/JupiterMetaLabs/ion/internal/clickhouse/config" ) // Config holds the complete logger configuration. @@ -42,6 +44,9 @@ type Config struct { // Metrics configuration for OpenTelemetry metrics. Metrics MetricsConfig `yaml:"metrics" json:"metrics"` + + // ClickHouse log sink configuration. + ClickHouse clickhouseconfig.Config `yaml:"clickhouse" json:"clickhouse"` } // ConsoleConfig configures console (stdout/stderr) output. @@ -270,6 +275,14 @@ func Default() Config { Temporality: "cumulative", // Prometheus-compatible // Endpoint, Protocol, Auth inherited from OTEL if empty }, + ClickHouse: clickhouseconfig.Config{ + Enabled: false, + DSN: "", + Table: "ion_logs", + BatchSize: 1000, + FlushInterval: 500 * time.Millisecond, + ChannelBuffer: 50000, + }, } } @@ -335,6 +348,13 @@ func (c Config) WithTracing(endpoint string) Config { return c } +func (c Config) WithClickHouse(dsn string) Config { + c.ClickHouse.Enabled = true + c.ClickHouse.DSN = dsn + c.ClickHouse.WithDefaults() + return c +} + // WithMetrics returns a copy of the config with metrics enabled. func (c Config) WithMetrics(endpoint string) Config { c.Metrics.Enabled = true @@ -403,6 +423,11 @@ func (c Config) Validate() error { errs = append(errs, fmt.Sprintf("invalid tracing protocol %q (use: grpc, http)", c.Tracing.Protocol)) } + // Validate ClickHouse config + if c.ClickHouse.Enabled && c.ClickHouse.DSN == "" { + errs = append(errs, "clickhouse enabled but DSN is empty") + } + // Validate metrics config if c.Metrics.Enabled { if c.Metrics.Endpoint == "" && c.OTEL.Endpoint == "" { @@ -430,9 +455,13 @@ func (c Config) Validate() error { if c.OTEL.Level != "" && !validLevels[strings.ToLower(c.OTEL.Level)] { errs = append(errs, fmt.Sprintf("invalid otel level %q", c.OTEL.Level)) } + if c.ClickHouse.Level != "" && !validLevels[strings.ToLower(c.ClickHouse.Level)] { + errs = append(errs, fmt.Sprintf("invalid clickhouse level %q", c.ClickHouse.Level)) + } if len(errs) > 0 { return fmt.Errorf("config validation failed: %s", strings.Join(errs, "; ")) } return nil } + diff --git a/internal/core/logger_factory.go b/internal/core/logger_factory.go index 6406b7b..b9adafd 100644 --- a/internal/core/logger_factory.go +++ b/internal/core/logger_factory.go @@ -45,6 +45,11 @@ func NewZapLogger(cfg config.Config) (*ZapFactoryResult, error) { otelLevel = parseLevel(cfg.OTEL.Level) } + chLevel := globalLevel + if cfg.ClickHouse.Level != "" { + chLevel = parseLevel(cfg.ClickHouse.Level) + } + // Calculate the minimum level across all ENABLED sinks. // This ensures the main atomicLevel (used for SetLevel and early filtering) // allows logs to pass if ANY sink needs them. @@ -61,6 +66,9 @@ func NewZapLogger(cfg config.Config) (*ZapFactoryResult, error) { if cfg.OTEL.Enabled && otelLevel < minLevel { minLevel = otelLevel } + if cfg.ClickHouse.Enabled && chLevel < minLevel { + minLevel = chLevel + } // The atomicLevel acts as the master gatekeeper in logger_impl.go atomicLevel := zap.NewAtomicLevelAt(minLevel) diff --git a/ion.go b/ion.go index d720e9c..45c4db8 100644 --- a/ion.go +++ b/ion.go @@ -4,10 +4,14 @@ import ( "context" "fmt" "log" + "time" "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/metric/noop" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "github.com/JupiterMetaLabs/ion/internal/clickhouse" "github.com/JupiterMetaLabs/ion/internal/core" ) @@ -57,6 +61,7 @@ type Ion struct { tracingEnabled bool meterProvider *core.MeterProvider metricsEnabled bool + chCore *clickhouse.Core } // Warning represents a non-fatal initialization issue. @@ -106,7 +111,28 @@ func New(cfg Config) (*Ion, []Warning, error) { otelProvider: zapRes.OTELProvider, } - // 2. Setup Tracing (OTEL Traces) + // 2. Setup ClickHouse sink + if cfg.ClickHouse.Enabled { + chCore, chErr := buildClickHouseCore(cfg) + if chErr != nil { + warnings = append(warnings, Warning{ + Component: "clickhouse", + Err: fmt.Errorf("failed to init clickhouse: %w (clickhouse disabled)", chErr), + }) + } else { + ion.chCore = chCore + // Inject the CH core into the zap tee without touching internal/core. + // extractRow already drops the SentinelKey, but we wrap with filteringCore + // for consistency with the other sinks. + ion.zapLogger.zap = ion.zapLogger.zap.WithOptions( + zap.WrapCore(func(existing zapcore.Core) zapcore.Core { + return zapcore.NewTee(existing, core.NewFilteringCore(chCore, core.SentinelKey)) + }), + ) + } + } + + // 3. Setup Tracing (OTEL Traces) if cfg.Tracing.Enabled { // Use Tracing endpoint or fallback to OTEL endpoint if cfg.Tracing.Endpoint == "" { @@ -153,7 +179,7 @@ func New(cfg Config) (*Ion, []Warning, error) { } } - // 3. Setup Metrics (OTEL Metrics) + // 4. Setup Metrics (OTEL Metrics) if cfg.Metrics.Enabled { // Use Metrics endpoint or fallback to OTEL endpoint if cfg.Metrics.Endpoint == "" { @@ -214,6 +240,7 @@ func (i *Ion) Named(name string) Logger { tracingEnabled: i.tracingEnabled, meterProvider: i.meterProvider, metricsEnabled: i.metricsEnabled, + chCore: i.chCore, } } @@ -233,6 +260,7 @@ func (i *Ion) With(fields ...Field) Logger { tracingEnabled: i.tracingEnabled, meterProvider: i.meterProvider, metricsEnabled: i.metricsEnabled, + chCore: i.chCore, } } @@ -264,6 +292,7 @@ func (i *Ion) Child(name string, fields ...Field) *Ion { tracingEnabled: i.tracingEnabled, meterProvider: i.meterProvider, metricsEnabled: i.metricsEnabled, + chCore: i.chCore, } } @@ -303,6 +332,18 @@ func newNoopMeter() metric.Meter { return noop.NewMeterProvider().Meter("noop") } +// DroppedCount returns the total number of log entries dropped by the ClickHouse +// sink since startup. A non-zero value means the async write buffer filled up +// faster than ClickHouse could flush — tune BatchSize, FlushInterval, or +// ChannelBuffer to reduce back-pressure. +// Returns 0 if ClickHouse is not enabled. +func (i *Ion) DroppedCount() int64 { + if i.chCore == nil { + return 0 + } + return i.chCore.DroppedCount() +} + // --- Lifecycle --- // Shutdown gracefully shuts down all observability subsystems in order: @@ -330,6 +371,12 @@ func (i *Ion) Shutdown(ctx context.Context) error { } } + if i.chCore != nil { + if err := i.chCore.Shutdown(ctx); err != nil && firstErr == nil { + firstErr = err + } + } + if i.zapLogger != nil { if err := i.zapLogger.Shutdown(ctx); err != nil && firstErr == nil { firstErr = err @@ -338,3 +385,25 @@ func (i *Ion) Shutdown(ctx context.Context) error { return firstErr } + +// buildClickHouseCore constructs and opens a ClickHouse core from ion Config. +// Returns an error (treated as warning by the caller) if the core cannot be started. +func buildClickHouseCore(cfg Config) (*clickhouse.Core, error) { + // Inherit global level when the ClickHouse-specific level is not set. + chCfg := cfg.ClickHouse + if chCfg.Level == "" { + chCfg.Level = cfg.Level + } + + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + + chCore, err := clickhouse.New(ctx, chCfg) + if err != nil { + return nil, err + } + if err := chCore.Open(ctx); err != nil { + return nil, err + } + return chCore, nil +}