From c09225129194070b214a39caa27772e00cc22183 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Fri, 27 Mar 2026 13:40:30 +0300 Subject: [PATCH 01/29] feat(fracmanager): implement fraction snapshots with wait group reference counting --- asyncsearcher/async_searcher.go | 27 +- asyncsearcher/async_searcher_test.go | 4 +- fracmanager/fracmanager.go | 19 +- fracmanager/fracmanager_for_tests.go | 2 +- fracmanager/fracmanager_test.go | 24 +- fracmanager/fracs_stats.go | 4 + fracmanager/fraction_provider.go | 26 +- fracmanager/fraction_registry.go | 431 ++++++++++---------------- fracmanager/fractions_snapshot.go | 136 ++++++++ fracmanager/lifecycle_manager.go | 99 +++--- fracmanager/lifecycle_manager_test.go | 123 ++++++-- fracmanager/partitioned_collection.go | 117 +++++++ fracmanager/proxy_frac.go | 201 ------------ fracmanager/sync_appender.go | 82 +++++ skipmaskmanager/skip_mask_manager.go | 15 +- storeapi/grpc_fetch.go | 5 +- storeapi/grpc_search.go | 16 +- util/min_heap.go | 117 +++++++ 18 files changed, 846 insertions(+), 602 deletions(-) create mode 100644 fracmanager/fractions_snapshot.go create mode 100644 fracmanager/partitioned_collection.go delete mode 100644 fracmanager/proxy_frac.go create mode 100644 fracmanager/sync_appender.go create mode 100644 util/min_heap.go diff --git a/asyncsearcher/async_searcher.go b/asyncsearcher/async_searcher.go index a0fa6368..72bbaa8a 100644 --- a/asyncsearcher/async_searcher.go +++ b/asyncsearcher/async_searcher.go @@ -74,11 +74,11 @@ type AsyncSearcherConfig struct { } type fractionAcquirer interface { - Fractions() fracmanager.List + AcquireFractions() (_ fracmanager.List, release func()) AcquireFraction(name string) (_ frac.Fraction, release func(), ok bool) } -func MustStartAsync(config AsyncSearcherConfig, mp MappingProvider, fracs fractionAcquirer) *AsyncSearcher { +func MustStartAsync(config AsyncSearcherConfig, mp MappingProvider, fracProvider fractionAcquirer) *AsyncSearcher { if config.DataDir == "" { logger.Fatal("can't start async searcher: DataDir is empty") } @@ -107,7 +107,7 @@ func MustStartAsync(config AsyncSearcherConfig, mp MappingProvider, fracs fracti for _, id := range notProcessedIDs { asyncSearchActiveSearches.Add(1) as.processWg.Add(1) - go as.processRequest(id, fracs) + go as.processRequest(id, fracProvider) } // set limit metrics that allow us to calculate alerts' thresholds @@ -209,7 +209,7 @@ func (i *asyncSearchInfo) Status() AsyncSearchStatus { return status } -func (as *AsyncSearcher) StartSearch(r AsyncSearchRequest, fracs fractionAcquirer) error { +func (as *AsyncSearcher) StartSearch(r AsyncSearchRequest, fracProvider fractionAcquirer) error { if as.readOnly.Load() { return fmt.Errorf("cannot start search on read-only mode") } @@ -240,14 +240,17 @@ func (as *AsyncSearcher) StartSearch(r AsyncSearchRequest, fracs fractionAcquire return fmt.Errorf("retention time should be less than %s, got %s", maxRetention, r.Retention) } - fracNames := fracs.Fractions().FilterInRange(r.Params.From, r.Params.To).Names() + fracs, release := fracProvider.AcquireFractions() + defer release() + + fracNames := fracs.FilterInRange(r.Params.From, r.Params.To).Names() if ok := as.saveSearchInfo(r, fracNames); !ok { // Request was saved previously, skip it return nil } asyncSearchActiveSearches.Add(1) as.processWg.Add(1) - go as.processRequest(r.ID, fracs) + go as.processRequest(r.ID, fracProvider) return nil } @@ -301,17 +304,17 @@ func (as *AsyncSearcher) createDataDir() { }) } -func (as *AsyncSearcher) processRequest(asyncSearchID string, fracs fractionAcquirer) { +func (as *AsyncSearcher) processRequest(asyncSearchID string, fracProvider fractionAcquirer) { defer as.processWg.Done() as.rateLimit <- struct{}{} defer func() { <-as.rateLimit }() - as.doSearch(asyncSearchID, fracs) + as.doSearch(asyncSearchID, fracProvider) asyncSearchActiveSearches.Add(-1) } -func (as *AsyncSearcher) doSearch(id string, fracs fractionAcquirer) { +func (as *AsyncSearcher) doSearch(id string, fracProvider fractionAcquirer) { qprPaths, err := as.findQPRs(id) if err != nil { panic(fmt.Errorf("can't find QPRs for id %q: %s", id, err)) @@ -353,7 +356,7 @@ func (as *AsyncSearcher) doSearch(id string, fracs fractionAcquirer) { if as.shouldStopSearch(id) { break } - if err := as.acquireAndProcessFrac(fracInfo, info, fracs); err != nil { + if err := as.acquireAndProcessFrac(fracInfo, info, fracProvider); err != nil { as.updateSearchInfo(id, func(info *asyncSearchInfo) { info.Error = err.Error() }) @@ -395,8 +398,8 @@ func compressQPR(qpr *seq.QPR, cb func(compressed []byte) error) error { return nil } -func (as *AsyncSearcher) acquireAndProcessFrac(fracInfo fracSearchState, searchInfo asyncSearchInfo, fracs fractionAcquirer) (err error) { - f, release, ok := fracs.AcquireFraction(fracInfo.Name) +func (as *AsyncSearcher) acquireAndProcessFrac(fracInfo fracSearchState, searchInfo asyncSearchInfo, fracProvider fractionAcquirer) (err error) { + f, release, ok := fracProvider.AcquireFraction(fracInfo.Name) if !ok { // oldest fracs may already be removed logger.Info( "async search: skip missing fraction", diff --git a/asyncsearcher/async_searcher_test.go b/asyncsearcher/async_searcher_test.go index 8ef37bc3..d6311ef5 100644 --- a/asyncsearcher/async_searcher_test.go +++ b/asyncsearcher/async_searcher_test.go @@ -49,8 +49,8 @@ func (fp fakeFractionProvider) AcquireFraction(name string) (frac.Fraction, func return nil, func() {}, false } -func (fp fakeFractionProvider) Fractions() fracmanager.List { - return fracmanager.List(fp) +func (fp fakeFractionProvider) AcquireFractions() (fracmanager.List, func()) { + return fracmanager.List(fp), func() {} } func TestAsyncSearcherMaintain(t *testing.T) { diff --git a/fracmanager/fracmanager.go b/fracmanager/fracmanager.go index 6b5b7c87..77a73c78 100644 --- a/fracmanager/fracmanager.go +++ b/fracmanager/fracmanager.go @@ -76,18 +76,19 @@ func New(ctx context.Context, cfg *Config, s3cli *s3.Client, skipMaskProvider sk cancel() wg.Wait() - // freeze active fraction to prevent new writes - active := lc.registry.Active() - if err := active.Finalize(); err != nil { + // finalize appender to prevent new writes + appender := lc.registry.Appender() + if err := appender.Finalize(); err != nil { logger.Fatal("shutdown fraction freezing error", zap.Error(err)) } - active.WaitWriteIdle() + appender.WaitWriteIdle() stopIdx() lc.SyncInfoCache() - sealOnShutdown(active.instance, provider, cfg.MinSealFracSize) + // Seal active fraction + sealOnShutdown(appender.Active, provider, cfg.MinSealFracSize) logger.Info("fracmanager's workers are stopped", zap.Int64("took_ms", time.Since(n).Milliseconds())) } @@ -96,11 +97,11 @@ func New(ctx context.Context, cfg *Config, s3cli *s3.Client, skipMaskProvider sk } func (fm *FracManager) AcquireFraction(name string) (frac.Fraction, func(), bool) { - return fm.lc.registry.AcquireFraction(name) + return fm.lc.registry.AcquireOneFraction(name) } -func (fm *FracManager) Fractions() List { - return fm.lc.registry.AllFractions() +func (fm *FracManager) AcquireFractions() (List, func()) { + return fm.lc.registry.AcquireAllFractions() } func (fm *FracManager) Oldest() uint64 { @@ -120,7 +121,7 @@ func (fm *FracManager) Append(ctx context.Context, docs storage.DocBlock, metas return ctx.Err() default: // Try to append data to the currently active fraction - err := fm.lc.registry.Active().Append(docs, metas) + err := fm.lc.registry.Appender().Append(docs, metas) if err != nil { logger.Info("append fail", zap.Error(err)) if err == ErrFractionNotWritable { diff --git a/fracmanager/fracmanager_for_tests.go b/fracmanager/fracmanager_for_tests.go index ab7cd851..c4ec1cad 100644 --- a/fracmanager/fracmanager_for_tests.go +++ b/fracmanager/fracmanager_for_tests.go @@ -3,7 +3,7 @@ package fracmanager import "sync" func (fm *FracManager) WaitIdleForTests() { - fm.lc.registry.Active().WaitWriteIdle() + fm.lc.registry.Appender().WaitWriteIdle() } func (fm *FracManager) SealForcedForTests() { diff --git a/fracmanager/fracmanager_test.go b/fracmanager/fracmanager_test.go index 663dedec..d92e13b8 100644 --- a/fracmanager/fracmanager_test.go +++ b/fracmanager/fracmanager_test.go @@ -61,30 +61,32 @@ func TestSealingOnShutdown(t *testing.T) { cfg.MinSealFracSize = 0 // to ensure that the frac will not be sealed on shutdown cfg, fm, stop := setupFracManager(t, cfg) appendDocsToFracManager(t, fm, 10) - activeName := fm.Fractions()[0].Info().Name() + + activeName := fm.lc.registry.all.fractions[0].Info().Name() + stop() // second start cfg.MinSealFracSize = 1 // to ensure that the frac will be sealed on shutdown cfg, fm, stop = setupFracManager(t, cfg) - assert.Equal(t, 1, len(fm.Fractions()), "should have one fraction") - assert.Equal(t, activeName, fm.Fractions()[0].Info().Name(), "fraction should have the same name") - _, ok := fm.Fractions()[0].(*fractionProxy).impl.(*frac.Active) + allFractions := fm.lc.registry.all.fractions + assert.Equal(t, 1, len(allFractions), "should have one fraction") + assert.Equal(t, activeName, allFractions[0].Info().Name(), "fraction should have the same name") + _, ok := allFractions[0].(*syncAppender) assert.True(t, ok, "fraction should be active") - stop() // third start _, fm, stop = setupFracManager(t, cfg) - assert.Equal(t, 2, len(fm.Fractions()), "should have 2 fraction: new active and old sealed") - _, ok = fm.Fractions()[0].(*fractionProxy).impl.(*frac.Sealed) + allFractions = fm.lc.registry.all.fractions + assert.Equal(t, 2, len(allFractions), "should have 2 fraction: new active and old sealed") + _, ok = allFractions[0].(*refCountedSealed) assert.True(t, ok, "first fraction should be sealed") - assert.Equal(t, activeName, fm.Fractions()[0].Info().Name(), "sealed fraction should have the same name") - assert.Equal(t, uint32(0), fm.Fractions()[1].Info().DocsTotal, "active fraction should be empty") - _, ok = fm.Fractions()[1].(*fractionProxy).impl.(*frac.Active) + assert.Equal(t, activeName, allFractions[0].Info().Name(), "sealed fraction should have the same name") + assert.Equal(t, uint32(0), allFractions[1].Info().DocsTotal, "active fraction should be empty") + _, ok = allFractions[1].(*syncAppender) assert.True(t, ok, "new fraction should be active") - stop() } diff --git a/fracmanager/fracs_stats.go b/fracmanager/fracs_stats.go index 968b8b41..c70bbd37 100644 --- a/fracmanager/fracs_stats.go +++ b/fracmanager/fracs_stats.go @@ -95,3 +95,7 @@ func (s *registryStats) SetMetrics() { s.offloading.SetMetrics(dataSizeTotal, "offloading") s.remotes.SetMetrics(dataSizeTotal, "remotes") } + +func (s registryStats) TotalSizeOnDiskLocal() uint64 { + return s.sealing.totalSizeOnDisk + s.sealed.totalSizeOnDisk +} diff --git a/fracmanager/fraction_provider.go b/fracmanager/fraction_provider.go index 66e6477b..e3a4d46b 100644 --- a/fracmanager/fraction_provider.go +++ b/fracmanager/fraction_provider.go @@ -8,14 +8,17 @@ import ( "time" "github.com/oklog/ulid/v2" + "go.uber.org/zap" "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/frac/sealed/sealing" + "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/node" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" + "github.com/ozontech/seq-db/util" ) const fileBasePattern = "seq-db-" @@ -123,8 +126,11 @@ func (fp *fractionProvider) CreateActive() *frac.Active { // Seal converts an active fraction to a sealed one // Process includes sorting, indexing, and data optimization for reading -func (fp *fractionProvider) Seal(active *frac.Active) (*frac.Sealed, error) { - src, err := frac.NewActiveSealingSource(active, fp.config.SealParams) +func (fp *fractionProvider) Seal(a *frac.Active) (*frac.Sealed, error) { + sealsTotal.Inc() + now := time.Now() + + src, err := frac.NewActiveSealingSource(a, fp.config.SealParams) if err != nil { return nil, err } @@ -133,9 +139,19 @@ func (fp *fractionProvider) Seal(active *frac.Active) (*frac.Sealed, error) { return nil, err } - sealedFrac := fp.NewSealedPreloaded(active.BaseFileName, preloaded) - fp.skipMaskProvider.RefreshFrac(sealedFrac) - return sealedFrac, nil + s := fp.NewSealedPreloaded(a.BaseFileName, preloaded) + fp.skipMaskProvider.RefreshFrac(s) + + sealingTime := time.Since(now) + sealsDoneSeconds.Observe(sealingTime.Seconds()) + + logger.Info( + "fraction sealed", + zap.String("fraction", filepath.Base(s.BaseFileName)), + zap.Float64("time_spent_s", util.DurationToUnit(sealingTime, "s")), + ) + + return s, nil } // Offload uploads fraction to S3 storage and returns a remote fraction diff --git a/fracmanager/fraction_registry.go b/fracmanager/fraction_registry.go index 0f79c28d..b0667c04 100644 --- a/fracmanager/fraction_registry.go +++ b/fracmanager/fraction_registry.go @@ -15,94 +15,94 @@ import ( // fractionRegistry manages fraction queues at different lifecycle stages. // Tracks fractions through different stages: active → sealing → sealed → offloading → remote -// Ensures correct state transitions while maintaining chronological order. // The entire structure is thread-safe due to internal synchronization. -// Lifecycle: created once, persists through application lifetime. type fractionRegistry struct { mu sync.RWMutex // main mutex for protecting registry state - // lifecycle queues (FIFO order, oldest at lower indexes) - sealing []*activeProxy // fractions being sealed (0-5 typical) - sealed []*sealedProxy // local sealed fractions (can be thousands) - offloading []*sealedProxy // fractions being offloaded (0-5 typical) - remotes []*remoteProxy // offloaded fractions (can be thousands) + sealing map[string]*syncAppender // fractions being sealed (0-5 typical) + sealed PartitionedCollection[*refCountedSealed] // local sealed fractions (can be thousands) + offloading PartitionedCollection[*refCountedSealed] // fractions being offloaded (0-5 typical) + remotes PartitionedCollection[*refCountedRemote] // offloaded fractions (can be thousands) - stats registryStats // size statistics for monitoring - oldestTotal uint64 // creation time of oldest fraction in all list including remote - oldestLocal uint64 // creation time of oldest fraction in local or offloading queues + stats registryStats // size statistics for monitoring - muAll sync.RWMutex // protects active, all, and oldestTotal fields - active *activeProxy // currently active writable fraction - all []frac.Fraction // all fractions in creation order (read-only view) - allMap map[string]frac.Fraction + muAppender sync.RWMutex + appender *syncAppender // currently active writable fraction + + muAll sync.RWMutex + all fractionsSnapshot // all fractions } // NewFractionRegistry creates and initializes a new fraction registry instance. // Populates the registry with existing active, sealed and remote fractions. -// Rebuilds the complete fractions list in chronological order. func NewFractionRegistry(active *frac.Active, sealed []*frac.Sealed, remotes []*frac.Remote) (*fractionRegistry, error) { if active == nil { return nil, errors.New("active fraction must be specified") } - r := fractionRegistry{ - active: &activeProxy{ - proxy: &fractionProxy{impl: active}, - instance: active, - }, + creationTime := func(f frac.Fraction) uint64 { return f.Info().CreationTime } + + lastDocTime := func(f frac.Fraction) uint64 { + aligned := f.Info().To.Time(). + Add(-time.Nanosecond). + Truncate(time.Minute). + Add(time.Minute) + return uint64(aligned.UnixMilli()) + } + + reg := fractionRegistry{ + appender: &syncAppender{refCountedActive: refCountedActive{Active: active}}, + + sealing: map[string]*syncAppender{}, + sealed: NewPartitionedCollection(func(rcs *refCountedSealed) uint64 { return creationTime(rcs) }), + offloading: NewPartitionedCollection(func(rcs *refCountedSealed) uint64 { return lastDocTime(rcs) }), + remotes: NewPartitionedCollection(func(rcr *refCountedRemote) uint64 { return lastDocTime(rcr) }), } // initialize local sealed fractions - for _, sealed := range sealed { - r.stats.sealed.Add(sealed.Info()) - r.sealed = append(r.sealed, &sealedProxy{ - proxy: &fractionProxy{impl: sealed}, - instance: sealed, - }) + for _, s := range sealed { + reg.stats.sealed.Add(s.Info()) + reg.sealed.Add(s.Info().Name(), &refCountedSealed{Sealed: s}) } // initialize remote fractions - for _, remote := range remotes { - r.stats.remotes.Add(remote.Info()) - r.remotes = append(r.remotes, &remoteProxy{ - proxy: &fractionProxy{impl: remote}, - instance: remote, - }) + for _, r := range remotes { + reg.stats.remotes.Add(r.Info()) + reg.remotes.Add(r.Info().Name(), &refCountedRemote{Remote: r}) } - r.updateOldestLocal() - r.rebuildAllFractions() + reg.rebuildSnapshot() - return &r, nil + return ®, nil } -// Active returns the currently active writable fraction. -func (r *fractionRegistry) Active() *activeProxy { - r.muAll.RLock() - defer r.muAll.RUnlock() - return r.active +// Appender returns the currently active writable fraction. +func (r *fractionRegistry) Appender() *syncAppender { + r.muAppender.RLock() + defer r.muAppender.RUnlock() + return r.appender } -func (r *fractionRegistry) AcquireFraction(name string) (frac.Fraction, func(), bool) { +func (r *fractionRegistry) AcquireOneFraction(name string) (frac.Fraction, func(), bool) { r.muAll.RLock() defer r.muAll.RUnlock() - f, ok := r.allMap[name] - return f, func() {}, ok + return r.all.AcquireOne(name) } -// AllFractions returns a read-only view of all fractions in creation order. -func (r *fractionRegistry) AllFractions() []frac.Fraction { +// AcquireAllFractions returns a read-only view of all fractions +func (r *fractionRegistry) AcquireAllFractions() ([]frac.Fraction, func()) { r.muAll.RLock() defer r.muAll.RUnlock() - return r.all + + return r.all.AcquireAll() } // Stats returns current size statistics of the registry. func (r *fractionRegistry) Stats() registryStats { r.mu.RLock() s := r.stats - i := r.active.instance.Info() + i := r.appender.Info() r.mu.RUnlock() s.active.Set(i) @@ -113,40 +113,57 @@ func (r *fractionRegistry) Stats() registryStats { func (r *fractionRegistry) OldestTotal() uint64 { r.muAll.RLock() defer r.muAll.RUnlock() - return r.oldestTotal + return r.all.oldestTotal } // OldestLocal returns the creation time of the oldest local fraction in the registry. func (r *fractionRegistry) OldestLocal() uint64 { - r.mu.RLock() - defer r.mu.RUnlock() - return r.oldestLocal + r.muAll.RLock() + defer r.muAll.RUnlock() + return r.all.oldestLocal +} + +type activeProvider interface { + CreateActive() *frac.Active +} + +func (r *fractionRegistry) setAppender(appender *syncAppender) { + r.muAppender.Lock() + defer r.muAppender.Unlock() + + r.appender = appender + + r.muAll.Lock() + defer r.muAll.Unlock() + + r.all.AddActive(appender) } // RotateIfFull completes the current active fraction and starts a new one. // Moves previous active fraction to sealing queue. -// Updates statistics and maintains chronological order. -// Should be called when creating a new fraction. -func (r *fractionRegistry) RotateIfFull(maxSize uint64, newActive func() *activeProxy) (*activeProxy, func(), error) { +// Should be called when the current active fraction reaches size limit and needs to be rotated +func (r *fractionRegistry) RotateIfFull(maxSize uint64, ap activeProvider) (*refCountedActive, func(), error) { r.mu.Lock() defer r.mu.Unlock() - if r.active.instance.Info().DocsOnDisk <= maxSize { + if r.appender.Info().DocsOnDisk <= maxSize { return nil, nil, nil } - old := r.active - r.sealing = append(r.sealing, old) - r.addActive(newActive()) + old := r.appender + + r.sealing[old.Info().Name()] = old + + r.setAppender(&syncAppender{refCountedActive: refCountedActive{Active: ap.CreateActive()}}) if err := old.Finalize(); err != nil { - return old, nil, err + return nil, nil, err } - curInfo := old.instance.Info() + curInfo := old.Info() r.stats.sealing.Add(curInfo) - r.active.Suspend(old.Suspended()) + r.appender.Suspend(old.Suspended()) wg := sync.WaitGroup{} wg.Add(1) @@ -156,7 +173,7 @@ func (r *fractionRegistry) RotateIfFull(maxSize uint64, newActive func() *active defer wg.Done() old.WaitWriteIdle() // can be long enough - finalInfo := old.instance.Info() + finalInfo := old.Info() r.mu.Lock() defer r.mu.Unlock() @@ -167,14 +184,14 @@ func (r *fractionRegistry) RotateIfFull(maxSize uint64, newActive func() *active r.stats.sealing.Add(finalInfo) }() - return old, wg.Wait, nil + return &old.refCountedActive, wg.Wait, nil } func (r *fractionRegistry) SuspendIfOverCapacity(maxQueue, maxSize uint64) { r.mu.Lock() defer r.mu.Unlock() - suspended := r.active.Suspended() + suspended := r.appender.Suspended() if maxQueue > 0 && r.stats.sealing.count >= int(maxQueue) { if !suspended { @@ -182,7 +199,7 @@ func (r *fractionRegistry) SuspendIfOverCapacity(maxQueue, maxSize uint64) { zap.String("reason", "sealing queue size exceeded"), zap.Uint64("limit", maxQueue), zap.Int("queue_size", r.stats.sealing.count)) - r.active.Suspend(true) + r.appender.Suspend(true) } return } @@ -195,7 +212,7 @@ func (r *fractionRegistry) SuspendIfOverCapacity(maxQueue, maxSize uint64) { zap.String("reason", "occupied space limit exceeded"), zap.Float64("queue_size_limit_gb", util.Float64ToPrec(util.SizeToUnit(maxSize, "gb"), 2)), zap.Float64("occupied_space_gb", util.Float64ToPrec(util.SizeToUnit(du, "gb"), 2))) - r.active.Suspend(true) + r.appender.Suspend(true) } return } @@ -206,66 +223,67 @@ func (r *fractionRegistry) SuspendIfOverCapacity(maxQueue, maxSize uint64) { zap.Float64("occupied_space_gb", util.Float64ToPrec(util.SizeToUnit(du, "gb"), 2)), zap.Uint64("sealing_queue_size_limit", maxQueue), zap.Int("queue_size", r.stats.sealing.count)) - r.active.Suspend(false) + r.appender.Suspend(false) } } func (r *fractionRegistry) diskUsage() uint64 { - return r.active.instance.Info().FullSize() + + return r.appender.Info().FullSize() + r.stats.sealed.totalSizeOnDisk + r.stats.sealing.totalSizeOnDisk + r.stats.offloading.totalSizeOnDisk } -// addActive sets a new active fraction and updates the complete fractions list. -func (r *fractionRegistry) addActive(a *activeProxy) { - r.muAll.Lock() - defer r.muAll.Unlock() +// EvictLocalForDelete removes oldest local fractions to free disk space. +// Returns evicted fractions or error if insufficient space is released. +func (r *fractionRegistry) EvictLocalForDelete(sizeLimit uint64) (evicted []*refCountedSealed, err error) { + r.mu.Lock() + defer r.mu.Unlock() - r.active = a - r.all = append(r.all, a.proxy) - r.allMap[a.instance.Info().Name()] = a.proxy -} + if evicted, err = r.evictLocal(sizeLimit); err != nil { + return nil, err + } -// trimAll removes the oldest fractions from the complete fractions list. -// Used when fractions are evicted or deleted from the system. -func (r *fractionRegistry) trimAll(count int) { - r.muAll.Lock() - defer r.muAll.Unlock() + r.rebuildSnapshot() - for _, f := range r.all[:count] { - delete(r.allMap, f.Info().Name()) - } - r.all = r.all[count:] - r.updateOldestTotal() + return evicted, nil } -// EvictLocal removes oldest local fractions to free disk space. -// If shouldOffload is true, moves fractions to offloading queue instead of deleting. +// EvictLocalForOffload removes oldest local fractions to moves it to offloading queue. // Returns evicted fractions or error if insufficient space is released. -func (r *fractionRegistry) EvictLocal(shouldOffload bool, sizeLimit uint64) ([]*sealedProxy, error) { +func (r *fractionRegistry) EvictLocalForOffload(sizeLimit uint64) ([]*refCountedSealed, error) { r.mu.Lock() defer r.mu.Unlock() - var ( - count int - releasingSize uint64 - ) + evicted, err := r.evictLocal(sizeLimit) + if err != nil { + return nil, err + } + + for _, sealed := range evicted { + r.offloading.Add(sealed.Info().Name(), sealed) + r.stats.offloading.Add(sealed.Info()) + } + + return evicted, nil +} + +func (r *fractionRegistry) evictLocal(sizeLimit uint64) ([]*refCountedSealed, error) { + var releasingSize uint64 // calculate total used disk space - totalUsedSize := r.stats.sealed.totalSizeOnDisk + - r.stats.sealing.totalSizeOnDisk + - r.active.instance.Info().FullSize() + totalUsedSize := r.stats.TotalSizeOnDiskLocal() + r.appender.Info().FullSize() + + evicted := []*refCountedSealed{} - // determine how many oldest fractions need to be removed to meet size limit - for _, item := range r.sealed { - if totalUsedSize-releasingSize <= sizeLimit { - break + for r.sealed.Len() > 0 && totalUsedSize-releasingSize > sizeLimit { + for _, s := range r.sealed.GetByPartition(r.sealed.MinPartition()) { + info := s.Info() + releasingSize += info.FullSize() + r.stats.sealed.Sub(info) + r.sealed.Del(info.Name()) + evicted = append(evicted, s) } - info := item.instance.Info() - releasingSize += info.FullSize() - r.stats.sealed.Sub(info) - count++ } // check if enough space will be freed @@ -275,28 +293,13 @@ func (r *fractionRegistry) EvictLocal(shouldOffload bool, sizeLimit uint64) ([]* (totalUsedSize-releasingSize)-sizeLimit, totalUsedSize, releasingSize, sizeLimit) } - // extract fractions to evict - evicted := r.sealed[:count] - r.sealed = r.sealed[count:] - - // either offload or completely remove the fractions - if shouldOffload { - for _, item := range evicted { - r.offloading = append(r.offloading, item) - r.stats.offloading.Add(item.instance.Info()) - } - } else { - r.trimAll(count) // permanently remove - r.updateOldestLocal() // oldest local can be changed here - } - return evicted, nil } // EvictRemote removes oldest remote fractions based on retention policy. // Fractions older than retention period are permanently deleted. // Returns removed fractions or empty slice if nothing to remove. -func (r *fractionRegistry) EvictRemote(retention time.Duration) []*remoteProxy { +func (r *fractionRegistry) EvictRemote(retention time.Duration) []*refCountedRemote { if retention == 0 { return nil } @@ -304,28 +307,24 @@ func (r *fractionRegistry) EvictRemote(retention time.Duration) []*remoteProxy { r.mu.Lock() defer r.mu.Unlock() - count := 0 - // find fractions older than retention period - for _, item := range r.remotes { - info := item.instance.Info() - if time.Since(time.UnixMilli(int64(info.CreationTime))) <= retention { - break // stop at first fraction within retention + evicted := []*refCountedRemote{} + for r.remotes.Len() > 0 && time.Since(time.UnixMilli(int64(r.remotes.MinPartition()))) > retention { + for _, remote := range r.remotes.GetByPartition(r.remotes.MinPartition()) { + info := remote.Info() + r.stats.remotes.Sub(info) + evicted = append(evicted, remote) + r.remotes.Del(info.Name()) } - r.stats.remotes.Sub(info) - count++ } - evicted := r.remotes[:count] - r.remotes = r.remotes[count:] - r.trimAll(count) // remove from complete list + r.rebuildSnapshot() return evicted } // EvictOverflowed removes oldest fractions from offloading queue when it exceeds size limit. -// Selects fractions that haven't finished offloading yet to minimize data loss. // Used when offloading queue grows too large due to slow remote storage performance. -func (r *fractionRegistry) EvictOverflowed(sizeLimit uint64) []*sealedProxy { +func (r *fractionRegistry) EvictOverflowed(sizeLimit uint64) (evicted []*refCountedSealed) { if sizeLimit == 0 { return nil } @@ -338,168 +337,80 @@ func (r *fractionRegistry) EvictOverflowed(sizeLimit uint64) []*sealedProxy { return nil } - count := 0 - evicted := []*sealedProxy{} +loop: // filter fractions - for _, item := range r.offloading { - // keep items that are within limits or already offloaded - if r.stats.offloading.totalSizeOnDisk <= sizeLimit || item.remote != nil { - r.offloading[count] = item - count++ - continue + for r.offloading.Len() > 0 { + for _, s := range r.offloading.GetByPartition(r.offloading.MinPartition()) { + evicted = append(evicted, s) + r.stats.offloading.Sub(s.Info()) + r.offloading.Del(s.Info().Name()) + if r.stats.offloading.totalSizeOnDisk <= sizeLimit { + break loop + } } - evicted = append(evicted, item) - r.stats.offloading.Sub(item.instance.Info()) } - r.offloading = r.offloading[:count] - r.rebuildAllFractions() + r.rebuildSnapshot() return evicted } // PromoteToSealed moves fractions from sealing to local queue when sealing completes. -// Maintains strict ordering - younger fractions wait for older ones to seal first. -func (r *fractionRegistry) PromoteToSealed(active *activeProxy, sealed *frac.Sealed) { +func (r *fractionRegistry) PromoteToSealed(active *refCountedActive, sealed *frac.Sealed) { r.mu.Lock() defer r.mu.Unlock() - active.sealed = sealed + r.sealed.Add(sealed.Info().Name(), &refCountedSealed{Sealed: sealed}) + r.stats.sealed.Add(sealed.Info()) + r.stats.sealing.Sub(active.Info()) - promotedCount := 0 - // process sealing queue in order, promoting completed fractions - for _, item := range r.sealing { - if item.sealed == nil { - break // maintain order - wait for previous fractions to complete - } - promotedCount++ - r.sealed = append(r.sealed, &sealedProxy{ - proxy: item.proxy, - instance: item.sealed, - }) - r.stats.sealed.Add(item.sealed.Info()) - r.stats.sealing.Sub(item.instance.Info()) - } + delete(r.sealing, active.Info().Name()) - // remove promoted fractions from sealing queue - r.sealing = r.sealing[promotedCount:] + r.rebuildSnapshot() } // PromoteToRemote moves fractions from offloading to remote queue when offloading completes. // Special case: handles fractions that don't require offloading (remote == nil). -// Maintains strict ordering - younger fractions wait for older ones to offload. -func (r *fractionRegistry) PromoteToRemote(sealed *sealedProxy, remote *frac.Remote) { +func (r *fractionRegistry) PromoteToRemote(sealed *refCountedSealed, remote *frac.Remote) { r.mu.Lock() defer r.mu.Unlock() - sealed.remote = remote - - // special case: remote == nil means fraction doesn't require offloading - if remote == nil { - r.removeFromOffloading(sealed) + if remote != nil { + r.remotes.Add(remote.Info().Name(), &refCountedRemote{Remote: remote}) + r.stats.remotes.Add(remote.Info()) } - promotedCount := 0 - // process offloading queue in order, promoting completed fractions - for _, item := range r.offloading { - if item.remote == nil { - break // maintain order - wait for previous fractions to complete - } - promotedCount++ - r.remotes = append(r.remotes, &remoteProxy{ - proxy: item.proxy, - instance: item.remote, - }) - - r.stats.remotes.Add(item.remote.Info()) - r.stats.offloading.Sub(item.instance.Info()) - } - if promotedCount > 0 { - // remove promoted fractions from offloading queue - r.offloading = r.offloading[promotedCount:] - r.updateOldestLocal() - } + r.stats.offloading.Sub(sealed.Info()) + r.offloading.Del(sealed.Info().Name()) + r.rebuildSnapshot() } -// removeFromOffloading removes a specific fraction from offloading queue. -// O(n) operation that rebuilds the all fractions list. -func (r *fractionRegistry) removeFromOffloading(sealed *sealedProxy) { - count := 0 - // filter out the target fraction - for _, item := range r.offloading { - if sealed != item { - r.offloading[count] = item - count++ - } - } - - if count == len(r.offloading) { // not found to remove (can be removed earlier in EvictOverflowed) - return - } - - r.offloading = r.offloading[:count] - r.stats.offloading.Sub(sealed.instance.Info()) - - // oldest local can be changed here - r.updateOldestLocal() - - // rebuild complete list since we modified the middle of the queue - r.rebuildAllFractions() -} +// rebuildSnapshot reconstructs the all fractions list +func (r *fractionRegistry) rebuildSnapshot() { + capacity := r.remotes.Len() + r.offloading.Len() + r.sealed.Len() + len(r.sealing) + 1 -// rebuildAllFractions reconstructs the all fractions list in correct chronological order. -// Order: remote (oldest) → offloading → sealed → sealing → active (newest) -// Expensive O(n) operation used when direct list modification is insufficient. -func (r *fractionRegistry) rebuildAllFractions() { - all := make([]frac.Fraction, 0, len(r.all)) - allMap := make(map[string]frac.Fraction, len(r.all)) + // allocate extra capacity to accommodate appender rotation that may occur during snapshot lifetime + all := newFractionsSnapshot(capacity + 1) - add := func(f frac.Fraction) { - all = append(all, f) - allMap[f.Info().Name()] = f + for r := range r.remotes.All() { + all.AddRemote(r) } - // collect fractions in correct chronological order: from oldest (remote) to newest (active) - for _, remote := range r.remotes { - add(remote.proxy) + for o := range r.offloading.All() { + all.AddSealed(o) } - for _, offloaded := range r.offloading { - add(offloaded.proxy) - } - for _, sealed := range r.sealed { - add(sealed.proxy) + + for s := range r.sealed.All() { + all.AddSealed(s) } - for _, active := range r.sealing { - add(active.proxy) + + for _, a := range r.sealing { + all.AddActive(a) } - add(r.active.proxy) + all.AddActive(r.appender) r.muAll.Lock() defer r.muAll.Unlock() - r.all = all - r.allMap = allMap - r.updateOldestTotal() -} - -// updateOldestTotal recalculates the creation time of the oldest fraction. -// Called after modifications of the complete fractions list. -func (r *fractionRegistry) updateOldestTotal() { - r.oldestTotal = r.all[0].Info().CreationTime -} - -// updateOldestLocal recalculates the creation time of the oldest local fraction. -// Priority order: offloading queue → sealed queue → sealing queue → active fraction. -// Called after modifications -func (r *fractionRegistry) updateOldestLocal() { - if len(r.offloading) > 0 { - r.oldestLocal = r.offloading[0].proxy.Info().CreationTime - } else if len(r.sealed) > 0 { - r.oldestLocal = r.sealed[0].proxy.Info().CreationTime - } else if len(r.sealing) > 0 { - r.oldestLocal = r.sealing[0].proxy.Info().CreationTime - } else { - r.oldestLocal = r.active.proxy.Info().CreationTime - } } diff --git a/fracmanager/fractions_snapshot.go b/fracmanager/fractions_snapshot.go new file mode 100644 index 00000000..9d561f63 --- /dev/null +++ b/fracmanager/fractions_snapshot.go @@ -0,0 +1,136 @@ +package fracmanager + +import ( + "math" + "sync" + + "github.com/ozontech/seq-db/frac" +) + +// RefCounter provides reference counting capability. +type RefCounter interface { + Inc() + Dec() +} + +// fractionsSnapshot represents a point-in-time view of multiple fractions +// with associated reference counters to keep them alive. +type fractionsSnapshot struct { + counters []RefCounter // Reference counters to keep fractions alive + fractions []frac.Fraction // The actual fractions in chronological order + names map[string]int + oldestLocal uint64 + oldestTotal uint64 +} + +func newFractionsSnapshot(capacity int) fractionsSnapshot { + return fractionsSnapshot{ + counters: make([]RefCounter, 0, capacity), + fractions: make([]frac.Fraction, 0, capacity), + names: make(map[string]int, capacity), + oldestLocal: math.MaxUint64, + oldestTotal: math.MaxUint64, + } +} + +func (fs *fractionsSnapshot) AddActive(a *syncAppender) { + fs.names[a.Info().Name()] = len(fs.fractions) + + fs.counters = append(fs.counters, a) + fs.fractions = append(fs.fractions, a) + + fs.oldestLocal = min(fs.oldestLocal, a.Info().CreationTime) + fs.oldestTotal = min(fs.oldestTotal, fs.oldestLocal) +} + +func (fs *fractionsSnapshot) AddSealed(s *refCountedSealed) { + fs.names[s.Info().Name()] = len(fs.fractions) + + fs.counters = append(fs.counters, s) + fs.fractions = append(fs.fractions, s) + + fs.oldestLocal = min(fs.oldestLocal, s.Info().CreationTime) + fs.oldestTotal = min(fs.oldestTotal, fs.oldestLocal) +} + +func (fs *fractionsSnapshot) AddRemote(r *refCountedRemote) { + fs.names[r.Info().Name()] = len(fs.fractions) + + fs.counters = append(fs.counters, r) + fs.fractions = append(fs.fractions, r) + + fs.oldestTotal = min(fs.oldestTotal, r.Info().CreationTime) +} + +// AcquireAll returns the fractions and a release function. +// Caller must call the release function when done to decrement reference counts. +func (fs *fractionsSnapshot) AcquireAll() ([]frac.Fraction, func()) { + for _, c := range fs.counters { + c.Inc() + } + + counters := fs.counters // make copy of counters + return fs.fractions, func() { + for _, c := range counters { + c.Dec() + } + } +} + +func (fs *fractionsSnapshot) AcquireOne(name string) (frac.Fraction, func(), bool) { + i, ok := fs.names[name] + if !ok { + return nil, func() {}, false + } + + c := fs.counters[i] + f := fs.fractions[i] + + c.Inc() + return f, c.Dec, true +} + +type refCounterWg struct { + wg sync.WaitGroup +} + +func (p *refCounterWg) Inc() { p.wg.Add(1) } + +func (p *refCounterWg) Dec() { p.wg.Done() } + +// refCountedActive wraps frac.Active with reference counting. +// Destroy releases the underlying Active after all references are gone. +type refCountedActive struct { + refCounterWg + *frac.Active +} + +// Destroy waits for all references to be released and then releases the Active. +func (p *refCountedActive) Destroy() { + p.wg.Wait() + p.Release() +} + +// refCountedSealed wraps frac.Sealed with reference counting. +type refCountedSealed struct { + refCounterWg + *frac.Sealed +} + +// Destroy waits for all references to be released and then destroys the Sealed. +func (p *refCountedSealed) Destroy() { + p.wg.Wait() + p.Suicide() +} + +// refCountedRemote wraps frac.Remote with reference counting. +type refCountedRemote struct { + refCounterWg + *frac.Remote +} + +// Destroy waits for all references to be released and then destroys the Remote. +func (p *refCountedRemote) Destroy() { + p.wg.Wait() + p.Suicide() +} diff --git a/fracmanager/lifecycle_manager.go b/fracmanager/lifecycle_manager.go index cd1c4bd3..24025c23 100644 --- a/fracmanager/lifecycle_manager.go +++ b/fracmanager/lifecycle_manager.go @@ -2,7 +2,6 @@ package fracmanager import ( "context" - "path/filepath" "sync" "time" @@ -10,7 +9,6 @@ import ( "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/logger" - "github.com/ozontech/seq-db/util" ) // lifecycleManager manages the complete lifecycle of fractions. @@ -23,7 +21,7 @@ type lifecycleManager struct { registry *fractionRegistry // fraction state registry tasks *TaskManager // Background offloading tasks - sealingWg sync.WaitGroup + sealingWg sync.WaitGroup // todo: get rid after removing SealAll in tests } func newLifecycleManager( @@ -67,41 +65,14 @@ func (lc *lifecycleManager) SyncInfoCache() { } } -// seal converts an active fraction to sealed state. -// It freezes writes, waits for pending operations, then seals the fraction. -func (lc *lifecycleManager) seal(active *activeProxy) error { - sealsTotal.Inc() - now := time.Now() - sealed, err := lc.provider.Seal(active.instance) - if err != nil { - return err - } - sealingTime := time.Since(now) - sealsDoneSeconds.Observe(sealingTime.Seconds()) - - logger.Info( - "fraction sealed", - zap.String("fraction", filepath.Base(sealed.BaseFileName)), - zap.Float64("time_spent_s", util.DurationToUnit(sealingTime, "s")), - ) - - lc.infoCache.Add(sealed.Info()) - lc.registry.PromoteToSealed(active, sealed) - active.proxy.Redirect(sealed) - active.instance.Release() - return nil -} - // rotate checks if active fraction needs rotation based on size limit. // Creates new active fraction and starts sealing the previous one. func (lc *lifecycleManager) rotate(maxSize uint64, wg *sync.WaitGroup) { - activeToSeal, waitBeforeSealing, err := lc.registry.RotateIfFull(maxSize, func() *activeProxy { - return newActiveProxy(lc.provider.CreateActive()) - }) + active, waitBeforeSealing, err := lc.registry.RotateIfFull(maxSize, lc.provider) if err != nil { logger.Fatal("active fraction rotation error", zap.Error(err)) } - if activeToSeal == nil { + if active == nil { return } @@ -112,37 +83,39 @@ func (lc *lifecycleManager) rotate(maxSize uint64, wg *sync.WaitGroup) { defer lc.sealingWg.Done() waitBeforeSealing() - if err := lc.seal(activeToSeal); err != nil { + sealed, err := lc.provider.Seal(active.Active) + if err != nil { logger.Fatal("sealing error", zap.Error(err)) } + + lc.infoCache.Add(sealed.Info()) + lc.registry.PromoteToSealed(active, sealed) + active.Destroy() }() } // offloadLocal starts offloading of local fractions to remote storage. // Selects fractions based on disk space usage and retention policy. func (lc *lifecycleManager) offloadLocal(ctx context.Context, sizeLimit uint64, retryDelay time.Duration, wg *sync.WaitGroup) { - toOffload, err := lc.registry.EvictLocal(true, sizeLimit) + toOffload, err := lc.registry.EvictLocalForOffload(sizeLimit) if err != nil { logger.Fatal("error releasing old fractions:", zap.Error(err)) } - for _, sealed := range toOffload { + for _, frac := range toOffload { wg.Add(1) - _, err := lc.tasks.Run(sealed.instance.BaseFileName, ctx, func(ctx context.Context) { + _, err := lc.tasks.Run(frac.BaseFileName, ctx, func(ctx context.Context) { defer wg.Done() - remote := lc.offloadWithRetry(ctx, sealed.instance, retryDelay) + remote := lc.offloadWithRetry(ctx, frac.Sealed, retryDelay) - lc.registry.PromoteToRemote(sealed, remote) + lc.registry.PromoteToRemote(frac, remote) if remote == nil { - sealed.proxy.Redirect(emptyFraction{}) - lc.infoCache.Remove(sealed.instance.Info().Name()) - } else { - sealed.proxy.Redirect(remote) + lc.infoCache.Remove(frac.Info().Name()) } // free up local resources - sealed.instance.Suicide() + frac.Destroy() maintenanceTruncateTotal.Add(1) }) if err != nil { @@ -209,20 +182,19 @@ func (lc *lifecycleManager) tryOffload(ctx context.Context, sealed *frac.Sealed) // cleanRemote deletes outdated remote fractions based on retention policy. func (lc *lifecycleManager) cleanRemote(retention time.Duration, wg *sync.WaitGroup) { toDelete := lc.registry.EvictRemote(retention) - wg.Add(1) - go func() { - defer wg.Done() - for _, remote := range toDelete { - remote.proxy.Redirect(emptyFraction{}) - lc.infoCache.Remove(remote.instance.Info().Name()) - remote.instance.Suicide() - } - }() + wg.Add(len(toDelete)) + for _, remote := range toDelete { + go func() { + defer wg.Done() + lc.infoCache.Remove(remote.Info().Name()) + remote.Destroy() + }() + } } // cleanLocal deletes outdated local fractions when offloading is disabled. func (lc *lifecycleManager) cleanLocal(sizeLimit uint64, wg *sync.WaitGroup) { - toDelete, err := lc.registry.EvictLocal(false, sizeLimit) + toDelete, err := lc.registry.EvictLocalForDelete(sizeLimit) if err != nil { logger.Fatal("error releasing old fractions:", zap.Error(err)) } @@ -232,16 +204,15 @@ func (lc *lifecycleManager) cleanLocal(sizeLimit uint64, wg *sync.WaitGroup) { } } - wg.Add(1) - go func() { - defer wg.Done() - for _, sealed := range toDelete { - sealed.proxy.Redirect(emptyFraction{}) - lc.infoCache.Remove(sealed.instance.Info().Name()) - sealed.instance.Suicide() + wg.Add(len(toDelete)) + for _, frac := range toDelete { + go func() { + defer wg.Done() + lc.infoCache.Remove(frac.Info().Name()) + frac.Destroy() maintenanceTruncateTotal.Add(1) - } - }() + }() + } } // updateOldestMetric updates the prometheus metric with oldest fraction timestamp. @@ -254,13 +225,13 @@ func (lc *lifecycleManager) updateOldestMetric() { // Stops ongoing offloading tasks and cleans up both local and remote resources. func (lc *lifecycleManager) removeOverflowed(sizeLimit uint64, wg *sync.WaitGroup) { evicted := lc.registry.EvictOverflowed(sizeLimit) - for _, item := range evicted { + for _, sealed := range evicted { wg.Add(1) go func() { defer wg.Done() // Cancel the offloading task - this operation may take significant time // hence executed in a separate goroutine to avoid blocking - lc.tasks.Cancel(item.instance.BaseFileName) + lc.tasks.Cancel(sealed.BaseFileName) }() } } diff --git a/fracmanager/lifecycle_manager_test.go b/fracmanager/lifecycle_manager_test.go index abd180e2..cb9ab1e0 100644 --- a/fracmanager/lifecycle_manager_test.go +++ b/fracmanager/lifecycle_manager_test.go @@ -1,14 +1,20 @@ package fracmanager import ( + "math" "math/rand" "path/filepath" "sync" "testing" + "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/ozontech/seq-db/consts" + "github.com/ozontech/seq-db/frac/processor" + "github.com/ozontech/seq-db/parser" + "github.com/ozontech/seq-db/seq" ) func setupLifecycle(t testing.TB, cfg *Config) (*lifecycleManager, func()) { @@ -31,21 +37,18 @@ func TestFracInfoCache(t *testing.T) { lc, tearDown := setupLifecycle(t, nil) defer tearDown() - var total uint64 - fillRotateAndCheck := func(names map[string]struct{}) { - active := lc.registry.Active() - appendDocsToActive(t, active.instance, 10+rand.Intn(10)) + appender := lc.registry.Appender() + appendDocsToActive(t, appender.Active, 10+rand.Intn(10)) wg := sync.WaitGroup{} lc.rotate(0, &wg) wg.Wait() - info := active.proxy.Info() + info := appender.Info() _, ok := lc.infoCache.Get(info.Name()) assert.True(t, ok) - total += info.FullSize() names[info.Name()] = struct{}{} } @@ -53,12 +56,13 @@ func TestFracInfoCache(t *testing.T) { for range 10 { fillRotateAndCheck(first) } - halfSize := total + halfSize := lc.registry.Stats().TotalSizeOnDiskLocal() second := map[string]struct{}{} for range 10 { fillRotateAndCheck(second) } + total := lc.registry.Stats().TotalSizeOnDiskLocal() wg := sync.WaitGroup{} lc.cleanLocal(total-halfSize, &wg) @@ -80,18 +84,14 @@ func TestCapacityExceeded(t *testing.T) { defer tearDown() const fracsCount = 10 - var total uint64 fillAndRotate := func() { - active := lc.registry.Active() - appendDocsToActive(t, active.instance, 10+rand.Intn(10)) + appender := lc.registry.Appender() + appendDocsToActive(t, appender.Active, 10+rand.Intn(10)) wg := sync.WaitGroup{} lc.rotate(0, &wg) wg.Wait() - - info := active.proxy.Info() - total += info.FullSize() } assert.False(t, lc.flags.IsCapacityExceeded(), "expect data dir is empty") @@ -102,6 +102,8 @@ func TestCapacityExceeded(t *testing.T) { } assert.False(t, lc.flags.IsCapacityExceeded(), "there should be no deletions and the flag is false") + total := lc.registry.Stats().TotalSizeOnDiskLocal() + wg := sync.WaitGroup{} lc.cleanLocal(total, &wg) wg.Wait() @@ -121,20 +123,15 @@ func TestOldestMetrics(t *testing.T) { defer tearDown() const fracsCount = 10 - var total uint64 - fillAndRotate := func() { - active := lc.registry.Active() - appendDocsToActive(t, active.instance, 10+rand.Intn(10)) + appender := lc.registry.Appender() + appendDocsToActive(t, appender.Active, 10+rand.Intn(10)) wg := sync.WaitGroup{} lc.rotate(0, &wg) wg.Wait() - - info := active.proxy.Info() - total += info.FullSize() } - firstFracTime := lc.registry.Active().proxy.Info().CreationTime + firstFracTime := lc.registry.Appender().Info().CreationTime for range fracsCount { fillAndRotate() } @@ -143,12 +140,15 @@ func TestOldestMetrics(t *testing.T) { assert.Equal(t, firstFracTime, lc.registry.OldestTotal(), "should point to the very first fraction when all data is local") assert.Equal(t, firstFracTime, lc.registry.OldestLocal(), "should point to the first fraction when nothing is offloaded") - halfSize := total - halfwayFracTime := lc.registry.Active().proxy.Info().CreationTime + halfSize := lc.registry.Stats().TotalSizeOnDiskLocal() + + halfwayFracTime := lc.registry.Appender().Info().CreationTime for range fracsCount { fillAndRotate() } + total := lc.registry.Stats().TotalSizeOnDiskLocal() + wg := sync.WaitGroup{} lc.offloadLocal(t.Context(), total-halfSize, 0, &wg) wg.Wait() @@ -158,3 +158,80 @@ func TestOldestMetrics(t *testing.T) { assert.Equal(t, firstFracTime, lc.registry.OldestTotal(), "should still reference the first fraction after offload") assert.Equal(t, halfwayFracTime, lc.registry.OldestLocal(), "should point to the oldest remaining local fraction after offload") } + +func TestPendingDestroy(t *testing.T) { + lc, tearDown := setupLifecycle(t, nil) + defer tearDown() + + const ( + fracsCount = 10 + docsPerFrac = 10 + ) + // appending docs to `fracsCount` fractions where the last is active and the rest are sealed + wg := sync.WaitGroup{} + for range fracsCount - 1 { + appendDocsToActive(t, lc.registry.Appender().Active, docsPerFrac) + lc.rotate(0, &wg) + } + appendDocsToActive(t, lc.registry.Appender().Active, docsPerFrac) + + // wait sealing complete + wg.Wait() + + // take all fracs to search + fractions1, release1 := lc.registry.AcquireAllFractions() + + // delete all sealing fracs + lc.cleanLocal(lc.registry.Appender().Info().FullSize(), &wg) + + var ( + beforeRelease time.Time + afterCleanup time.Time + ) + + cleanup := sync.WaitGroup{} + cleanup.Add(1) + go func() { + // cleanup is pending, so run it in a goroutine + // waiting for cleanup to finish + defer cleanup.Done() + wg.Wait() + afterCleanup = time.Now() + }() + + queryAst, err := parser.ParseSeqQL("*", seq.Mapping{}) + require.NoError(t, err, "failed to parse query") + params := processor.SearchParams{ + AST: queryAst.Root, + From: seq.MID(0), + To: seq.MID(math.MaxUint64), + Limit: math.MaxInt32, + } + + for _, f := range fractions1 { + qpr, err := f.Search(t.Context(), params) + assert.NoError(t, err, "failed to search") + assert.Equal(t, docsPerFrac, len(qpr.IDs)) + } + + beforeRelease = time.Now() + release1() + + cleanup.Wait() + assert.Less(t, beforeRelease, afterCleanup, "we expect cleanup to happen after release") + + fractions2, release2 := lc.registry.AcquireAllFractions() + + assert.Len(t, fractions2, 1, "only one active fraction should remain") + singleName := fractions2[0].Info().Name() + + for _, f := range fractions1 { + if f.Info().Name() == singleName { + continue + } + assert.Panics(t, func() { + _, _ = f.Search(t.Context(), params) + }, "searching by destroyed faction is expected to trigger a panic") + } + release2() +} diff --git a/fracmanager/partitioned_collection.go b/fracmanager/partitioned_collection.go new file mode 100644 index 00000000..7f37f045 --- /dev/null +++ b/fracmanager/partitioned_collection.go @@ -0,0 +1,117 @@ +package fracmanager + +import ( + "iter" + + "github.com/ozontech/seq-db/util" +) + +// PartitionedCollection manages a collection of objects grouped into partitions by a user‑defined value. +// Each partition is identified by a uint64. +type PartitionedCollection[T any] struct { + getPartition func(T) uint64 // function to extract partition ID from object + byKey map[string]T // primary index: key -> object + byPartition map[uint64]map[string]T // partition ID -> map[key]object + minPartition *util.MinHeap[uint64] // min‑heap of partition IDs for O(1) MinPartition +} + +// NewPartitionedCollection creates a new empty PartitionedCollection. +func NewPartitionedCollection[T any](getPartition func(T) uint64) PartitionedCollection[T] { + return PartitionedCollection[T]{ + getPartition: getPartition, + byKey: make(map[string]T), + byPartition: make(map[uint64]map[string]T), + minPartition: util.NewMinHeap[uint64](), + } +} + +// Add inserts a new object into the collection. +func (c *PartitionedCollection[T]) Add(key string, obj T) { + if _, ok := c.byKey[key]; ok { + return + } + + partitionID := c.getPartition(obj) + if _, ok := c.byPartition[partitionID]; !ok { + c.minPartition.Push(partitionID) + c.byPartition[partitionID] = make(map[string]T) + } + c.byPartition[partitionID][key] = obj + c.byKey[key] = obj +} + +// Delete removes an object from the collection by its key. +// Does nothing if the key doesn't exist. +func (c *PartitionedCollection[T]) Del(key string) { + obj, ok := c.byKey[key] + if !ok { + return + } + + partitionID := c.getPartition(obj) + delete(c.byPartition[partitionID], key) + if len(c.byPartition[partitionID]) == 0 { + c.minPartition.Remove(partitionID) + delete(c.byPartition, partitionID) + } + delete(c.byKey, key) +} + +// MinPartition returns the smallest partition ID among all stored objects. +// Returns 0 if the collection is empty. +func (c *PartitionedCollection[T]) MinPartition() uint64 { + if val, ok := c.minPartition.Min(); ok { + return val + } + return 0 +} + +// GetByPartition returns all objects in the specified partition. +func (c *PartitionedCollection[T]) GetByPartition(partitionID uint64) []T { + partitionMap, ok := c.byPartition[partitionID] + if !ok { + return nil + } + res := make([]T, 0, len(partitionMap)) + for _, obj := range partitionMap { + res = append(res, obj) + } + return res +} + +// Get retrieves an object by its key. +// Returns the object and true if found, zero value and false otherwise. +func (c *PartitionedCollection[T]) Get(key string) (T, bool) { + obj, ok := c.byKey[key] + return obj, ok +} + +// All returns all objects in the collection. +// The order is not guaranteed. +func (c *PartitionedCollection[T]) All() iter.Seq[T] { + return func(yield func(T) bool) { + for _, obj := range c.byKey { + if !yield(obj) { + return + } + } + } +} + +// Len returns the number of objects in the collection. +func (c *PartitionedCollection[T]) Len() int { + return len(c.byKey) +} + +// GetAllPartitions returns a map of all partitions in the collection. +func (c *PartitionedCollection[T]) GetAllPartitions() map[uint64][]T { + result := make(map[uint64][]T, len(c.byPartition)) + for partitionID, objects := range c.byPartition { + partition := make([]T, 0, len(objects)) + for _, obj := range objects { + partition = append(partition, obj) + } + result[partitionID] = partition + } + return result +} diff --git a/fracmanager/proxy_frac.go b/fracmanager/proxy_frac.go deleted file mode 100644 index 6d4df41f..00000000 --- a/fracmanager/proxy_frac.go +++ /dev/null @@ -1,201 +0,0 @@ -package fracmanager - -import ( - "context" - "errors" - "math" - "sync" - "time" - - "go.uber.org/zap" - - "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" - "github.com/ozontech/seq-db/frac/processor" - "github.com/ozontech/seq-db/logger" - "github.com/ozontech/seq-db/metric" - "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/util" -) - -var ( - _ frac.Fraction = (*fractionProxy)(nil) - _ frac.Fraction = (*emptyFraction)(nil) - - ErrFractionNotWritable = errors.New("fraction is not writable") - ErrFractionSuspended = errors.New("write operations temporarily suspended - database capacity exceeded") -) - -// fractionProxy provides thread-safe access to a fraction with atomic replacement -// Used to switch fraction implementations (active → sealed → remote) without blocking readers. -// Lifecycle: Created for each fraction, persists through state transitions. -type fractionProxy struct { - mu sync.RWMutex - impl frac.Fraction // Current fraction implementation -} - -func (p *fractionProxy) Redirect(f frac.Fraction) { - p.mu.Lock() - defer p.mu.Unlock() - p.impl = f -} - -func (p *fractionProxy) Info() *common.Info { - p.mu.RLock() - defer p.mu.RUnlock() - return p.impl.Info() -} - -func (p *fractionProxy) IsIntersecting(from, to seq.MID) bool { - p.mu.RLock() - defer p.mu.RUnlock() - return p.impl.IsIntersecting(from, to) -} - -func (p *fractionProxy) Contains(mid seq.MID) bool { - p.mu.RLock() - defer p.mu.RUnlock() - return p.impl.Contains(mid) -} - -func (p *fractionProxy) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { - p.mu.RLock() - defer p.mu.RUnlock() - return p.impl.Fetch(ctx, ids) -} - -func (p *fractionProxy) Search(ctx context.Context, params processor.SearchParams) (*seq.QPR, error) { - p.mu.RLock() - defer p.mu.RUnlock() - return p.impl.Search(ctx, params) -} - -func (p *fractionProxy) FindLIDs(ctx context.Context, ids []seq.ID) ([]seq.LID, error) { - return p.impl.FindLIDs(ctx, ids) -} - -// activeProxy manages an active (writable) fraction -// Tracks pending write operations and provides freeze capability. -// Lifecycle: Created when fraction becomes active, destroyed after sealing. -type activeProxy struct { - proxy *fractionProxy // Thread-safe fraction access - instance *frac.Active // Actual active fraction instance - sealed *frac.Sealed // Sealed version (set after sealing) - - mu sync.RWMutex // Protects readonly state - wg sync.WaitGroup // Tracks pending write operations - - finalized bool // Whether fraction is frozen for writes - suspended bool // Temporarily suspended for writes -} - -func newActiveProxy(active *frac.Active) *activeProxy { - return &activeProxy{ - proxy: &fractionProxy{impl: active}, - instance: active, - } -} - -// Append adds documents to the active fraction -func (p *activeProxy) Append(docs, meta []byte) error { - p.mu.RLock() - if p.finalized { - p.mu.RUnlock() - return ErrFractionNotWritable - } - if p.suspended { - p.mu.RUnlock() - return ErrFractionSuspended - } - p.wg.Add(1) // Important: wg.Add() inside lock to prevent race with WaitWriteIdle() - p.mu.RUnlock() - - return p.instance.Append(docs, meta, &p.wg) -} - -// WaitWriteIdle waits for all pending write operations to complete -// Used before sealing to ensure data consistency. -func (p *activeProxy) WaitWriteIdle() { - start := time.Now() - logger.Info("waiting fraction to stop write...", zap.String("name", p.instance.BaseFileName)) - p.wg.Wait() - waitTime := util.DurationToUnit(time.Since(start), "s") - logger.Info("write is stopped", - zap.String("name", p.instance.BaseFileName), - zap.Float64("time_wait_s", waitTime)) -} - -func (p *activeProxy) Suspended() bool { - p.mu.Lock() - defer p.mu.Unlock() - - return p.suspended -} - -func (p *activeProxy) Suspend(value bool) { - p.mu.Lock() - p.suspended = value - p.mu.Unlock() -} - -// Finalize marks the fraction as read-only and prevents new writes from starting after finalize. -func (p *activeProxy) Finalize() error { - p.mu.Lock() - defer p.mu.Unlock() - - if p.finalized { - return errors.New("fraction is already finalized") - } - p.finalized = true - - return nil -} - -// sealedProxy represents a sealed fraction that may be offloaded -// Tracks both local sealed instance and remote version if offloaded. -type sealedProxy struct { - proxy *fractionProxy // Thread-safe fraction access - instance *frac.Sealed // Local sealed fraction - remote *frac.Remote // Remote version (if offloaded) -} - -// remoteProxy represents an offloaded fraction -type remoteProxy struct { - proxy *fractionProxy // Thread-safe fraction access - instance *frac.Remote // Remote fraction instance -} - -// emptyFraction represents a missing or deleted fraction -// Returns empty results for all operations. -// Used as placeholder when fraction is removed but references still exist. -type emptyFraction struct { -} - -func (emptyFraction) Info() *common.Info { - return &common.Info{ - Path: "empty", - From: math.MaxUint64, - To: 0, - } -} - -func (emptyFraction) IsIntersecting(_, _ seq.MID) bool { - return false -} - -func (emptyFraction) Contains(mid seq.MID) bool { - return false -} - -func (emptyFraction) Fetch(ctx context.Context, ids []seq.ID) ([][]byte, error) { - return nil, nil -} - -func (emptyFraction) Search(_ context.Context, params processor.SearchParams) (*seq.QPR, error) { - metric.CountersTotal.WithLabelValues("empty_data_provider").Inc() - return &seq.QPR{Aggs: make([]seq.AggregatableSamples, len(params.AggQ))}, nil -} - -func (emptyFraction) FindLIDs(_ context.Context, _ []seq.ID) ([]seq.LID, error) { - return nil, nil -} diff --git a/fracmanager/sync_appender.go b/fracmanager/sync_appender.go new file mode 100644 index 00000000..b8d93ab4 --- /dev/null +++ b/fracmanager/sync_appender.go @@ -0,0 +1,82 @@ +package fracmanager + +import ( + "errors" + "sync" + "time" + + "go.uber.org/zap" + + "github.com/ozontech/seq-db/logger" + "github.com/ozontech/seq-db/util" +) + +var ( + ErrFractionNotWritable = errors.New("fraction is not writable") + ErrFractionSuspended = errors.New("write operations temporarily suspended - database capacity exceeded") +) + +type syncAppender struct { + refCountedActive // Actual active fraction instance + + mu sync.RWMutex // Protects readonly state + wg sync.WaitGroup // Tracks pending write operations + + finalized bool // Whether fraction is frozen for writes + suspended bool // Temporarily suspended for writes +} + +// Append adds documents to the active fraction +func (a *syncAppender) Append(docs, meta []byte) error { + a.mu.RLock() + if a.finalized { + a.mu.RUnlock() + return ErrFractionNotWritable + } + if a.suspended { + a.mu.RUnlock() + return ErrFractionSuspended + } + a.wg.Add(1) // Important: wg.Add() inside lock to prevent race with WaitWriteIdle() + a.mu.RUnlock() + + return a.refCountedActive.Append(docs, meta, &a.wg) +} + +func (a *syncAppender) Suspended() bool { + a.mu.Lock() + defer a.mu.Unlock() + + return a.suspended +} + +func (a *syncAppender) Suspend(value bool) { + a.mu.Lock() + a.suspended = value + a.mu.Unlock() +} + +// WaitWriteIdle waits for all pending write operations to complete +// Used before sealing to ensure data consistency. +func (a *syncAppender) WaitWriteIdle() { + start := time.Now() + logger.Info("waiting fraction to stop write...", zap.String("name", a.BaseFileName)) + a.wg.Wait() + waitTime := util.DurationToUnit(time.Since(start), "s") + logger.Info("write is stopped", + zap.String("name", a.BaseFileName), + zap.Float64("time_wait_s", waitTime)) +} + +// Finalize marks the fraction as read-only and prevents new writes from starting after finalize. +func (a *syncAppender) Finalize() error { + a.mu.Lock() + if a.finalized { + a.mu.Unlock() + return errors.New("fraction is already finalized") + } + a.finalized = true + a.mu.Unlock() + + return nil +} diff --git a/skipmaskmanager/skip_mask_manager.go b/skipmaskmanager/skip_mask_manager.go index 79f6680d..40f48480 100644 --- a/skipmaskmanager/skip_mask_manager.go +++ b/skipmaskmanager/skip_mask_manager.go @@ -45,7 +45,7 @@ type MappingProvider interface { } type fractionAcquirer interface { - Fractions() fracmanager.List + AcquireFractions() (fracmanager.List, func()) AcquireFraction(name string) (_ frac.Fraction, release func(), ok bool) } @@ -142,7 +142,7 @@ func New( // - Begins asynchronous processing of all skip mask queries // // This method must be called before using the manager. -func (smm *SkipMaskManager) Start(fracs fractionAcquirer) { +func (smm *SkipMaskManager) Start(fracProvider fractionAcquirer) { smm.createDataDir() err := smm.loadSkipMasks() @@ -150,7 +150,10 @@ func (smm *SkipMaskManager) Start(fracs fractionAcquirer) { logger.Fatal("failed to load previous skip masks", zap.Error(err)) } - err = smm.buildQueue(fracs.Fractions()) + fracs, release := fracProvider.AcquireFractions() + defer release() + + err = smm.buildQueue(fracs) if err != nil { logger.Fatal("failed to build skip mask manager queue", zap.Error(err)) } @@ -171,7 +174,7 @@ func (smm *SkipMaskManager) Start(fracs fractionAcquirer) { } sm.ast = ast - smm.processSkipMask(sm, fracs) + smm.processSkipMask(sm, fracProvider) } }() } @@ -439,7 +442,7 @@ func (smm *SkipMaskManager) buildQueue(fracs fracmanager.List) error { // It processes each fraction with a .queue file, running search queries in parallel // (limited by the rate limiter). Each successful search writes results to a .skipmask // file. The skip mask status is set to Done when all fractions are processed. -func (smm *SkipMaskManager) processSkipMask(skipMask *SkipMask, fracs fractionAcquirer) { +func (smm *SkipMaskManager) processSkipMask(skipMask *SkipMask, fracProvider fractionAcquirer) { skipMaskDes, err := os.ReadDir(skipMask.dirPath) if err != nil { panic(fmt.Errorf("BUG: reading directory must be successful: %s", err)) @@ -457,7 +460,7 @@ func (smm *SkipMaskManager) processSkipMask(skipMask *SkipMask, fracs fractionAc defer skipMask.processWg.Done() defer func() { <-smm.rateLimit }() - f, release, ok := fracs.AcquireFraction(fracNameFromFilePath(name)) + f, release, ok := fracProvider.AcquireFraction(fracNameFromFilePath(name)) if !ok { // skip missing fracs return } diff --git a/storeapi/grpc_fetch.go b/storeapi/grpc_fetch.go index d640618c..9eb14147 100644 --- a/storeapi/grpc_fetch.go +++ b/storeapi/grpc_fetch.go @@ -68,7 +68,10 @@ func (g *GrpcV1) doFetch(ctx context.Context, req *storeapi.FetchRequest, stream dp := acquireDocFieldsFilter(req.FieldsFilter) defer releaseDocFieldsFilter(dp) - docsStream := newDocsStream(ctx, ids, g.fetchData.docFetcher, g.fracManager.Fractions()) + fracs, release := g.fracManager.AcquireFractions() + defer release() + + docsStream := newDocsStream(ctx, ids, g.fetchData.docFetcher, fracs) for _, id := range ids { workTime := time.Now() doc, err := docsStream.Next() diff --git a/storeapi/grpc_search.go b/storeapi/grpc_search.go index 9eb89e73..e5eedd98 100644 --- a/storeapi/grpc_search.go +++ b/storeapi/grpc_search.go @@ -189,18 +189,13 @@ func (g *GrpcV1) doSearch( } searchTr := tr.NewChild("search iteratively") - qpr, err := g.searchData.searcher.SearchDocs( - ctx, - g.fracManager.Fractions(), - searchParams, - tr, - ) + qpr, err := g.searchDocs(ctx, searchParams, tr) searchTr.Done() + if err != nil { if code, ok := parseStoreError(err); ok { return &storeapi.SearchResponse{Code: code}, nil } - return nil, err } @@ -229,6 +224,13 @@ func (g *GrpcV1) doSearch( return buildSearchResponse(qpr), nil } +func (g *GrpcV1) searchDocs(ctx context.Context, sp processor.SearchParams, tr *querytracer.Tracer) (*seq.QPR, error) { + fracs, release := g.fracManager.AcquireFractions() + defer release() + + return g.searchData.searcher.SearchDocs(ctx, fracs, sp, tr) +} + func (g *GrpcV1) parseQuery(query string) (*parser.ASTNode, error) { seqql, err := parser.ParseSeqQL(query, g.mappingProvider.GetMapping()) if err != nil { diff --git a/util/min_heap.go b/util/min_heap.go new file mode 100644 index 00000000..5ec79aee --- /dev/null +++ b/util/min_heap.go @@ -0,0 +1,117 @@ +package util + +import ( + "cmp" + "container/heap" +) + +// MinHeap is a min‑heap for any comparable type. +// Maintains both a heap structure and a map for fast lookup of items. +type MinHeap[T cmp.Ordered] struct { + items []*heapItem[T] // Heap elements + indexMap map[T]*heapItem[T] // Value → item mapping for O(1) lookup +} + +// heapItem represents an element in the heap. +type heapItem[T comparable] struct { + value T // Stored value + index int // Current index in the heap +} + +// NewMinHeap creates and initializes a new MinHeap instance. +func NewMinHeap[T cmp.Ordered]() *MinHeap[T] { + h := &MinHeap[T]{ + items: make([]*heapItem[T], 0), + indexMap: make(map[T]*heapItem[T]), + } + heap.Init((*heapWrapper[T])(h)) + return h +} + +// Push adds a value to the heap if it doesn't already exist (no duplicates). +func (h *MinHeap[T]) Push(value T) { + if _, ok := h.indexMap[value]; !ok { + item := &heapItem[T]{ + value: value, + index: -1, + } + h.indexMap[value] = item + heap.Push((*heapWrapper[T])(h), item) + } +} + +// Remove deletes one occurrence of the specified value from the heap. +// Does nothing if the value doesn't exist. +func (h *MinHeap[T]) Remove(value T) { + item, ok := h.indexMap[value] + if !ok { + return + } + heap.Remove((*heapWrapper[T])(h), item.index) + delete(h.indexMap, value) +} + +// PopMin removes and returns the minimum value from the heap. +// Returns (zero value, false) if the heap is empty. +func (h *MinHeap[T]) PopMin() (T, bool) { + var zero T + if len(h.items) == 0 { + return zero, false + } + item := h.items[0] + value := item.value + heap.Pop((*heapWrapper[T])(h)) + return value, true +} + +// Min returns the minimum value in the heap without removing it. +// Returns (zero value, false) if the heap is empty. +func (h *MinHeap[T]) Min() (T, bool) { + var zero T + if len(h.items) == 0 { + return zero, false + } + return h.items[0].value, true +} + +// Len returns the current number of elements in the heap. +func (h *MinHeap[T]) Len() int { + return len(h.items) +} + +// heapWrapper is a type alias for MinHeap to implement heap.Interface. +type heapWrapper[T cmp.Ordered] MinHeap[T] + +// Len is part of heap.Interface — returns the number of elements. +func (hw *heapWrapper[T]) Len() int { + return len(hw.items) +} + +// Less is part of heap.Interface — defines min‑heap order (smaller values first). +func (hw *heapWrapper[T]) Less(i, j int) bool { + return hw.items[i].value < hw.items[j].value +} + +// Swap is part of heap.Interface — swaps elements and updates their indices. +func (hw *heapWrapper[T]) Swap(i, j int) { + hw.items[i], hw.items[j] = hw.items[j], hw.items[i] + hw.items[i].index = i + hw.items[j].index = j +} + +// Push is part of heap.Interface — adds a new element to the heap. +func (hw *heapWrapper[T]) Push(x interface{}) { + item := x.(*heapItem[T]) + item.index = len(hw.items) + hw.items = append(hw.items, item) +} + +// Pop is part of heap.Interface — removes and returns the last element. +func (hw *heapWrapper[T]) Pop() interface{} { + old := hw.items + n := len(old) - 1 + item := old[n] + item.index = -1 + hw.items = old[0:n] + return item +} From e447d7f40a8192df5117219beb0e7f3dcfe1dd62 Mon Sep 17 00:00:00 2001 From: Evgenii Guguchkin Date: Wed, 20 May 2026 17:02:42 +0300 Subject: [PATCH 02/29] review fixes --- fracmanager/sync_appender.go | 6 ++++-- util/min_heap.go | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fracmanager/sync_appender.go b/fracmanager/sync_appender.go index b8d93ab4..76cf4ee0 100644 --- a/fracmanager/sync_appender.go +++ b/fracmanager/sync_appender.go @@ -63,9 +63,11 @@ func (a *syncAppender) WaitWriteIdle() { logger.Info("waiting fraction to stop write...", zap.String("name", a.BaseFileName)) a.wg.Wait() waitTime := util.DurationToUnit(time.Since(start), "s") - logger.Info("write is stopped", + logger.Info( + "write is stopped", zap.String("name", a.BaseFileName), - zap.Float64("time_wait_s", waitTime)) + zap.Float64("time_wait_s", waitTime), + ) } // Finalize marks the fraction as read-only and prevents new writes from starting after finalize. diff --git a/util/min_heap.go b/util/min_heap.go index 5ec79aee..70b92fcf 100644 --- a/util/min_heap.go +++ b/util/min_heap.go @@ -100,14 +100,14 @@ func (hw *heapWrapper[T]) Swap(i, j int) { } // Push is part of heap.Interface — adds a new element to the heap. -func (hw *heapWrapper[T]) Push(x interface{}) { +func (hw *heapWrapper[T]) Push(x any) { item := x.(*heapItem[T]) item.index = len(hw.items) hw.items = append(hw.items, item) } // Pop is part of heap.Interface — removes and returns the last element. -func (hw *heapWrapper[T]) Pop() interface{} { +func (hw *heapWrapper[T]) Pop() any { old := hw.items n := len(old) - 1 item := old[n] From 69c1e69db8f2d451bfb62a3d950a4a769eb09d09 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Thu, 21 May 2026 14:50:07 +0300 Subject: [PATCH 03/29] feat: new api for compaction --- frac/sealed.go | 1 + fracmanager/fracmanager.go | 50 +++++- fracmanager/fracmanager_for_tests.go | 2 +- fracmanager/fracmanager_test.go | 6 +- fracmanager/fracs_stats.go | 5 +- fracmanager/fraction_registry.go | 219 ++++++++++++++++++-------- fracmanager/lifecycle_manager.go | 27 ++-- fracmanager/lifecycle_manager_test.go | 44 +++--- fracmanager/sync_appender.go | 16 +- 9 files changed, 248 insertions(+), 122 deletions(-) diff --git a/frac/sealed.go b/frac/sealed.go index c18f9a62..e5f8a555 100644 --- a/frac/sealed.go +++ b/frac/sealed.go @@ -392,6 +392,7 @@ func (f *Sealed) Release() { func (f *Sealed) Suicide() { f.Release() + // Rename docs atomically first — this commits the intent to delete. oldPath := f.BaseFileName + consts.DocsFileSuffix newPath := f.BaseFileName + consts.DocsDelFileSuffix diff --git a/fracmanager/fracmanager.go b/fracmanager/fracmanager.go index 77a73c78..5dc808c7 100644 --- a/fracmanager/fracmanager.go +++ b/fracmanager/fracmanager.go @@ -77,11 +77,11 @@ func New(ctx context.Context, cfg *Config, s3cli *s3.Client, skipMaskProvider sk wg.Wait() // finalize appender to prevent new writes - appender := lc.registry.Appender() - if err := appender.Finalize(); err != nil { + appender := lc.registry.appender() + if err := appender.finalize(); err != nil { logger.Fatal("shutdown fraction freezing error", zap.Error(err)) } - appender.WaitWriteIdle() + appender.waitWriteIdle() stopIdx() @@ -96,16 +96,50 @@ func New(ctx context.Context, cfg *Config, s3cli *s3.Client, skipMaskProvider sk return &fm, stop, nil } +type CompactionSnapshot struct { + claimed []*refCountedSealed +} + +func (cs *CompactionSnapshot) Fractions() []*frac.Sealed { + result := make([]*frac.Sealed, len(cs.claimed)) + for i, f := range cs.claimed { + result[i] = f.Sealed + } + return result +} + +func (cs *CompactionSnapshot) Destroy() { + for _, f := range cs.claimed { + f.Destroy() + } +} + +func (fm *FracManager) SealedFractionsSnapshot() []*frac.Sealed { + return fm.lc.registry.sealedSnapshot() +} + +func (fm *FracManager) ClaimForCompaction(names []string) (*CompactionSnapshot, error) { + claimed, err := fm.lc.registry.claimForCompaction(names) + if err != nil { + return nil, err + } + return &CompactionSnapshot{claimed: claimed}, nil +} + +func (fm *FracManager) SubstituteWithSealed(produced *frac.Sealed, snapshot *CompactionSnapshot) { + fm.lc.registry.substituteWithSealed(produced, snapshot.claimed...) +} + func (fm *FracManager) AcquireFraction(name string) (frac.Fraction, func(), bool) { - return fm.lc.registry.AcquireOneFraction(name) + return fm.lc.registry.acquireOneFraction(name) } func (fm *FracManager) AcquireFractions() (List, func()) { - return fm.lc.registry.AcquireAllFractions() + return fm.lc.registry.acquireAllFractions() } func (fm *FracManager) Oldest() uint64 { - return fm.lc.registry.OldestTotal() + return fm.lc.registry.oldestTotal() } func (fm *FracManager) Flags() *StateManager { @@ -121,7 +155,7 @@ func (fm *FracManager) Append(ctx context.Context, docs storage.DocBlock, metas return ctx.Err() default: // Try to append data to the currently active fraction - err := fm.lc.registry.Appender().Append(docs, metas) + err := fm.lc.registry.appender().append(docs, metas) if err != nil { logger.Info("append fail", zap.Error(err)) if err == ErrFractionNotWritable { @@ -167,7 +201,7 @@ func startStatsWorker(ctx context.Context, reg *fractionRegistry, wg *sync.WaitG logger.Info("stats loop is started") // Run stats collection every 10 seconds util.RunEvery(ctx.Done(), time.Second*10, func() { - stats := reg.Stats() + stats := reg.statistics() stats.Log() // Log statistics stats.SetMetrics() // Update Prometheus metrics }) diff --git a/fracmanager/fracmanager_for_tests.go b/fracmanager/fracmanager_for_tests.go index c4ec1cad..39349289 100644 --- a/fracmanager/fracmanager_for_tests.go +++ b/fracmanager/fracmanager_for_tests.go @@ -3,7 +3,7 @@ package fracmanager import "sync" func (fm *FracManager) WaitIdleForTests() { - fm.lc.registry.Appender().WaitWriteIdle() + fm.lc.registry.appender().waitWriteIdle() } func (fm *FracManager) SealForcedForTests() { diff --git a/fracmanager/fracmanager_test.go b/fracmanager/fracmanager_test.go index d92e13b8..64c264b6 100644 --- a/fracmanager/fracmanager_test.go +++ b/fracmanager/fracmanager_test.go @@ -62,7 +62,7 @@ func TestSealingOnShutdown(t *testing.T) { cfg, fm, stop := setupFracManager(t, cfg) appendDocsToFracManager(t, fm, 10) - activeName := fm.lc.registry.all.fractions[0].Info().Name() + activeName := fm.lc.registry.snapshot.fractions[0].Info().Name() stop() @@ -70,7 +70,7 @@ func TestSealingOnShutdown(t *testing.T) { cfg.MinSealFracSize = 1 // to ensure that the frac will be sealed on shutdown cfg, fm, stop = setupFracManager(t, cfg) - allFractions := fm.lc.registry.all.fractions + allFractions := fm.lc.registry.snapshot.fractions assert.Equal(t, 1, len(allFractions), "should have one fraction") assert.Equal(t, activeName, allFractions[0].Info().Name(), "fraction should have the same name") _, ok := allFractions[0].(*syncAppender) @@ -80,7 +80,7 @@ func TestSealingOnShutdown(t *testing.T) { // third start _, fm, stop = setupFracManager(t, cfg) - allFractions = fm.lc.registry.all.fractions + allFractions = fm.lc.registry.snapshot.fractions assert.Equal(t, 2, len(allFractions), "should have 2 fraction: new active and old sealed") _, ok = allFractions[0].(*refCountedSealed) assert.True(t, ok, "first fraction should be sealed") diff --git a/fracmanager/fracs_stats.go b/fracmanager/fracs_stats.go index c70bbd37..ee255543 100644 --- a/fracmanager/fracs_stats.go +++ b/fracmanager/fracs_stats.go @@ -76,6 +76,7 @@ type registryStats struct { active fracsStats // Statistics for active fraction sealing fracsStats // Statistics for fractions in the sealing process sealed fracsStats // Statistics for fractions on sealed disk + compacting fracsStats // Statistics for fractions participating in compaction offloading fracsStats // Statistics for fractions in the offloading process remotes fracsStats // Statistics for fractions in remote storage } @@ -84,6 +85,7 @@ func (s *registryStats) Log() { s.active.Log("active") s.sealing.Log("sealing") s.sealed.Log("sealed") + s.compacting.Log("compacting") s.offloading.Log("offloading") s.remotes.Log("remotes") } @@ -92,10 +94,11 @@ func (s *registryStats) SetMetrics() { s.active.SetMetrics(dataSizeTotal, "active") s.sealing.SetMetrics(dataSizeTotal, "sealing") s.sealed.SetMetrics(dataSizeTotal, "sealed") + s.compacting.SetMetrics(dataSizeTotal, "compacting") s.offloading.SetMetrics(dataSizeTotal, "offloading") s.remotes.SetMetrics(dataSizeTotal, "remotes") } func (s registryStats) TotalSizeOnDiskLocal() uint64 { - return s.sealing.totalSizeOnDisk + s.sealed.totalSizeOnDisk + return s.sealing.totalSizeOnDisk + s.sealed.totalSizeOnDisk + s.compacting.totalSizeOnDisk } diff --git a/fracmanager/fraction_registry.go b/fracmanager/fraction_registry.go index b0667c04..0c0872c6 100644 --- a/fracmanager/fraction_registry.go +++ b/fracmanager/fraction_registry.go @@ -21,16 +21,17 @@ type fractionRegistry struct { sealing map[string]*syncAppender // fractions being sealed (0-5 typical) sealed PartitionedCollection[*refCountedSealed] // local sealed fractions (can be thousands) + compacting map[string]*refCountedSealed // fractions participating in compaction offloading PartitionedCollection[*refCountedSealed] // fractions being offloaded (0-5 typical) remotes PartitionedCollection[*refCountedRemote] // offloaded fractions (can be thousands) stats registryStats // size statistics for monitoring muAppender sync.RWMutex - appender *syncAppender // currently active writable fraction + sappender *syncAppender // currently active writable fraction - muAll sync.RWMutex - all fractionsSnapshot // all fractions + muSnapshot sync.RWMutex + snapshot fractionsSnapshot // all fractions } // NewFractionRegistry creates and initializes a new fraction registry instance. @@ -51,10 +52,11 @@ func NewFractionRegistry(active *frac.Active, sealed []*frac.Sealed, remotes []* } reg := fractionRegistry{ - appender: &syncAppender{refCountedActive: refCountedActive{Active: active}}, + sappender: &syncAppender{refCountedActive: refCountedActive{Active: active}}, sealing: map[string]*syncAppender{}, sealed: NewPartitionedCollection(func(rcs *refCountedSealed) uint64 { return creationTime(rcs) }), + compacting: map[string]*refCountedSealed{}, offloading: NewPartitionedCollection(func(rcs *refCountedSealed) uint64 { return lastDocTime(rcs) }), remotes: NewPartitionedCollection(func(rcr *refCountedRemote) uint64 { return lastDocTime(rcr) }), } @@ -76,51 +78,51 @@ func NewFractionRegistry(active *frac.Active, sealed []*frac.Sealed, remotes []* return ®, nil } -// Appender returns the currently active writable fraction. -func (r *fractionRegistry) Appender() *syncAppender { +// appender returns the currently active writable fraction. +func (r *fractionRegistry) appender() *syncAppender { r.muAppender.RLock() defer r.muAppender.RUnlock() - return r.appender + return r.sappender } -func (r *fractionRegistry) AcquireOneFraction(name string) (frac.Fraction, func(), bool) { - r.muAll.RLock() - defer r.muAll.RUnlock() +func (r *fractionRegistry) acquireOneFraction(name string) (frac.Fraction, func(), bool) { + r.muSnapshot.RLock() + defer r.muSnapshot.RUnlock() - return r.all.AcquireOne(name) + return r.snapshot.AcquireOne(name) } -// AcquireAllFractions returns a read-only view of all fractions -func (r *fractionRegistry) AcquireAllFractions() ([]frac.Fraction, func()) { - r.muAll.RLock() - defer r.muAll.RUnlock() +// acquireAllFractions returns a read-only view of all fractions +func (r *fractionRegistry) acquireAllFractions() ([]frac.Fraction, func()) { + r.muSnapshot.RLock() + defer r.muSnapshot.RUnlock() - return r.all.AcquireAll() + return r.snapshot.AcquireAll() } -// Stats returns current size statistics of the registry. -func (r *fractionRegistry) Stats() registryStats { +// statistics returns current size statistics of the registry. +func (r *fractionRegistry) statistics() registryStats { r.mu.RLock() s := r.stats - i := r.appender.Info() + i := r.sappender.Info() r.mu.RUnlock() s.active.Set(i) return s } -// OldestTotal returns the creation time of the oldest fraction in the registry. -func (r *fractionRegistry) OldestTotal() uint64 { - r.muAll.RLock() - defer r.muAll.RUnlock() - return r.all.oldestTotal +// oldestTotal returns the creation time of the oldest fraction in the registry. +func (r *fractionRegistry) oldestTotal() uint64 { + r.muSnapshot.RLock() + defer r.muSnapshot.RUnlock() + return r.snapshot.oldestTotal } -// OldestLocal returns the creation time of the oldest local fraction in the registry. -func (r *fractionRegistry) OldestLocal() uint64 { - r.muAll.RLock() - defer r.muAll.RUnlock() - return r.all.oldestLocal +// oldestLocal returns the creation time of the oldest local fraction in the registry. +func (r *fractionRegistry) oldestLocal() uint64 { + r.muSnapshot.RLock() + defer r.muSnapshot.RUnlock() + return r.snapshot.oldestLocal } type activeProvider interface { @@ -131,39 +133,39 @@ func (r *fractionRegistry) setAppender(appender *syncAppender) { r.muAppender.Lock() defer r.muAppender.Unlock() - r.appender = appender + r.sappender = appender - r.muAll.Lock() - defer r.muAll.Unlock() + r.muSnapshot.Lock() + defer r.muSnapshot.Unlock() - r.all.AddActive(appender) + r.snapshot.AddActive(appender) } -// RotateIfFull completes the current active fraction and starts a new one. +// rotateIfFull completes the current active fraction and starts a new one. // Moves previous active fraction to sealing queue. // Should be called when the current active fraction reaches size limit and needs to be rotated -func (r *fractionRegistry) RotateIfFull(maxSize uint64, ap activeProvider) (*refCountedActive, func(), error) { +func (r *fractionRegistry) rotateIfFull(maxSize uint64, ap activeProvider) (*refCountedActive, func(), error) { r.mu.Lock() defer r.mu.Unlock() - if r.appender.Info().DocsOnDisk <= maxSize { + if r.sappender.Info().DocsOnDisk <= maxSize { return nil, nil, nil } - old := r.appender + old := r.sappender r.sealing[old.Info().Name()] = old r.setAppender(&syncAppender{refCountedActive: refCountedActive{Active: ap.CreateActive()}}) - if err := old.Finalize(); err != nil { + if err := old.finalize(); err != nil { return nil, nil, err } curInfo := old.Info() r.stats.sealing.Add(curInfo) - r.appender.Suspend(old.Suspended()) + r.sappender.suspend(old.isSuspended()) wg := sync.WaitGroup{} wg.Add(1) @@ -172,7 +174,7 @@ func (r *fractionRegistry) RotateIfFull(maxSize uint64, ap activeProvider) (*ref go func() { defer wg.Done() - old.WaitWriteIdle() // can be long enough + old.waitWriteIdle() // can be long enough finalInfo := old.Info() r.mu.Lock() @@ -187,11 +189,11 @@ func (r *fractionRegistry) RotateIfFull(maxSize uint64, ap activeProvider) (*ref return &old.refCountedActive, wg.Wait, nil } -func (r *fractionRegistry) SuspendIfOverCapacity(maxQueue, maxSize uint64) { +func (r *fractionRegistry) suspendIfOverCapacity(maxQueue, maxSize uint64) { r.mu.Lock() defer r.mu.Unlock() - suspended := r.appender.Suspended() + suspended := r.sappender.isSuspended() if maxQueue > 0 && r.stats.sealing.count >= int(maxQueue) { if !suspended { @@ -199,7 +201,7 @@ func (r *fractionRegistry) SuspendIfOverCapacity(maxQueue, maxSize uint64) { zap.String("reason", "sealing queue size exceeded"), zap.Uint64("limit", maxQueue), zap.Int("queue_size", r.stats.sealing.count)) - r.appender.Suspend(true) + r.sappender.suspend(true) } return } @@ -212,7 +214,7 @@ func (r *fractionRegistry) SuspendIfOverCapacity(maxQueue, maxSize uint64) { zap.String("reason", "occupied space limit exceeded"), zap.Float64("queue_size_limit_gb", util.Float64ToPrec(util.SizeToUnit(maxSize, "gb"), 2)), zap.Float64("occupied_space_gb", util.Float64ToPrec(util.SizeToUnit(du, "gb"), 2))) - r.appender.Suspend(true) + r.sappender.suspend(true) } return } @@ -223,20 +225,21 @@ func (r *fractionRegistry) SuspendIfOverCapacity(maxQueue, maxSize uint64) { zap.Float64("occupied_space_gb", util.Float64ToPrec(util.SizeToUnit(du, "gb"), 2)), zap.Uint64("sealing_queue_size_limit", maxQueue), zap.Int("queue_size", r.stats.sealing.count)) - r.appender.Suspend(false) + r.sappender.suspend(false) } } func (r *fractionRegistry) diskUsage() uint64 { - return r.appender.Info().FullSize() + + return r.sappender.Info().FullSize() + r.stats.sealed.totalSizeOnDisk + r.stats.sealing.totalSizeOnDisk + + r.stats.compacting.totalSizeOnDisk + r.stats.offloading.totalSizeOnDisk } -// EvictLocalForDelete removes oldest local fractions to free disk space. +// evictLocalForDelete removes oldest local fractions to free disk space. // Returns evicted fractions or error if insufficient space is released. -func (r *fractionRegistry) EvictLocalForDelete(sizeLimit uint64) (evicted []*refCountedSealed, err error) { +func (r *fractionRegistry) evictLocalForDelete(sizeLimit uint64) (evicted []*refCountedSealed, err error) { r.mu.Lock() defer r.mu.Unlock() @@ -249,9 +252,9 @@ func (r *fractionRegistry) EvictLocalForDelete(sizeLimit uint64) (evicted []*ref return evicted, nil } -// EvictLocalForOffload removes oldest local fractions to moves it to offloading queue. +// evictLocalForOffload removes oldest local fractions to moves it to offloading queue. // Returns evicted fractions or error if insufficient space is released. -func (r *fractionRegistry) EvictLocalForOffload(sizeLimit uint64) ([]*refCountedSealed, error) { +func (r *fractionRegistry) evictLocalForOffload(sizeLimit uint64) ([]*refCountedSealed, error) { r.mu.Lock() defer r.mu.Unlock() @@ -272,16 +275,21 @@ func (r *fractionRegistry) evictLocal(sizeLimit uint64) ([]*refCountedSealed, er var releasingSize uint64 // calculate total used disk space - totalUsedSize := r.stats.TotalSizeOnDiskLocal() + r.appender.Info().FullSize() - - evicted := []*refCountedSealed{} + totalUsedSize := r.stats.TotalSizeOnDiskLocal() + r.sappender.Info().FullSize() + var evicted []*refCountedSealed for r.sealed.Len() > 0 && totalUsedSize-releasingSize > sizeLimit { for _, s := range r.sealed.GetByPartition(r.sealed.MinPartition()) { + if totalUsedSize-releasingSize <= sizeLimit { + break + } + info := s.Info() releasingSize += info.FullSize() + r.stats.sealed.Sub(info) r.sealed.Del(info.Name()) + evicted = append(evicted, s) } } @@ -296,10 +304,10 @@ func (r *fractionRegistry) evictLocal(sizeLimit uint64) ([]*refCountedSealed, er return evicted, nil } -// EvictRemote removes oldest remote fractions based on retention policy. +// evictRemote removes oldest remote fractions based on retention policy. // Fractions older than retention period are permanently deleted. // Returns removed fractions or empty slice if nothing to remove. -func (r *fractionRegistry) EvictRemote(retention time.Duration) []*refCountedRemote { +func (r *fractionRegistry) evictRemote(retention time.Duration) []*refCountedRemote { if retention == 0 { return nil } @@ -322,9 +330,9 @@ func (r *fractionRegistry) EvictRemote(retention time.Duration) []*refCountedRem return evicted } -// EvictOverflowed removes oldest fractions from offloading queue when it exceeds size limit. +// evictOverflowed removes oldest fractions from offloading queue when it exceeds size limit. // Used when offloading queue grows too large due to slow remote storage performance. -func (r *fractionRegistry) EvictOverflowed(sizeLimit uint64) (evicted []*refCountedSealed) { +func (r *fractionRegistry) evictOverflowed(sizeLimit uint64) (evicted []*refCountedSealed) { if sizeLimit == 0 { return nil } @@ -355,23 +363,43 @@ loop: return evicted } -// PromoteToSealed moves fractions from sealing to local queue when sealing completes. -func (r *fractionRegistry) PromoteToSealed(active *refCountedActive, sealed *frac.Sealed) { +// promoteToSealed moves fractions from sealing to local queue when sealing completes. +func (r *fractionRegistry) promoteToSealed(active *refCountedActive, sealed ...*frac.Sealed) { r.mu.Lock() defer r.mu.Unlock() - r.sealed.Add(sealed.Info().Name(), &refCountedSealed{Sealed: sealed}) - r.stats.sealed.Add(sealed.Info()) - r.stats.sealing.Sub(active.Info()) + for _, f := range sealed { + info := f.Info() + r.sealed.Add(info.Name(), &refCountedSealed{Sealed: f}) + r.stats.sealed.Add(info) + } + r.stats.sealing.Sub(active.Info()) delete(r.sealing, active.Info().Name()) r.rebuildSnapshot() } -// PromoteToRemote moves fractions from offloading to remote queue when offloading completes. +func (r *fractionRegistry) substituteWithSealed(produced *frac.Sealed, consumed ...*refCountedSealed) { + r.mu.Lock() + defer r.mu.Unlock() + + for _, f := range consumed { + info := f.Info() + r.stats.compacting.Sub(info) + delete(r.compacting, info.Name()) + } + + info := produced.Info() + r.stats.sealed.Add(info) + r.sealed.Add(info.Name(), &refCountedSealed{Sealed: produced}) + + r.rebuildSnapshot() +} + +// promoteToRemote moves fractions from offloading to remote queue when offloading completes. // Special case: handles fractions that don't require offloading (remote == nil). -func (r *fractionRegistry) PromoteToRemote(sealed *refCountedSealed, remote *frac.Remote) { +func (r *fractionRegistry) promoteToRemote(sealed *refCountedSealed, remote *frac.Remote) { r.mu.Lock() defer r.mu.Unlock() @@ -380,14 +408,60 @@ func (r *fractionRegistry) PromoteToRemote(sealed *refCountedSealed, remote *fra r.stats.remotes.Add(remote.Info()) } - r.stats.offloading.Sub(sealed.Info()) r.offloading.Del(sealed.Info().Name()) + r.stats.offloading.Sub(sealed.Info()) + + r.rebuildSnapshot() +} + +func (r *fractionRegistry) sealedSnapshot() []*frac.Sealed { + r.mu.RLock() + defer r.mu.RUnlock() + + result := make([]*frac.Sealed, 0, r.sealed.Len()) + for s := range r.sealed.All() { + result = append(result, s.Sealed) + } + + return result +} + +func (r *fractionRegistry) claimForCompaction(names []string) ([]*refCountedSealed, error) { + r.mu.Lock() + defer r.mu.Unlock() + + for _, name := range names { + // NOTE(dkharms): If offloading pressure is high on the oldest fractions, + // compaction may repeatedly fail to claim them and get into livelock. + if _, ok := r.sealed.Get(name); !ok { + return nil, fmt.Errorf( + "fraction %q is not available for compaction", + name, + ) + } + } + + claimed := make([]*refCountedSealed, 0, len(names)) + for _, name := range names { + s, _ := r.sealed.Get(name) + + r.sealed.Del(name) + r.stats.sealed.Sub(s.Info()) + + r.compacting[name] = s + r.stats.compacting.Add(s.Info()) + + claimed = append(claimed, s) + } + r.rebuildSnapshot() + return claimed, nil } // rebuildSnapshot reconstructs the all fractions list func (r *fractionRegistry) rebuildSnapshot() { - capacity := r.remotes.Len() + r.offloading.Len() + r.sealed.Len() + len(r.sealing) + 1 + capacity := r.remotes.Len() + r.offloading.Len() + + r.sealed.Len() + len(r.compacting) + len(r.sealing) + 1 // allocate extra capacity to accommodate appender rotation that may occur during snapshot lifetime all := newFractionsSnapshot(capacity + 1) @@ -404,13 +478,18 @@ func (r *fractionRegistry) rebuildSnapshot() { all.AddSealed(s) } + for _, c := range r.compacting { + all.AddSealed(c) + } + for _, a := range r.sealing { all.AddActive(a) } - all.AddActive(r.appender) + all.AddActive(r.sappender) + + r.muSnapshot.Lock() + defer r.muSnapshot.Unlock() - r.muAll.Lock() - defer r.muAll.Unlock() - r.all = all + r.snapshot = all } diff --git a/fracmanager/lifecycle_manager.go b/fracmanager/lifecycle_manager.go index 24025c23..327d475f 100644 --- a/fracmanager/lifecycle_manager.go +++ b/fracmanager/lifecycle_manager.go @@ -2,6 +2,7 @@ package fracmanager import ( "context" + "fmt" "sync" "time" @@ -42,7 +43,7 @@ func newLifecycleManager( // Maintain performs periodic lifecycle management tasks. // It coordinates rotation, offloading, cleanup based on configuration. func (lc *lifecycleManager) Maintain(ctx context.Context, cfg *Config, wg *sync.WaitGroup) { - lc.registry.SuspendIfOverCapacity(cfg.SealingQueueLen, cfg.SuspendThreshold()) + lc.registry.suspendIfOverCapacity(cfg.SealingQueueLen, cfg.SuspendThreshold()) lc.rotate(cfg.FracSize, wg) if cfg.OffloadingEnabled { @@ -68,7 +69,7 @@ func (lc *lifecycleManager) SyncInfoCache() { // rotate checks if active fraction needs rotation based on size limit. // Creates new active fraction and starts sealing the previous one. func (lc *lifecycleManager) rotate(maxSize uint64, wg *sync.WaitGroup) { - active, waitBeforeSealing, err := lc.registry.RotateIfFull(maxSize, lc.provider) + active, waitBeforeSealing, err := lc.registry.rotateIfFull(maxSize, lc.provider) if err != nil { logger.Fatal("active fraction rotation error", zap.Error(err)) } @@ -89,7 +90,7 @@ func (lc *lifecycleManager) rotate(maxSize uint64, wg *sync.WaitGroup) { } lc.infoCache.Add(sealed.Info()) - lc.registry.PromoteToSealed(active, sealed) + lc.registry.promoteToSealed(active, sealed) active.Destroy() }() } @@ -97,7 +98,7 @@ func (lc *lifecycleManager) rotate(maxSize uint64, wg *sync.WaitGroup) { // offloadLocal starts offloading of local fractions to remote storage. // Selects fractions based on disk space usage and retention policy. func (lc *lifecycleManager) offloadLocal(ctx context.Context, sizeLimit uint64, retryDelay time.Duration, wg *sync.WaitGroup) { - toOffload, err := lc.registry.EvictLocalForOffload(sizeLimit) + toOffload, err := lc.registry.evictLocalForOffload(sizeLimit) if err != nil { logger.Fatal("error releasing old fractions:", zap.Error(err)) } @@ -108,7 +109,7 @@ func (lc *lifecycleManager) offloadLocal(ctx context.Context, sizeLimit uint64, remote := lc.offloadWithRetry(ctx, frac.Sealed, retryDelay) - lc.registry.PromoteToRemote(frac, remote) + lc.registry.promoteToRemote(frac, remote) if remote == nil { lc.infoCache.Remove(frac.Info().Name()) @@ -181,7 +182,7 @@ func (lc *lifecycleManager) tryOffload(ctx context.Context, sealed *frac.Sealed) // cleanRemote deletes outdated remote fractions based on retention policy. func (lc *lifecycleManager) cleanRemote(retention time.Duration, wg *sync.WaitGroup) { - toDelete := lc.registry.EvictRemote(retention) + toDelete := lc.registry.evictRemote(retention) wg.Add(len(toDelete)) for _, remote := range toDelete { go func() { @@ -194,10 +195,16 @@ func (lc *lifecycleManager) cleanRemote(retention time.Duration, wg *sync.WaitGr // cleanLocal deletes outdated local fractions when offloading is disabled. func (lc *lifecycleManager) cleanLocal(sizeLimit uint64, wg *sync.WaitGroup) { - toDelete, err := lc.registry.EvictLocalForDelete(sizeLimit) + toDelete, err := lc.registry.evictLocalForDelete(sizeLimit) if err != nil { logger.Fatal("error releasing old fractions:", zap.Error(err)) } + + fmt.Printf("len(toDelete): %v\n", len(toDelete)) + for _, f := range toDelete { + fmt.Printf("f.Info().Name(): %v\n", f.Info().Name()) + } + if len(toDelete) > 0 && !lc.flags.IsCapacityExceeded() { if err := lc.flags.setCapacityExceeded(true); err != nil { logger.Fatal("can't set capacity_exceeded flag", zap.Error(err)) @@ -217,14 +224,14 @@ func (lc *lifecycleManager) cleanLocal(sizeLimit uint64, wg *sync.WaitGroup) { // updateOldestMetric updates the prometheus metric with oldest fraction timestamp. func (lc *lifecycleManager) updateOldestMetric() { - oldestFracTime.WithLabelValues("remote").Set((time.Duration(lc.registry.OldestTotal()) * time.Millisecond).Seconds()) - oldestFracTime.WithLabelValues("local").Set((time.Duration(lc.registry.OldestLocal()) * time.Millisecond).Seconds()) + oldestFracTime.WithLabelValues("remote").Set((time.Duration(lc.registry.oldestTotal()) * time.Millisecond).Seconds()) + oldestFracTime.WithLabelValues("local").Set((time.Duration(lc.registry.oldestLocal()) * time.Millisecond).Seconds()) } // removeOverflowed removes fractions from offloading queue that exceed size limit // Stops ongoing offloading tasks and cleans up both local and remote resources. func (lc *lifecycleManager) removeOverflowed(sizeLimit uint64, wg *sync.WaitGroup) { - evicted := lc.registry.EvictOverflowed(sizeLimit) + evicted := lc.registry.evictOverflowed(sizeLimit) for _, sealed := range evicted { wg.Add(1) go func() { diff --git a/fracmanager/lifecycle_manager_test.go b/fracmanager/lifecycle_manager_test.go index cb9ab1e0..382a4ebf 100644 --- a/fracmanager/lifecycle_manager_test.go +++ b/fracmanager/lifecycle_manager_test.go @@ -38,7 +38,9 @@ func TestFracInfoCache(t *testing.T) { defer tearDown() fillRotateAndCheck := func(names map[string]struct{}) { - appender := lc.registry.Appender() + time.Sleep(time.Millisecond * 10) + + appender := lc.registry.appender() appendDocsToActive(t, appender.Active, 10+rand.Intn(10)) wg := sync.WaitGroup{} @@ -56,13 +58,13 @@ func TestFracInfoCache(t *testing.T) { for range 10 { fillRotateAndCheck(first) } - halfSize := lc.registry.Stats().TotalSizeOnDiskLocal() + halfSize := lc.registry.statistics().TotalSizeOnDiskLocal() second := map[string]struct{}{} for range 10 { fillRotateAndCheck(second) } - total := lc.registry.Stats().TotalSizeOnDiskLocal() + total := lc.registry.statistics().TotalSizeOnDiskLocal() wg := sync.WaitGroup{} lc.cleanLocal(total-halfSize, &wg) @@ -86,7 +88,7 @@ func TestCapacityExceeded(t *testing.T) { const fracsCount = 10 fillAndRotate := func() { - appender := lc.registry.Appender() + appender := lc.registry.appender() appendDocsToActive(t, appender.Active, 10+rand.Intn(10)) wg := sync.WaitGroup{} @@ -102,19 +104,19 @@ func TestCapacityExceeded(t *testing.T) { } assert.False(t, lc.flags.IsCapacityExceeded(), "there should be no deletions and the flag is false") - total := lc.registry.Stats().TotalSizeOnDiskLocal() + total := lc.registry.statistics().TotalSizeOnDiskLocal() wg := sync.WaitGroup{} lc.cleanLocal(total, &wg) wg.Wait() - assert.Equal(t, fracsCount, lc.registry.Stats().sealed.count, "as much as was added, so much should be") + assert.Equal(t, fracsCount, lc.registry.statistics().sealed.count, "as much as was added, so much should be") assert.False(t, lc.flags.IsCapacityExceeded(), "there should still be no deletions, and the flag is false") lc.cleanLocal(total-1, &wg) wg.Wait() - assert.Equal(t, fracsCount-1, lc.registry.Stats().sealed.count, "expect one less") + assert.Equal(t, fracsCount-1, lc.registry.statistics().sealed.count, "expect one less") assert.True(t, lc.flags.IsCapacityExceeded(), "the flag must be true now") } @@ -124,30 +126,30 @@ func TestOldestMetrics(t *testing.T) { const fracsCount = 10 fillAndRotate := func() { - appender := lc.registry.Appender() + appender := lc.registry.appender() appendDocsToActive(t, appender.Active, 10+rand.Intn(10)) wg := sync.WaitGroup{} lc.rotate(0, &wg) wg.Wait() } - firstFracTime := lc.registry.Appender().Info().CreationTime + firstFracTime := lc.registry.appender().Info().CreationTime for range fracsCount { fillAndRotate() } // Check state after initial rotations - assert.Equal(t, firstFracTime, lc.registry.OldestTotal(), "should point to the very first fraction when all data is local") - assert.Equal(t, firstFracTime, lc.registry.OldestLocal(), "should point to the first fraction when nothing is offloaded") + assert.Equal(t, firstFracTime, lc.registry.oldestTotal(), "should point to the very first fraction when all data is local") + assert.Equal(t, firstFracTime, lc.registry.oldestLocal(), "should point to the first fraction when nothing is offloaded") - halfSize := lc.registry.Stats().TotalSizeOnDiskLocal() + halfSize := lc.registry.statistics().TotalSizeOnDiskLocal() - halfwayFracTime := lc.registry.Appender().Info().CreationTime + halfwayFracTime := lc.registry.appender().Info().CreationTime for range fracsCount { fillAndRotate() } - total := lc.registry.Stats().TotalSizeOnDiskLocal() + total := lc.registry.statistics().TotalSizeOnDiskLocal() wg := sync.WaitGroup{} lc.offloadLocal(t.Context(), total-halfSize, 0, &wg) @@ -155,8 +157,8 @@ func TestOldestMetrics(t *testing.T) { // Check state after offloading assert.NotEqual(t, firstFracTime, halfwayFracTime, "expect different creation times") - assert.Equal(t, firstFracTime, lc.registry.OldestTotal(), "should still reference the first fraction after offload") - assert.Equal(t, halfwayFracTime, lc.registry.OldestLocal(), "should point to the oldest remaining local fraction after offload") + assert.Equal(t, firstFracTime, lc.registry.oldestTotal(), "should still reference the first fraction after offload") + assert.Equal(t, halfwayFracTime, lc.registry.oldestLocal(), "should point to the oldest remaining local fraction after offload") } func TestPendingDestroy(t *testing.T) { @@ -170,19 +172,19 @@ func TestPendingDestroy(t *testing.T) { // appending docs to `fracsCount` fractions where the last is active and the rest are sealed wg := sync.WaitGroup{} for range fracsCount - 1 { - appendDocsToActive(t, lc.registry.Appender().Active, docsPerFrac) + appendDocsToActive(t, lc.registry.appender().Active, docsPerFrac) lc.rotate(0, &wg) } - appendDocsToActive(t, lc.registry.Appender().Active, docsPerFrac) + appendDocsToActive(t, lc.registry.appender().Active, docsPerFrac) // wait sealing complete wg.Wait() // take all fracs to search - fractions1, release1 := lc.registry.AcquireAllFractions() + fractions1, release1 := lc.registry.acquireAllFractions() // delete all sealing fracs - lc.cleanLocal(lc.registry.Appender().Info().FullSize(), &wg) + lc.cleanLocal(lc.registry.appender().Info().FullSize(), &wg) var ( beforeRelease time.Time @@ -220,7 +222,7 @@ func TestPendingDestroy(t *testing.T) { cleanup.Wait() assert.Less(t, beforeRelease, afterCleanup, "we expect cleanup to happen after release") - fractions2, release2 := lc.registry.AcquireAllFractions() + fractions2, release2 := lc.registry.acquireAllFractions() assert.Len(t, fractions2, 1, "only one active fraction should remain") singleName := fractions2[0].Info().Name() diff --git a/fracmanager/sync_appender.go b/fracmanager/sync_appender.go index 76cf4ee0..1acb15a3 100644 --- a/fracmanager/sync_appender.go +++ b/fracmanager/sync_appender.go @@ -26,8 +26,8 @@ type syncAppender struct { suspended bool // Temporarily suspended for writes } -// Append adds documents to the active fraction -func (a *syncAppender) Append(docs, meta []byte) error { +// append adds documents to the active fraction +func (a *syncAppender) append(docs, meta []byte) error { a.mu.RLock() if a.finalized { a.mu.RUnlock() @@ -43,22 +43,22 @@ func (a *syncAppender) Append(docs, meta []byte) error { return a.refCountedActive.Append(docs, meta, &a.wg) } -func (a *syncAppender) Suspended() bool { +func (a *syncAppender) isSuspended() bool { a.mu.Lock() defer a.mu.Unlock() return a.suspended } -func (a *syncAppender) Suspend(value bool) { +func (a *syncAppender) suspend(value bool) { a.mu.Lock() a.suspended = value a.mu.Unlock() } -// WaitWriteIdle waits for all pending write operations to complete +// waitWriteIdle waits for all pending write operations to complete // Used before sealing to ensure data consistency. -func (a *syncAppender) WaitWriteIdle() { +func (a *syncAppender) waitWriteIdle() { start := time.Now() logger.Info("waiting fraction to stop write...", zap.String("name", a.BaseFileName)) a.wg.Wait() @@ -70,8 +70,8 @@ func (a *syncAppender) WaitWriteIdle() { ) } -// Finalize marks the fraction as read-only and prevents new writes from starting after finalize. -func (a *syncAppender) Finalize() error { +// finalize marks the fraction as read-only and prevents new writes from starting after finalize. +func (a *syncAppender) finalize() error { a.mu.Lock() if a.finalized { a.mu.Unlock() From dcdeec2b2c435128d0fda2d3dac8bf8a2e203432 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Fri, 22 May 2026 14:21:25 +0300 Subject: [PATCH 04/29] chore: remove garbage code --- fracmanager/lifecycle_manager.go | 6 ------ fracmanager/lifecycle_manager_test.go | 2 -- 2 files changed, 8 deletions(-) diff --git a/fracmanager/lifecycle_manager.go b/fracmanager/lifecycle_manager.go index 327d475f..e98c5871 100644 --- a/fracmanager/lifecycle_manager.go +++ b/fracmanager/lifecycle_manager.go @@ -2,7 +2,6 @@ package fracmanager import ( "context" - "fmt" "sync" "time" @@ -200,11 +199,6 @@ func (lc *lifecycleManager) cleanLocal(sizeLimit uint64, wg *sync.WaitGroup) { logger.Fatal("error releasing old fractions:", zap.Error(err)) } - fmt.Printf("len(toDelete): %v\n", len(toDelete)) - for _, f := range toDelete { - fmt.Printf("f.Info().Name(): %v\n", f.Info().Name()) - } - if len(toDelete) > 0 && !lc.flags.IsCapacityExceeded() { if err := lc.flags.setCapacityExceeded(true); err != nil { logger.Fatal("can't set capacity_exceeded flag", zap.Error(err)) diff --git a/fracmanager/lifecycle_manager_test.go b/fracmanager/lifecycle_manager_test.go index 382a4ebf..bebc2c1f 100644 --- a/fracmanager/lifecycle_manager_test.go +++ b/fracmanager/lifecycle_manager_test.go @@ -38,8 +38,6 @@ func TestFracInfoCache(t *testing.T) { defer tearDown() fillRotateAndCheck := func(names map[string]struct{}) { - time.Sleep(time.Millisecond * 10) - appender := lc.registry.appender() appendDocsToActive(t, appender.Active, 10+rand.Intn(10)) From 96ba81fd7d2a5a67981d648b430c0f76c67e01b8 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Fri, 22 May 2026 14:25:22 +0300 Subject: [PATCH 05/29] chore: remove evict local fix --- fracmanager/fraction_registry.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fracmanager/fraction_registry.go b/fracmanager/fraction_registry.go index 0c0872c6..c0112383 100644 --- a/fracmanager/fraction_registry.go +++ b/fracmanager/fraction_registry.go @@ -280,10 +280,6 @@ func (r *fractionRegistry) evictLocal(sizeLimit uint64) ([]*refCountedSealed, er var evicted []*refCountedSealed for r.sealed.Len() > 0 && totalUsedSize-releasingSize > sizeLimit { for _, s := range r.sealed.GetByPartition(r.sealed.MinPartition()) { - if totalUsedSize-releasingSize <= sizeLimit { - break - } - info := s.Info() releasingSize += info.FullSize() From 0978ee42be9bd6c9708833c7934238548e8fbdea Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Mon, 27 Apr 2026 10:25:09 +0300 Subject: [PATCH 06/29] refactor: introduce `blockbuilder` and `indexwriter` package --- blockbuilder/blocks_builder.go | 303 ++++++++++++++++++ .../blocks_builder_test.go | 74 ++--- frac/fraction_concurrency_test.go | 4 + frac/fraction_test.go | 4 + fracmanager/fraction_provider.go | 3 +- fracmanager/sealer_test.go | 2 +- {frac/sealed/sealing => indexwriter}/index.go | 117 ++++--- .../sealed/sealing => indexwriter}/writer.go | 2 +- {frac/sealed/sealing => sealing}/sealer.go | 137 ++++---- 9 files changed, 460 insertions(+), 186 deletions(-) create mode 100644 blockbuilder/blocks_builder.go rename {frac/sealed/sealing => blockbuilder}/blocks_builder_test.go (76%) rename {frac/sealed/sealing => indexwriter}/index.go (68%) rename {frac/sealed/sealing => indexwriter}/writer.go (99%) rename {frac/sealed/sealing => sealing}/sealer.go (58%) diff --git a/blockbuilder/blocks_builder.go b/blockbuilder/blocks_builder.go new file mode 100644 index 00000000..193b061e --- /dev/null +++ b/blockbuilder/blocks_builder.go @@ -0,0 +1,303 @@ +package blockbuilder + +import ( + "encoding/binary" + "iter" + "unsafe" + + "github.com/ozontech/seq-db/frac/sealed/lids" + "github.com/ozontech/seq-db/frac/sealed/seqids" + "github.com/ozontech/seq-db/frac/sealed/token" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/util" +) + +type ( + DocLocation = util.Pair[seq.ID, seq.DocPos] + TokenPosting = util.Pair[[]byte, []uint32] + TokenBlock = util.Pair[TokensSealBlock, []token.FieldTable] +) + +// TokensExt represents the token ID range contained in a block. +type TokensExt struct { + MinTID uint32 // First token ID in the block + MaxTID uint32 // Last token ID in the block +} + +// TokensSealBlock represents a sealed block containing token data with metadata. +type TokensSealBlock struct { + Ext TokensExt // Tokens block metadata for registry marking + Payload token.Block // Actual token data payload +} + +// LidsExt represents the range and continuation status of LID blocks. +type LidsExt struct { + MinTID uint32 // First token ID in the LID block + MaxTID uint32 // Last token ID in the LID block + IsContinued bool // Whether LID sequence continues in next block +} + +// LidsSealBlock represents a sealed block containing LID (Local ID) data. +type LidsSealBlock struct { + Ext LidsExt // LIDs block metadata for registry marking + Payload lids.Block // LID data payload +} + +// IdsSealBlock represents a sealed block containing various identifier types. +type IdsSealBlock struct { + MIDs seqids.BlockMIDs + RIDs seqids.BlockRIDs + Params seqids.BlockParams +} + +// BlocksBuilder constructs sealed blocks from various data sources. +type BlocksBuilder struct{} + +func (bb *BlocksBuilder) BuildTokenBlocks( + it iter.Seq2[string, iter.Seq2[TokenPosting, error]], + accumulate func([]uint32) error, blockCapacity int, +) iter.Seq2[TokenBlock, error] { + return func(yield func(TokenBlock, error) bool) { + var ( + block TokensSealBlock + blockIdx uint32 + blockSize int + ) + + var ( + currentTID uint32 + pendingTable []token.FieldTable + fieldName string + fieldEntryStartTID uint32 + ) + + emitFieldEntry := func() { + // Handle case when field does not have tokens. + if fieldName == "" || fieldEntryStartTID > currentTID { + return + } + + entry := newTokenTableEntry(fieldEntryStartTID, currentTID, blockIdx, block) + pendingTable = append(pendingTable, token.FieldTable{ + Field: fieldName, + Entries: []*token.TableEntry{entry}, + }) + } + + flushBlock := func() bool { + emitFieldEntry() + block.Ext.MaxTID = currentTID + + pair := TokenBlock{First: block, Second: pendingTable} + if !yield(pair, nil) { + return false + } + + block.Payload.Payload = block.Payload.Payload[:0] + block.Payload.Offsets = block.Payload.Offsets[:0] + block.Ext.MinTID = currentTID + 1 + + blockIdx++ + blockSize = 0 + + pendingTable = pendingTable[:0] + fieldEntryStartTID = currentTID + 1 + + return true + } + + block.Ext.MinTID = 1 + for field, tokIt := range it { + emitFieldEntry() + + fieldName = field + fieldEntryStartTID = currentTID + 1 + + for pair, err := range tokIt { + if err != nil { + yield(TokenBlock{}, err) + return + } + + tok, tlids := pair.First, pair.Second + tokenSize := int(unsafe.Sizeof(uint32(0))) + len(tok) + + if blockSize > 0 && blockSize+tokenSize > blockCapacity { + if !flushBlock() { + return + } + } + + block.Payload.Offsets = append(block.Payload.Offsets, uint32(len(block.Payload.Payload))) + block.Payload.Payload = binary.LittleEndian.AppendUint32(block.Payload.Payload, uint32(len(tok))) + block.Payload.Payload = append(block.Payload.Payload, tok...) + + if err := accumulate(tlids); err != nil { + yield(TokenBlock{}, err) + return + } + + currentTID++ + blockSize += tokenSize + } + } + + if blockSize > 0 { + flushBlock() + } + } +} + +func newTokenTableEntry( + entryStartTID, entryEndTID uint32, + blockIndex uint32, block TokensSealBlock, +) *token.TableEntry { + // Convert global TIDs to block-local indices + firstIndex := entryStartTID - block.Ext.MinTID + lastIndex := entryEndTID - block.Ext.MinTID + + // Extract min and max token values for the entry range + minVal := string(block.Payload.GetToken(int(firstIndex))) + maxVal := string(block.Payload.GetToken(int(lastIndex))) + + return &token.TableEntry{ + StartIndex: firstIndex, // Starting index within the block + StartTID: entryStartTID, // Starting token ID (global) + BlockIndex: blockIndex, // Reference to containing block + ValCount: lastIndex - firstIndex + 1, // Number of tokens in this entry + MinVal: minVal, // Smallest token value in range + MaxVal: maxVal, // Largest token value in range + } +} + +// SeqBlockID accumulates scalar (ID, position) pairs into sealed ID blocks. +// A new block is yielded every `blockCapacity` IDs. +func SeqBlockID(ids iter.Seq2[DocLocation, error], blockCapacity int) iter.Seq2[IdsSealBlock, error] { + return func(yield func(IdsSealBlock, error) bool) { + var block IdsSealBlock + + for pair, err := range ids { + if err != nil { + yield(IdsSealBlock{}, err) + return + } + + id, pos := pair.First, pair.Second + block.MIDs.Values = append(block.MIDs.Values, uint64(id.MID)) + block.RIDs.Values = append(block.RIDs.Values, uint64(id.RID)) + block.Params.Values = append(block.Params.Values, uint64(pos)) + + if len(block.MIDs.Values) == blockCapacity { + if !yield(block, nil) { + return + } + + block.MIDs.Values = block.MIDs.Values[:0] + block.RIDs.Values = block.RIDs.Values[:0] + block.Params.Values = block.Params.Values[:0] + } + } + + if len(block.MIDs.Values) > 0 { + yield(block, nil) + } + } +} + +// LidBlocksAcc accumulates LIDs into sealed LID blocks. +type LidBlocksAcc struct { + blockCapacity int + + currentTID uint32 + currentBlock LidsSealBlock + + isEndOfToken bool + isContinued bool +} + +func NewLIDBlocksAccumulator(blockCapacity int) *LidBlocksAcc { + a := &LidBlocksAcc{blockCapacity: blockCapacity} + + a.currentBlock.Ext.MinTID = 1 + a.currentBlock.Payload = lids.Block{ + LIDs: make([]uint32, 0, blockCapacity), + Offsets: []uint32{0}, + } + + return a +} + +// Add processes LIDs of one token (must be called in TID order). +// +// For each block that fills up, `onBlock` is called immediately +// before the backing arrays are reset, so `onBlock` may read the +// block data but must not retain references to it. +func (a *LidBlocksAcc) Add(lidsbuf []uint32, onBlock func(LidsSealBlock) error) error { + a.currentTID++ + + for _, lid := range lidsbuf { + if len(a.currentBlock.Payload.LIDs) == a.blockCapacity { + if err := onBlock(a.finalizeBlock()); err != nil { + return err + } + + a.currentBlock.Ext.MinTID = a.currentTID + a.currentBlock.Payload.LIDs = a.currentBlock.Payload.LIDs[:0] + a.currentBlock.Payload.Offsets = a.currentBlock.Payload.Offsets[:1] + } + + a.isEndOfToken = false + a.currentBlock.Ext.MaxTID = a.currentTID + a.currentBlock.Payload.LIDs = append(a.currentBlock.Payload.LIDs, lid) + } + + a.isEndOfToken = true + a.currentBlock.Payload.Offsets = append( + a.currentBlock.Payload.Offsets, + uint32(len(a.currentBlock.Payload.LIDs)), + ) + + return nil +} + +func (a *LidBlocksAcc) Flush() LidsSealBlock { + return a.finalizeBlock() +} + +func (a *LidBlocksAcc) finalizeBlock() LidsSealBlock { + if !a.isEndOfToken { + a.currentBlock.Payload.Offsets = append( + a.currentBlock.Payload.Offsets, + uint32(len(a.currentBlock.Payload.LIDs)), + ) + } + + result := a.currentBlock + result.Payload.IsLastLID = a.isEndOfToken + result.Ext.IsContinued = a.isContinued + + a.isContinued = !a.isEndOfToken + return result +} + +// CollapseOrderedFieldsTables merges FieldTables with the same field name. +// Assumes input is sorted by Field. +func CollapseOrderedFieldsTables(src []token.FieldTable) []token.FieldTable { + if len(src) == 0 { + return nil + } + + current := src[0] + var dst []token.FieldTable + for _, ft := range src[1:] { + if current.Field == ft.Field { + current.Entries = append(current.Entries, ft.Entries...) + continue + } + + dst = append(dst, current) + current = ft + } + + return append(dst, current) +} diff --git a/frac/sealed/sealing/blocks_builder_test.go b/blockbuilder/blocks_builder_test.go similarity index 76% rename from frac/sealed/sealing/blocks_builder_test.go rename to blockbuilder/blocks_builder_test.go index d6bca144..34295a91 100644 --- a/frac/sealed/sealing/blocks_builder_test.go +++ b/blockbuilder/blocks_builder_test.go @@ -1,4 +1,4 @@ -package sealing +package blockbuilder import ( "iter" @@ -7,27 +7,20 @@ import ( "github.com/stretchr/testify/assert" - "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/seq" ) -var _ Source = (*mockSource)(nil) - type mockSource struct { - info common.Info - tokens [][]byte - fields []string - fieldMaxTIDs []uint32 - ids []seq.ID - pos []seq.DocPos - tokenLIDs [][]uint32 - blocksOffsets []uint64 + tokens [][]byte + fields []string + fieldMaxTIDs []uint32 + ids []seq.ID + pos []seq.DocPos + tokenLIDs [][]uint32 } -func (m *mockSource) Info() *common.Info { return &m.info } - func (m *mockSource) TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, error]] { return func(yield func(string, iter.Seq2[TokenPosting, error]) bool) { start := 0 @@ -48,8 +41,7 @@ func (m *mockSource) tokensForField(start, end int) iter.Seq2[TokenPosting, erro if j < len(m.tokenLIDs) { lidsbuf = m.tokenLIDs[j] } - pair := TokenPosting{First: m.tokens[j], Second: lidsbuf} - if !yield(pair, nil) { + if !yield(TokenPosting{First: m.tokens[j], Second: lidsbuf}, nil) { return } } @@ -66,8 +58,6 @@ func (m *mockSource) ID() iter.Seq2[DocLocation, error] { } } -func (m *mockSource) BlockOffsets() []uint64 { return m.blocksOffsets } - func TestBlocksBuilder_BuildTokenBlocks(t *testing.T) { src := mockSource{ tokens: [][]byte{ @@ -145,16 +135,16 @@ func TestBlocksBuilder_BuildTokenBlocks(t *testing.T) { for pair, err := range tokenBlocks { assert.NoError(t, err) block, fieldsTables := pair.First, pair.Second - assert.Equal(t, expectedSizes[blockIndex], block.payload.Len()) - for i := range block.payload.Len() { + assert.Equal(t, expectedSizes[blockIndex], block.Payload.Len()) + for i := range block.Payload.Len() { tid++ - assert.Equal(t, src.tokens[tid-1], block.payload.GetToken(i)) + assert.Equal(t, src.tokens[tid-1], block.Payload.GetToken(i)) } allFieldsTables = append(allFieldsTables, fieldsTables...) blockIndex++ } - actualTokenTable := token.TableBlock{FieldsTables: collapseOrderedFieldsTables(allFieldsTables)} + actualTokenTable := token.TableBlock{FieldsTables: CollapseOrderedFieldsTables(allFieldsTables)} assert.Equal(t, tid, len(src.tokens)) expectedTokenTable := token.TableBlock{ @@ -251,30 +241,30 @@ func TestBlocksBuilder_BuildTokenBlocks(t *testing.T) { assert.Equal(t, actualTokenTable.FieldsTables, expectedTokenTable.FieldsTables) assert.NoError(t, lidAccumulator.Finalize()) - expectedLIDBlocks := []lidsSealBlock{ + expectedLIDBlocks := []LidsSealBlock{ { - ext: lidsExt{minTID: 1, maxTID: 1, isContinued: false}, - payload: lids.Block{LIDs: []uint32{10, 20, 30}, Offsets: []uint32{0, 3}, IsLastLID: false}, + Ext: LidsExt{MinTID: 1, MaxTID: 1, IsContinued: false}, + Payload: lids.Block{LIDs: []uint32{10, 20, 30}, Offsets: []uint32{0, 3}, IsLastLID: false}, }, { - ext: lidsExt{minTID: 1, maxTID: 3, isContinued: true}, - payload: lids.Block{LIDs: []uint32{40, 2, 3}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, + Ext: LidsExt{MinTID: 1, MaxTID: 3, IsContinued: true}, + Payload: lids.Block{LIDs: []uint32{40, 2, 3}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, }, { - ext: lidsExt{minTID: 4, maxTID: 6, isContinued: false}, - payload: lids.Block{LIDs: []uint32{4, 5, 6}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, + Ext: LidsExt{MinTID: 4, MaxTID: 6, IsContinued: false}, + Payload: lids.Block{LIDs: []uint32{4, 5, 6}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, }, { - ext: lidsExt{minTID: 7, maxTID: 9, isContinued: false}, - payload: lids.Block{LIDs: []uint32{7, 8, 9}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, + Ext: LidsExt{MinTID: 7, MaxTID: 9, IsContinued: false}, + Payload: lids.Block{LIDs: []uint32{7, 8, 9}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, }, { - ext: lidsExt{minTID: 10, maxTID: 12, isContinued: false}, - payload: lids.Block{LIDs: []uint32{10, 11, 12}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, + Ext: LidsExt{MinTID: 10, MaxTID: 12, IsContinued: false}, + Payload: lids.Block{LIDs: []uint32{10, 11, 12}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, }, { - ext: lidsExt{minTID: 13, maxTID: 14, isContinued: false}, - payload: lids.Block{LIDs: []uint32{13, 14}, Offsets: []uint32{0, 1, 2}, IsLastLID: true}, + Ext: LidsExt{MinTID: 13, MaxTID: 14, IsContinued: false}, + Payload: lids.Block{LIDs: []uint32{13, 14}, Offsets: []uint32{0, 1, 2}, IsLastLID: true}, }, } assert.Equal(t, expectedLIDBlocks, lidBlocks) @@ -313,18 +303,18 @@ func TestBlocksBuilder_IDsBlocks(t *testing.T) { i := 0 ids := []seq.ID{} pos := []seq.DocPos{} - for block, err := range seqBlockID(src.ID(), 3) { + for block, err := range SeqBlockID(src.ID(), 3) { assert.NoError(t, err) - assert.Equal(t, expectedSizes[i], len(block.mids.Values)) - assert.Equal(t, expectedSizes[i], len(block.rids.Values)) - assert.Equal(t, expectedSizes[i], len(block.params.Values)) + assert.Equal(t, expectedSizes[i], len(block.MIDs.Values)) + assert.Equal(t, expectedSizes[i], len(block.RIDs.Values)) + assert.Equal(t, expectedSizes[i], len(block.Params.Values)) i++ j := 0 - for _, mid := range block.mids.Values { - ids = append(ids, seq.ID{MID: seq.MID(mid), RID: seq.RID(block.rids.Values[j])}) - pos = append(pos, seq.DocPos(block.params.Values[j])) + for _, mid := range block.MIDs.Values { + ids = append(ids, seq.ID{MID: seq.MID(mid), RID: seq.RID(block.RIDs.Values[j])}) + pos = append(pos, seq.DocPos(block.Params.Values[j])) j++ } } diff --git a/frac/fraction_concurrency_test.go b/frac/fraction_concurrency_test.go index 27f5d971..639a44c2 100644 --- a/frac/fraction_concurrency_test.go +++ b/frac/fraction_concurrency_test.go @@ -16,9 +16,13 @@ import ( "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/processor" + "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/sealing" + "github.com/ozontech/seq-db/frac/sealed/seqids" + "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/parser" + "github.com/ozontech/seq-db/sealing" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" testcommon "github.com/ozontech/seq-db/tests/common" diff --git a/frac/fraction_test.go b/frac/fraction_test.go index 8757c0db..dcc534d9 100644 --- a/frac/fraction_test.go +++ b/frac/fraction_test.go @@ -22,10 +22,14 @@ import ( "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/processor" + "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/sealing" + "github.com/ozontech/seq-db/frac/sealed/seqids" + "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/node" "github.com/ozontech/seq-db/parser" + "github.com/ozontech/seq-db/sealing" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" diff --git a/fracmanager/fraction_provider.go b/fracmanager/fraction_provider.go index e3a4d46b..d254a1ad 100644 --- a/fracmanager/fraction_provider.go +++ b/fracmanager/fraction_provider.go @@ -13,9 +13,8 @@ import ( "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" - "github.com/ozontech/seq-db/frac/sealed/sealing" - "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/node" + "github.com/ozontech/seq-db/sealing" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" "github.com/ozontech/seq-db/util" diff --git a/fracmanager/sealer_test.go b/fracmanager/sealer_test.go index f85c3f8f..51c16b6b 100644 --- a/fracmanager/sealer_test.go +++ b/fracmanager/sealer_test.go @@ -19,8 +19,8 @@ import ( "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" - "github.com/ozontech/seq-db/frac/sealed/sealing" "github.com/ozontech/seq-db/indexer" + "github.com/ozontech/seq-db/sealing" "github.com/ozontech/seq-db/seq" testscommon "github.com/ozontech/seq-db/tests/common" ) diff --git a/frac/sealed/sealing/index.go b/indexwriter/index.go similarity index 68% rename from frac/sealed/sealing/index.go rename to indexwriter/index.go index 5c23842a..1060f76d 100644 --- a/frac/sealed/sealing/index.go +++ b/indexwriter/index.go @@ -1,8 +1,10 @@ -package sealing +package indexwriter import ( "io" + "iter" + "github.com/ozontech/seq-db/blockbuilder" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" @@ -14,6 +16,25 @@ import ( "github.com/ozontech/seq-db/zstd" ) +// Source defines the data required to write all index files for a fraction. +type Source interface { + // Info returns metadata describing this source. + Info() *common.Info + + // ID returns an iterator over stored document identifiers paired with + // their positions, in descending [seq.ID] order. + ID() iter.Seq2[blockbuilder.DocLocation, error] + + // BlockOffsets returns byte offsets to each document block + // within this source's `.docs` file. + BlockOffsets() []uint64 + + // TokenTriplet iterates over fields in lexicographic order. + // For each field, it yields tokens (lexicographically sorted) + // paired with the local document ID list for that token. + TokenTriplet() iter.Seq2[string, iter.Seq2[blockbuilder.TokenPosting, error]] +} + // indexBlock is one compressed (or not) block with its registry metadata. type indexBlock struct { codec storage.Codec @@ -27,7 +48,7 @@ func (i indexBlock) Bin(pos int64) (storage.IndexBlockHeader, []byte) { return storage.NewIndexBlockHeader(pos, i.ext1, i.ext2, uint32(len(i.payload)), i.rawLen, i.codec), i.payload } -type IndexSealer struct { +type IndexWriter struct { params common.SealParams buf1 []byte @@ -38,28 +59,28 @@ type IndexSealer struct { tokenTable token.Table } -func NewIndexSealer(params common.SealParams) *IndexSealer { - return &IndexSealer{ +func New(params common.SealParams) *IndexWriter { + return &IndexWriter{ params: params, buf1: make([]byte, 0, consts.RegularBlockSize), buf2: make([]byte, 0, consts.RegularBlockSize), } } -func (s *IndexSealer) LIDsTable() lids.Table { +func (s *IndexWriter) LIDsTable() lids.Table { return s.lidsTable } -func (s *IndexSealer) TokenTable() token.Table { +func (s *IndexWriter) TokenTable() token.Table { return s.tokenTable } -func (s *IndexSealer) IDsTable() seqids.Table { +func (s *IndexWriter) IDsTable() seqids.Table { return s.idsTable } // WriteOffsetsFile writes the .offsets file containing a single BlockOffsets block. -func (s *IndexSealer) WriteOffsetsFile(ws io.WriteSeeker, src Source) error { +func (s *IndexWriter) WriteOffsetsFile(ws io.WriteSeeker, src Source) error { w, err := newWriter(ws) if err != nil { return err @@ -78,14 +99,14 @@ func (s *IndexSealer) WriteOffsetsFile(ws io.WriteSeeker, src Source) error { return w.finalize() } -func (s *IndexSealer) WriteIDFile(ws io.WriteSeeker, src Source) error { +func (s *IndexWriter) WriteIDFile(ws io.WriteSeeker, src Source) error { w, err := newWriter(ws) if err != nil { return err } defer w.release() - for block, err := range seqBlockID(src.ID(), consts.IDsPerBlock) { + for block, err := range blockbuilder.SeqBlockID(src.ID(), consts.IDsPerBlock) { if err != nil { return err } @@ -106,7 +127,7 @@ func (s *IndexSealer) WriteIDFile(ws io.WriteSeeker, src Source) error { return w.finalize() } -func (s *IndexSealer) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) error { +func (s *IndexWriter) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) error { tw, err := newWriter(tws) if err != nil { return err @@ -120,7 +141,7 @@ func (s *IndexSealer) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) err defer lw.release() var ( - bb blocksBuilder + bb blockbuilder.BlocksBuilder allFieldsTables []token.FieldTable ) @@ -158,7 +179,7 @@ func (s *IndexSealer) finalizeLIDFile(w *writer, lidAccumulator *lidAccumulator) return w.finalize() } -func (s *IndexSealer) finalizeTokenFile(w *writer, allFieldsTables []token.FieldTable) error { +func (s *IndexWriter) finalizeTokenFile(w *writer, allFieldsTables []token.FieldTable) error { // Emit section separator. if err := w.writeEmptyBlock(); err != nil { return err @@ -178,33 +199,11 @@ func (s *IndexSealer) WriteInfoFile(ws io.Writer, src Source) error { return err } -// collapseOrderedFieldsTables merges FieldTables with the same field name. -// Assumes input is sorted by Field. -func collapseOrderedFieldsTables(src []token.FieldTable) []token.FieldTable { - if len(src) == 0 { - return nil - } - - current := src[0] - var dst []token.FieldTable - for _, ft := range src[1:] { - if current.Field == ft.Field { - current.Entries = append(current.Entries, ft.Entries...) - continue - } - - dst = append(dst, current) - current = ft - } - - return append(dst, current) -} - func newIndexBlock(raw []byte) indexBlock { return indexBlock{codec: storage.CodecNo, rawLen: uint32(len(raw)), payload: raw} } -func (s *IndexSealer) newIndexBlockZSTD(raw []byte, level int) indexBlock { +func (s *IndexWriter) newIndexBlockZSTD(raw []byte, level int) indexBlock { s.buf2 = zstd.CompressLevel(raw, s.buf2[:0], level) if len(s.buf2) < len(raw) { return indexBlock{codec: storage.CodecZSTD, rawLen: uint32(len(raw)), payload: s.buf2} @@ -213,22 +212,22 @@ func (s *IndexSealer) newIndexBlockZSTD(raw []byte, level int) indexBlock { } // packInfoBlock packs fraction information into an index block. -func (s *IndexSealer) packInfoBlock(block sealed.BlockInfo) indexBlock { +func (s *IndexWriter) packInfoBlock(block sealed.BlockInfo) indexBlock { s.buf1 = block.Pack(s.buf1[:0]) return newIndexBlock(s.buf1) // Info block is typically small, no compression } // packTokenBlock packs token data into a compressed index block. -func (s *IndexSealer) packTokenBlock(block tokensSealBlock) indexBlock { - s.buf1 = block.payload.Pack(s.buf1[:0]) // Pack token data +func (s *IndexWriter) packTokenBlock(block blockbuilder.TokensSealBlock) indexBlock { + s.buf1 = block.Payload.Pack(s.buf1[:0]) // Pack token data b := s.newIndexBlockZSTD(s.buf1, s.params.TokenListZstdLevel) // Store TID range in extended metadata - b.ext1 = uint64(block.ext.maxTID)<<32 | uint64(block.ext.minTID) + b.ext1 = uint64(block.Ext.MaxTID)<<32 | uint64(block.Ext.MinTID) return b } // packTokenTableBlock packs the token table into a compressed index block. -func (s *IndexSealer) packTokenTableBlock(tokenTableBlock token.TableBlock) indexBlock { +func (s *IndexWriter) packTokenTableBlock(tokenTableBlock token.TableBlock) indexBlock { s.tokenTable = token.TableFromBlocks([]token.TableBlock{tokenTableBlock}) // Store for PreloadedData // Packing block @@ -237,7 +236,7 @@ func (s *IndexSealer) packTokenTableBlock(tokenTableBlock token.TableBlock) inde } // packBlocksOffsetsBlock packs document block offsets into a compressed index block. -func (s *IndexSealer) packBlocksOffsetsBlock(block sealed.BlockOffsets) indexBlock { +func (s *IndexWriter) packBlocksOffsetsBlock(block sealed.BlockOffsets) indexBlock { // Update IDs table for PreloadedData s.idsTable.IDsTotal = block.IDsTotal // Total number of IDs s.idsTable.IDBlocksTotal = uint32(len(block.Offsets)) // Number of ID blocks @@ -249,19 +248,19 @@ func (s *IndexSealer) packBlocksOffsetsBlock(block sealed.BlockOffsets) indexBlo } // packMIDsBlock packs MIDs into a compressed index block. -func (s *IndexSealer) packMIDsBlock(block idsSealBlock) indexBlock { +func (s *IndexWriter) packMIDsBlock(block blockbuilder.IdsSealBlock) indexBlock { // Get the last ID in the block (smallest due to descending order) - last := len(block.mids.Values) - 1 + last := len(block.MIDs.Values) - 1 minID := seq.ID{ - MID: seq.MID(block.mids.Values[last]), - RID: seq.RID(block.rids.Values[last]), + MID: seq.MID(block.MIDs.Values[last]), + RID: seq.RID(block.RIDs.Values[last]), } s.idsTable.MinBlockIDs = append(s.idsTable.MinBlockIDs, minID) // Store for PreloadedData // Packing block - s.buf1 = block.mids.Pack(s.buf1[:0]) + s.buf1 = block.MIDs.Pack(s.buf1[:0]) b := s.newIndexBlockZSTD(s.buf1, s.params.IDsZstdLevel) // Store min MID and RID in extended metadata @@ -272,38 +271,38 @@ func (s *IndexSealer) packMIDsBlock(block idsSealBlock) indexBlock { } // packRIDsBlock packs RIDs into a compressed index block. -func (s *IndexSealer) packRIDsBlock(block idsSealBlock) indexBlock { - s.buf1 = block.rids.Pack(s.buf1[:0]) +func (s *IndexWriter) packRIDsBlock(block blockbuilder.IdsSealBlock) indexBlock { + s.buf1 = block.RIDs.Pack(s.buf1[:0]) b := s.newIndexBlockZSTD(s.buf1, s.params.IDsZstdLevel) return b } // packPosBlock packs document positions into a compressed index block. -func (s *IndexSealer) packPosBlock(block idsSealBlock) indexBlock { - s.buf1 = block.params.Pack(s.buf1[:0]) +func (s *IndexWriter) packPosBlock(block blockbuilder.IdsSealBlock) indexBlock { + s.buf1 = block.Params.Pack(s.buf1[:0]) b := s.newIndexBlockZSTD(s.buf1, s.params.IDsZstdLevel) return b } // packLIDsBlock packs Local IDs (LIDs) into a compressed index block. // Also updates LIDs table for preloaded data access. -func (s *IndexSealer) packLIDsBlock(block lidsSealBlock) indexBlock { +func (s *IndexWriter) packLIDsBlock(block blockbuilder.LidsSealBlock) indexBlock { var ext1 uint64 - if block.ext.isContinued { // todo: Legacy continuation flag + if block.Ext.IsContinued { // todo: Legacy continuation flag ext1 = 1 - block.ext.minTID++ // Adjust for legacy format + block.Ext.MinTID++ // Adjust for legacy format } // Update LIDs table for PreloadedData - s.lidsTable.MinTIDs = append(s.lidsTable.MinTIDs, block.ext.minTID) - s.lidsTable.MaxTIDs = append(s.lidsTable.MaxTIDs, block.ext.maxTID) - s.lidsTable.IsContinued = append(s.lidsTable.IsContinued, block.ext.isContinued) + s.lidsTable.MinTIDs = append(s.lidsTable.MinTIDs, block.Ext.MinTID) + s.lidsTable.MaxTIDs = append(s.lidsTable.MaxTIDs, block.Ext.MaxTID) + s.lidsTable.IsContinued = append(s.lidsTable.IsContinued, block.Ext.IsContinued) // Packing block - s.buf1 = block.payload.Pack(s.buf1[:0]) + s.buf1 = block.Payload.Pack(s.buf1[:0]) b := s.newIndexBlockZSTD(s.buf1, s.params.LIDsZstdLevel) b.ext1 = ext1 // Legacy continuation flag - b.ext2 = uint64(block.ext.maxTID)<<32 | uint64(block.ext.minTID) // TID range + b.ext2 = uint64(block.Ext.MaxTID)<<32 | uint64(block.Ext.MinTID) // TID range return b } diff --git a/frac/sealed/sealing/writer.go b/indexwriter/writer.go similarity index 99% rename from frac/sealed/sealing/writer.go rename to indexwriter/writer.go index 1a147e4e..1fb9909d 100644 --- a/frac/sealed/sealing/writer.go +++ b/indexwriter/writer.go @@ -1,4 +1,4 @@ -package sealing +package indexwriter import ( "bytes" diff --git a/frac/sealed/sealing/sealer.go b/sealing/sealer.go similarity index 58% rename from frac/sealed/sealing/sealer.go rename to sealing/sealer.go index 57863d82..d3af4baf 100644 --- a/frac/sealed/sealing/sealer.go +++ b/sealing/sealer.go @@ -2,40 +2,67 @@ package sealing import ( "errors" - "iter" "os" "path/filepath" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" - "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/indexwriter" "github.com/ozontech/seq-db/util" ) -type ( - DocLocation = util.Pair[seq.ID, seq.DocPos] - TokenPosting = util.Pair[[]byte, []uint32] -) +// Source defines the contract for data sources that can be sealed. +// Provides access to all necessary data components for index creation. +type Source = indexwriter.Source + +func syncAndClose(f *os.File) error { + if err := f.Sync(); err != nil { + f.Close() + return err + } + return f.Close() +} + +func createAndWrite(tmpPath, finalPath string, write func(*os.File) error) error { + f, err := os.Create(tmpPath) + if err != nil { + return err + } + + if err := errors.Join(write(f), syncAndClose(f)); err != nil { + return err + } + + return os.Rename(tmpPath, finalPath) +} -// Source interface defines the contract for data sources that can be sealed. -// Provides access to all necessary data components for index creation -type Source interface { - // Info returns metadata describing this source. - Info() *common.Info +func createAndWriteBoth( + tmpPath1, finalPath1, + tmpPath2, finalPath2 string, + write func(*os.File, *os.File) error, +) error { + f1, err := os.Create(tmpPath1) + if err != nil { + return err + } + + f2, err := os.Create(tmpPath2) + if err != nil { + f1.Close() + return err + } - // ID returns an iterator over stored document identifiers paired with - // their positions, in descending [seq.ID] order. - ID() iter.Seq2[DocLocation, error] + writeErr := write(f1, f2) + if err := errors.Join(writeErr, syncAndClose(f1), syncAndClose(f2)); err != nil { + return err + } - // BlockOffsets returns byte offsets to each document block - // within this source's `.docs` file. - BlockOffsets() []uint64 + if err := os.Rename(tmpPath1, finalPath1); err != nil { + return err + } - // TokenTriplet iterates over fields in lexicographic order. - // For each field, it yields tokens (lexicographically sorted) - // paired with the local document ID list for that token. - TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, error]] + return os.Rename(tmpPath2, finalPath2) } // Seal writes five index files (.info, .token, .offsets, .id, .lid) for the fraction @@ -47,12 +74,12 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { return nil, errors.New("sealing of an empty active fraction is not supported") } - sealer := NewIndexSealer(params) + writer := indexwriter.New(params) if err := createAndWrite( info.Path+consts.OffsetsTmpFileSuffix, info.Path+consts.OffsetsFileSuffix, - func(f *os.File) error { return sealer.WriteOffsetsFile(f, src) }, + func(f *os.File) error { return writer.WriteOffsetsFile(f, src) }, ); err != nil { return nil, err } @@ -60,7 +87,7 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { if err := createAndWrite( info.Path+consts.IDTmpFileSuffix, info.Path+consts.IDFileSuffix, - func(f *os.File) error { return sealer.WriteIDFile(f, src) }, + func(f *os.File) error { return writer.WriteIDFile(f, src) }, ); err != nil { return nil, err } @@ -68,7 +95,7 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { if err := createAndWriteBoth( info.Path+consts.TokenTmpFileSuffix, info.Path+consts.TokenFileSuffix, info.Path+consts.LIDTmpFileSuffix, info.Path+consts.LIDFileSuffix, - func(tokenF, lidF *os.File) error { return sealer.WriteTokenTriplet(tokenF, lidF, src) }, + func(tokenF, lidF *os.File) error { return writer.WriteTokenTriplet(tokenF, lidF, src) }, ); err != nil { return nil, err } @@ -76,7 +103,7 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { if err := createAndWrite( info.Path+consts.InfoTmpFileSuffix, info.Path+consts.InfoFileSuffix, - func(f *os.File) error { return sealer.WriteInfoFile(f, src) }, + func(f *os.File) error { return writer.WriteInfoFile(f, src) }, ); err != nil { return nil, err } @@ -100,13 +127,13 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { } info.IndexOnDisk = totalSize - lidsTable := sealer.LIDsTable() + lidsTable := writer.LIDsTable() preloaded := &sealed.PreloadedData{ Info: info, - TokenTable: sealer.TokenTable(), + TokenTable: writer.TokenTable(), BlocksData: sealed.BlocksData{ - IDsTable: sealer.IDsTable(), + IDsTable: writer.IDsTable(), LIDsTable: &lidsTable, BlocksOffsets: src.BlockOffsets(), }, @@ -114,55 +141,3 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { return preloaded, nil } - -func syncAndClose(f *os.File) error { - if err := f.Sync(); err != nil { - f.Close() - return err - } - return f.Close() -} - -func createAndWrite( - tmp, final string, - write func(*os.File) error, -) error { - f, err := os.Create(tmp) - if err != nil { - return err - } - - if err := errors.Join(write(f), syncAndClose(f)); err != nil { - return err - } - - return os.Rename(tmp, final) -} - -func createAndWriteBoth( - tmpa, finala, - tmpb, finalb string, - write func(*os.File, *os.File) error, -) error { - a, err := os.Create(tmpa) - if err != nil { - return err - } - - b, err := os.Create(tmpb) - if err != nil { - a.Close() - return err - } - - writeErr := write(a, b) - if err := errors.Join(writeErr, syncAndClose(a), syncAndClose(b)); err != nil { - return err - } - - if err := os.Rename(tmpa, finala); err != nil { - return err - } - - return os.Rename(tmpb, finalb) -} From 3abf863fdb404e0be950cc97b706625f7f075d10 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Mon, 27 Apr 2026 10:26:45 +0300 Subject: [PATCH 07/29] refactor: filename similar to package name --- blockbuilder/{blocks_builder.go => block_builder.go} | 0 blockbuilder/{blocks_builder_test.go => block_builder_test.go} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename blockbuilder/{blocks_builder.go => block_builder.go} (100%) rename blockbuilder/{blocks_builder_test.go => block_builder_test.go} (100%) diff --git a/blockbuilder/blocks_builder.go b/blockbuilder/block_builder.go similarity index 100% rename from blockbuilder/blocks_builder.go rename to blockbuilder/block_builder.go diff --git a/blockbuilder/blocks_builder_test.go b/blockbuilder/block_builder_test.go similarity index 100% rename from blockbuilder/blocks_builder_test.go rename to blockbuilder/block_builder_test.go From 6bf88e3b3a13a882df5ab590f05a0abbafaf67b6 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Mon, 27 Apr 2026 10:30:58 +0300 Subject: [PATCH 08/29] refactor: remove `BlockBuilder` type --- blockbuilder/block_builder.go | 5 +---- indexwriter/index.go | 3 +-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/blockbuilder/block_builder.go b/blockbuilder/block_builder.go index 193b061e..262a2c77 100644 --- a/blockbuilder/block_builder.go +++ b/blockbuilder/block_builder.go @@ -50,10 +50,7 @@ type IdsSealBlock struct { Params seqids.BlockParams } -// BlocksBuilder constructs sealed blocks from various data sources. -type BlocksBuilder struct{} - -func (bb *BlocksBuilder) BuildTokenBlocks( +func BuildTokenBlocks( it iter.Seq2[string, iter.Seq2[TokenPosting, error]], accumulate func([]uint32) error, blockCapacity int, ) iter.Seq2[TokenBlock, error] { diff --git a/indexwriter/index.go b/indexwriter/index.go index 1060f76d..286b620c 100644 --- a/indexwriter/index.go +++ b/indexwriter/index.go @@ -141,7 +141,6 @@ func (s *IndexWriter) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) err defer lw.release() var ( - bb blockbuilder.BlocksBuilder allFieldsTables []token.FieldTable ) @@ -152,7 +151,7 @@ func (s *IndexWriter) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) err }, ) - for pair, err := range bb.BuildTokenBlocks(src.TokenTriplet(), lidAccumulator.Add, consts.RegularBlockSize) { + for pair, err := range blockbuilder.BuildTokenBlocks(src.TokenTriplet(), accumulate, consts.RegularBlockSize) { if err != nil { return err } From b9f77b85aeb937ea62222f7bc4f894339936b24f Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Mon, 27 Apr 2026 10:33:20 +0300 Subject: [PATCH 09/29] refactor: move unexported functions --- sealing/sealer.go | 98 +++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/sealing/sealer.go b/sealing/sealer.go index d3af4baf..edd263fe 100644 --- a/sealing/sealer.go +++ b/sealing/sealer.go @@ -16,55 +16,6 @@ import ( // Provides access to all necessary data components for index creation. type Source = indexwriter.Source -func syncAndClose(f *os.File) error { - if err := f.Sync(); err != nil { - f.Close() - return err - } - return f.Close() -} - -func createAndWrite(tmpPath, finalPath string, write func(*os.File) error) error { - f, err := os.Create(tmpPath) - if err != nil { - return err - } - - if err := errors.Join(write(f), syncAndClose(f)); err != nil { - return err - } - - return os.Rename(tmpPath, finalPath) -} - -func createAndWriteBoth( - tmpPath1, finalPath1, - tmpPath2, finalPath2 string, - write func(*os.File, *os.File) error, -) error { - f1, err := os.Create(tmpPath1) - if err != nil { - return err - } - - f2, err := os.Create(tmpPath2) - if err != nil { - f1.Close() - return err - } - - writeErr := write(f1, f2) - if err := errors.Join(writeErr, syncAndClose(f1), syncAndClose(f2)); err != nil { - return err - } - - if err := os.Rename(tmpPath1, finalPath1); err != nil { - return err - } - - return os.Rename(tmpPath2, finalPath2) -} - // Seal writes five index files (.info, .token, .offsets, .id, .lid) for the fraction // and returns PreloadedData for fast initialization of the sealed fraction. func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { @@ -141,3 +92,52 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { return preloaded, nil } + +func syncAndClose(f *os.File) error { + if err := f.Sync(); err != nil { + f.Close() + return err + } + return f.Close() +} + +func createAndWrite(tmp, final string, write func(*os.File) error) error { + f, err := os.Create(tmp) + if err != nil { + return err + } + + if err := errors.Join(write(f), syncAndClose(f)); err != nil { + return err + } + + return os.Rename(tmp, final) +} + +func createAndWriteBoth( + atmp, afinal, + btmp, bfinal string, + write func(*os.File, *os.File) error, +) error { + a, err := os.Create(atmp) + if err != nil { + return err + } + + b, err := os.Create(btmp) + if err != nil { + a.Close() + return err + } + + writeErr := write(a, b) + if err := errors.Join(writeErr, syncAndClose(a), syncAndClose(b)); err != nil { + return err + } + + if err := os.Rename(atmp, afinal); err != nil { + return err + } + + return os.Rename(btmp, bfinal) +} From bb339f67c9043414cde906820fe19d3de64369a3 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Thu, 7 May 2026 11:06:53 +0300 Subject: [PATCH 10/29] chore: fix rebase conflicts --- blockbuilder/block_builder.go | 82 +------- blockbuilder/block_builder_test.go | 19 +- blockbuilder/lid_accumulator.go | 85 ++++++++ frac/fraction_concurrency_test.go | 4 - frac/fraction_test.go | 4 - frac/sealed/sealing/blocks_builder.go | 285 -------------------------- indexwriter/index.go | 29 +-- sealing/sealer.go | 17 +- 8 files changed, 120 insertions(+), 405 deletions(-) create mode 100644 blockbuilder/lid_accumulator.go delete mode 100644 frac/sealed/sealing/blocks_builder.go diff --git a/blockbuilder/block_builder.go b/blockbuilder/block_builder.go index 262a2c77..8103dedf 100644 --- a/blockbuilder/block_builder.go +++ b/blockbuilder/block_builder.go @@ -50,7 +50,7 @@ type IdsSealBlock struct { Params seqids.BlockParams } -func BuildTokenBlocks( +func TokenBlocks( it iter.Seq2[string, iter.Seq2[TokenPosting, error]], accumulate func([]uint32) error, blockCapacity int, ) iter.Seq2[TokenBlock, error] { @@ -167,9 +167,9 @@ func newTokenTableEntry( } } -// SeqBlockID accumulates scalar (ID, position) pairs into sealed ID blocks. +// IDBlock accumulates scalar (ID, position) pairs into sealed ID blocks. // A new block is yielded every `blockCapacity` IDs. -func SeqBlockID(ids iter.Seq2[DocLocation, error], blockCapacity int) iter.Seq2[IdsSealBlock, error] { +func IDBlock(ids iter.Seq2[DocLocation, error], blockCapacity int) iter.Seq2[IdsSealBlock, error] { return func(yield func(IdsSealBlock, error) bool) { var block IdsSealBlock @@ -201,82 +201,6 @@ func SeqBlockID(ids iter.Seq2[DocLocation, error], blockCapacity int) iter.Seq2[ } } -// LidBlocksAcc accumulates LIDs into sealed LID blocks. -type LidBlocksAcc struct { - blockCapacity int - - currentTID uint32 - currentBlock LidsSealBlock - - isEndOfToken bool - isContinued bool -} - -func NewLIDBlocksAccumulator(blockCapacity int) *LidBlocksAcc { - a := &LidBlocksAcc{blockCapacity: blockCapacity} - - a.currentBlock.Ext.MinTID = 1 - a.currentBlock.Payload = lids.Block{ - LIDs: make([]uint32, 0, blockCapacity), - Offsets: []uint32{0}, - } - - return a -} - -// Add processes LIDs of one token (must be called in TID order). -// -// For each block that fills up, `onBlock` is called immediately -// before the backing arrays are reset, so `onBlock` may read the -// block data but must not retain references to it. -func (a *LidBlocksAcc) Add(lidsbuf []uint32, onBlock func(LidsSealBlock) error) error { - a.currentTID++ - - for _, lid := range lidsbuf { - if len(a.currentBlock.Payload.LIDs) == a.blockCapacity { - if err := onBlock(a.finalizeBlock()); err != nil { - return err - } - - a.currentBlock.Ext.MinTID = a.currentTID - a.currentBlock.Payload.LIDs = a.currentBlock.Payload.LIDs[:0] - a.currentBlock.Payload.Offsets = a.currentBlock.Payload.Offsets[:1] - } - - a.isEndOfToken = false - a.currentBlock.Ext.MaxTID = a.currentTID - a.currentBlock.Payload.LIDs = append(a.currentBlock.Payload.LIDs, lid) - } - - a.isEndOfToken = true - a.currentBlock.Payload.Offsets = append( - a.currentBlock.Payload.Offsets, - uint32(len(a.currentBlock.Payload.LIDs)), - ) - - return nil -} - -func (a *LidBlocksAcc) Flush() LidsSealBlock { - return a.finalizeBlock() -} - -func (a *LidBlocksAcc) finalizeBlock() LidsSealBlock { - if !a.isEndOfToken { - a.currentBlock.Payload.Offsets = append( - a.currentBlock.Payload.Offsets, - uint32(len(a.currentBlock.Payload.LIDs)), - ) - } - - result := a.currentBlock - result.Payload.IsLastLID = a.isEndOfToken - result.Ext.IsContinued = a.isContinued - - a.isContinued = !a.isEndOfToken - return result -} - // CollapseOrderedFieldsTables merges FieldTables with the same field name. // Assumes input is sorted by Field. func CollapseOrderedFieldsTables(src []token.FieldTable) []token.FieldTable { diff --git a/blockbuilder/block_builder_test.go b/blockbuilder/block_builder_test.go index 34295a91..95fe7698 100644 --- a/blockbuilder/block_builder_test.go +++ b/blockbuilder/block_builder_test.go @@ -104,23 +104,20 @@ func TestBlocksBuilder_BuildTokenBlocks(t *testing.T) { const blockSize = 24 const lidBlockCap = 3 - var lidBlocks []lidsSealBlock - lidAccumulator := newLIDAccumulator( + var lidBlocks []LidsSealBlock + lidAccumulator := NewLIDAccumulator( lidBlockCap, - func(block lidsSealBlock) error { - block.payload.LIDs = slices.Clone(block.payload.LIDs) - block.payload.Offsets = slices.Clone(block.payload.Offsets) + func(block LidsSealBlock) error { + block.Payload.LIDs = slices.Clone(block.Payload.LIDs) + block.Payload.Offsets = slices.Clone(block.Payload.Offsets) lidBlocks = append(lidBlocks, block) return nil }, ) - var bb blocksBuilder - tokenBlocks := bb.BuildTokenBlocks( + tokenBlocks := TokenBlocks( src.TokenTriplet(), - func(lids []uint32) error { - return lidAccumulator.Add(lids) - }, + lidAccumulator.Add, blockSize, ) @@ -303,7 +300,7 @@ func TestBlocksBuilder_IDsBlocks(t *testing.T) { i := 0 ids := []seq.ID{} pos := []seq.DocPos{} - for block, err := range SeqBlockID(src.ID(), 3) { + for block, err := range IDBlock(src.ID(), 3) { assert.NoError(t, err) assert.Equal(t, expectedSizes[i], len(block.MIDs.Values)) diff --git a/blockbuilder/lid_accumulator.go b/blockbuilder/lid_accumulator.go new file mode 100644 index 00000000..ef81a970 --- /dev/null +++ b/blockbuilder/lid_accumulator.go @@ -0,0 +1,85 @@ +package blockbuilder + +import "github.com/ozontech/seq-db/frac/sealed/lids" + +type LIDAccumulator struct { + blockCapacity int + onBlock func(LidsSealBlock) error + + currentTID uint32 + currentBlock LidsSealBlock + + isEndOfToken bool + isContinued bool +} + +func NewLIDAccumulator( + blockCapacity int, + onBlock func(LidsSealBlock) error, +) *LIDAccumulator { + a := &LIDAccumulator{ + blockCapacity: blockCapacity, + onBlock: onBlock, + } + + a.currentBlock.Ext.MinTID = 1 + a.currentBlock.Payload = lids.Block{ + LIDs: make([]uint32, 0, blockCapacity), + Offsets: []uint32{0}, + } + + return a +} + +// Add processes LIDs of one token (must be called in TID order). +// +// For each block that fills up, `onBlock` is called immediately +// before the backing arrays are reset, so `onBlock` may read the +// block data but must not retain references to it. +func (a *LIDAccumulator) Add(lidsbuf []uint32) error { + a.currentTID++ + + for _, lid := range lidsbuf { + if len(a.currentBlock.Payload.LIDs) == a.blockCapacity { + if err := a.onBlock(a.finalizeBlock()); err != nil { + return err + } + + a.currentBlock.Ext.MinTID = a.currentTID + a.currentBlock.Payload.LIDs = a.currentBlock.Payload.LIDs[:0] + a.currentBlock.Payload.Offsets = a.currentBlock.Payload.Offsets[:1] + } + + a.isEndOfToken = false + a.currentBlock.Ext.MaxTID = a.currentTID + a.currentBlock.Payload.LIDs = append(a.currentBlock.Payload.LIDs, lid) + } + + a.isEndOfToken = true + a.currentBlock.Payload.Offsets = append( + a.currentBlock.Payload.Offsets, + uint32(len(a.currentBlock.Payload.LIDs)), + ) + + return nil +} + +func (a *LIDAccumulator) Finalize() error { + return a.onBlock(a.finalizeBlock()) +} + +func (a *LIDAccumulator) finalizeBlock() LidsSealBlock { + if !a.isEndOfToken { + a.currentBlock.Payload.Offsets = append( + a.currentBlock.Payload.Offsets, + uint32(len(a.currentBlock.Payload.LIDs)), + ) + } + + result := a.currentBlock + result.Payload.IsLastLID = a.isEndOfToken + result.Ext.IsContinued = a.isContinued + + a.isContinued = !a.isEndOfToken + return result +} diff --git a/frac/fraction_concurrency_test.go b/frac/fraction_concurrency_test.go index 639a44c2..b37cf2a3 100644 --- a/frac/fraction_concurrency_test.go +++ b/frac/fraction_concurrency_test.go @@ -16,10 +16,6 @@ import ( "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/processor" - "github.com/ozontech/seq-db/frac/sealed/lids" - "github.com/ozontech/seq-db/frac/sealed/sealing" - "github.com/ozontech/seq-db/frac/sealed/seqids" - "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/parser" "github.com/ozontech/seq-db/sealing" diff --git a/frac/fraction_test.go b/frac/fraction_test.go index dcc534d9..0fee4795 100644 --- a/frac/fraction_test.go +++ b/frac/fraction_test.go @@ -22,10 +22,6 @@ import ( "github.com/ozontech/seq-db/cache" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/processor" - "github.com/ozontech/seq-db/frac/sealed/lids" - "github.com/ozontech/seq-db/frac/sealed/sealing" - "github.com/ozontech/seq-db/frac/sealed/seqids" - "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/node" "github.com/ozontech/seq-db/parser" diff --git a/frac/sealed/sealing/blocks_builder.go b/frac/sealed/sealing/blocks_builder.go deleted file mode 100644 index fc069cbf..00000000 --- a/frac/sealed/sealing/blocks_builder.go +++ /dev/null @@ -1,285 +0,0 @@ -package sealing - -import ( - "encoding/binary" - "iter" - "unsafe" - - "github.com/ozontech/seq-db/frac/sealed/lids" - "github.com/ozontech/seq-db/frac/sealed/seqids" - "github.com/ozontech/seq-db/frac/sealed/token" - "github.com/ozontech/seq-db/util" -) - -type ( - TokenBlock = util.Pair[tokensSealBlock, []token.FieldTable] -) - -// tokensExt represents the token ID range contained in a block. -type tokensExt struct { - minTID uint32 // First token ID in the block - maxTID uint32 // Last token ID in the block -} - -// tokensSealBlock represents a sealed block containing token data with metadata. -type tokensSealBlock struct { - ext tokensExt // Tokens block metadata for registry marking - payload token.Block // Actual token data payload -} - -// lidsExt represents the range and continuation status of LID blocks. -type lidsExt struct { - minTID uint32 // First token ID in the LID block - maxTID uint32 // Last token ID in the LID block - isContinued bool // Whether LID sequence continues in next block -} - -// lidsSealBlock represents a sealed block containing LID (Local ID) data. -type lidsSealBlock struct { - ext lidsExt // LIDs block metadata for registry marking - payload lids.Block // LID data payload -} - -// idsSealBlock represents a sealed block containing various identifier types. -type idsSealBlock struct { - mids seqids.BlockMIDs - rids seqids.BlockRIDs - params seqids.BlockParams -} - -// blocksBuilder constructs sealed blocks from various data sources. -// Provides error tracking and consistency validation during block construction. -type blocksBuilder struct{} - -func (bb *blocksBuilder) BuildTokenBlocks( - it iter.Seq2[string, iter.Seq2[TokenPosting, error]], - accumulate func([]uint32) error, blockCapacity int, -) iter.Seq2[TokenBlock, error] { - return func(yield func(TokenBlock, error) bool) { - var ( - block tokensSealBlock - blockIdx uint32 - blockSize int - ) - - var ( - currentTID uint32 - pendingTable []token.FieldTable - fieldName string - fieldEntryStartTID uint32 - ) - - emitFieldEntry := func() { - // Handle case when field does not have tokens. - if fieldName == "" || fieldEntryStartTID > currentTID { - return - } - - entry := newTokenTableEntry(fieldEntryStartTID, currentTID, blockIdx, block) - pendingTable = append(pendingTable, token.FieldTable{ - Field: fieldName, - Entries: []*token.TableEntry{entry}, - }) - } - - flushBlock := func() bool { - emitFieldEntry() - block.ext.maxTID = currentTID - - pair := TokenBlock{First: block, Second: pendingTable} - if !yield(pair, nil) { - return false - } - - block.payload.Payload = block.payload.Payload[:0] - block.payload.Offsets = block.payload.Offsets[:0] - block.ext.minTID = currentTID + 1 - - blockIdx++ - blockSize = 0 - - pendingTable = pendingTable[:0] - fieldEntryStartTID = currentTID + 1 - - return true - } - - block.ext.minTID = 1 - for field, tokenIterator := range it { - emitFieldEntry() - - fieldName = field - fieldEntryStartTID = currentTID + 1 - - for pair, err := range tokenIterator { - if err != nil { - yield(TokenBlock{}, err) - return - } - - tok, tlids := pair.First, pair.Second - tokenSize := int(unsafe.Sizeof(uint32(0))) + len(tok) - - if blockSize > 0 && blockSize+tokenSize > blockCapacity { - if !flushBlock() { - return - } - } - - block.payload.Offsets = append(block.payload.Offsets, uint32(len(block.payload.Payload))) - block.payload.Payload = binary.LittleEndian.AppendUint32(block.payload.Payload, uint32(len(tok))) - block.payload.Payload = append(block.payload.Payload, tok...) - - if err := accumulate(tlids); err != nil { - yield(TokenBlock{}, err) - return - } - - currentTID++ - blockSize += tokenSize - } - } - - if blockSize > 0 { - flushBlock() - } - } -} - -func newTokenTableEntry( - entryStartTID, entryEndTID uint32, - blockIndex uint32, block tokensSealBlock, -) *token.TableEntry { - // Convert global TIDs to block-local indices - firstIndex := entryStartTID - block.ext.minTID - lastIndex := entryEndTID - block.ext.minTID - - // Extract min and max token values for the entry range - minVal := string(block.payload.GetToken(int(firstIndex))) - maxVal := string(block.payload.GetToken(int(lastIndex))) - - return &token.TableEntry{ - StartIndex: firstIndex, // Starting index within the block - StartTID: entryStartTID, // Starting token ID (global) - BlockIndex: blockIndex, // Reference to containing block - ValCount: lastIndex - firstIndex + 1, // Number of tokens in this entry - MinVal: minVal, // Smallest token value in range - MaxVal: maxVal, // Largest token value in range - } -} - -// seqBlockID accumulates scalar (ID, position) pairs into sealed ID blocks. -// A new block is yielded every `blockCapacity` IDs. -func seqBlockID(ids iter.Seq2[DocLocation, error], blockCapacity int) iter.Seq2[idsSealBlock, error] { - return func(yield func(idsSealBlock, error) bool) { - var block idsSealBlock - - for pair, err := range ids { - if err != nil { - yield(idsSealBlock{}, err) - return - } - - id, pos := pair.First, pair.Second - block.mids.Values = append(block.mids.Values, uint64(id.MID)) - block.rids.Values = append(block.rids.Values, uint64(id.RID)) - block.params.Values = append(block.params.Values, uint64(pos)) - - if len(block.mids.Values) == blockCapacity { - if !yield(block, nil) { - return - } - - block.mids.Values = block.mids.Values[:0] - block.rids.Values = block.rids.Values[:0] - block.params.Values = block.params.Values[:0] - } - } - - if len(block.mids.Values) > 0 { - yield(block, nil) - } - } -} - -type lidAccumulator struct { - blockCapacity int - onBlock func(lidsSealBlock) error - - currentTID uint32 - currentBlock lidsSealBlock - - isEndOfToken bool - isContinued bool -} - -func newLIDAccumulator( - blockCapacity int, - onBlock func(lidsSealBlock) error, -) *lidAccumulator { - a := &lidAccumulator{ - blockCapacity: blockCapacity, - onBlock: onBlock, - } - - a.currentBlock.ext.minTID = 1 - a.currentBlock.payload = lids.Block{ - LIDs: make([]uint32, 0, blockCapacity), - Offsets: []uint32{0}, - } - - return a -} - -// Add processes LIDs of one token (must be called in TID order). -// -// For each block that fills up, `onBlock` is called immediately -// before the backing arrays are reset, so `onBlock` may read the -// block data but must not retain references to it. -func (a *lidAccumulator) Add(lidsbuf []uint32) error { - a.currentTID++ - - for _, lid := range lidsbuf { - if len(a.currentBlock.payload.LIDs) == a.blockCapacity { - if err := a.onBlock(a.finalizeBlock()); err != nil { - return err - } - - a.currentBlock.ext.minTID = a.currentTID - a.currentBlock.payload.LIDs = a.currentBlock.payload.LIDs[:0] - a.currentBlock.payload.Offsets = a.currentBlock.payload.Offsets[:1] - } - - a.isEndOfToken = false - a.currentBlock.ext.maxTID = a.currentTID - a.currentBlock.payload.LIDs = append(a.currentBlock.payload.LIDs, lid) - } - - a.isEndOfToken = true - a.currentBlock.payload.Offsets = append( - a.currentBlock.payload.Offsets, - uint32(len(a.currentBlock.payload.LIDs)), - ) - - return nil -} - -func (a *lidAccumulator) Finalize() error { - return a.onBlock(a.finalizeBlock()) -} - -func (a *lidAccumulator) finalizeBlock() lidsSealBlock { - if !a.isEndOfToken { - a.currentBlock.payload.Offsets = append( - a.currentBlock.payload.Offsets, - uint32(len(a.currentBlock.payload.LIDs)), - ) - } - - result := a.currentBlock - result.payload.IsLastLID = a.isEndOfToken - result.ext.isContinued = a.isContinued - - a.isContinued = !a.isEndOfToken - return result -} diff --git a/indexwriter/index.go b/indexwriter/index.go index 286b620c..87fec07a 100644 --- a/indexwriter/index.go +++ b/indexwriter/index.go @@ -13,9 +13,15 @@ import ( "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/util" "github.com/ozontech/seq-db/zstd" ) +type ( + DocLocation = util.Pair[seq.ID, seq.DocPos] + TokenPosting = util.Pair[[]byte, []uint32] +) + // Source defines the data required to write all index files for a fraction. type Source interface { // Info returns metadata describing this source. @@ -23,7 +29,7 @@ type Source interface { // ID returns an iterator over stored document identifiers paired with // their positions, in descending [seq.ID] order. - ID() iter.Seq2[blockbuilder.DocLocation, error] + ID() iter.Seq2[DocLocation, error] // BlockOffsets returns byte offsets to each document block // within this source's `.docs` file. @@ -32,7 +38,7 @@ type Source interface { // TokenTriplet iterates over fields in lexicographic order. // For each field, it yields tokens (lexicographically sorted) // paired with the local document ID list for that token. - TokenTriplet() iter.Seq2[string, iter.Seq2[blockbuilder.TokenPosting, error]] + TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, error]] } // indexBlock is one compressed (or not) block with its registry metadata. @@ -106,7 +112,7 @@ func (s *IndexWriter) WriteIDFile(ws io.WriteSeeker, src Source) error { } defer w.release() - for block, err := range blockbuilder.SeqBlockID(src.ID(), consts.IDsPerBlock) { + for block, err := range blockbuilder.IDBlock(src.ID(), consts.IDsPerBlock) { if err != nil { return err } @@ -140,18 +146,15 @@ func (s *IndexWriter) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) err } defer lw.release() - var ( - allFieldsTables []token.FieldTable - ) - - lidAccumulator := newLIDAccumulator( + lidAccumulator := blockbuilder.NewLIDAccumulator( consts.LIDBlockCap, - func(block lidsSealBlock) error { + func(block blockbuilder.LidsSealBlock) error { return lw.writeBlock(blockTypeLID, s.packLIDsBlock(block)) }, ) - for pair, err := range blockbuilder.BuildTokenBlocks(src.TokenTriplet(), accumulate, consts.RegularBlockSize) { + var allFieldsTables []token.FieldTable + for pair, err := range blockbuilder.TokenBlocks(src.TokenTriplet(), lidAccumulator.Add, consts.RegularBlockSize) { if err != nil { return err } @@ -170,7 +173,7 @@ func (s *IndexWriter) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) err return s.finalizeTokenFile(tw, allFieldsTables) } -func (s *IndexSealer) finalizeLIDFile(w *writer, lidAccumulator *lidAccumulator) error { +func (s *IndexWriter) finalizeLIDFile(w *writer, lidAccumulator *blockbuilder.LIDAccumulator) error { if err := lidAccumulator.Finalize(); err != nil { return err } @@ -184,7 +187,7 @@ func (s *IndexWriter) finalizeTokenFile(w *writer, allFieldsTables []token.Field return err } - tokenTableBlock := token.TableBlock{FieldsTables: collapseOrderedFieldsTables(allFieldsTables)} + tokenTableBlock := token.TableBlock{FieldsTables: blockbuilder.CollapseOrderedFieldsTables(allFieldsTables)} if err := w.writeBlock(blockTypeTokenTable, s.packTokenTableBlock(tokenTableBlock)); err != nil { return err } @@ -192,7 +195,7 @@ func (s *IndexWriter) finalizeTokenFile(w *writer, allFieldsTables []token.Field return w.finalize() } -func (s *IndexSealer) WriteInfoFile(ws io.Writer, src Source) error { +func (s *IndexWriter) WriteInfoFile(ws io.Writer, src Source) error { block := sealed.BlockInfo{Info: src.Info()} _, err := ws.Write(s.packInfoBlock(block).payload) return err diff --git a/sealing/sealer.go b/sealing/sealer.go index edd263fe..0c21ffc4 100644 --- a/sealing/sealer.go +++ b/sealing/sealer.go @@ -25,12 +25,11 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { return nil, errors.New("sealing of an empty active fraction is not supported") } - writer := indexwriter.New(params) - + w := indexwriter.New(params) if err := createAndWrite( info.Path+consts.OffsetsTmpFileSuffix, info.Path+consts.OffsetsFileSuffix, - func(f *os.File) error { return writer.WriteOffsetsFile(f, src) }, + func(f *os.File) error { return w.WriteOffsetsFile(f, src) }, ); err != nil { return nil, err } @@ -38,7 +37,7 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { if err := createAndWrite( info.Path+consts.IDTmpFileSuffix, info.Path+consts.IDFileSuffix, - func(f *os.File) error { return writer.WriteIDFile(f, src) }, + func(f *os.File) error { return w.WriteIDFile(f, src) }, ); err != nil { return nil, err } @@ -46,7 +45,7 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { if err := createAndWriteBoth( info.Path+consts.TokenTmpFileSuffix, info.Path+consts.TokenFileSuffix, info.Path+consts.LIDTmpFileSuffix, info.Path+consts.LIDFileSuffix, - func(tokenF, lidF *os.File) error { return writer.WriteTokenTriplet(tokenF, lidF, src) }, + func(tokenF, lidF *os.File) error { return w.WriteTokenTriplet(tokenF, lidF, src) }, ); err != nil { return nil, err } @@ -54,7 +53,7 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { if err := createAndWrite( info.Path+consts.InfoTmpFileSuffix, info.Path+consts.InfoFileSuffix, - func(f *os.File) error { return writer.WriteInfoFile(f, src) }, + func(f *os.File) error { return w.WriteInfoFile(f, src) }, ); err != nil { return nil, err } @@ -78,13 +77,13 @@ func Seal(src Source, params common.SealParams) (*sealed.PreloadedData, error) { } info.IndexOnDisk = totalSize - lidsTable := writer.LIDsTable() + lidsTable := w.LIDsTable() preloaded := &sealed.PreloadedData{ Info: info, - TokenTable: writer.TokenTable(), + TokenTable: w.TokenTable(), BlocksData: sealed.BlocksData{ - IDsTable: writer.IDsTable(), + IDsTable: w.IDsTable(), LIDsTable: &lidsTable, BlocksOffsets: src.BlockOffsets(), }, From 7484ac0930c051c8735b736411190c379520c77e Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Thu, 7 May 2026 11:35:18 +0300 Subject: [PATCH 11/29] refactor: merge `indexwriter` and `blockbuilder` --- blockbuilder/block_builder.go | 224 ------------------ blockbuilder/lid_accumulator.go | 85 ------- indexwriter/blocks.go | 219 +++++++++++++++++ .../blocks_test.go | 64 ++--- indexwriter/index.go | 53 ++--- indexwriter/lid_accumulator.go | 85 +++++++ 6 files changed, 362 insertions(+), 368 deletions(-) delete mode 100644 blockbuilder/block_builder.go delete mode 100644 blockbuilder/lid_accumulator.go create mode 100644 indexwriter/blocks.go rename blockbuilder/block_builder_test.go => indexwriter/blocks_test.go (77%) create mode 100644 indexwriter/lid_accumulator.go diff --git a/blockbuilder/block_builder.go b/blockbuilder/block_builder.go deleted file mode 100644 index 8103dedf..00000000 --- a/blockbuilder/block_builder.go +++ /dev/null @@ -1,224 +0,0 @@ -package blockbuilder - -import ( - "encoding/binary" - "iter" - "unsafe" - - "github.com/ozontech/seq-db/frac/sealed/lids" - "github.com/ozontech/seq-db/frac/sealed/seqids" - "github.com/ozontech/seq-db/frac/sealed/token" - "github.com/ozontech/seq-db/seq" - "github.com/ozontech/seq-db/util" -) - -type ( - DocLocation = util.Pair[seq.ID, seq.DocPos] - TokenPosting = util.Pair[[]byte, []uint32] - TokenBlock = util.Pair[TokensSealBlock, []token.FieldTable] -) - -// TokensExt represents the token ID range contained in a block. -type TokensExt struct { - MinTID uint32 // First token ID in the block - MaxTID uint32 // Last token ID in the block -} - -// TokensSealBlock represents a sealed block containing token data with metadata. -type TokensSealBlock struct { - Ext TokensExt // Tokens block metadata for registry marking - Payload token.Block // Actual token data payload -} - -// LidsExt represents the range and continuation status of LID blocks. -type LidsExt struct { - MinTID uint32 // First token ID in the LID block - MaxTID uint32 // Last token ID in the LID block - IsContinued bool // Whether LID sequence continues in next block -} - -// LidsSealBlock represents a sealed block containing LID (Local ID) data. -type LidsSealBlock struct { - Ext LidsExt // LIDs block metadata for registry marking - Payload lids.Block // LID data payload -} - -// IdsSealBlock represents a sealed block containing various identifier types. -type IdsSealBlock struct { - MIDs seqids.BlockMIDs - RIDs seqids.BlockRIDs - Params seqids.BlockParams -} - -func TokenBlocks( - it iter.Seq2[string, iter.Seq2[TokenPosting, error]], - accumulate func([]uint32) error, blockCapacity int, -) iter.Seq2[TokenBlock, error] { - return func(yield func(TokenBlock, error) bool) { - var ( - block TokensSealBlock - blockIdx uint32 - blockSize int - ) - - var ( - currentTID uint32 - pendingTable []token.FieldTable - fieldName string - fieldEntryStartTID uint32 - ) - - emitFieldEntry := func() { - // Handle case when field does not have tokens. - if fieldName == "" || fieldEntryStartTID > currentTID { - return - } - - entry := newTokenTableEntry(fieldEntryStartTID, currentTID, blockIdx, block) - pendingTable = append(pendingTable, token.FieldTable{ - Field: fieldName, - Entries: []*token.TableEntry{entry}, - }) - } - - flushBlock := func() bool { - emitFieldEntry() - block.Ext.MaxTID = currentTID - - pair := TokenBlock{First: block, Second: pendingTable} - if !yield(pair, nil) { - return false - } - - block.Payload.Payload = block.Payload.Payload[:0] - block.Payload.Offsets = block.Payload.Offsets[:0] - block.Ext.MinTID = currentTID + 1 - - blockIdx++ - blockSize = 0 - - pendingTable = pendingTable[:0] - fieldEntryStartTID = currentTID + 1 - - return true - } - - block.Ext.MinTID = 1 - for field, tokIt := range it { - emitFieldEntry() - - fieldName = field - fieldEntryStartTID = currentTID + 1 - - for pair, err := range tokIt { - if err != nil { - yield(TokenBlock{}, err) - return - } - - tok, tlids := pair.First, pair.Second - tokenSize := int(unsafe.Sizeof(uint32(0))) + len(tok) - - if blockSize > 0 && blockSize+tokenSize > blockCapacity { - if !flushBlock() { - return - } - } - - block.Payload.Offsets = append(block.Payload.Offsets, uint32(len(block.Payload.Payload))) - block.Payload.Payload = binary.LittleEndian.AppendUint32(block.Payload.Payload, uint32(len(tok))) - block.Payload.Payload = append(block.Payload.Payload, tok...) - - if err := accumulate(tlids); err != nil { - yield(TokenBlock{}, err) - return - } - - currentTID++ - blockSize += tokenSize - } - } - - if blockSize > 0 { - flushBlock() - } - } -} - -func newTokenTableEntry( - entryStartTID, entryEndTID uint32, - blockIndex uint32, block TokensSealBlock, -) *token.TableEntry { - // Convert global TIDs to block-local indices - firstIndex := entryStartTID - block.Ext.MinTID - lastIndex := entryEndTID - block.Ext.MinTID - - // Extract min and max token values for the entry range - minVal := string(block.Payload.GetToken(int(firstIndex))) - maxVal := string(block.Payload.GetToken(int(lastIndex))) - - return &token.TableEntry{ - StartIndex: firstIndex, // Starting index within the block - StartTID: entryStartTID, // Starting token ID (global) - BlockIndex: blockIndex, // Reference to containing block - ValCount: lastIndex - firstIndex + 1, // Number of tokens in this entry - MinVal: minVal, // Smallest token value in range - MaxVal: maxVal, // Largest token value in range - } -} - -// IDBlock accumulates scalar (ID, position) pairs into sealed ID blocks. -// A new block is yielded every `blockCapacity` IDs. -func IDBlock(ids iter.Seq2[DocLocation, error], blockCapacity int) iter.Seq2[IdsSealBlock, error] { - return func(yield func(IdsSealBlock, error) bool) { - var block IdsSealBlock - - for pair, err := range ids { - if err != nil { - yield(IdsSealBlock{}, err) - return - } - - id, pos := pair.First, pair.Second - block.MIDs.Values = append(block.MIDs.Values, uint64(id.MID)) - block.RIDs.Values = append(block.RIDs.Values, uint64(id.RID)) - block.Params.Values = append(block.Params.Values, uint64(pos)) - - if len(block.MIDs.Values) == blockCapacity { - if !yield(block, nil) { - return - } - - block.MIDs.Values = block.MIDs.Values[:0] - block.RIDs.Values = block.RIDs.Values[:0] - block.Params.Values = block.Params.Values[:0] - } - } - - if len(block.MIDs.Values) > 0 { - yield(block, nil) - } - } -} - -// CollapseOrderedFieldsTables merges FieldTables with the same field name. -// Assumes input is sorted by Field. -func CollapseOrderedFieldsTables(src []token.FieldTable) []token.FieldTable { - if len(src) == 0 { - return nil - } - - current := src[0] - var dst []token.FieldTable - for _, ft := range src[1:] { - if current.Field == ft.Field { - current.Entries = append(current.Entries, ft.Entries...) - continue - } - - dst = append(dst, current) - current = ft - } - - return append(dst, current) -} diff --git a/blockbuilder/lid_accumulator.go b/blockbuilder/lid_accumulator.go deleted file mode 100644 index ef81a970..00000000 --- a/blockbuilder/lid_accumulator.go +++ /dev/null @@ -1,85 +0,0 @@ -package blockbuilder - -import "github.com/ozontech/seq-db/frac/sealed/lids" - -type LIDAccumulator struct { - blockCapacity int - onBlock func(LidsSealBlock) error - - currentTID uint32 - currentBlock LidsSealBlock - - isEndOfToken bool - isContinued bool -} - -func NewLIDAccumulator( - blockCapacity int, - onBlock func(LidsSealBlock) error, -) *LIDAccumulator { - a := &LIDAccumulator{ - blockCapacity: blockCapacity, - onBlock: onBlock, - } - - a.currentBlock.Ext.MinTID = 1 - a.currentBlock.Payload = lids.Block{ - LIDs: make([]uint32, 0, blockCapacity), - Offsets: []uint32{0}, - } - - return a -} - -// Add processes LIDs of one token (must be called in TID order). -// -// For each block that fills up, `onBlock` is called immediately -// before the backing arrays are reset, so `onBlock` may read the -// block data but must not retain references to it. -func (a *LIDAccumulator) Add(lidsbuf []uint32) error { - a.currentTID++ - - for _, lid := range lidsbuf { - if len(a.currentBlock.Payload.LIDs) == a.blockCapacity { - if err := a.onBlock(a.finalizeBlock()); err != nil { - return err - } - - a.currentBlock.Ext.MinTID = a.currentTID - a.currentBlock.Payload.LIDs = a.currentBlock.Payload.LIDs[:0] - a.currentBlock.Payload.Offsets = a.currentBlock.Payload.Offsets[:1] - } - - a.isEndOfToken = false - a.currentBlock.Ext.MaxTID = a.currentTID - a.currentBlock.Payload.LIDs = append(a.currentBlock.Payload.LIDs, lid) - } - - a.isEndOfToken = true - a.currentBlock.Payload.Offsets = append( - a.currentBlock.Payload.Offsets, - uint32(len(a.currentBlock.Payload.LIDs)), - ) - - return nil -} - -func (a *LIDAccumulator) Finalize() error { - return a.onBlock(a.finalizeBlock()) -} - -func (a *LIDAccumulator) finalizeBlock() LidsSealBlock { - if !a.isEndOfToken { - a.currentBlock.Payload.Offsets = append( - a.currentBlock.Payload.Offsets, - uint32(len(a.currentBlock.Payload.LIDs)), - ) - } - - result := a.currentBlock - result.Payload.IsLastLID = a.isEndOfToken - result.Ext.IsContinued = a.isContinued - - a.isContinued = !a.isEndOfToken - return result -} diff --git a/indexwriter/blocks.go b/indexwriter/blocks.go new file mode 100644 index 00000000..3064491b --- /dev/null +++ b/indexwriter/blocks.go @@ -0,0 +1,219 @@ +package indexwriter + +import ( + "encoding/binary" + "iter" + "unsafe" + + "github.com/ozontech/seq-db/frac/sealed/lids" + "github.com/ozontech/seq-db/frac/sealed/seqids" + "github.com/ozontech/seq-db/frac/sealed/token" + "github.com/ozontech/seq-db/util" +) + +type tokenFieldBlock = util.Pair[unpackedTokenBlock, []token.FieldTable] + +// tokenExt represents the token ID range contained in a block. +type tokenExt struct { + minTID uint32 // First token ID in the block + maxTID uint32 // Last token ID in the block +} + +// unpackedTokenBlock represents a sealed block containing token data with metadata. +type unpackedTokenBlock struct { + ext tokenExt // Tokens block metadata for registry marking + payload token.Block // Actual token data payload +} + +// lidExt represents the range and continuation status of LID blocks. +type lidExt struct { + minTID uint32 // First token ID in the LID block + maxTID uint32 // Last token ID in the LID block + isContinued bool // Whether LID sequence continues in next block +} + +// unpackedLIDBlock represents a sealed block containing LID (Local ID) data. +type unpackedLIDBlock struct { + ext lidExt // LIDs block metadata for registry marking + payload lids.Block // LID data payload +} + +// unpackedIDBlock represents a sealed block containing various identifier types. +type unpackedIDBlock struct { + mids seqids.BlockMIDs + rids seqids.BlockRIDs + params seqids.BlockParams +} + +func tokenBlock( + it iter.Seq2[string, iter.Seq2[TokenPosting, error]], + accumulate func([]uint32) error, blockCapacity int, +) iter.Seq2[tokenFieldBlock, error] { + return func(yield func(tokenFieldBlock, error) bool) { + var ( + block unpackedTokenBlock + blockIdx uint32 + blockSize int + ) + + var ( + currentTID uint32 + pendingTable []token.FieldTable + fieldName string + fieldEntryStartTID uint32 + ) + + emitFieldEntry := func() { + // Handle case when field does not have tokens. + if fieldName == "" || fieldEntryStartTID > currentTID { + return + } + + entry := newTokenTableEntry(fieldEntryStartTID, currentTID, blockIdx, block) + pendingTable = append(pendingTable, token.FieldTable{ + Field: fieldName, + Entries: []*token.TableEntry{entry}, + }) + } + + flushBlock := func() bool { + emitFieldEntry() + block.ext.maxTID = currentTID + + pair := tokenFieldBlock{First: block, Second: pendingTable} + if !yield(pair, nil) { + return false + } + + block.payload.Payload = block.payload.Payload[:0] + block.payload.Offsets = block.payload.Offsets[:0] + block.ext.minTID = currentTID + 1 + + blockIdx++ + blockSize = 0 + + pendingTable = pendingTable[:0] + fieldEntryStartTID = currentTID + 1 + + return true + } + + block.ext.minTID = 1 + for field, tokIt := range it { + emitFieldEntry() + + fieldName = field + fieldEntryStartTID = currentTID + 1 + + for pair, err := range tokIt { + if err != nil { + yield(tokenFieldBlock{}, err) + return + } + + tok, tlids := pair.First, pair.Second + tokenSize := int(unsafe.Sizeof(uint32(0))) + len(tok) + + if blockSize > 0 && blockSize+tokenSize > blockCapacity { + if !flushBlock() { + return + } + } + + block.payload.Offsets = append(block.payload.Offsets, uint32(len(block.payload.Payload))) + block.payload.Payload = binary.LittleEndian.AppendUint32(block.payload.Payload, uint32(len(tok))) + block.payload.Payload = append(block.payload.Payload, tok...) + + if err := accumulate(tlids); err != nil { + yield(tokenFieldBlock{}, err) + return + } + + currentTID++ + blockSize += tokenSize + } + } + + if blockSize > 0 { + flushBlock() + } + } +} + +func newTokenTableEntry( + entryStartTID, entryEndTID uint32, + blockIndex uint32, block unpackedTokenBlock, +) *token.TableEntry { + // Convert global TIDs to block-local indices + firstIndex := entryStartTID - block.ext.minTID + lastIndex := entryEndTID - block.ext.minTID + + // Extract min and max token values for the entry range + minVal := string(block.payload.GetToken(int(firstIndex))) + maxVal := string(block.payload.GetToken(int(lastIndex))) + + return &token.TableEntry{ + StartIndex: firstIndex, // Starting index within the block + StartTID: entryStartTID, // Starting token ID (global) + BlockIndex: blockIndex, // Reference to containing block + ValCount: lastIndex - firstIndex + 1, // Number of tokens in this entry + MinVal: minVal, // Smallest token value in range + MaxVal: maxVal, // Largest token value in range + } +} + +// idBlock accumulates scalar (ID, position) pairs into sealed ID blocks. +// A new block is yielded every `blockCapacity` IDs. +func idBlock(ids iter.Seq2[DocLocation, error], blockCapacity int) iter.Seq2[unpackedIDBlock, error] { + return func(yield func(unpackedIDBlock, error) bool) { + var block unpackedIDBlock + + for pair, err := range ids { + if err != nil { + yield(unpackedIDBlock{}, err) + return + } + + id, pos := pair.First, pair.Second + block.mids.Values = append(block.mids.Values, uint64(id.MID)) + block.rids.Values = append(block.rids.Values, uint64(id.RID)) + block.params.Values = append(block.params.Values, uint64(pos)) + + if len(block.mids.Values) == blockCapacity { + if !yield(block, nil) { + return + } + + block.mids.Values = block.mids.Values[:0] + block.rids.Values = block.rids.Values[:0] + block.params.Values = block.params.Values[:0] + } + } + + if len(block.mids.Values) > 0 { + yield(block, nil) + } + } +} + +// collapseOrderedFieldsTables merges FieldTables with the same field name. +// Assumes input is sorted by Field. +func collapseOrderedFieldsTables(src []token.FieldTable) []token.FieldTable { + if len(src) == 0 { + return nil + } + + current := src[0] + var dst []token.FieldTable + for _, ft := range src[1:] { + if current.Field == ft.Field { + current.Entries = append(current.Entries, ft.Entries...) + continue + } + + dst = append(dst, current) + current = ft + } + + return append(dst, current) +} diff --git a/blockbuilder/block_builder_test.go b/indexwriter/blocks_test.go similarity index 77% rename from blockbuilder/block_builder_test.go rename to indexwriter/blocks_test.go index 95fe7698..8a3951d7 100644 --- a/blockbuilder/block_builder_test.go +++ b/indexwriter/blocks_test.go @@ -1,4 +1,4 @@ -package blockbuilder +package indexwriter import ( "iter" @@ -104,18 +104,18 @@ func TestBlocksBuilder_BuildTokenBlocks(t *testing.T) { const blockSize = 24 const lidBlockCap = 3 - var lidBlocks []LidsSealBlock - lidAccumulator := NewLIDAccumulator( + var lidBlocks []unpackedLIDBlock + lidAccumulator := newLIDAccumulator( lidBlockCap, - func(block LidsSealBlock) error { - block.Payload.LIDs = slices.Clone(block.Payload.LIDs) - block.Payload.Offsets = slices.Clone(block.Payload.Offsets) + func(block unpackedLIDBlock) error { + block.payload.LIDs = slices.Clone(block.payload.LIDs) + block.payload.Offsets = slices.Clone(block.payload.Offsets) lidBlocks = append(lidBlocks, block) return nil }, ) - tokenBlocks := TokenBlocks( + tokenBlocksIter := tokenBlock( src.TokenTriplet(), lidAccumulator.Add, blockSize, @@ -129,19 +129,19 @@ func TestBlocksBuilder_BuildTokenBlocks(t *testing.T) { blockIndex := 0 allFieldsTables := []token.FieldTable{} - for pair, err := range tokenBlocks { + for pair, err := range tokenBlocksIter { assert.NoError(t, err) block, fieldsTables := pair.First, pair.Second - assert.Equal(t, expectedSizes[blockIndex], block.Payload.Len()) - for i := range block.Payload.Len() { + assert.Equal(t, expectedSizes[blockIndex], block.payload.Len()) + for i := range block.payload.Len() { tid++ - assert.Equal(t, src.tokens[tid-1], block.Payload.GetToken(i)) + assert.Equal(t, src.tokens[tid-1], block.payload.GetToken(i)) } allFieldsTables = append(allFieldsTables, fieldsTables...) blockIndex++ } - actualTokenTable := token.TableBlock{FieldsTables: CollapseOrderedFieldsTables(allFieldsTables)} + actualTokenTable := token.TableBlock{FieldsTables: collapseOrderedFieldsTables(allFieldsTables)} assert.Equal(t, tid, len(src.tokens)) expectedTokenTable := token.TableBlock{ @@ -238,30 +238,30 @@ func TestBlocksBuilder_BuildTokenBlocks(t *testing.T) { assert.Equal(t, actualTokenTable.FieldsTables, expectedTokenTable.FieldsTables) assert.NoError(t, lidAccumulator.Finalize()) - expectedLIDBlocks := []LidsSealBlock{ + expectedLIDBlocks := []unpackedLIDBlock{ { - Ext: LidsExt{MinTID: 1, MaxTID: 1, IsContinued: false}, - Payload: lids.Block{LIDs: []uint32{10, 20, 30}, Offsets: []uint32{0, 3}, IsLastLID: false}, + ext: lidExt{minTID: 1, maxTID: 1, isContinued: false}, + payload: lids.Block{LIDs: []uint32{10, 20, 30}, Offsets: []uint32{0, 3}, IsLastLID: false}, }, { - Ext: LidsExt{MinTID: 1, MaxTID: 3, IsContinued: true}, - Payload: lids.Block{LIDs: []uint32{40, 2, 3}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, + ext: lidExt{minTID: 1, maxTID: 3, isContinued: true}, + payload: lids.Block{LIDs: []uint32{40, 2, 3}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, }, { - Ext: LidsExt{MinTID: 4, MaxTID: 6, IsContinued: false}, - Payload: lids.Block{LIDs: []uint32{4, 5, 6}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, + ext: lidExt{minTID: 4, maxTID: 6, isContinued: false}, + payload: lids.Block{LIDs: []uint32{4, 5, 6}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, }, { - Ext: LidsExt{MinTID: 7, MaxTID: 9, IsContinued: false}, - Payload: lids.Block{LIDs: []uint32{7, 8, 9}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, + ext: lidExt{minTID: 7, maxTID: 9, isContinued: false}, + payload: lids.Block{LIDs: []uint32{7, 8, 9}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, }, { - Ext: LidsExt{MinTID: 10, MaxTID: 12, IsContinued: false}, - Payload: lids.Block{LIDs: []uint32{10, 11, 12}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, + ext: lidExt{minTID: 10, maxTID: 12, isContinued: false}, + payload: lids.Block{LIDs: []uint32{10, 11, 12}, Offsets: []uint32{0, 1, 2, 3}, IsLastLID: true}, }, { - Ext: LidsExt{MinTID: 13, MaxTID: 14, IsContinued: false}, - Payload: lids.Block{LIDs: []uint32{13, 14}, Offsets: []uint32{0, 1, 2}, IsLastLID: true}, + ext: lidExt{minTID: 13, maxTID: 14, isContinued: false}, + payload: lids.Block{LIDs: []uint32{13, 14}, Offsets: []uint32{0, 1, 2}, IsLastLID: true}, }, } assert.Equal(t, expectedLIDBlocks, lidBlocks) @@ -300,18 +300,18 @@ func TestBlocksBuilder_IDsBlocks(t *testing.T) { i := 0 ids := []seq.ID{} pos := []seq.DocPos{} - for block, err := range IDBlock(src.ID(), 3) { + for block, err := range idBlock(src.ID(), 3) { assert.NoError(t, err) - assert.Equal(t, expectedSizes[i], len(block.MIDs.Values)) - assert.Equal(t, expectedSizes[i], len(block.RIDs.Values)) - assert.Equal(t, expectedSizes[i], len(block.Params.Values)) + assert.Equal(t, expectedSizes[i], len(block.mids.Values)) + assert.Equal(t, expectedSizes[i], len(block.rids.Values)) + assert.Equal(t, expectedSizes[i], len(block.params.Values)) i++ j := 0 - for _, mid := range block.MIDs.Values { - ids = append(ids, seq.ID{MID: seq.MID(mid), RID: seq.RID(block.RIDs.Values[j])}) - pos = append(pos, seq.DocPos(block.Params.Values[j])) + for _, mid := range block.mids.Values { + ids = append(ids, seq.ID{MID: seq.MID(mid), RID: seq.RID(block.rids.Values[j])}) + pos = append(pos, seq.DocPos(block.params.Values[j])) j++ } } diff --git a/indexwriter/index.go b/indexwriter/index.go index 87fec07a..12fedafe 100644 --- a/indexwriter/index.go +++ b/indexwriter/index.go @@ -4,7 +4,6 @@ import ( "io" "iter" - "github.com/ozontech/seq-db/blockbuilder" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" @@ -112,7 +111,7 @@ func (s *IndexWriter) WriteIDFile(ws io.WriteSeeker, src Source) error { } defer w.release() - for block, err := range blockbuilder.IDBlock(src.ID(), consts.IDsPerBlock) { + for block, err := range idBlock(src.ID(), consts.IDsPerBlock) { if err != nil { return err } @@ -146,15 +145,15 @@ func (s *IndexWriter) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) err } defer lw.release() - lidAccumulator := blockbuilder.NewLIDAccumulator( + lidAccumulator := newLIDAccumulator( consts.LIDBlockCap, - func(block blockbuilder.LidsSealBlock) error { + func(block unpackedLIDBlock) error { return lw.writeBlock(blockTypeLID, s.packLIDsBlock(block)) }, ) var allFieldsTables []token.FieldTable - for pair, err := range blockbuilder.TokenBlocks(src.TokenTriplet(), lidAccumulator.Add, consts.RegularBlockSize) { + for pair, err := range tokenBlock(src.TokenTriplet(), lidAccumulator.Add, consts.RegularBlockSize) { if err != nil { return err } @@ -173,7 +172,7 @@ func (s *IndexWriter) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) err return s.finalizeTokenFile(tw, allFieldsTables) } -func (s *IndexWriter) finalizeLIDFile(w *writer, lidAccumulator *blockbuilder.LIDAccumulator) error { +func (s *IndexWriter) finalizeLIDFile(w *writer, lidAccumulator *lidAccumulator) error { if err := lidAccumulator.Finalize(); err != nil { return err } @@ -187,7 +186,7 @@ func (s *IndexWriter) finalizeTokenFile(w *writer, allFieldsTables []token.Field return err } - tokenTableBlock := token.TableBlock{FieldsTables: blockbuilder.CollapseOrderedFieldsTables(allFieldsTables)} + tokenTableBlock := token.TableBlock{FieldsTables: collapseOrderedFieldsTables(allFieldsTables)} if err := w.writeBlock(blockTypeTokenTable, s.packTokenTableBlock(tokenTableBlock)); err != nil { return err } @@ -220,11 +219,11 @@ func (s *IndexWriter) packInfoBlock(block sealed.BlockInfo) indexBlock { } // packTokenBlock packs token data into a compressed index block. -func (s *IndexWriter) packTokenBlock(block blockbuilder.TokensSealBlock) indexBlock { - s.buf1 = block.Payload.Pack(s.buf1[:0]) // Pack token data +func (s *IndexWriter) packTokenBlock(block unpackedTokenBlock) indexBlock { + s.buf1 = block.payload.Pack(s.buf1[:0]) // Pack token data b := s.newIndexBlockZSTD(s.buf1, s.params.TokenListZstdLevel) // Store TID range in extended metadata - b.ext1 = uint64(block.Ext.MaxTID)<<32 | uint64(block.Ext.MinTID) + b.ext1 = uint64(block.ext.maxTID)<<32 | uint64(block.ext.minTID) return b } @@ -250,19 +249,19 @@ func (s *IndexWriter) packBlocksOffsetsBlock(block sealed.BlockOffsets) indexBlo } // packMIDsBlock packs MIDs into a compressed index block. -func (s *IndexWriter) packMIDsBlock(block blockbuilder.IdsSealBlock) indexBlock { +func (s *IndexWriter) packMIDsBlock(block unpackedIDBlock) indexBlock { // Get the last ID in the block (smallest due to descending order) - last := len(block.MIDs.Values) - 1 + last := len(block.mids.Values) - 1 minID := seq.ID{ - MID: seq.MID(block.MIDs.Values[last]), - RID: seq.RID(block.RIDs.Values[last]), + MID: seq.MID(block.mids.Values[last]), + RID: seq.RID(block.rids.Values[last]), } s.idsTable.MinBlockIDs = append(s.idsTable.MinBlockIDs, minID) // Store for PreloadedData // Packing block - s.buf1 = block.MIDs.Pack(s.buf1[:0]) + s.buf1 = block.mids.Pack(s.buf1[:0]) b := s.newIndexBlockZSTD(s.buf1, s.params.IDsZstdLevel) // Store min MID and RID in extended metadata @@ -273,38 +272,38 @@ func (s *IndexWriter) packMIDsBlock(block blockbuilder.IdsSealBlock) indexBlock } // packRIDsBlock packs RIDs into a compressed index block. -func (s *IndexWriter) packRIDsBlock(block blockbuilder.IdsSealBlock) indexBlock { - s.buf1 = block.RIDs.Pack(s.buf1[:0]) +func (s *IndexWriter) packRIDsBlock(block unpackedIDBlock) indexBlock { + s.buf1 = block.rids.Pack(s.buf1[:0]) b := s.newIndexBlockZSTD(s.buf1, s.params.IDsZstdLevel) return b } // packPosBlock packs document positions into a compressed index block. -func (s *IndexWriter) packPosBlock(block blockbuilder.IdsSealBlock) indexBlock { - s.buf1 = block.Params.Pack(s.buf1[:0]) +func (s *IndexWriter) packPosBlock(block unpackedIDBlock) indexBlock { + s.buf1 = block.params.Pack(s.buf1[:0]) b := s.newIndexBlockZSTD(s.buf1, s.params.IDsZstdLevel) return b } // packLIDsBlock packs Local IDs (LIDs) into a compressed index block. // Also updates LIDs table for preloaded data access. -func (s *IndexWriter) packLIDsBlock(block blockbuilder.LidsSealBlock) indexBlock { +func (s *IndexWriter) packLIDsBlock(block unpackedLIDBlock) indexBlock { var ext1 uint64 - if block.Ext.IsContinued { // todo: Legacy continuation flag + if block.ext.isContinued { // todo: Legacy continuation flag ext1 = 1 - block.Ext.MinTID++ // Adjust for legacy format + block.ext.minTID++ // Adjust for legacy format } // Update LIDs table for PreloadedData - s.lidsTable.MinTIDs = append(s.lidsTable.MinTIDs, block.Ext.MinTID) - s.lidsTable.MaxTIDs = append(s.lidsTable.MaxTIDs, block.Ext.MaxTID) - s.lidsTable.IsContinued = append(s.lidsTable.IsContinued, block.Ext.IsContinued) + s.lidsTable.MinTIDs = append(s.lidsTable.MinTIDs, block.ext.minTID) + s.lidsTable.MaxTIDs = append(s.lidsTable.MaxTIDs, block.ext.maxTID) + s.lidsTable.IsContinued = append(s.lidsTable.IsContinued, block.ext.isContinued) // Packing block - s.buf1 = block.Payload.Pack(s.buf1[:0]) + s.buf1 = block.payload.Pack(s.buf1[:0]) b := s.newIndexBlockZSTD(s.buf1, s.params.LIDsZstdLevel) b.ext1 = ext1 // Legacy continuation flag - b.ext2 = uint64(block.Ext.MaxTID)<<32 | uint64(block.Ext.MinTID) // TID range + b.ext2 = uint64(block.ext.maxTID)<<32 | uint64(block.ext.minTID) // TID range return b } diff --git a/indexwriter/lid_accumulator.go b/indexwriter/lid_accumulator.go new file mode 100644 index 00000000..f3b3740a --- /dev/null +++ b/indexwriter/lid_accumulator.go @@ -0,0 +1,85 @@ +package indexwriter + +import "github.com/ozontech/seq-db/frac/sealed/lids" + +type lidAccumulator struct { + blockCapacity int + onBlock func(unpackedLIDBlock) error + + currentTID uint32 + currentBlock unpackedLIDBlock + + isEndOfToken bool + isContinued bool +} + +func newLIDAccumulator( + blockCapacity int, + onBlock func(unpackedLIDBlock) error, +) *lidAccumulator { + a := &lidAccumulator{ + blockCapacity: blockCapacity, + onBlock: onBlock, + } + + a.currentBlock.ext.minTID = 1 + a.currentBlock.payload = lids.Block{ + LIDs: make([]uint32, 0, blockCapacity), + Offsets: []uint32{0}, + } + + return a +} + +// Add processes LIDs of one token (must be called in TID order). +// +// For each block that fills up, `onBlock` is called immediately +// before the backing arrays are reset, so `onBlock` may read the +// block data but must not retain references to it. +func (a *lidAccumulator) Add(lidsbuf []uint32) error { + a.currentTID++ + + for _, lid := range lidsbuf { + if len(a.currentBlock.payload.LIDs) == a.blockCapacity { + if err := a.onBlock(a.finalizeBlock()); err != nil { + return err + } + + a.currentBlock.ext.minTID = a.currentTID + a.currentBlock.payload.LIDs = a.currentBlock.payload.LIDs[:0] + a.currentBlock.payload.Offsets = a.currentBlock.payload.Offsets[:1] + } + + a.isEndOfToken = false + a.currentBlock.ext.maxTID = a.currentTID + a.currentBlock.payload.LIDs = append(a.currentBlock.payload.LIDs, lid) + } + + a.isEndOfToken = true + a.currentBlock.payload.Offsets = append( + a.currentBlock.payload.Offsets, + uint32(len(a.currentBlock.payload.LIDs)), + ) + + return nil +} + +func (a *lidAccumulator) Finalize() error { + return a.onBlock(a.finalizeBlock()) +} + +func (a *lidAccumulator) finalizeBlock() unpackedLIDBlock { + if !a.isEndOfToken { + a.currentBlock.payload.Offsets = append( + a.currentBlock.payload.Offsets, + uint32(len(a.currentBlock.payload.LIDs)), + ) + } + + result := a.currentBlock + result.payload.IsLastLID = a.isEndOfToken + result.ext.isContinued = a.isContinued + + a.isContinued = !a.isEndOfToken + return result +} From e0dfb1351dcaa6184a62fd215f80e7d12f32d40b Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Mon, 25 May 2026 12:34:01 +0300 Subject: [PATCH 12/29] chore: fix rebase conflicts In this commit I've fixed the rebase issues when rebasing onto 0-fm-compaction-api --- fracmanager/fraction_provider.go | 1 + 1 file changed, 1 insertion(+) diff --git a/fracmanager/fraction_provider.go b/fracmanager/fraction_provider.go index d254a1ad..a3609b85 100644 --- a/fracmanager/fraction_provider.go +++ b/fracmanager/fraction_provider.go @@ -13,6 +13,7 @@ import ( "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" + "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/node" "github.com/ozontech/seq-db/sealing" "github.com/ozontech/seq-db/storage" From 8bc6dff1ce4d5579f10c361a07e4cf05335f012b Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Mon, 25 May 2026 12:43:05 +0300 Subject: [PATCH 13/29] chore: do not export some methods In this commit I've unexported methods for several entities: - `lidAccumulator` -- it is accessible outside of `indexwriter`; - `indexBlock` -- the same reason; --- indexwriter/blocks_test.go | 4 ++-- indexwriter/index.go | 6 +++--- indexwriter/lid_accumulator.go | 6 +++--- indexwriter/writer.go | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/indexwriter/blocks_test.go b/indexwriter/blocks_test.go index 8a3951d7..bb513130 100644 --- a/indexwriter/blocks_test.go +++ b/indexwriter/blocks_test.go @@ -117,7 +117,7 @@ func TestBlocksBuilder_BuildTokenBlocks(t *testing.T) { tokenBlocksIter := tokenBlock( src.TokenTriplet(), - lidAccumulator.Add, + lidAccumulator.add, blockSize, ) @@ -236,7 +236,7 @@ func TestBlocksBuilder_BuildTokenBlocks(t *testing.T) { }, } assert.Equal(t, actualTokenTable.FieldsTables, expectedTokenTable.FieldsTables) - assert.NoError(t, lidAccumulator.Finalize()) + assert.NoError(t, lidAccumulator.finalize()) expectedLIDBlocks := []unpackedLIDBlock{ { diff --git a/indexwriter/index.go b/indexwriter/index.go index 12fedafe..0e51e5cb 100644 --- a/indexwriter/index.go +++ b/indexwriter/index.go @@ -49,7 +49,7 @@ type indexBlock struct { ext2 uint64 } -func (i indexBlock) Bin(pos int64) (storage.IndexBlockHeader, []byte) { +func (i indexBlock) bin(pos int64) (storage.IndexBlockHeader, []byte) { return storage.NewIndexBlockHeader(pos, i.ext1, i.ext2, uint32(len(i.payload)), i.rawLen, i.codec), i.payload } @@ -153,7 +153,7 @@ func (s *IndexWriter) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) err ) var allFieldsTables []token.FieldTable - for pair, err := range tokenBlock(src.TokenTriplet(), lidAccumulator.Add, consts.RegularBlockSize) { + for pair, err := range tokenBlock(src.TokenTriplet(), lidAccumulator.add, consts.RegularBlockSize) { if err != nil { return err } @@ -173,7 +173,7 @@ func (s *IndexWriter) WriteTokenTriplet(tws, lws io.WriteSeeker, src Source) err } func (s *IndexWriter) finalizeLIDFile(w *writer, lidAccumulator *lidAccumulator) error { - if err := lidAccumulator.Finalize(); err != nil { + if err := lidAccumulator.finalize(); err != nil { return err } diff --git a/indexwriter/lid_accumulator.go b/indexwriter/lid_accumulator.go index f3b3740a..311311ef 100644 --- a/indexwriter/lid_accumulator.go +++ b/indexwriter/lid_accumulator.go @@ -31,12 +31,12 @@ func newLIDAccumulator( return a } -// Add processes LIDs of one token (must be called in TID order). +// add processes LIDs of one token (must be called in TID order). // // For each block that fills up, `onBlock` is called immediately // before the backing arrays are reset, so `onBlock` may read the // block data but must not retain references to it. -func (a *lidAccumulator) Add(lidsbuf []uint32) error { +func (a *lidAccumulator) add(lidsbuf []uint32) error { a.currentTID++ for _, lid := range lidsbuf { @@ -64,7 +64,7 @@ func (a *lidAccumulator) Add(lidsbuf []uint32) error { return nil } -func (a *lidAccumulator) Finalize() error { +func (a *lidAccumulator) finalize() error { return a.onBlock(a.finalizeBlock()) } diff --git a/indexwriter/writer.go b/indexwriter/writer.go index 1fb9909d..7746c1db 100644 --- a/indexwriter/writer.go +++ b/indexwriter/writer.go @@ -73,7 +73,7 @@ func newWriter(ws io.WriteSeeker) (*writer, error) { } func (w *writer) writeBlock(btype string, block indexBlock) error { - header, payload := block.Bin(int64(w.pos)) + header, payload := block.bin(int64(w.pos)) if _, err := w.wpayload.Write(payload); err != nil { return err } @@ -92,7 +92,7 @@ func (w *writer) writeBlock(btype string, block indexBlock) error { } func (w *writer) writeEmptyBlock() error { - header, _ := indexBlock{}.Bin(int64(w.pos)) + header, _ := indexBlock{}.bin(int64(w.pos)) w.wheader.Write(header) return nil } From c2c3fce46b6c45cb748ed9b37f906dbd90544834 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Thu, 9 Apr 2026 13:58:25 +0300 Subject: [PATCH 14/29] feat: k-way fraction merge --- compaction/heap.go | 1 + compaction/merge.go | 131 ++++++++++++++++++ compaction/merge_source.go | 223 +++++++++++++++++++++++++++++++ compaction/merge_source_test.go | 230 ++++++++++++++++++++++++++++++++ consts/consts.go | 1 + frac/sealed_source.go | 160 ++++++++++++++++++++++ 6 files changed, 746 insertions(+) create mode 100644 compaction/heap.go create mode 100644 compaction/merge.go create mode 100644 compaction/merge_source.go create mode 100644 compaction/merge_source_test.go create mode 100644 frac/sealed_source.go diff --git a/compaction/heap.go b/compaction/heap.go new file mode 100644 index 00000000..d1d3cde1 --- /dev/null +++ b/compaction/heap.go @@ -0,0 +1 @@ +package compaction diff --git a/compaction/merge.go b/compaction/merge.go new file mode 100644 index 00000000..23d60002 --- /dev/null +++ b/compaction/merge.go @@ -0,0 +1,131 @@ +package compaction + +import ( + "errors" + "fmt" + "os" + + "github.com/ozontech/seq-db/consts" + "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/indexwriter" +) + +func Merge(filename string, srcs ...Source) error { + mergeDocs(filename, srcs...) + + src := NewMergeSource(filename, srcs) + + // FIXME(dkharms): [common.SealParams] must be passed into [Merge] function. + writer := indexwriter.New(common.SealParams{ + IDsZstdLevel: 3, + LIDsZstdLevel: 3, + TokenListZstdLevel: 3, + DocsPositionsZstdLevel: 3, + TokenTableZstdLevel: 3, + DocBlocksZstdLevel: 3, + DocBlockSize: 3, + }) + + if err := createAndWrite( + filename+consts.OffsetsTmpFileSuffix, + filename+consts.OffsetsFileSuffix, + func(f *os.File) error { return writer.WriteOffsetsFile(f, src) }, + ); err != nil { + return err + } + + if err := createAndWrite( + filename+consts.IDTmpFileSuffix, + filename+consts.IDFileSuffix, + func(f *os.File) error { return writer.WriteIDFile(f, src) }, + ); err != nil { + return err + } + + if err := createAndWriteBoth( + filename+consts.TokenTmpFileSuffix, + filename+consts.TokenFileSuffix, + filename+consts.LIDTmpFileSuffix, + filename+consts.LIDFileSuffix, + func(tf, lf *os.File) error { return writer.WriteTokenTriplet(tf, lf, src) }, + ); err != nil { + return err + } + + if err := createAndWrite( + filename+consts.InfoTmpFileSuffix, + filename+consts.InfoFileSuffix, + func(f *os.File) error { return writer.WriteInfoFile(f, src) }, + ); err != nil { + return err + } + + return nil +} + +func syncAndClose(f *os.File) error { + if err := f.Sync(); err != nil { + f.Close() + return err + } + return f.Close() +} + +func createAndWrite(tmpPath, finalPath string, write func(*os.File) error) error { + f, err := os.Create(tmpPath) + if err != nil { + return err + } + + if err := errors.Join(write(f), syncAndClose(f)); err != nil { + return err + } + + return os.Rename(tmpPath, finalPath) +} + +func createAndWriteBoth( + tmpPath1, finalPath1, + tmpPath2, finalPath2 string, + write func(*os.File, *os.File) error, +) error { + f1, err := os.Create(tmpPath1) + if err != nil { + return err + } + + f2, err := os.Create(tmpPath2) + if err != nil { + f1.Close() + return err + } + + writeErr := write(f1, f2) + if err := errors.Join(writeErr, syncAndClose(f1), syncAndClose(f2)); err != nil { + return err + } + + if err := os.Rename(tmpPath1, finalPath1); err != nil { + return err + } + + return os.Rename(tmpPath2, finalPath2) +} + +// FIXME(dkharms): Create buffered writer for file. +func mergeDocs(filename string, srcs ...Source) error { + return createAndWrite( + filename+consts.DocsTmpFileSuffix, + filename+consts.DocsFileSuffix, + func(f *os.File) error { + for _, src := range srcs { + for block := range src.DocBlock() { + if _, err := f.Write(block); err != nil { + return err + } + } + } + return nil + }, + ) +} diff --git a/compaction/merge_source.go b/compaction/merge_source.go new file mode 100644 index 00000000..663aac3a --- /dev/null +++ b/compaction/merge_source.go @@ -0,0 +1,223 @@ +package compaction + +import ( + "cmp" + "iter" + "slices" + "strings" + + "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/indexwriter" + "github.com/ozontech/seq-db/seq" +) + +type Source interface { + indexwriter.Source + DocBlock() iter.Seq[[]byte] +} + +type MergeSource struct { + filename string + + // sources is a slice of [sealing.Source] + // which provide view into underlying fractions. + sources []Source + + // docblockcount is populated during [MergeSource.BlockOffsets] call. + // This slice is used for changing block indexes in [seq.DocPos]. + docblockcount []int + + // lidmapping describes the transformation of lids + // after k-merge of several fractions. + // + // i-th index of lidmapping correponds to i-th fraction. + // j-th index of i-th lidmapping corresponds to rename of i-th lid. + lidmapping [][]uint32 +} + +func NewMergeSource(filename string, sources []Source) *MergeSource { + lidmapping := make([][]uint32, len(sources)) + for i, src := range sources { + lidmapping[i] = make([]uint32, src.Info().DocsTotal+1) + } + return &MergeSource{sources: sources, lidmapping: lidmapping} +} + +// FIXME(dkharms): now this is just a placeholder. +// And info can be caculated after all merges. +func (s *MergeSource) Info() *common.Info { + var ( + docsOnDisk uint64 + indexOnDisk uint64 + ) + + for i := range s.sources { + docsOnDisk += s.sources[i].Info().DocsOnDisk + indexOnDisk += s.sources[i].Info().IndexOnDisk + } + + return common.NewInfo(s.filename, docsOnDisk, 0) +} + +func (s *MergeSource) BlockOffsets() []uint64 { + var ( + docsSize uint64 + offsets []uint64 + ) + + s.docblockcount = append(s.docblockcount, 0) + for i := 0; i < len(s.sources); i++ { + for _, offset := range s.sources[i].BlockOffsets() { + offsets = append(offsets, uint64(offset)+docsSize) + } + docsSize += s.sources[i].Info().DocsOnDisk + s.docblockcount = append(s.docblockcount, len(offsets)) + } + + return offsets +} + +func (s *MergeSource) ID() iter.Seq2[seq.ID, seq.DocPos] { + // FIXME(dkharms): For now, I will use stupid-simple linear scan for k-way merge. + // + // Its time complexity O(k*n) so it's not efficient enough if we compare it + // against time complexity of min-heap (which is O(n*log(k))) + // or another great data structure -- tournament tree -- which is O(n * log(k)) as well. + // + // However, tournament tree performs less comparisons than min-heap + // and it is around log(k) vs 2*log(k). + + type entry struct { + id seq.ID + docpos seq.DocPos + + sourceIdx int + oldlid uint32 + } + + var ids []entry + for i := 0; i < len(s.sources); i++ { + var lid uint32 + for id, docpos := range s.sources[i].ID() { + // Skip system [seq.ID]. + if id == seq.SystemID { + lid += 1 + continue + } + + blockIdx, offset := docpos.Unpack() + docpos = seq.PackDocPos(uint32(s.docblockcount[i]+int(blockIdx)), offset) + ids = append(ids, entry{id, docpos, i, lid}) + + lid += 1 + } + } + + slices.SortFunc(ids, func(x, y entry) int { + if x.id.MID == y.id.MID { + return -cmp.Compare(x.id.RID, y.id.RID) + } + return -cmp.Compare(x.id.MID, y.id.MID) + }) + + for i, entry := range ids { + s.lidmapping[entry.sourceIdx][entry.oldlid] = uint32(i + 1) + } + + return func(yield func(seq.ID, seq.DocPos) bool) { + // Emit system id since we skipped all such ids previously. + if !yield(seq.SystemID, seq.SystemDocPos) { + return + } + + for _, v := range ids { + if !yield(v.id, v.docpos) { + return + } + } + } +} + +type key struct { + field string + token string +} + +type value struct { + idx int + lids []uint32 +} + +func (s *MergeSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, []uint32]] { + // TODO(dkharms): Use heap or other more efficient data structure. + // For now, I'll just dump everything into one array. + + values := make(map[key][]value) + for i := 0; i < len(s.sources); i++ { + for field, tokIter := range s.sources[i].TokenTriplet() { + for tok, lids := range tokIter { + k := key{field, string(tok)} + values[k] = append(values[k], value{i, slices.Clone(lids)}) + } + } + } + + var keys []key + for k := range values { + keys = append(keys, k) + } + + slices.SortFunc(keys, func(x, y key) int { + if x.field != y.field { + return strings.Compare(x.field, y.field) + } + return strings.Compare(x.token, y.token) + }) + + return func(yield func(string, iter.Seq2[[]byte, []uint32]) bool) { + var previous string + for _, k := range keys { + if k.field == previous { + continue + } + + if !yield(k.field, s.tokensForField(k.field, keys, values)) { + return + } + + previous = k.field + } + } +} + +func (s *MergeSource) tokensForField( + field string, keys []key, values map[key][]value, +) iter.Seq2[[]byte, []uint32] { + var filtered []key + for _, k := range keys { + if k.field == field { + filtered = append(filtered, k) + } + } + + return func(yield func([]byte, []uint32) bool) { + for _, k := range filtered { + var buf []uint32 + + for _, v := range values[k] { + for _, lid := range v.lids { + buf = append(buf, s.lidmapping[v.idx][lid]) + } + } + + slices.Sort(buf) + if !yield([]byte(k.token), buf) { + return + } + } + } +} + +func (s *MergeSource) LastError() error { + return nil +} diff --git a/compaction/merge_source_test.go b/compaction/merge_source_test.go new file mode 100644 index 00000000..c01b15a2 --- /dev/null +++ b/compaction/merge_source_test.go @@ -0,0 +1,230 @@ +package compaction + +import ( + "iter" + "slices" + "testing" + + "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/seq" + "github.com/stretchr/testify/require" +) + +// mockSealingSource is a test implementation of sealing.Source. +// +// IDs must be provided in descending order (MID DESC, RID DESC); the mock +// automatically prepends the system ID when iterating, matching the contract +// expected by MergeSource.ID(). +// +// Fields maps field name → token value → list of 1-based LIDs. +// Fields and tokens are yielded in sorted order. +type mockSealingSource struct { + ids []seq.ID + pos []seq.DocPos + blocks []uint64 + // docsOnDisk is the total compressed size of the .docs file, + // used by MergeSource to adjust block offsets across sources. + docsOnDisk uint64 + // fields maps field → token → lids (1-based). + fields map[string]map[string][]uint32 +} + +func (m *mockSealingSource) Info() *common.Info { + return &common.Info{ + DocsTotal: uint32(len(m.ids)), + DocsOnDisk: m.docsOnDisk, + } +} + +func (m *mockSealingSource) BlockOffsets() []uint64 { + return m.blocks +} + +func (m *mockSealingSource) ID() iter.Seq2[seq.ID, seq.DocPos] { + return func(yield func(seq.ID, seq.DocPos) bool) { + if !yield(seq.SystemID, seq.SystemDocPos) { + return + } + for i, id := range m.ids { + if !yield(id, m.pos[i]) { + return + } + } + } +} + +func (m *mockSealingSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, []uint32]] { + fieldNames := make([]string, 0, len(m.fields)) + for f := range m.fields { + fieldNames = append(fieldNames, f) + } + slices.Sort(fieldNames) + + return func(yield func(string, iter.Seq2[[]byte, []uint32]) bool) { + for _, field := range fieldNames { + tokens := make([]string, 0, len(m.fields[field])) + for t := range m.fields[field] { + tokens = append(tokens, t) + } + slices.Sort(tokens) + + if !yield(field, func(yield func([]byte, []uint32) bool) { + for _, tok := range tokens { + if !yield([]byte(tok), m.fields[field][tok]) { + return + } + } + }) { + return + } + } + } +} + +func (m *mockSealingSource) DocBlock() iter.Seq[[]byte] { + return func(yield func([]byte) bool) { + if !yield(nil) { + return + } + } +} + +func (m *mockSealingSource) LastError() error { + return nil +} + +func TestMergeSource(t *testing.T) { + first := &mockSealingSource{ + ids: []seq.ID{ + {MID: 3}, + {MID: 2}, + {MID: 1}, + }, + + pos: []seq.DocPos{ + seq.PackDocPos(0, 0), + seq.PackDocPos(0, 1024), + seq.PackDocPos(0, 2048), + }, + + fields: map[string]map[string][]uint32{ + "level": { + "error": {1, 3}, + "info": {2, 3}, + }, + }, + + blocks: []uint64{0}, + docsOnDisk: 1024, + } + + second := &mockSealingSource{ + ids: []seq.ID{ + {MID: 6}, + {MID: 5}, + }, + + pos: []seq.DocPos{ + seq.PackDocPos(0, 0), + seq.PackDocPos(0, 2048), + }, + + fields: map[string]map[string][]uint32{ + "level": { + "debug": {1}, + "info": {2}, + }, + }, + + blocks: []uint64{0}, + docsOnDisk: 2048, + } + + source := NewMergeSource("inmemory", []Source{first, second}) + + { + // Validate correctness of [storage.DocBlock] calculation. + offsets := source.BlockOffsets() + require.Equal(t, []uint64{0, 1024}, offsets) + } + + { + var ( + ids []seq.ID + docpos []seq.DocPos + ) + + for id, dp := range source.ID() { + ids = append(ids, id) + docpos = append(docpos, dp) + } + + require.Equal(t, + []seq.ID{ + seq.SystemID, + // seq.ID from the second source + {MID: 6}, + {MID: 5}, + // seq.ID from the first source + {MID: 3}, + {MID: 2}, + {MID: 1}, + }, + ids, + ) + + require.Equal(t, + []seq.DocPos{ + seq.SystemDocPos, + // seq.DocPos from the second source + seq.PackDocPos(1, 0), seq.PackDocPos(1, 2048), + // seq.DocPos from the first source + seq.PackDocPos(0, 0), seq.PackDocPos(0, 1024), seq.PackDocPos(0, 2048), + }, + docpos, + ) + } + + { + var ( + fields []string + tokens [][]byte + lids [][]uint32 + ) + + for field, fieldIt := range source.TokenTriplet() { + fields = append(fields, field) + + for token, lidsbuf := range fieldIt { + tokens = append(tokens, token) + lids = append(lids, slices.Clone(lidsbuf)) + } + } + + // Both sources have the same and the only field + require.Equal(t, []string{"level"}, fields) + + // Ensure tokens are sorted in ascending order + require.Equal(t, + [][]byte{[]byte("debug"), []byte("error"), []byte("info")}, + tokens, + ) + + // Ensure correctness of lids remapping + // ----------------- + // seq.MID 6 5 3 2 1 + // seq.LID 1 2 3 4 5 + // ----------------- + require.Equal(t, + [][]uint32{ + // Sequence of [seq.LID] for token `debug` + {1}, + // Sequence of [seq.LID] for token `error` + {3, 5}, + // Sequence of [seq.LID] for token `info` + {2, 4, 5}, + }, + lids, + ) + } +} diff --git a/consts/consts.go b/consts/consts.go index 80aabbe3..f9f45037 100644 --- a/consts/consts.go +++ b/consts/consts.go @@ -56,6 +56,7 @@ const ( WalFileSuffix = ".wal" DocsFileSuffix = ".docs" + DocsTmpFileSuffix = "._docs" DocsDelFileSuffix = ".docs.del" SdocsFileSuffix = ".sdocs" diff --git a/frac/sealed_source.go b/frac/sealed_source.go new file mode 100644 index 00000000..633855a4 --- /dev/null +++ b/frac/sealed_source.go @@ -0,0 +1,160 @@ +package frac + +import ( + "iter" + "slices" + + "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/sealed/lids" + "github.com/ozontech/seq-db/frac/sealed/seqids" + "github.com/ozontech/seq-db/frac/sealed/token" + "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/storage" +) + +// SealedSource implements [indexwriter.Source] for a sealed fraction. +// Used as input to [compaction.MergeSource] when compacting multiple fractions. +type SealedSource struct { + f *Sealed + + idsProvider *seqids.Provider + lidsLoader *lids.Loader + + tokenBlockLoader *token.BlockLoader + tokenTableLoader *token.TableLoader + + lastErr error +} + +func NewSealedSource(f *Sealed) *SealedSource { + f.load() + return &SealedSource{ + f: f, + idsProvider: seqids.NewProvider( + &f.idReader, + f.indexCache.MIDs, + f.indexCache.RIDs, + f.indexCache.Params, + &f.blocksData.IDsTable, + f.info.BinaryDataVer, + ), + lidsLoader: lids.NewLoader(&f.lidReader, f.indexCache.LIDs), + tokenBlockLoader: token.NewBlockLoader(f.BaseFileName, &f.tokenReader, f.indexCache.Tokens), + tokenTableLoader: token.NewTableLoader(f.BaseFileName, &f.tokenReader, f.indexCache.TokenTable), + } +} + +func (s *SealedSource) Info() *common.Info { + return s.f.info +} + +func (s *SealedSource) BlockOffsets() []uint64 { + return s.f.blocksData.BlocksOffsets +} + +func (s *SealedSource) ID() iter.Seq2[seq.ID, seq.DocPos] { + return func(yield func(seq.ID, seq.DocPos) bool) { + for lid := uint32(0); lid < s.f.blocksData.IDsTable.IDsTotal; lid++ { + mid, err := s.idsProvider.MID(seq.LID(lid)) + if err != nil { + s.lastErr = err + return + } + + rid, err := s.idsProvider.RID(seq.LID(lid)) + if err != nil { + s.lastErr = err + return + } + + pos, err := s.idsProvider.DocPos(seq.LID(lid)) + if err != nil { + s.lastErr = err + return + } + + if !yield(seq.ID{MID: mid, RID: rid}, pos) { + return + } + } + } +} + +func (s *SealedSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, []uint32]] { + tokenTable := s.tokenTableLoader.Load() + + fields := make([]string, 0, len(tokenTable)) + for field := range tokenTable { + fields = append(fields, field) + } + + slices.Sort(fields) + return func(yield func(string, iter.Seq2[[]byte, []uint32]) bool) { + for _, field := range fields { + if !yield(field, s.tokensForField(field)) { + return + } + } + } +} + +func (s *SealedSource) tokensForField(field string) iter.Seq2[[]byte, []uint32] { + lidsTable := s.f.blocksData.LIDsTable + tokenTable := s.tokenTableLoader.Load() + + var lidsbuf []uint32 + return func(yield func([]byte, []uint32) bool) { + for _, entry := range tokenTable[field].Entries { + block := s.tokenBlockLoader.Load(entry.BlockIndex) + + for tid := entry.StartTID; tid < entry.StartTID+entry.ValCount; tid++ { + lidsbuf = lidsbuf[:0] + + tokenVal := block.GetToken(entry.GetIndexInTokensBlock(tid)) + firstBlock := lidsTable.GetFirstBlockIndexForTID(tid) + lastBlock := lidsTable.GetLastBlockIndexForTID(tid) + + for bi := firstBlock; bi <= lastBlock; bi++ { + lidBlock, err := s.lidsLoader.GetLIDsBlock(bi) + if err != nil { + s.lastErr = err + return + } + + chunkIdx := lidsTable.GetChunkIndex(bi, tid) + lidsbuf = append(lidsbuf, lidBlock.LIDs[lidBlock.Offsets[chunkIdx]:lidBlock.Offsets[chunkIdx+1]]...) + } + + if !yield(tokenVal, lidsbuf) { + return + } + } + } + } +} + +func (s *SealedSource) DocBlock() iter.Seq[[]byte] { + return func(yield func([]byte) bool) { + // We do not want to cache payload of DocBlock because + // it will just pollute cache and cause unnecessary evictions. + r := storage.NewDocBlocksReader(s.f.readLimiter, s.f.docsFile) + + for _, offset := range s.f.blocksData.BlocksOffsets { + // Read DocBlock payload (including its header) but do not decompress it. + // Caller of [SealedSource.DocBlock] will decide whether it requires decompressed data. + payload, _, err := r.ReadDocBlock(int64(offset)) + if err != nil { + s.lastErr = err + return + } + + if !yield(payload) { + return + } + } + } +} + +func (s *SealedSource) LastError() error { + return s.lastErr +} From eac4423600579374c78901173505b876672c8472 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Fri, 10 Apr 2026 13:43:43 +0300 Subject: [PATCH 15/29] feat: calculate information correctly --- compaction/merge.go | 48 ++++++++++++++++++++++++++------- compaction/merge_source.go | 44 +++++++++++++++++++++--------- compaction/merge_source_test.go | 36 ++++++++++++++++++++----- 3 files changed, 101 insertions(+), 27 deletions(-) diff --git a/compaction/merge.go b/compaction/merge.go index 23d60002..600e929f 100644 --- a/compaction/merge.go +++ b/compaction/merge.go @@ -2,17 +2,15 @@ package compaction import ( "errors" - "fmt" "os" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/indexwriter" ) -func Merge(filename string, srcs ...Source) error { - mergeDocs(filename, srcs...) - +func Merge(filename string, srcs ...Source) (*sealed.PreloadedData, error) { src := NewMergeSource(filename, srcs) // FIXME(dkharms): [common.SealParams] must be passed into [Merge] function. @@ -31,7 +29,7 @@ func Merge(filename string, srcs ...Source) error { filename+consts.OffsetsFileSuffix, func(f *os.File) error { return writer.WriteOffsetsFile(f, src) }, ); err != nil { - return err + return nil, err } if err := createAndWrite( @@ -39,7 +37,7 @@ func Merge(filename string, srcs ...Source) error { filename+consts.IDFileSuffix, func(f *os.File) error { return writer.WriteIDFile(f, src) }, ); err != nil { - return err + return nil, err } if err := createAndWriteBoth( @@ -49,7 +47,7 @@ func Merge(filename string, srcs ...Source) error { filename+consts.LIDFileSuffix, func(tf, lf *os.File) error { return writer.WriteTokenTriplet(tf, lf, src) }, ); err != nil { - return err + return nil, err } if err := createAndWrite( @@ -57,10 +55,42 @@ func Merge(filename string, srcs ...Source) error { filename+consts.InfoFileSuffix, func(f *os.File) error { return writer.WriteInfoFile(f, src) }, ); err != nil { - return err + return nil, err + } + + if err := mergeDocs(filename, srcs...); err != nil { + return nil, err + } + + info := src.Info() + info.IndexOnDisk = 0 + + for _, suffix := range []string{ + consts.InfoFileSuffix, + consts.TokenFileSuffix, + consts.OffsetsFileSuffix, + consts.IDFileSuffix, + consts.LIDFileSuffix, + } { + st, err := os.Stat(info.Path + suffix) + if err != nil { + return nil, err + } + info.IndexOnDisk += uint64(st.Size()) + } + + lidsTable := writer.LIDsTable() + preloaded := &sealed.PreloadedData{ + Info: info, + TokenTable: writer.TokenTable(), + BlocksData: sealed.BlocksData{ + LIDsTable: &lidsTable, + IDsTable: writer.IDsTable(), + BlocksOffsets: src.BlockOffsets(), + }, } - return nil + return preloaded, nil } func syncAndClose(f *os.File) error { diff --git a/compaction/merge_source.go b/compaction/merge_source.go index 663aac3a..f8ed6796 100644 --- a/compaction/merge_source.go +++ b/compaction/merge_source.go @@ -3,6 +3,7 @@ package compaction import ( "cmp" "iter" + "math" "slices" "strings" @@ -18,6 +19,7 @@ type Source interface { type MergeSource struct { filename string + info *common.Info // sources is a slice of [sealing.Source] // which provide view into underlying fractions. @@ -33,6 +35,8 @@ type MergeSource struct { // i-th index of lidmapping correponds to i-th fraction. // j-th index of i-th lidmapping corresponds to rename of i-th lid. lidmapping [][]uint32 + + from, to seq.MID } func NewMergeSource(filename string, sources []Source) *MergeSource { @@ -40,23 +44,36 @@ func NewMergeSource(filename string, sources []Source) *MergeSource { for i, src := range sources { lidmapping[i] = make([]uint32, src.Info().DocsTotal+1) } - return &MergeSource{sources: sources, lidmapping: lidmapping} + + info := common.NewInfo(filename, 0, 0) + info.SealingTime = info.CreationTime + + return &MergeSource{ + info: info, + filename: filename, + + sources: sources, + lidmapping: lidmapping, + + from: math.MaxUint64, to: 0, + } } -// FIXME(dkharms): now this is just a placeholder. -// And info can be caculated after all merges. func (s *MergeSource) Info() *common.Info { - var ( - docsOnDisk uint64 - indexOnDisk uint64 - ) - for i := range s.sources { - docsOnDisk += s.sources[i].Info().DocsOnDisk - indexOnDisk += s.sources[i].Info().IndexOnDisk + sinfo := s.sources[i].Info() + + s.info.DocsRaw += sinfo.DocsRaw + s.info.DocsTotal += sinfo.DocsTotal + s.info.DocsOnDisk += sinfo.DocsOnDisk + + // NOTE(dkharms): [IndexOnDisk] is calculated later. } - return common.NewInfo(s.filename, docsOnDisk, 0) + s.info.From = s.from + s.info.To = s.to + + return s.info } func (s *MergeSource) BlockOffsets() []uint64 { @@ -82,7 +99,7 @@ func (s *MergeSource) ID() iter.Seq2[seq.ID, seq.DocPos] { // // Its time complexity O(k*n) so it's not efficient enough if we compare it // against time complexity of min-heap (which is O(n*log(k))) - // or another great data structure -- tournament tree -- which is O(n * log(k)) as well. + // or another great data structure -- tournament tree -- which is O(n*log(k)) as well. // // However, tournament tree performs less comparisons than min-heap // and it is around log(k) vs 2*log(k). @@ -110,6 +127,9 @@ func (s *MergeSource) ID() iter.Seq2[seq.ID, seq.DocPos] { ids = append(ids, entry{id, docpos, i, lid}) lid += 1 + + s.from = min(s.from, id.MID) + s.to = max(s.to, id.MID) } } diff --git a/compaction/merge_source_test.go b/compaction/merge_source_test.go index c01b15a2..df471c8f 100644 --- a/compaction/merge_source_test.go +++ b/compaction/merge_source_test.go @@ -1,6 +1,7 @@ package compaction import ( + "cmp" "iter" "slices" "testing" @@ -31,8 +32,17 @@ type mockSealingSource struct { func (m *mockSealingSource) Info() *common.Info { return &common.Info{ + DocsRaw: m.docsOnDisk, DocsTotal: uint32(len(m.ids)), DocsOnDisk: m.docsOnDisk, + + From: slices.MinFunc(m.ids, func(x, y seq.ID) int { + return cmp.Compare(x.MID, y.MID) + }).MID, + + To: slices.MaxFunc(m.ids, func(x, y seq.ID) int { + return cmp.Compare(x.MID, y.MID) + }).MID, } } @@ -142,13 +152,13 @@ func TestMergeSource(t *testing.T) { source := NewMergeSource("inmemory", []Source{first, second}) - { + t.Run("offsets", func(t *testing.T) { // Validate correctness of [storage.DocBlock] calculation. offsets := source.BlockOffsets() require.Equal(t, []uint64{0, 1024}, offsets) - } + }) - { + t.Run("ids", func(t *testing.T) { var ( ids []seq.ID docpos []seq.DocPos @@ -183,9 +193,9 @@ func TestMergeSource(t *testing.T) { }, docpos, ) - } + }) - { + t.Run("tokens-lids", func(t *testing.T) { var ( fields []string tokens [][]byte @@ -226,5 +236,19 @@ func TestMergeSource(t *testing.T) { }, lids, ) - } + }) + + t.Run("info", func(t *testing.T) { + merged := source.Info() + finfo, sinfo := first.Info(), second.Info() + + // Validate correctness of fraction time-range. + require.Equal(t, merged.From, min(finfo.From, sinfo.From)) + require.Equal(t, merged.To, max(finfo.To, sinfo.To)) + + // Validate correctness of total documents of merged fractions. + require.Equal(t, merged.DocsTotal, finfo.DocsTotal+sinfo.DocsTotal) + require.Equal(t, merged.DocsOnDisk, finfo.DocsOnDisk+sinfo.DocsOnDisk) + require.Equal(t, merged.DocsRaw, finfo.DocsRaw+sinfo.DocsRaw) + }) } From 465bc2070be43e5d6b200b05d44781b6a86aaa9b Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Mon, 13 Apr 2026 17:45:47 +0300 Subject: [PATCH 16/29] feat: use linear scan for k-way merge --- compaction/heap.go | 1 - compaction/merge.go | 36 ++-- compaction/merge_source.go | 331 +++++++++++++++++++++++--------- compaction/merge_source_test.go | 114 +++++++++-- frac/sealed_source.go | 8 +- seq/seq.go | 10 +- 6 files changed, 369 insertions(+), 131 deletions(-) delete mode 100644 compaction/heap.go diff --git a/compaction/heap.go b/compaction/heap.go deleted file mode 100644 index d1d3cde1..00000000 --- a/compaction/heap.go +++ /dev/null @@ -1 +0,0 @@ -package compaction diff --git a/compaction/merge.go b/compaction/merge.go index 600e929f..1ff9b5a6 100644 --- a/compaction/merge.go +++ b/compaction/merge.go @@ -4,26 +4,21 @@ import ( "errors" "os" + "github.com/alecthomas/units" + "go.uber.org/zap" + + "github.com/ozontech/seq-db/bytespool" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/indexwriter" + "github.com/ozontech/seq-db/logger" ) -func Merge(filename string, srcs ...Source) (*sealed.PreloadedData, error) { +func Merge(filename string, params common.SealParams, srcs ...Source) (*sealed.PreloadedData, error) { + writer := indexwriter.New(params) src := NewMergeSource(filename, srcs) - // FIXME(dkharms): [common.SealParams] must be passed into [Merge] function. - writer := indexwriter.New(common.SealParams{ - IDsZstdLevel: 3, - LIDsZstdLevel: 3, - TokenListZstdLevel: 3, - DocsPositionsZstdLevel: 3, - TokenTableZstdLevel: 3, - DocBlocksZstdLevel: 3, - DocBlockSize: 3, - }) - if err := createAndWrite( filename+consts.OffsetsTmpFileSuffix, filename+consts.OffsetsFileSuffix, @@ -142,19 +137,32 @@ func createAndWriteBoth( return os.Rename(tmpPath2, finalPath2) } -// FIXME(dkharms): Create buffered writer for file. func mergeDocs(filename string, srcs ...Source) error { return createAndWrite( filename+consts.DocsTmpFileSuffix, filename+consts.DocsFileSuffix, func(f *os.File) error { + w := bytespool.AcquireWriterSize(f, int(units.MiB)) + + defer func() { + if err := w.Flush(); err != nil { + logger.Error( + "cannot flush compacted .docs file", + zap.Error(err), + zap.String("fraction", filename), + ) + } + bytespool.ReleaseWriter(w) + }() + for _, src := range srcs { for block := range src.DocBlock() { - if _, err := f.Write(block); err != nil { + if _, err := w.Write(block); err != nil { return err } } } + return nil }, ) diff --git a/compaction/merge_source.go b/compaction/merge_source.go index f8ed6796..19768c93 100644 --- a/compaction/merge_source.go +++ b/compaction/merge_source.go @@ -1,11 +1,10 @@ package compaction import ( - "cmp" + "bytes" "iter" "math" "slices" - "strings" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/indexwriter" @@ -25,22 +24,23 @@ type MergeSource struct { // which provide view into underlying fractions. sources []Source - // docblockcount is populated during [MergeSource.BlockOffsets] call. + // docBlockCount is populated during [MergeSource.BlockOffsets] call. // This slice is used for changing block indexes in [seq.DocPos]. - docblockcount []int + docBlockCount []int - // lidmapping describes the transformation of lids + // lidMapping describes the transformation of lids // after k-merge of several fractions. // - // i-th index of lidmapping correponds to i-th fraction. - // j-th index of i-th lidmapping corresponds to rename of i-th lid. - lidmapping [][]uint32 + // i-th index of [lidMapping] correponds to i-th fraction. + // j-th index of i-th [lidMapping] corresponds to rename of j-th lid. + lidMapping [][]uint32 from, to seq.MID } func NewMergeSource(filename string, sources []Source) *MergeSource { lidmapping := make([][]uint32, len(sources)) + for i, src := range sources { lidmapping[i] = make([]uint32, src.Info().DocsTotal+1) } @@ -53,7 +53,7 @@ func NewMergeSource(filename string, sources []Source) *MergeSource { filename: filename, sources: sources, - lidmapping: lidmapping, + lidMapping: lidmapping, from: math.MaxUint64, to: 0, } @@ -82,20 +82,21 @@ func (s *MergeSource) BlockOffsets() []uint64 { offsets []uint64 ) - s.docblockcount = append(s.docblockcount, 0) + // Initially s.docBlockCount + s.docBlockCount = append(s.docBlockCount, 0) for i := 0; i < len(s.sources); i++ { for _, offset := range s.sources[i].BlockOffsets() { offsets = append(offsets, uint64(offset)+docsSize) } docsSize += s.sources[i].Info().DocsOnDisk - s.docblockcount = append(s.docblockcount, len(offsets)) + s.docBlockCount = append(s.docBlockCount, len(offsets)) } return offsets } func (s *MergeSource) ID() iter.Seq2[seq.ID, seq.DocPos] { - // FIXME(dkharms): For now, I will use stupid-simple linear scan for k-way merge. + // TODO(dkharms): For now, I will use stupid-simple linear scan for k-way merge. // // Its time complexity O(k*n) so it's not efficient enough if we compare it // against time complexity of min-heap (which is O(n*log(k))) @@ -104,136 +105,284 @@ func (s *MergeSource) ID() iter.Seq2[seq.ID, seq.DocPos] { // However, tournament tree performs less comparisons than min-heap // and it is around log(k) vs 2*log(k). - type entry struct { + type cursor struct { + next func() (seq.ID, seq.DocPos, bool) + stop func() + id seq.ID - docpos seq.DocPos + docPos seq.DocPos + lidOld uint32 - sourceIdx int - oldlid uint32 + ok bool } - var ids []entry - for i := 0; i < len(s.sources); i++ { - var lid uint32 - for id, docpos := range s.sources[i].ID() { - // Skip system [seq.ID]. - if id == seq.SystemID { - lid += 1 - continue - } - - blockIdx, offset := docpos.Unpack() - docpos = seq.PackDocPos(uint32(s.docblockcount[i]+int(blockIdx)), offset) - ids = append(ids, entry{id, docpos, i, lid}) + return func(yield func(seq.ID, seq.DocPos) bool) { + var cursors []cursor - lid += 1 + for i := range s.sources { + src := s.sources[i] + next, stop := iter.Pull2(src.ID()) - s.from = min(s.from, id.MID) - s.to = max(s.to, id.MID) - } - } + // Skip [seq.SystemID] and [seq.SystemDocPos]. + _, _, _ = next() - slices.SortFunc(ids, func(x, y entry) int { - if x.id.MID == y.id.MID { - return -cmp.Compare(x.id.RID, y.id.RID) + id, docpos, ok := next() + cursors = append(cursors, cursor{ + next: next, stop: stop, + id: id, docPos: docpos, lidOld: 1, + ok: ok, + }) } - return -cmp.Compare(x.id.MID, y.id.MID) - }) - for i, entry := range ids { - s.lidmapping[entry.sourceIdx][entry.oldlid] = uint32(i + 1) - } + defer func() { + for _, c := range cursors { + c.stop() + } + }() - return func(yield func(seq.ID, seq.DocPos) bool) { - // Emit system id since we skipped all such ids previously. + lid := uint32(1) + // We've previosly dropped [seq.SystemID] from + // iterators however we do have to emit one such id. if !yield(seq.SystemID, seq.SystemDocPos) { return } - for _, v := range ids { - if !yield(v.id, v.docpos) { + for { + var ( + id seq.ID = seq.MinID + idx int = -1 + ) + + for i, c := range cursors { + // We exhausted i-th cursor so there is nothing pull. + if !c.ok { + continue + } + + if seq.Less(id, c.id) { + id = c.id + idx = i + } + } + + // All pull-iterators are exhausted. + // Close all iterators and return. + if idx == -1 { + break + } + + c := cursors[idx] + minid, mindocpos, oldlid := c.id, c.docPos, c.lidOld + + blockIdx, offset := mindocpos.Unpack() + mindocpos = seq.PackDocPos(uint32(s.docBlockCount[idx]+int(blockIdx)), offset) + + if !yield(minid, mindocpos) { return } + + // Rename lid from picked cursor to the new value. + s.lidMapping[idx][oldlid] = lid + + c.id, c.docPos, c.ok = c.next() + c.lidOld += 1 + + s.from = min(s.from, minid.MID) + s.to = max(s.to, minid.MID) + + lid += 1 + cursors[idx] = c } } } -type key struct { - field string - token string -} +func (s *MergeSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, []uint32]] { + // TODO(dkharms): For now, I will use stupid-simple linear scan for k-way merge. + // + // Its time complexity O(k*n) so it's not efficient enough if we compare it + // against time complexity of min-heap (which is O(n*log(k))) + // or another great data structure -- tournament tree -- which is O(n*log(k)) as well. + // + // However, tournament tree performs less comparisons than min-heap + // and it is around log(k) vs 2*log(k). -type value struct { - idx int - lids []uint32 -} + type cursor struct { + next func() (string, iter.Seq2[[]byte, []uint32], bool) + stop func() -func (s *MergeSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, []uint32]] { - // TODO(dkharms): Use heap or other more efficient data structure. - // For now, I'll just dump everything into one array. + field string + tokIt iter.Seq2[[]byte, []uint32] - values := make(map[key][]value) - for i := 0; i < len(s.sources); i++ { - for field, tokIter := range s.sources[i].TokenTriplet() { - for tok, lids := range tokIter { - k := key{field, string(tok)} - values[k] = append(values[k], value{i, slices.Clone(lids)}) + ok bool + } + + minimal := func(cursors []cursor) (string, bool) { + var ( + set bool + field string + ) + + for _, c := range cursors { + if !c.ok { + continue + } + + if !set { + field = c.field + set = true + continue } + + field = min(field, c.field) } - } - var keys []key - for k := range values { - keys = append(keys, k) + return field, set } - slices.SortFunc(keys, func(x, y key) int { - if x.field != y.field { - return strings.Compare(x.field, y.field) + return func(yield func(string, iter.Seq2[[]byte, []uint32]) bool) { + var cursors []cursor + + for i := range s.sources { + src := s.sources[i] + + next, stop := iter.Pull2(src.TokenTriplet()) + field, tokIt, has := next() + + cursors = append(cursors, cursor{ + next: next, stop: stop, + field: field, tokIt: tokIt, + ok: has, + }) } - return strings.Compare(x.token, y.token) - }) - return func(yield func(string, iter.Seq2[[]byte, []uint32]) bool) { - var previous string - for _, k := range keys { - if k.field == previous { - continue + defer func() { + for _, c := range cursors { + c.stop() + } + }() + + for { + field, ok := minimal(cursors) + if !ok { + break } - if !yield(k.field, s.tokensForField(k.field, keys, values)) { + var ( + idxs []int + iters []iter.Seq2[[]byte, []uint32] + ) + + for i, c := range cursors { + if !c.ok || c.field != field { + continue + } + + idxs = append(idxs, i) + iters = append(iters, c.tokIt) + } + + if !yield(field, s.tokensForField(idxs, iters)) { return } - previous = k.field + // Advance all cursors that were on this field. + for _, idx := range idxs { + c := cursors[idx] + c.field, c.tokIt, c.ok = c.next() + cursors[idx] = c + } } } } func (s *MergeSource) tokensForField( - field string, keys []key, values map[key][]value, + idxs []int, iters []iter.Seq2[[]byte, []uint32], ) iter.Seq2[[]byte, []uint32] { - var filtered []key - for _, k := range keys { - if k.field == field { - filtered = append(filtered, k) + type cursor struct { + next func() ([]byte, []uint32, bool) + stop func() + + idx int + token []byte + lids []uint32 + + ok bool + } + + minimal := func(cursors []cursor) ([]byte, bool) { + var ( + set bool + token []byte + ) + + for _, c := range cursors { + if !c.ok { + continue + } + + if !set { + token = c.token + set = true + continue + } + + if bytes.Compare(c.token, token) < 0 { + token = c.token + } } + + return token, set } + // NB: This buffer will be reused across + // all calls within current field. + var lidRenamed []uint32 + return func(yield func([]byte, []uint32) bool) { - for _, k := range filtered { - var buf []uint32 + var cursors []cursor + + for i := range iters { + next, stop := iter.Pull2(iters[i]) + token, lids, ok := next() + cursors = append(cursors, cursor{ + next: next, stop: stop, + idx: idxs[i], token: token, lids: lids, + ok: ok, + }) + } + + defer func() { + for _, c := range cursors { + c.stop() + } + }() + + for { + token, ok := minimal(cursors) + if !ok { + break + } + + // Collect and remap lids from all cursors at this token, then advance them. + for i, c := range cursors { + if !c.ok || !bytes.Equal(c.token, token) { + continue + } - for _, v := range values[k] { - for _, lid := range v.lids { - buf = append(buf, s.lidmapping[v.idx][lid]) + for _, lid := range c.lids { + lidRenamed = append(lidRenamed, s.lidMapping[c.idx][lid]) } + + c.token, c.lids, c.ok = c.next() + cursors[i] = c } - slices.Sort(buf) - if !yield([]byte(k.token), buf) { + slices.Sort(lidRenamed) + if !yield(token, lidRenamed) { return } + + lidRenamed = lidRenamed[:0] } } } diff --git a/compaction/merge_source_test.go b/compaction/merge_source_test.go index df471c8f..5a045503 100644 --- a/compaction/merge_source_test.go +++ b/compaction/merge_source_test.go @@ -2,32 +2,24 @@ package compaction import ( "cmp" + "fmt" "iter" + "math/rand" "slices" "testing" + "github.com/stretchr/testify/require" + "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/seq" - "github.com/stretchr/testify/require" ) -// mockSealingSource is a test implementation of sealing.Source. -// -// IDs must be provided in descending order (MID DESC, RID DESC); the mock -// automatically prepends the system ID when iterating, matching the contract -// expected by MergeSource.ID(). -// -// Fields maps field name → token value → list of 1-based LIDs. -// Fields and tokens are yielded in sorted order. type mockSealingSource struct { - ids []seq.ID - pos []seq.DocPos - blocks []uint64 - // docsOnDisk is the total compressed size of the .docs file, - // used by MergeSource to adjust block offsets across sources. + ids []seq.ID + pos []seq.DocPos + blocks []uint64 docsOnDisk uint64 - // fields maps field → token → lids (1-based). - fields map[string]map[string][]uint32 + fields map[string]map[string][]uint32 } func (m *mockSealingSource) Info() *common.Info { @@ -222,8 +214,9 @@ func TestMergeSource(t *testing.T) { // Ensure correctness of lids remapping // ----------------- - // seq.MID 6 5 3 2 1 - // seq.LID 1 2 3 4 5 + // seq.MID 6 5 | 3 2 1 + // seq.LID (old) 1 2 | 1 2 3 + // seq.LID (new) 1 2 | 3 4 5 // ----------------- require.Equal(t, [][]uint32{ @@ -252,3 +245,88 @@ func TestMergeSource(t *testing.T) { require.Equal(t, merged.DocsRaw, finfo.DocsRaw+sinfo.DocsRaw) }) } + +func BenchmarkMergeSource(b *testing.B) { + const ( + numSources = 4 + docsPerSource = 512_000 + + // Total pairs of (field, token) will be + // [numFields] * [numTokens]. + numFields = 512 + numTokens = 16384 + ) + + rng := rand.New(rand.NewSource(42)) + + fieldNames := make([]string, numFields) + for i := range fieldNames { + fieldNames[i] = fmt.Sprintf("field-%d", i) + } + + tokenNames := make([]string, numTokens) + for i := range tokenNames { + tokenNames[i] = fmt.Sprintf("token-%d", i) + } + + makeSource := func(midOffset seq.MID) Source { + ids := make([]seq.ID, docsPerSource) + pos := make([]seq.DocPos, docsPerSource) + + for j := range ids { + // IDs must be in descending MID order within each source. + ids[j] = seq.ID{MID: midOffset + seq.MID(docsPerSource-j)} + pos[j] = seq.PackDocPos(0, uint64(j*64)) + } + + // Assign each lid to a random (field, token) pair from the vocabulary + // so that total lids per source equals [docsPerSource]. + fields := make(map[string]map[string][]uint32) + for lid := uint32(1); lid <= uint32(docsPerSource); lid++ { + field := fieldNames[rng.Intn(numFields)] + token := tokenNames[rng.Intn(numTokens)] + + if fields[field] == nil { + fields[field] = make(map[string][]uint32) + } + + fields[field][token] = append(fields[field][token], lid) + } + + for _, tokens := range fields { + for tok, lids := range tokens { + slices.Sort(lids) + tokens[tok] = lids + } + } + + return &mockSealingSource{ + ids: ids, + pos: pos, + blocks: []uint64{0}, + docsOnDisk: docsPerSource * 64, + fields: fields, + } + } + + sources := make([]Source, numSources) + for i := range sources { + sources[i] = makeSource(seq.MID(i * docsPerSource)) + } + + b.ResetTimer() + b.ReportAllocs() + + for b.Loop() { + ms := NewMergeSource("bench", sources) + + ms.BlockOffsets() + for range ms.ID() { + } + + for _, tokIt := range ms.TokenTriplet() { + for range tokIt { + } + } + } +} diff --git a/frac/sealed_source.go b/frac/sealed_source.go index 633855a4..29a3d31b 100644 --- a/frac/sealed_source.go +++ b/frac/sealed_source.go @@ -102,13 +102,13 @@ func (s *SealedSource) tokensForField(field string) iter.Seq2[[]byte, []uint32] lidsTable := s.f.blocksData.LIDsTable tokenTable := s.tokenTableLoader.Load() - var lidsbuf []uint32 + var lidsBuf []uint32 return func(yield func([]byte, []uint32) bool) { for _, entry := range tokenTable[field].Entries { block := s.tokenBlockLoader.Load(entry.BlockIndex) for tid := entry.StartTID; tid < entry.StartTID+entry.ValCount; tid++ { - lidsbuf = lidsbuf[:0] + lidsBuf = lidsBuf[:0] tokenVal := block.GetToken(entry.GetIndexInTokensBlock(tid)) firstBlock := lidsTable.GetFirstBlockIndexForTID(tid) @@ -122,10 +122,10 @@ func (s *SealedSource) tokensForField(field string) iter.Seq2[[]byte, []uint32] } chunkIdx := lidsTable.GetChunkIndex(bi, tid) - lidsbuf = append(lidsbuf, lidBlock.LIDs[lidBlock.Offsets[chunkIdx]:lidBlock.Offsets[chunkIdx+1]]...) + lidsBuf = append(lidsBuf, lidBlock.LIDs[lidBlock.Offsets[chunkIdx]:lidBlock.Offsets[chunkIdx+1]]...) } - if !yield(tokenVal, lidsbuf) { + if !yield(tokenVal, lidsBuf) { return } } diff --git a/seq/seq.go b/seq/seq.go index adae4265..d3557a16 100644 --- a/seq/seq.go +++ b/seq/seq.go @@ -11,9 +11,13 @@ import ( ) var ( - SystemMID MID = math.MaxUint64 - SystemRID RID = math.MaxUint64 - SystemID ID = ID{SystemMID, SystemRID} + SystemMID MID = math.MaxUint64 + SystemRID RID = math.MaxUint64 + + SystemID ID = ID{SystemMID, SystemRID} + MinID ID = ID{0, 0} + MaxID ID = SystemID + SystemDocPos DocPos = DocPos(0) ) From 9944b81d9b2769c9593a3c87cba802c1cdeb7d99 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Mon, 27 Apr 2026 14:14:42 +0300 Subject: [PATCH 17/29] fix: calculate offsets and info once --- compaction/merge.go | 90 +++++++-------- compaction/merge_source.go | 191 +++++++++++++++++++------------- compaction/merge_source_test.go | 36 +++--- frac/sealed_source.go | 45 ++++---- 4 files changed, 198 insertions(+), 164 deletions(-) diff --git a/compaction/merge.go b/compaction/merge.go index 1ff9b5a6..b21cd9c3 100644 --- a/compaction/merge.go +++ b/compaction/merge.go @@ -4,15 +4,10 @@ import ( "errors" "os" - "github.com/alecthomas/units" - "go.uber.org/zap" - - "github.com/ozontech/seq-db/bytespool" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/indexwriter" - "github.com/ozontech/seq-db/logger" ) func Merge(filename string, params common.SealParams, srcs ...Source) (*sealed.PreloadedData, error) { @@ -88,6 +83,33 @@ func Merge(filename string, params common.SealParams, srcs ...Source) (*sealed.P return preloaded, nil } +func mergeDocs(filename string, srcs ...Source) error { + return createAndWrite( + filename+consts.DocsTmpFileSuffix, + filename+consts.DocsFileSuffix, + func(f *os.File) error { + var docsSize uint64 + + for _, src := range srcs { + for loc, err := range src.DocBlock() { + if err != nil { + return err + } + + payload, offset := loc.First, loc.Second + if _, err := f.WriteAt(payload, int64(offset+docsSize)); err != nil { + return err + } + } + + docsSize += src.Info().DocsOnDisk + } + + return nil + }, + ) +} + func syncAndClose(f *os.File) error { if err := f.Sync(); err != nil { f.Close() @@ -96,8 +118,11 @@ func syncAndClose(f *os.File) error { return f.Close() } -func createAndWrite(tmpPath, finalPath string, write func(*os.File) error) error { - f, err := os.Create(tmpPath) +func createAndWrite( + tmp, final string, + write func(*os.File) error, +) error { + f, err := os.Create(tmp) if err != nil { return err } @@ -106,64 +131,33 @@ func createAndWrite(tmpPath, finalPath string, write func(*os.File) error) error return err } - return os.Rename(tmpPath, finalPath) + return os.Rename(tmp, final) } func createAndWriteBoth( - tmpPath1, finalPath1, - tmpPath2, finalPath2 string, + atmp, afinal, + btmp, bfinal string, write func(*os.File, *os.File) error, ) error { - f1, err := os.Create(tmpPath1) + a, err := os.Create(atmp) if err != nil { return err } - f2, err := os.Create(tmpPath2) + b, err := os.Create(btmp) if err != nil { - f1.Close() + a.Close() return err } - writeErr := write(f1, f2) - if err := errors.Join(writeErr, syncAndClose(f1), syncAndClose(f2)); err != nil { + writeErr := write(a, b) + if err := errors.Join(writeErr, syncAndClose(a), syncAndClose(b)); err != nil { return err } - if err := os.Rename(tmpPath1, finalPath1); err != nil { + if err := os.Rename(atmp, afinal); err != nil { return err } - return os.Rename(tmpPath2, finalPath2) -} - -func mergeDocs(filename string, srcs ...Source) error { - return createAndWrite( - filename+consts.DocsTmpFileSuffix, - filename+consts.DocsFileSuffix, - func(f *os.File) error { - w := bytespool.AcquireWriterSize(f, int(units.MiB)) - - defer func() { - if err := w.Flush(); err != nil { - logger.Error( - "cannot flush compacted .docs file", - zap.Error(err), - zap.String("fraction", filename), - ) - } - bytespool.ReleaseWriter(w) - }() - - for _, src := range srcs { - for block := range src.DocBlock() { - if _, err := w.Write(block); err != nil { - return err - } - } - } - - return nil - }, - ) + return os.Rename(btmp, bfinal) } diff --git a/compaction/merge_source.go b/compaction/merge_source.go index 19768c93..d9f16428 100644 --- a/compaction/merge_source.go +++ b/compaction/merge_source.go @@ -5,25 +5,39 @@ import ( "iter" "math" "slices" + "sync" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/indexwriter" "github.com/ozontech/seq-db/seq" + "github.com/ozontech/seq-db/util" +) + +type ( + Document = util.Pair[seq.ID, []byte] + DocBlockLocation = util.Pair[[]byte, uint64] + TokenPosting = util.Pair[[]byte, []uint32] + DocLocation = util.Pair[seq.ID, seq.DocPos] + IndexedDocBlock = util.Pair[[]byte, []seq.DocPos] ) type Source interface { indexwriter.Source - DocBlock() iter.Seq[[]byte] + DocBlock() iter.Seq2[DocBlockLocation, error] } type MergeSource struct { filename string + info *common.Info + infoOnce sync.Once // sources is a slice of [sealing.Source] // which provide view into underlying fractions. sources []Source + offsets []uint64 + offsetsOnce sync.Once // docBlockCount is populated during [MergeSource.BlockOffsets] call. // This slice is used for changing block indexes in [seq.DocPos]. docBlockCount []int @@ -60,42 +74,47 @@ func NewMergeSource(filename string, sources []Source) *MergeSource { } func (s *MergeSource) Info() *common.Info { - for i := range s.sources { - sinfo := s.sources[i].Info() + s.infoOnce.Do(func() { + for i := range s.sources { + sinfo := s.sources[i].Info() - s.info.DocsRaw += sinfo.DocsRaw - s.info.DocsTotal += sinfo.DocsTotal - s.info.DocsOnDisk += sinfo.DocsOnDisk + s.info.DocsRaw += sinfo.DocsRaw + s.info.DocsTotal += sinfo.DocsTotal + s.info.DocsOnDisk += sinfo.DocsOnDisk - // NOTE(dkharms): [IndexOnDisk] is calculated later. - } + // NOTE(dkharms): [IndexOnDisk] is calculated later. + } - s.info.From = s.from - s.info.To = s.to + s.info.From = s.from + s.info.To = s.to + }) return s.info } func (s *MergeSource) BlockOffsets() []uint64 { - var ( - docsSize uint64 - offsets []uint64 - ) - - // Initially s.docBlockCount - s.docBlockCount = append(s.docBlockCount, 0) - for i := 0; i < len(s.sources); i++ { - for _, offset := range s.sources[i].BlockOffsets() { - offsets = append(offsets, uint64(offset)+docsSize) + s.offsetsOnce.Do(func() { + var ( + docsSize uint64 + offsets []uint64 + ) + + s.docBlockCount = append(s.docBlockCount, 0) + for i := 0; i < len(s.sources); i++ { + for _, offset := range s.sources[i].BlockOffsets() { + offsets = append(offsets, uint64(offset)+docsSize) + } + docsSize += s.sources[i].Info().DocsOnDisk + s.docBlockCount = append(s.docBlockCount, len(offsets)) } - docsSize += s.sources[i].Info().DocsOnDisk - s.docBlockCount = append(s.docBlockCount, len(offsets)) - } - return offsets + s.offsets = offsets + }) + + return s.offsets } -func (s *MergeSource) ID() iter.Seq2[seq.ID, seq.DocPos] { +func (s *MergeSource) ID() iter.Seq2[DocLocation, error] { // TODO(dkharms): For now, I will use stupid-simple linear scan for k-way merge. // // Its time complexity O(k*n) so it's not efficient enough if we compare it @@ -106,19 +125,24 @@ func (s *MergeSource) ID() iter.Seq2[seq.ID, seq.DocPos] { // and it is around log(k) vs 2*log(k). type cursor struct { - next func() (seq.ID, seq.DocPos, bool) + next func() (DocLocation, error, bool) stop func() - id seq.ID - docPos seq.DocPos + loc DocLocation lidOld uint32 ok bool } - return func(yield func(seq.ID, seq.DocPos) bool) { + return func(yield func(DocLocation, error) bool) { var cursors []cursor + defer func() { + for _, c := range cursors { + c.stop() + } + }() + for i := range s.sources { src := s.sources[i] next, stop := iter.Pull2(src.ID()) @@ -126,24 +150,23 @@ func (s *MergeSource) ID() iter.Seq2[seq.ID, seq.DocPos] { // Skip [seq.SystemID] and [seq.SystemDocPos]. _, _, _ = next() - id, docpos, ok := next() + loc, err, ok := next() cursors = append(cursors, cursor{ next: next, stop: stop, - id: id, docPos: docpos, lidOld: 1, - ok: ok, + loc: loc, lidOld: 1, + ok: ok && err == nil, }) - } - defer func() { - for _, c := range cursors { - c.stop() + if err != nil { + yield(DocLocation{}, err) + return } - }() + } lid := uint32(1) // We've previosly dropped [seq.SystemID] from // iterators however we do have to emit one such id. - if !yield(seq.SystemID, seq.SystemDocPos) { + if !yield(DocLocation{First: seq.SystemID, Second: seq.SystemDocPos}, nil) { return } @@ -159,8 +182,8 @@ func (s *MergeSource) ID() iter.Seq2[seq.ID, seq.DocPos] { continue } - if seq.Less(id, c.id) { - id = c.id + if seq.Less(id, c.loc.First) { + id = c.loc.First idx = i } } @@ -172,21 +195,28 @@ func (s *MergeSource) ID() iter.Seq2[seq.ID, seq.DocPos] { } c := cursors[idx] - minid, mindocpos, oldlid := c.id, c.docPos, c.lidOld + minid, oldlid := c.loc.First, c.lidOld - blockIdx, offset := mindocpos.Unpack() - mindocpos = seq.PackDocPos(uint32(s.docBlockCount[idx]+int(blockIdx)), offset) + blockIdx, offset := c.loc.Second.Unpack() + mindocpos := seq.PackDocPos(uint32(s.docBlockCount[idx]+int(blockIdx)), offset) - if !yield(minid, mindocpos) { + if !yield(DocLocation{First: minid, Second: mindocpos}, nil) { return } // Rename lid from picked cursor to the new value. s.lidMapping[idx][oldlid] = lid - c.id, c.docPos, c.ok = c.next() + var err error + c.loc, err, c.ok = c.next() c.lidOld += 1 + if err != nil { + cursors[idx] = c + yield(DocLocation{}, err) + return + } + s.from = min(s.from, minid.MID) s.to = max(s.to, minid.MID) @@ -196,7 +226,7 @@ func (s *MergeSource) ID() iter.Seq2[seq.ID, seq.DocPos] { } } -func (s *MergeSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, []uint32]] { +func (s *MergeSource) TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, error]] { // TODO(dkharms): For now, I will use stupid-simple linear scan for k-way merge. // // Its time complexity O(k*n) so it's not efficient enough if we compare it @@ -207,11 +237,11 @@ func (s *MergeSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, []uint3 // and it is around log(k) vs 2*log(k). type cursor struct { - next func() (string, iter.Seq2[[]byte, []uint32], bool) + next func() (string, iter.Seq2[TokenPosting, error], bool) stop func() field string - tokIt iter.Seq2[[]byte, []uint32] + tokIt iter.Seq2[TokenPosting, error] ok bool } @@ -239,7 +269,7 @@ func (s *MergeSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, []uint3 return field, set } - return func(yield func(string, iter.Seq2[[]byte, []uint32]) bool) { + return func(yield func(string, iter.Seq2[TokenPosting, error]) bool) { var cursors []cursor for i := range s.sources { @@ -269,7 +299,7 @@ func (s *MergeSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, []uint3 var ( idxs []int - iters []iter.Seq2[[]byte, []uint32] + iters []iter.Seq2[TokenPosting, error] ) for i, c := range cursors { @@ -296,15 +326,14 @@ func (s *MergeSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, []uint3 } func (s *MergeSource) tokensForField( - idxs []int, iters []iter.Seq2[[]byte, []uint32], -) iter.Seq2[[]byte, []uint32] { + idxs []int, iters []iter.Seq2[TokenPosting, error], +) iter.Seq2[TokenPosting, error] { type cursor struct { - next func() ([]byte, []uint32, bool) + next func() (TokenPosting, error, bool) stop func() - idx int - token []byte - lids []uint32 + idx int + posting TokenPosting ok bool } @@ -321,13 +350,13 @@ func (s *MergeSource) tokensForField( } if !set { - token = c.token + token = c.posting.First set = true continue } - if bytes.Compare(c.token, token) < 0 { - token = c.token + if bytes.Compare(c.posting.First, token) < 0 { + token = c.posting.First } } @@ -338,24 +367,30 @@ func (s *MergeSource) tokensForField( // all calls within current field. var lidRenamed []uint32 - return func(yield func([]byte, []uint32) bool) { + return func(yield func(TokenPosting, error) bool) { var cursors []cursor + defer func() { + for _, c := range cursors { + c.stop() + } + }() + for i := range iters { next, stop := iter.Pull2(iters[i]) - token, lids, ok := next() + posting, err, ok := next() + cursors = append(cursors, cursor{ next: next, stop: stop, - idx: idxs[i], token: token, lids: lids, - ok: ok, + idx: idxs[i], posting: posting, + ok: ok && err == nil, }) - } - defer func() { - for _, c := range cursors { - c.stop() + if err != nil { + yield(TokenPosting{}, err) + return } - }() + } for { token, ok := minimal(cursors) @@ -365,20 +400,28 @@ func (s *MergeSource) tokensForField( // Collect and remap lids from all cursors at this token, then advance them. for i, c := range cursors { - if !c.ok || !bytes.Equal(c.token, token) { + if !c.ok || !bytes.Equal(c.posting.First, token) { continue } - for _, lid := range c.lids { + for _, lid := range c.posting.Second { lidRenamed = append(lidRenamed, s.lidMapping[c.idx][lid]) } - c.token, c.lids, c.ok = c.next() + var err error + c.posting, err, c.ok = c.next() + + if err != nil { + cursors[i] = c + yield(TokenPosting{}, err) + return + } + cursors[i] = c } slices.Sort(lidRenamed) - if !yield(token, lidRenamed) { + if !yield(TokenPosting{First: token, Second: lidRenamed}, nil) { return } @@ -386,7 +429,3 @@ func (s *MergeSource) tokensForField( } } } - -func (s *MergeSource) LastError() error { - return nil -} diff --git a/compaction/merge_source_test.go b/compaction/merge_source_test.go index 5a045503..fdedac98 100644 --- a/compaction/merge_source_test.go +++ b/compaction/merge_source_test.go @@ -42,27 +42,27 @@ func (m *mockSealingSource) BlockOffsets() []uint64 { return m.blocks } -func (m *mockSealingSource) ID() iter.Seq2[seq.ID, seq.DocPos] { - return func(yield func(seq.ID, seq.DocPos) bool) { - if !yield(seq.SystemID, seq.SystemDocPos) { +func (m *mockSealingSource) ID() iter.Seq2[DocLocation, error] { + return func(yield func(DocLocation, error) bool) { + if !yield(DocLocation{First: seq.SystemID, Second: seq.SystemDocPos}, nil) { return } for i, id := range m.ids { - if !yield(id, m.pos[i]) { + if !yield(DocLocation{First: id, Second: m.pos[i]}, nil) { return } } } } -func (m *mockSealingSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, []uint32]] { +func (m *mockSealingSource) TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, error]] { fieldNames := make([]string, 0, len(m.fields)) for f := range m.fields { fieldNames = append(fieldNames, f) } slices.Sort(fieldNames) - return func(yield func(string, iter.Seq2[[]byte, []uint32]) bool) { + return func(yield func(string, iter.Seq2[TokenPosting, error]) bool) { for _, field := range fieldNames { tokens := make([]string, 0, len(m.fields[field])) for t := range m.fields[field] { @@ -70,9 +70,9 @@ func (m *mockSealingSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, [ } slices.Sort(tokens) - if !yield(field, func(yield func([]byte, []uint32) bool) { + if !yield(field, func(yield func(TokenPosting, error) bool) { for _, tok := range tokens { - if !yield([]byte(tok), m.fields[field][tok]) { + if !yield(TokenPosting{First: []byte(tok), Second: m.fields[field][tok]}, nil) { return } } @@ -83,9 +83,9 @@ func (m *mockSealingSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, [ } } -func (m *mockSealingSource) DocBlock() iter.Seq[[]byte] { - return func(yield func([]byte) bool) { - if !yield(nil) { +func (m *mockSealingSource) DocBlock() iter.Seq2[DocBlockLocation, error] { + return func(yield func(DocBlockLocation, error) bool) { + if !yield(DocBlockLocation{}, nil) { return } } @@ -156,9 +156,10 @@ func TestMergeSource(t *testing.T) { docpos []seq.DocPos ) - for id, dp := range source.ID() { - ids = append(ids, id) - docpos = append(docpos, dp) + for loc, err := range source.ID() { + require.NoError(t, err) + ids = append(ids, loc.First) + docpos = append(docpos, loc.Second) } require.Equal(t, @@ -197,9 +198,10 @@ func TestMergeSource(t *testing.T) { for field, fieldIt := range source.TokenTriplet() { fields = append(fields, field) - for token, lidsbuf := range fieldIt { - tokens = append(tokens, token) - lids = append(lids, slices.Clone(lidsbuf)) + for posting, err := range fieldIt { + require.NoError(t, err) + tokens = append(tokens, posting.First) + lids = append(lids, slices.Clone(posting.Second)) } } diff --git a/frac/sealed_source.go b/frac/sealed_source.go index 29a3d31b..f6eebf07 100644 --- a/frac/sealed_source.go +++ b/frac/sealed_source.go @@ -4,14 +4,18 @@ import ( "iter" "slices" + "github.com/ozontech/seq-db/blockbuilder" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/frac/sealed/token" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" + "github.com/ozontech/seq-db/util" ) +type DocBlockLocation = util.Pair[[]byte, uint64] + // SealedSource implements [indexwriter.Source] for a sealed fraction. // Used as input to [compaction.MergeSource] when compacting multiple fractions. type SealedSource struct { @@ -22,8 +26,6 @@ type SealedSource struct { tokenBlockLoader *token.BlockLoader tokenTableLoader *token.TableLoader - - lastErr error } func NewSealedSource(f *Sealed) *SealedSource { @@ -52,35 +54,35 @@ func (s *SealedSource) BlockOffsets() []uint64 { return s.f.blocksData.BlocksOffsets } -func (s *SealedSource) ID() iter.Seq2[seq.ID, seq.DocPos] { - return func(yield func(seq.ID, seq.DocPos) bool) { +func (s *SealedSource) ID() iter.Seq2[blockbuilder.DocLocation, error] { + return func(yield func(blockbuilder.DocLocation, error) bool) { for lid := uint32(0); lid < s.f.blocksData.IDsTable.IDsTotal; lid++ { mid, err := s.idsProvider.MID(seq.LID(lid)) if err != nil { - s.lastErr = err + yield(blockbuilder.DocLocation{}, err) return } rid, err := s.idsProvider.RID(seq.LID(lid)) if err != nil { - s.lastErr = err + yield(blockbuilder.DocLocation{}, err) return } pos, err := s.idsProvider.DocPos(seq.LID(lid)) if err != nil { - s.lastErr = err + yield(blockbuilder.DocLocation{}, err) return } - if !yield(seq.ID{MID: mid, RID: rid}, pos) { + if !yield(blockbuilder.DocLocation{First: seq.ID{MID: mid, RID: rid}, Second: pos}, nil) { return } } } } -func (s *SealedSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, []uint32]] { +func (s *SealedSource) TokenTriplet() iter.Seq2[string, iter.Seq2[blockbuilder.TokenPosting, error]] { tokenTable := s.tokenTableLoader.Load() fields := make([]string, 0, len(tokenTable)) @@ -89,21 +91,21 @@ func (s *SealedSource) TokenTriplet() iter.Seq2[string, iter.Seq2[[]byte, []uint } slices.Sort(fields) - return func(yield func(string, iter.Seq2[[]byte, []uint32]) bool) { + return func(yield func(string, iter.Seq2[blockbuilder.TokenPosting, error]) bool) { for _, field := range fields { - if !yield(field, s.tokensForField(field)) { + if !yield(field, s.postingsForField(field)) { return } } } } -func (s *SealedSource) tokensForField(field string) iter.Seq2[[]byte, []uint32] { +func (s *SealedSource) postingsForField(field string) iter.Seq2[blockbuilder.TokenPosting, error] { lidsTable := s.f.blocksData.LIDsTable tokenTable := s.tokenTableLoader.Load() var lidsBuf []uint32 - return func(yield func([]byte, []uint32) bool) { + return func(yield func(blockbuilder.TokenPosting, error) bool) { for _, entry := range tokenTable[field].Entries { block := s.tokenBlockLoader.Load(entry.BlockIndex) @@ -117,7 +119,7 @@ func (s *SealedSource) tokensForField(field string) iter.Seq2[[]byte, []uint32] for bi := firstBlock; bi <= lastBlock; bi++ { lidBlock, err := s.lidsLoader.GetLIDsBlock(bi) if err != nil { - s.lastErr = err + yield(blockbuilder.TokenPosting{}, err) return } @@ -125,7 +127,7 @@ func (s *SealedSource) tokensForField(field string) iter.Seq2[[]byte, []uint32] lidsBuf = append(lidsBuf, lidBlock.LIDs[lidBlock.Offsets[chunkIdx]:lidBlock.Offsets[chunkIdx+1]]...) } - if !yield(tokenVal, lidsBuf) { + if !yield(blockbuilder.TokenPosting{First: tokenVal, Second: lidsBuf}, nil) { return } } @@ -133,8 +135,8 @@ func (s *SealedSource) tokensForField(field string) iter.Seq2[[]byte, []uint32] } } -func (s *SealedSource) DocBlock() iter.Seq[[]byte] { - return func(yield func([]byte) bool) { +func (s *SealedSource) DocBlock() iter.Seq2[DocBlockLocation, error] { + return func(yield func(DocBlockLocation, error) bool) { // We do not want to cache payload of DocBlock because // it will just pollute cache and cause unnecessary evictions. r := storage.NewDocBlocksReader(s.f.readLimiter, s.f.docsFile) @@ -144,17 +146,14 @@ func (s *SealedSource) DocBlock() iter.Seq[[]byte] { // Caller of [SealedSource.DocBlock] will decide whether it requires decompressed data. payload, _, err := r.ReadDocBlock(int64(offset)) if err != nil { - s.lastErr = err + yield(DocBlockLocation{}, err) return } - if !yield(payload) { + loc := DocBlockLocation{First: payload, Second: offset} + if !yield(loc, nil) { return } } } } - -func (s *SealedSource) LastError() error { - return s.lastErr -} From 21a854bfc58438e95dc074da4baeaa10c1ee8264 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Mon, 27 Apr 2026 14:48:09 +0300 Subject: [PATCH 18/29] feat: build distribution for compacted fraction --- compaction/merge_source.go | 43 ++++++++++++++++++++------------- compaction/merge_source_test.go | 5 ++++ frac/common/info.go | 7 ++++++ 3 files changed, 38 insertions(+), 17 deletions(-) diff --git a/compaction/merge_source.go b/compaction/merge_source.go index d9f16428..d73671fa 100644 --- a/compaction/merge_source.go +++ b/compaction/merge_source.go @@ -3,7 +3,6 @@ package compaction import ( "bytes" "iter" - "math" "slices" "sync" @@ -48,8 +47,6 @@ type MergeSource struct { // i-th index of [lidMapping] correponds to i-th fraction. // j-th index of i-th [lidMapping] corresponds to rename of j-th lid. lidMapping [][]uint32 - - from, to seq.MID } func NewMergeSource(filename string, sources []Source) *MergeSource { @@ -59,18 +56,34 @@ func NewMergeSource(filename string, sources []Source) *MergeSource { lidmapping[i] = make([]uint32, src.Info().DocsTotal+1) } - info := common.NewInfo(filename, 0, 0) - info.SealingTime = info.CreationTime - - return &MergeSource{ - info: info, - filename: filename, - + s := &MergeSource{ + filename: filename, sources: sources, lidMapping: lidmapping, + } + + s.info = s.prepareInfo() + return s +} + +func (s *MergeSource) prepareInfo() *common.Info { + info := common.NewInfo(s.filename, 0, 0) - from: math.MaxUint64, to: 0, + var ( + from seq.MID = seq.MaxID.MID + to seq.MID = seq.MinID.MID + ) + + for _, src := range s.sources { + from = min(from, src.Info().From) + to = max(to, src.Info().To) } + + info.From, info.To = from, to + info.SealingTime = info.CreationTime + + info.InitEmptyDistribution() + return info } func (s *MergeSource) Info() *common.Info { @@ -84,9 +97,6 @@ func (s *MergeSource) Info() *common.Info { // NOTE(dkharms): [IndexOnDisk] is calculated later. } - - s.info.From = s.from - s.info.To = s.to }) return s.info @@ -195,7 +205,9 @@ func (s *MergeSource) ID() iter.Seq2[DocLocation, error] { } c := cursors[idx] + minid, oldlid := c.loc.First, c.lidOld + s.info.AddMID(uint64(minid.MID)) blockIdx, offset := c.loc.Second.Unpack() mindocpos := seq.PackDocPos(uint32(s.docBlockCount[idx]+int(blockIdx)), offset) @@ -217,9 +229,6 @@ func (s *MergeSource) ID() iter.Seq2[DocLocation, error] { return } - s.from = min(s.from, minid.MID) - s.to = max(s.to, minid.MID) - lid += 1 cursors[idx] = c } diff --git a/compaction/merge_source_test.go b/compaction/merge_source_test.go index fdedac98..5a8962d3 100644 --- a/compaction/merge_source_test.go +++ b/compaction/merge_source_test.go @@ -245,6 +245,11 @@ func TestMergeSource(t *testing.T) { require.Equal(t, merged.DocsTotal, finfo.DocsTotal+sinfo.DocsTotal) require.Equal(t, merged.DocsOnDisk, finfo.DocsOnDisk+sinfo.DocsOnDisk) require.Equal(t, merged.DocsRaw, finfo.DocsRaw+sinfo.DocsRaw) + + // Validate correctness of distribution. + require.NotNil(t, merged.Distribution) + require.True(t, merged.IsIntersecting(finfo.From, finfo.To)) + require.True(t, merged.IsIntersecting(sinfo.From, sinfo.To)) }) } diff --git a/frac/common/info.go b/frac/common/info.go index 20e7f7c2..d714f660 100644 --- a/frac/common/info.go +++ b/frac/common/info.go @@ -82,6 +82,13 @@ func (s *Info) BuildDistribution(mids []uint64) { } } +func (s *Info) AddMID(mid uint64) { + if s.Distribution == nil { + return + } + s.Distribution.Add(seq.MID(mid)) +} + func (s *Info) InitEmptyDistribution() bool { from := s.From.Time() creationTime := time.UnixMilli(int64(s.CreationTime)) From d3e1c9efb7901519dd01a9cf81beb36a29f4a081 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Tue, 28 Apr 2026 09:34:32 +0300 Subject: [PATCH 19/29] refactor: consistent naming --- compaction/merge.go | 1 - compaction/merge_source.go | 31 ++++++++------ compaction/merge_source_test.go | 75 +++++++++++++++++++-------------- frac/sealed_source.go | 30 ++++++------- 4 files changed, 77 insertions(+), 60 deletions(-) diff --git a/compaction/merge.go b/compaction/merge.go index b21cd9c3..928b3044 100644 --- a/compaction/merge.go +++ b/compaction/merge.go @@ -89,7 +89,6 @@ func mergeDocs(filename string, srcs ...Source) error { filename+consts.DocsFileSuffix, func(f *os.File) error { var docsSize uint64 - for _, src := range srcs { for loc, err := range src.DocBlock() { if err != nil { diff --git a/compaction/merge_source.go b/compaction/merge_source.go index d73671fa..f2e49da7 100644 --- a/compaction/merge_source.go +++ b/compaction/merge_source.go @@ -28,15 +28,16 @@ type Source interface { type MergeSource struct { filename string - info *common.Info - infoOnce sync.Once - // sources is a slice of [sealing.Source] // which provide view into underlying fractions. sources []Source + info *common.Info + infoOnce sync.Once + offsets []uint64 offsetsOnce sync.Once + // docBlockCount is populated during [MergeSource.BlockOffsets] call. // This slice is used for changing block indexes in [seq.DocPos]. docBlockCount []int @@ -50,16 +51,20 @@ type MergeSource struct { } func NewMergeSource(filename string, sources []Source) *MergeSource { - lidmapping := make([][]uint32, len(sources)) + lidMapping := make([][]uint32, len(sources)) for i, src := range sources { - lidmapping[i] = make([]uint32, src.Info().DocsTotal+1) + lidMapping[i] = make( + []uint32, + // Increment for [seq.SystemID]. + src.Info().DocsTotal+1, + ) } s := &MergeSource{ filename: filename, sources: sources, - lidMapping: lidmapping, + lidMapping: lidMapping, } s.info = s.prepareInfo() @@ -206,18 +211,18 @@ func (s *MergeSource) ID() iter.Seq2[DocLocation, error] { c := cursors[idx] - minid, oldlid := c.loc.First, c.lidOld - s.info.AddMID(uint64(minid.MID)) + minID, lidOld := c.loc.First, c.lidOld + s.info.AddMID(uint64(minID.MID)) blockIdx, offset := c.loc.Second.Unpack() - mindocpos := seq.PackDocPos(uint32(s.docBlockCount[idx]+int(blockIdx)), offset) + minDocPos := seq.PackDocPos(uint32(s.docBlockCount[idx]+int(blockIdx)), offset) - if !yield(DocLocation{First: minid, Second: mindocpos}, nil) { + if !yield(DocLocation{First: minID, Second: minDocPos}, nil) { return } // Rename lid from picked cursor to the new value. - s.lidMapping[idx][oldlid] = lid + s.lidMapping[idx][lidOld] = lid var err error c.loc, err, c.ok = c.next() @@ -320,7 +325,7 @@ func (s *MergeSource) TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, e iters = append(iters, c.tokIt) } - if !yield(field, s.tokensForField(idxs, iters)) { + if !yield(field, s.postingsForField(idxs, iters)) { return } @@ -334,7 +339,7 @@ func (s *MergeSource) TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, e } } -func (s *MergeSource) tokensForField( +func (s *MergeSource) postingsForField( idxs []int, iters []iter.Seq2[TokenPosting, error], ) iter.Seq2[TokenPosting, error] { type cursor struct { diff --git a/compaction/merge_source_test.go b/compaction/merge_source_test.go index 5a8962d3..12ba093c 100644 --- a/compaction/merge_source_test.go +++ b/compaction/merge_source_test.go @@ -44,11 +44,14 @@ func (m *mockSealingSource) BlockOffsets() []uint64 { func (m *mockSealingSource) ID() iter.Seq2[DocLocation, error] { return func(yield func(DocLocation, error) bool) { - if !yield(DocLocation{First: seq.SystemID, Second: seq.SystemDocPos}, nil) { + docloc := DocLocation{First: seq.SystemID, Second: seq.SystemDocPos} + if !yield(docloc, nil) { return } + for i, id := range m.ids { - if !yield(DocLocation{First: id, Second: m.pos[i]}, nil) { + docloc = DocLocation{First: id, Second: m.pos[i]} + if !yield(docloc, nil) { return } } @@ -56,27 +59,36 @@ func (m *mockSealingSource) ID() iter.Seq2[DocLocation, error] { } func (m *mockSealingSource) TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, error]] { - fieldNames := make([]string, 0, len(m.fields)) + fields := make([]string, 0, len(m.fields)) for f := range m.fields { - fieldNames = append(fieldNames, f) + fields = append(fields, f) } - slices.Sort(fieldNames) + slices.Sort(fields) return func(yield func(string, iter.Seq2[TokenPosting, error]) bool) { - for _, field := range fieldNames { - tokens := make([]string, 0, len(m.fields[field])) - for t := range m.fields[field] { - tokens = append(tokens, t) + for _, field := range fields { + if !yield(field, m.postingsForField(field)) { + return + } + } + } +} + +func (m *mockSealingSource) postingsForField(field string) iter.Seq2[TokenPosting, error] { + return func(yield func(TokenPosting, error) bool) { + tokens := make([]string, 0, len(m.fields[field])) + for t := range m.fields[field] { + tokens = append(tokens, t) + } + + slices.Sort(tokens) + for _, tok := range tokens { + posting := TokenPosting{ + First: []byte(tok), + Second: m.fields[field][tok], } - slices.Sort(tokens) - - if !yield(field, func(yield func(TokenPosting, error) bool) { - for _, tok := range tokens { - if !yield(TokenPosting{First: []byte(tok), Second: m.fields[field][tok]}, nil) { - return - } - } - }) { + + if !yield(posting, nil) { return } } @@ -165,10 +177,10 @@ func TestMergeSource(t *testing.T) { require.Equal(t, []seq.ID{ seq.SystemID, - // seq.ID from the second source + // [seq.ID] from the second source. {MID: 6}, {MID: 5}, - // seq.ID from the first source + // [seq.ID] from the first source. {MID: 3}, {MID: 2}, {MID: 1}, @@ -179,9 +191,9 @@ func TestMergeSource(t *testing.T) { require.Equal(t, []seq.DocPos{ seq.SystemDocPos, - // seq.DocPos from the second source + // [seq.DocPos] from the second source. seq.PackDocPos(1, 0), seq.PackDocPos(1, 2048), - // seq.DocPos from the first source + // [seq.DocPos] from the first source. seq.PackDocPos(0, 0), seq.PackDocPos(0, 1024), seq.PackDocPos(0, 2048), }, docpos, @@ -205,28 +217,28 @@ func TestMergeSource(t *testing.T) { } } - // Both sources have the same and the only field + // Both sources have the same and the only field. require.Equal(t, []string{"level"}, fields) - // Ensure tokens are sorted in ascending order + // Ensure tokens are sorted in ascending order. require.Equal(t, [][]byte{[]byte("debug"), []byte("error"), []byte("info")}, tokens, ) - // Ensure correctness of lids remapping - // ----------------- + // Ensure correctness of lids remapping: + // ------------------------- // seq.MID 6 5 | 3 2 1 // seq.LID (old) 1 2 | 1 2 3 // seq.LID (new) 1 2 | 3 4 5 - // ----------------- + // ------------------------- require.Equal(t, [][]uint32{ - // Sequence of [seq.LID] for token `debug` + // Sequence of [seq.LID] for token `debug`. {1}, - // Sequence of [seq.LID] for token `error` + // Sequence of [seq.LID] for token `error`. {3, 5}, - // Sequence of [seq.LID] for token `info` + // Sequence of [seq.LID] for token `info`. {2, 4, 5}, }, lids, @@ -250,6 +262,7 @@ func TestMergeSource(t *testing.T) { require.NotNil(t, merged.Distribution) require.True(t, merged.IsIntersecting(finfo.From, finfo.To)) require.True(t, merged.IsIntersecting(sinfo.From, sinfo.To)) + require.True(t, merged.IsIntersecting(min(finfo.From, sinfo.From), max(finfo.To, sinfo.To))) }) } @@ -258,7 +271,7 @@ func BenchmarkMergeSource(b *testing.B) { numSources = 4 docsPerSource = 512_000 - // Total pairs of (field, token) will be + // Total count of pairs of (field, token) will be // [numFields] * [numTokens]. numFields = 512 numTokens = 16384 diff --git a/frac/sealed_source.go b/frac/sealed_source.go index f6eebf07..3cafcccb 100644 --- a/frac/sealed_source.go +++ b/frac/sealed_source.go @@ -4,11 +4,11 @@ import ( "iter" "slices" - "github.com/ozontech/seq-db/blockbuilder" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed/lids" "github.com/ozontech/seq-db/frac/sealed/seqids" "github.com/ozontech/seq-db/frac/sealed/token" + "github.com/ozontech/seq-db/indexwriter" "github.com/ozontech/seq-db/seq" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/util" @@ -29,7 +29,7 @@ type SealedSource struct { } func NewSealedSource(f *Sealed) *SealedSource { - f.load() + f.init(true) return &SealedSource{ f: f, idsProvider: seqids.NewProvider( @@ -42,7 +42,7 @@ func NewSealedSource(f *Sealed) *SealedSource { ), lidsLoader: lids.NewLoader(&f.lidReader, f.indexCache.LIDs), tokenBlockLoader: token.NewBlockLoader(f.BaseFileName, &f.tokenReader, f.indexCache.Tokens), - tokenTableLoader: token.NewTableLoader(f.BaseFileName, &f.tokenReader, f.indexCache.TokenTable), + tokenTableLoader: token.NewTableLoader(f.BaseFileName, f.IsLegacy, &f.tokenReader, f.indexCache.TokenTable), } } @@ -54,35 +54,35 @@ func (s *SealedSource) BlockOffsets() []uint64 { return s.f.blocksData.BlocksOffsets } -func (s *SealedSource) ID() iter.Seq2[blockbuilder.DocLocation, error] { - return func(yield func(blockbuilder.DocLocation, error) bool) { +func (s *SealedSource) ID() iter.Seq2[indexwriter.DocLocation, error] { + return func(yield func(indexwriter.DocLocation, error) bool) { for lid := uint32(0); lid < s.f.blocksData.IDsTable.IDsTotal; lid++ { mid, err := s.idsProvider.MID(seq.LID(lid)) if err != nil { - yield(blockbuilder.DocLocation{}, err) + yield(indexwriter.DocLocation{}, err) return } rid, err := s.idsProvider.RID(seq.LID(lid)) if err != nil { - yield(blockbuilder.DocLocation{}, err) + yield(indexwriter.DocLocation{}, err) return } pos, err := s.idsProvider.DocPos(seq.LID(lid)) if err != nil { - yield(blockbuilder.DocLocation{}, err) + yield(indexwriter.DocLocation{}, err) return } - if !yield(blockbuilder.DocLocation{First: seq.ID{MID: mid, RID: rid}, Second: pos}, nil) { + if !yield(indexwriter.DocLocation{First: seq.ID{MID: mid, RID: rid}, Second: pos}, nil) { return } } } } -func (s *SealedSource) TokenTriplet() iter.Seq2[string, iter.Seq2[blockbuilder.TokenPosting, error]] { +func (s *SealedSource) TokenTriplet() iter.Seq2[string, iter.Seq2[indexwriter.TokenPosting, error]] { tokenTable := s.tokenTableLoader.Load() fields := make([]string, 0, len(tokenTable)) @@ -91,7 +91,7 @@ func (s *SealedSource) TokenTriplet() iter.Seq2[string, iter.Seq2[blockbuilder.T } slices.Sort(fields) - return func(yield func(string, iter.Seq2[blockbuilder.TokenPosting, error]) bool) { + return func(yield func(string, iter.Seq2[indexwriter.TokenPosting, error]) bool) { for _, field := range fields { if !yield(field, s.postingsForField(field)) { return @@ -100,12 +100,12 @@ func (s *SealedSource) TokenTriplet() iter.Seq2[string, iter.Seq2[blockbuilder.T } } -func (s *SealedSource) postingsForField(field string) iter.Seq2[blockbuilder.TokenPosting, error] { +func (s *SealedSource) postingsForField(field string) iter.Seq2[indexwriter.TokenPosting, error] { lidsTable := s.f.blocksData.LIDsTable tokenTable := s.tokenTableLoader.Load() var lidsBuf []uint32 - return func(yield func(blockbuilder.TokenPosting, error) bool) { + return func(yield func(indexwriter.TokenPosting, error) bool) { for _, entry := range tokenTable[field].Entries { block := s.tokenBlockLoader.Load(entry.BlockIndex) @@ -119,7 +119,7 @@ func (s *SealedSource) postingsForField(field string) iter.Seq2[blockbuilder.Tok for bi := firstBlock; bi <= lastBlock; bi++ { lidBlock, err := s.lidsLoader.GetLIDsBlock(bi) if err != nil { - yield(blockbuilder.TokenPosting{}, err) + yield(indexwriter.TokenPosting{}, err) return } @@ -127,7 +127,7 @@ func (s *SealedSource) postingsForField(field string) iter.Seq2[blockbuilder.Tok lidsBuf = append(lidsBuf, lidBlock.LIDs[lidBlock.Offsets[chunkIdx]:lidBlock.Offsets[chunkIdx+1]]...) } - if !yield(blockbuilder.TokenPosting{First: tokenVal, Second: lidsBuf}, nil) { + if !yield(indexwriter.TokenPosting{First: tokenVal, Second: lidsBuf}, nil) { return } } From 1cb65435a3b60699f17a823163d4bbd434f93148 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Mon, 25 May 2026 15:18:37 +0300 Subject: [PATCH 20/29] feat: implement `stcs` --- compaction/stcs.go | 108 ++++++++++++++++++++++++++++++++++++++++ compaction/stcs_test.go | 97 ++++++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+) create mode 100644 compaction/stcs.go create mode 100644 compaction/stcs_test.go diff --git a/compaction/stcs.go b/compaction/stcs.go new file mode 100644 index 00000000..54d9843f --- /dev/null +++ b/compaction/stcs.go @@ -0,0 +1,108 @@ +package compaction + +import ( + "cmp" + "slices" + + "github.com/ozontech/seq-db/frac" +) + +type strategySTCS struct { + // To trigger compaction of bucket there must be + // at least [mergeTrigger] fractions. + mergeTrigger int + + // At most this many fractions are compacted from a single bucket + // per compaction iteration. + mergeFanIn int + mergeFanOutSize uint64 + + // Fraction size must be within [bucketLowerbound, bucketUpperbound] * avg(bucket) + // to be considered part of the bucket. + bucketLowerbound float64 + bucketUpperbound float64 +} + +func (s strategySTCS) Pick(candidates []frac.Fraction) []frac.Fraction { + if len(candidates) < s.mergeTrigger { + return nil + } + + sorted := slices.Clone(candidates) + slices.SortFunc(sorted, func(a, b frac.Fraction) int { + return cmp.Compare(a.Info().IndexOnDisk, b.Info().IndexOnDisk) + }) + + buckets := s.group(sorted) + // We are interested in buckets with the most amount of fractions. + // Usually, these are the lowest tiers where all freshly sealed fractions end up. + slices.SortFunc(buckets, func(x, y []frac.Fraction) int { + return -cmp.Compare(len(x), len(y)) + }) + + for _, bucket := range buckets { + if len(bucket) < s.mergeTrigger { + continue + } + + fracs := bucket[:min(len(bucket), s.mergeFanIn)] + if picked := s.takeUntilSize(fracs); len(picked) > 0 { + return picked + } + } + + return nil +} + +func (s strategySTCS) group(sorted []frac.Fraction) [][]frac.Fraction { + var ( + sum uint64 + current []frac.Fraction + buckets [][]frac.Fraction + ) + + for _, f := range sorted { + size := f.Info().IndexOnDisk + + if len(current) == 0 { + current = append(current, f) + sum = size + continue + } + + avg := float64(sum) / float64(len(current)) + fsize := float64(size) + + lower := avg * s.bucketLowerbound + upper := avg * s.bucketUpperbound + + if lower <= fsize && fsize <= upper { + current = append(current, f) + sum += size + continue + } + + buckets = append(buckets, current) + current = []frac.Fraction{f} + sum = size + } + + if len(current) > 0 { + buckets = append(buckets, current) + } + + return buckets +} + +func (s strategySTCS) takeUntilSize(fracs []frac.Fraction) []frac.Fraction { + var picked uint64 + + for i := range fracs { + picked += fracs[i].Info().IndexOnDisk + if picked >= s.mergeFanOutSize { + return fracs[:i] + } + } + + return fracs +} diff --git a/compaction/stcs_test.go b/compaction/stcs_test.go new file mode 100644 index 00000000..b846855a --- /dev/null +++ b/compaction/stcs_test.go @@ -0,0 +1,97 @@ +package compaction + +import ( + "context" + "math" + "testing" + + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/processor" + "github.com/ozontech/seq-db/seq" + "github.com/stretchr/testify/require" +) + +type mockFraction struct { + indexOnDisk uint64 +} + +func (m *mockFraction) Info() *common.Info { + return &common.Info{IndexOnDisk: m.indexOnDisk} +} + +func (m *mockFraction) IsIntersecting(seq.MID, seq.MID) bool { + return false +} + +func (m *mockFraction) Contains(seq.MID) bool { + return false +} + +func (m *mockFraction) Fetch(context.Context, []seq.ID) ([][]byte, error) { + return nil, nil +} + +func (m *mockFraction) Search(context.Context, processor.SearchParams) (*seq.QPR, error) { + return nil, nil +} + +func (m *mockFraction) FindLIDs(context.Context, []seq.ID) ([]seq.LID, error) { + return nil, nil +} + +func makeFracs(sizes ...uint64) []frac.Fraction { + out := make([]frac.Fraction, len(sizes)) + for i, s := range sizes { + out[i] = &mockFraction{indexOnDisk: s} + } + return out +} + +func TestSTCS_Pick(t *testing.T) { + s := strategySTCS{ + mergeTrigger: 4, + mergeFanIn: 32, + mergeFanOutSize: math.MaxUint64, + bucketLowerbound: 0.5, + bucketUpperbound: 1.5, + } + + t.Run("not-enough-candidates", func(t *testing.T) { + for n := range s.mergeTrigger { + require.Nil(t, s.Pick(makeFracs(make([]uint64, n)...))) + } + }) + + t.Run("requirement-not-met", func(t *testing.T) { + // Each fraction size is 10x the previous. + // They land in different buckets and no bucket with [mergeTrigger] fractions exists. + require.Nil(t, s.Pick(makeFracs(100, 1000, 10000, 100000))) + }) + + t.Run("one-bucket", func(t *testing.T) { + require.Len(t, s.Pick(makeFracs(1000, 1000, 1000, 1000)), 4) + }) + + t.Run("largest-bucket", func(t *testing.T) { + fracs := s.Pick(makeFracs( + 1000, 1000, + 100000, 100000, 100000, 100000, 100000, // Will take this bucket. + )) + + require.Len(t, fracs, 5) + for _, f := range fracs { + require.Equal(t, uint64(100000), f.Info().IndexOnDisk) + } + }) + + t.Run("cap-at-fan-in", func(t *testing.T) { + sizes := make([]uint64, s.mergeFanIn+10) + + for i := range sizes { + sizes[i] = 5000 + } + + require.Len(t, s.Pick(makeFracs(sizes...)), s.mergeFanIn) + }) +} From 3118a7d06296bd5f18d6c2d25e4a7d6f25b1d762 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Wed, 27 May 2026 12:12:46 +0300 Subject: [PATCH 21/29] feat: add `FractionName` method for `FracManager` --- fracmanager/fracmanager.go | 14 ++++++++++++-- fracmanager/fraction_provider.go | 7 ++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/fracmanager/fracmanager.go b/fracmanager/fracmanager.go index 5dc808c7..9ac3c642 100644 --- a/fracmanager/fracmanager.go +++ b/fracmanager/fracmanager.go @@ -11,6 +11,7 @@ import ( "github.com/ozontech/seq-db/config" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/storage" "github.com/ozontech/seq-db/storage/s3" @@ -114,6 +115,12 @@ func (cs *CompactionSnapshot) Destroy() { } } +func (fm *FracManager) FractionName() string { + filePath := fileBasePattern + fm.lc.provider.nextFractionID() + baseFilePath := filepath.Join(fm.lc.provider.config.DataDir, filePath) + return baseFilePath +} + func (fm *FracManager) SealedFractionsSnapshot() []*frac.Sealed { return fm.lc.registry.sealedSnapshot() } @@ -126,8 +133,11 @@ func (fm *FracManager) ClaimForCompaction(names []string) (*CompactionSnapshot, return &CompactionSnapshot{claimed: claimed}, nil } -func (fm *FracManager) SubstituteWithSealed(produced *frac.Sealed, snapshot *CompactionSnapshot) { - fm.lc.registry.substituteWithSealed(produced, snapshot.claimed...) +func (fm *FracManager) SubstituteWithSealed(produced *sealed.PreloadedData, snapshot *CompactionSnapshot) { + fm.lc.registry.substituteWithSealed( + fm.lc.provider.NewSealedPreloaded(produced.Info.Path, produced), + snapshot.claimed..., + ) } func (fm *FracManager) AcquireFraction(name string) (frac.Fraction, func(), bool) { diff --git a/fracmanager/fraction_provider.go b/fracmanager/fraction_provider.go index a3609b85..556eb2f1 100644 --- a/fracmanager/fraction_provider.go +++ b/fracmanager/fraction_provider.go @@ -5,6 +5,7 @@ import ( "io" "math/rand" "path/filepath" + "sync" "time" "github.com/oklog/ulid/v2" @@ -37,8 +38,10 @@ type fractionProvider struct { cacheProvider *CacheMaintainer // Cache provider for data access optimization activeIndexer *frac.ActiveIndexer // Indexer for active fractions readLimiter *storage.ReadLimiter // Read rate limiter - ulidEntropy io.Reader // Entropy source for ULID generation skipMaskProvider skipMaskProvider + + mu sync.Mutex + ulidEntropy io.Reader // Entropy source for ULID generation } func newFractionProvider( @@ -113,6 +116,8 @@ func (fp *fractionProvider) NewRemote(ctx context.Context, name string, cachedIn // IMPORTANT: This method is not thread-safe. When used in concurrent environments, // external synchronization must be provided to avoid ID collisions func (fp *fractionProvider) nextFractionID() string { + fp.mu.Lock() + defer fp.mu.Unlock() return ulid.MustNew(ulid.Timestamp(time.Now()), fp.ulidEntropy).String() } From e0628ddc45f6f1ed4af6947fe8d28875d9a33673 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Wed, 27 May 2026 12:13:49 +0300 Subject: [PATCH 22/29] feat: first iteration on `planner` --- compaction/planner.go | 244 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 244 insertions(+) create mode 100644 compaction/planner.go diff --git a/compaction/planner.go b/compaction/planner.go new file mode 100644 index 00000000..73b146ce --- /dev/null +++ b/compaction/planner.go @@ -0,0 +1,244 @@ +package compaction + +import ( + "context" + "maps" + "slices" + "sync" + "time" + + "go.uber.org/zap" + + "github.com/alecthomas/units" + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/sealed" + "github.com/ozontech/seq-db/fracmanager" + "github.com/ozontech/seq-db/logger" +) + +const ( + // TODO(dkharms): Move this options to config. + compactionTick = time.Second + compactionWindow = 24 * time.Hour +) + +type task struct { + bin time.Time + filename string + snapshot *fracmanager.CompactionSnapshot + onComplete func(result, error) +} + +type result struct { + filename string + consumed *fracmanager.CompactionSnapshot + produced *sealed.PreloadedData +} + +type planner struct { + wg sync.WaitGroup + ctx context.Context + done chan struct{} + + fm *fracmanager.FracManager + tasks chan task + + mu sync.RWMutex + // inflight tracks active compactions for each time-bin. + // We cannot have concurrent compactions within one time-bin for correctness purposes. + inflight map[time.Time]struct{} + + stats map[time.Time]int +} + +func NewPlanner(ctx context.Context, fm *fracmanager.FracManager) *planner { + p := planner{ + ctx: ctx, + done: make(chan struct{}), + + fm: fm, + + tasks: make(chan task), + + inflight: make(map[time.Time]struct{}), + stats: make(map[time.Time]int), + } + + p.init() + return &p +} + +func (p *planner) init() { + p.wg.Go(func() { + t := time.NewTicker(compactionTick) + + for { + select { + case <-p.ctx.Done(): + close(p.tasks) + return + + case <-p.done: + close(p.tasks) + return + + case <-t.C: + task, ok := p.pick() + if !ok { + continue + } + + select { + case p.tasks <- task: + case <-time.NewTimer(time.Second).C: + // If all executor workers are busy for some long period + // we want to drop the task because it might contain stale decision. + } + } + } + }) +} + +func (p *planner) close() { + close(p.done) +} + +func (p *planner) pick() (task, bool) { + names := func(fracs []frac.Fraction) []string { + fnames := make([]string, len(fracs)) + for i := range fracs { + fnames[i] = fracs[i].Info().Name() + } + return fnames + } + + snapshot := p.fm.SealedFractionsSnapshot() + bins := p.distribute(compactionWindow, snapshot) + times := p.prioritize(bins) + + p.mu.Lock() + defer p.mu.Unlock() + + // NOTE(dkharms): This lock guards [inflight] map. + // Maybe I can find another way to signal from worker that time-bin is free? + + for _, t := range times { + if _, ok := p.inflight[t]; ok { + // There is on-going compaction within this time-bin. + continue + } + + // TODO(dkharms): Move this options to config. + picked := strategySTCS{ + mergeTrigger: 4, + mergeFanIn: 32, + mergeFanOutSize: 128 * uint64(units.MiB), + bucketLowerbound: 0.5, + bucketUpperbound: 1.5, + }.Pick(bins[t].fracs) + + if len(picked) == 0 { + // No candidates were found. + continue + } + + csnapshot, err := p.fm.ClaimForCompaction(names(picked)) + if err != nil { + continue + } + + p.inflight[t] = struct{}{} + + return task{ + bin: t, + + filename: p.fm.FractionName(), + snapshot: csnapshot, + + onComplete: func(r result, err error) { + p.mu.Lock() + defer p.mu.Unlock() + delete(p.inflight, t) + + if err != nil { + logger.Error( + "failed to compact fractions", + zap.Error(err), + zap.Any("snapshot", csnapshot), + ) + return + } + + if r.produced == nil { + logger.Info( + "compaction did not produce fraction", + zap.Any("snapshot", csnapshot), + ) + return + } + + // TODO(dkharms): Is it fine to substitute and delete? + // We need somehow substitute and delete atomically. + p.fm.SubstituteWithSealed(r.produced, csnapshot) + csnapshot.Destroy() + }, + }, true + } + + return task{}, false +} + +type timestampBin struct { + t time.Time + fracs []frac.Fraction +} + +func (p *planner) distribute(window time.Duration, fracs []*frac.Sealed) map[time.Time]timestampBin { + bins := make(map[time.Time]timestampBin) + + for _, f := range fracs { + from, to := f.Info().From.Time(), f.Info().To.Time() + + // Do not handle fractions which have + // too wide date-range. + if to.Sub(from) > window { + continue + } + + bin := from.Truncate(window) + tb := bins[bin] + + tb.t = bin + tb.fracs = append(tb.fracs, f) + + bins[bin] = tb + } + + return bins +} + +func (p *planner) prioritize(bins map[time.Time]timestampBin) []time.Time { + // NOTE(dkharms): What other strategies we can use here? + // (*) Prioritize by change rate; + // (*) Prioritize by amount of fractions; + + ordered := slices.Collect(maps.Keys(bins)) + + // Order timestamp-bins by the change-rate. + // We will prioritize bins with higher change rate. + slices.SortFunc(ordered, func(x, y time.Time) int { + xold, xnew := p.stats[x], len(bins[x].fracs) + yold, ynew := p.stats[y], len(bins[y].fracs) + xchange, ychange := xnew-xold, ynew-yold + + p.stats[x], p.stats[y] = xnew, ynew + + if xchange == ychange { + return -x.Compare(y) + } + + return -(xchange - ychange) + }) + + return ordered +} From 688cb8cc827b4e432a9a09e7ca5857640c4f9226 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Wed, 27 May 2026 12:14:07 +0300 Subject: [PATCH 23/29] feat: first iteration on `Executor` --- compaction/executor.go | 89 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 compaction/executor.go diff --git a/compaction/executor.go b/compaction/executor.go new file mode 100644 index 00000000..faabbc38 --- /dev/null +++ b/compaction/executor.go @@ -0,0 +1,89 @@ +package compaction + +import ( + "sync" + + "go.uber.org/zap" + + "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/logger" +) + +type Executor struct { + workers int + wg sync.WaitGroup + p *planner +} + +// FIXME(dkharms): I need to pass here [common.SealParams]. +func NewExecutor(workers int, p *planner) *Executor { + e := Executor{workers: workers, p: p} + e.init() + return &e +} + +func (e *Executor) Close() { + e.p.close() + e.wg.Wait() +} + +func (e *Executor) init() { + for range e.workers { + e.wg.Go(func() { + for t := range e.p.tasks { + logger.Info( + "got new compaction task", + zap.Time("bin", t.bin), + zap.Any("snapshot", t.snapshot), + ) + t.onComplete(e.compact(t)) + } + }) + } +} + +func (e *Executor) compact(t task) (result, error) { + var ( + names []string + srcs []Source + ) + + for _, f := range t.snapshot.Fractions() { + names = append(names, f.Info().Name()) + srcs = append(srcs, frac.NewSealedSource(f)) + } + + logger.Info( + "compacting fractions", + zap.Strings("names", names), + ) + + preloaded, err := Merge(t.filename, common.SealParams{}, srcs...) + return result{filename: t.filename, consumed: t.snapshot, produced: preloaded}, err +} + +type noopexecutor struct{} + +func (noopexecutor) Compact(t task) (result, error) { + var ( + sum int + cnt int + names []string + ) + + for _, f := range t.snapshot.Fractions() { + cnt += 1 + sum += int(f.Info().IndexOnDisk) + names = append(names, f.Info().Name()) + } + + logger.Info( + "picked fractions", + zap.Any("names", names), + zap.Int("size", sum), + zap.Int("count", cnt), + ) + + return result{}, nil +} From 14001ce495deb7e72527d67cb7629597a58cdcb0 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Wed, 27 May 2026 12:14:26 +0300 Subject: [PATCH 24/29] feat: add compaction executor startup --- storeapi/store.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/storeapi/store.go b/storeapi/store.go index dd53079e..c3ee0aca 100644 --- a/storeapi/store.go +++ b/storeapi/store.go @@ -8,6 +8,7 @@ import ( "go.uber.org/atomic" + "github.com/ozontech/seq-db/compaction" "github.com/ozontech/seq-db/consts" "github.com/ozontech/seq-db/fracmanager" "github.com/ozontech/seq-db/logger" @@ -31,6 +32,7 @@ type Store struct { fracManagerStop func() SkipMaskManager *skipmaskmanager.SkipMaskManager + Executor *compaction.Executor isStopped atomic.Bool } @@ -66,12 +68,14 @@ func NewStore( } skipMaskManager := skipmaskmanager.New(ctx, c.SkipMaskManagerConfig, skipMaskParams, mappingProvider) - fracManager, stop, err := fracmanager.New(ctx, &c.FracManager, s3cli, skipMaskManager) if err != nil { return nil, fmt.Errorf("loading fractions error: %w", err) } + planner := compaction.NewPlanner(ctx, fracManager) + executor := compaction.NewExecutor(10, planner) + skipMaskManager.Start(fracManager) return &Store{ @@ -82,6 +86,7 @@ func NewStore( FracManager: fracManager, fracManagerStop: stop, SkipMaskManager: skipMaskManager, + Executor: executor, isStopped: atomic.Bool{}, }, nil } @@ -107,6 +112,7 @@ func (s *Store) Stop() { s.grpcServer.Stop(ctx) s.fracManagerStop() s.SkipMaskManager.Stop() + s.Executor.Close() logger.Info("store stopped") } From b7ae92a97252c6419aea35dcf9217b41ea5e1c6b Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Wed, 27 May 2026 12:27:22 +0300 Subject: [PATCH 25/29] refactor: use local `fraction` interface --- compaction/planner.go | 9 +++++++-- compaction/stcs.go | 18 ++++++++---------- compaction/stcs_test.go | 31 ++++--------------------------- 3 files changed, 19 insertions(+), 39 deletions(-) diff --git a/compaction/planner.go b/compaction/planner.go index 73b146ce..c3744608 100644 --- a/compaction/planner.go +++ b/compaction/planner.go @@ -11,11 +11,16 @@ import ( "github.com/alecthomas/units" "github.com/ozontech/seq-db/frac" + "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/fracmanager" "github.com/ozontech/seq-db/logger" ) +type fraction interface { + Info() *common.Info +} + const ( // TODO(dkharms): Move this options to config. compactionTick = time.Second @@ -104,7 +109,7 @@ func (p *planner) close() { } func (p *planner) pick() (task, bool) { - names := func(fracs []frac.Fraction) []string { + names := func(fracs []fraction) []string { fnames := make([]string, len(fracs)) for i := range fracs { fnames[i] = fracs[i].Info().Name() @@ -190,7 +195,7 @@ func (p *planner) pick() (task, bool) { type timestampBin struct { t time.Time - fracs []frac.Fraction + fracs []fraction } func (p *planner) distribute(window time.Duration, fracs []*frac.Sealed) map[time.Time]timestampBin { diff --git a/compaction/stcs.go b/compaction/stcs.go index 54d9843f..192951b8 100644 --- a/compaction/stcs.go +++ b/compaction/stcs.go @@ -3,8 +3,6 @@ package compaction import ( "cmp" "slices" - - "github.com/ozontech/seq-db/frac" ) type strategySTCS struct { @@ -23,20 +21,20 @@ type strategySTCS struct { bucketUpperbound float64 } -func (s strategySTCS) Pick(candidates []frac.Fraction) []frac.Fraction { +func (s strategySTCS) Pick(candidates []fraction) []fraction { if len(candidates) < s.mergeTrigger { return nil } sorted := slices.Clone(candidates) - slices.SortFunc(sorted, func(a, b frac.Fraction) int { + slices.SortFunc(sorted, func(a, b fraction) int { return cmp.Compare(a.Info().IndexOnDisk, b.Info().IndexOnDisk) }) buckets := s.group(sorted) // We are interested in buckets with the most amount of fractions. // Usually, these are the lowest tiers where all freshly sealed fractions end up. - slices.SortFunc(buckets, func(x, y []frac.Fraction) int { + slices.SortFunc(buckets, func(x, y []fraction) int { return -cmp.Compare(len(x), len(y)) }) @@ -54,11 +52,11 @@ func (s strategySTCS) Pick(candidates []frac.Fraction) []frac.Fraction { return nil } -func (s strategySTCS) group(sorted []frac.Fraction) [][]frac.Fraction { +func (s strategySTCS) group(sorted []fraction) [][]fraction { var ( sum uint64 - current []frac.Fraction - buckets [][]frac.Fraction + current []fraction + buckets [][]fraction ) for _, f := range sorted { @@ -83,7 +81,7 @@ func (s strategySTCS) group(sorted []frac.Fraction) [][]frac.Fraction { } buckets = append(buckets, current) - current = []frac.Fraction{f} + current = []fraction{f} sum = size } @@ -94,7 +92,7 @@ func (s strategySTCS) group(sorted []frac.Fraction) [][]frac.Fraction { return buckets } -func (s strategySTCS) takeUntilSize(fracs []frac.Fraction) []frac.Fraction { +func (s strategySTCS) takeUntilSize(fracs []fraction) []fraction { var picked uint64 for i := range fracs { diff --git a/compaction/stcs_test.go b/compaction/stcs_test.go index b846855a..3e79f6bc 100644 --- a/compaction/stcs_test.go +++ b/compaction/stcs_test.go @@ -1,15 +1,12 @@ package compaction import ( - "context" "math" "testing" - "github.com/ozontech/seq-db/frac" - "github.com/ozontech/seq-db/frac/common" - "github.com/ozontech/seq-db/frac/processor" - "github.com/ozontech/seq-db/seq" "github.com/stretchr/testify/require" + + "github.com/ozontech/seq-db/frac/common" ) type mockFraction struct { @@ -20,28 +17,8 @@ func (m *mockFraction) Info() *common.Info { return &common.Info{IndexOnDisk: m.indexOnDisk} } -func (m *mockFraction) IsIntersecting(seq.MID, seq.MID) bool { - return false -} - -func (m *mockFraction) Contains(seq.MID) bool { - return false -} - -func (m *mockFraction) Fetch(context.Context, []seq.ID) ([][]byte, error) { - return nil, nil -} - -func (m *mockFraction) Search(context.Context, processor.SearchParams) (*seq.QPR, error) { - return nil, nil -} - -func (m *mockFraction) FindLIDs(context.Context, []seq.ID) ([]seq.LID, error) { - return nil, nil -} - -func makeFracs(sizes ...uint64) []frac.Fraction { - out := make([]frac.Fraction, len(sizes)) +func makeFracs(sizes ...uint64) []fraction { + out := make([]fraction, len(sizes)) for i, s := range sizes { out[i] = &mockFraction{indexOnDisk: s} } From ec7be15f8ea7430e2cae09d144a4291abcaa4d52 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Wed, 27 May 2026 15:04:16 +0300 Subject: [PATCH 26/29] refactor: move to `frac_test` package --- compaction/executor.go | 30 +---- compaction/metrics.go | 1 + compaction/planner.go | 27 ++--- compaction/stcs_test.go | 2 +- frac/active_indexer_test.go | 9 +- frac/fraction_concurrency_test.go | 21 ++-- frac/fraction_test.go | 193 ++++++++++++++++++++++++------ frac/index_cache.go | 2 +- 8 files changed, 194 insertions(+), 91 deletions(-) create mode 100644 compaction/metrics.go diff --git a/compaction/executor.go b/compaction/executor.go index faabbc38..d3b1a55c 100644 --- a/compaction/executor.go +++ b/compaction/executor.go @@ -7,6 +7,7 @@ import ( "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/common" + "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/logger" ) @@ -43,7 +44,7 @@ func (e *Executor) init() { } } -func (e *Executor) compact(t task) (result, error) { +func (e *Executor) compact(t task) (*sealed.PreloadedData, error) { var ( names []string srcs []Source @@ -60,30 +61,5 @@ func (e *Executor) compact(t task) (result, error) { ) preloaded, err := Merge(t.filename, common.SealParams{}, srcs...) - return result{filename: t.filename, consumed: t.snapshot, produced: preloaded}, err -} - -type noopexecutor struct{} - -func (noopexecutor) Compact(t task) (result, error) { - var ( - sum int - cnt int - names []string - ) - - for _, f := range t.snapshot.Fractions() { - cnt += 1 - sum += int(f.Info().IndexOnDisk) - names = append(names, f.Info().Name()) - } - - logger.Info( - "picked fractions", - zap.Any("names", names), - zap.Int("size", sum), - zap.Int("count", cnt), - ) - - return result{}, nil + return preloaded, err } diff --git a/compaction/metrics.go b/compaction/metrics.go new file mode 100644 index 00000000..d1d3cde1 --- /dev/null +++ b/compaction/metrics.go @@ -0,0 +1 @@ +package compaction diff --git a/compaction/planner.go b/compaction/planner.go index c3744608..3d46e282 100644 --- a/compaction/planner.go +++ b/compaction/planner.go @@ -10,7 +10,6 @@ import ( "go.uber.org/zap" "github.com/alecthomas/units" - "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/sealed" "github.com/ozontech/seq-db/fracmanager" @@ -31,13 +30,7 @@ type task struct { bin time.Time filename string snapshot *fracmanager.CompactionSnapshot - onComplete func(result, error) -} - -type result struct { - filename string - consumed *fracmanager.CompactionSnapshot - produced *sealed.PreloadedData + onComplete func(*sealed.PreloadedData, error) } type planner struct { @@ -96,7 +89,7 @@ func (p *planner) init() { select { case p.tasks <- task: case <-time.NewTimer(time.Second).C: - // If all executor workers are busy for some long period + // If all executor workers are busy for some long period of time, // we want to drop the task because it might contain stale decision. } } @@ -117,7 +110,13 @@ func (p *planner) pick() (task, bool) { return fnames } - snapshot := p.fm.SealedFractionsSnapshot() + fractions := p.fm.SealedFractionsSnapshot() + snapshot := make([]fraction, len(fractions)) + + for i := range fractions { + snapshot[i] = fractions[i] + } + bins := p.distribute(compactionWindow, snapshot) times := p.prioritize(bins) @@ -160,7 +159,7 @@ func (p *planner) pick() (task, bool) { filename: p.fm.FractionName(), snapshot: csnapshot, - onComplete: func(r result, err error) { + onComplete: func(s *sealed.PreloadedData, err error) { p.mu.Lock() defer p.mu.Unlock() delete(p.inflight, t) @@ -174,7 +173,7 @@ func (p *planner) pick() (task, bool) { return } - if r.produced == nil { + if s == nil { logger.Info( "compaction did not produce fraction", zap.Any("snapshot", csnapshot), @@ -184,7 +183,7 @@ func (p *planner) pick() (task, bool) { // TODO(dkharms): Is it fine to substitute and delete? // We need somehow substitute and delete atomically. - p.fm.SubstituteWithSealed(r.produced, csnapshot) + p.fm.SubstituteWithSealed(s, csnapshot) csnapshot.Destroy() }, }, true @@ -198,7 +197,7 @@ type timestampBin struct { fracs []fraction } -func (p *planner) distribute(window time.Duration, fracs []*frac.Sealed) map[time.Time]timestampBin { +func (p *planner) distribute(window time.Duration, fracs []fraction) map[time.Time]timestampBin { bins := make(map[time.Time]timestampBin) for _, f := range fracs { diff --git a/compaction/stcs_test.go b/compaction/stcs_test.go index 3e79f6bc..5729c971 100644 --- a/compaction/stcs_test.go +++ b/compaction/stcs_test.go @@ -41,7 +41,7 @@ func TestSTCS_Pick(t *testing.T) { }) t.Run("requirement-not-met", func(t *testing.T) { - // Each fraction size is 10x the previous. + // Each Fraction size is 10x the previous. // They land in different buckets and no bucket with [mergeTrigger] fractions exists. require.Nil(t, s.Pick(makeFracs(100, 1000, 10000, 100000))) }) diff --git a/frac/active_indexer_test.go b/frac/active_indexer_test.go index a1200a7c..812b2763 100644 --- a/frac/active_indexer_test.go +++ b/frac/active_indexer_test.go @@ -1,4 +1,4 @@ -package frac +package frac_test import ( "bytes" @@ -12,6 +12,7 @@ import ( "go.uber.org/zap/zapcore" "github.com/ozontech/seq-db/cache" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/indexer" "github.com/ozontech/seq-db/logger" "github.com/ozontech/seq-db/metric/stopwatch" @@ -76,20 +77,20 @@ func getTestProcessor() *indexer.Processor { func BenchmarkIndexer(b *testing.B) { logger.SetLevel(zapcore.FatalLevel) - idx, stop := NewActiveIndexer(8, 8) + idx, stop := frac.NewActiveIndexer(8, 8) defer stop() allLogs, err := readFileAllAtOnce(filepath.Join(common.TestDataDir, "k8s.logs")) readers := splitLogsToBulks(allLogs, 1000) assert.NoError(b, err) - active := NewActive( + active := frac.NewActive( filepath.Join(b.TempDir(), "test"), idx, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), - &Config{}, + &frac.Config{}, testSkipMaskProvider{}, ) diff --git a/frac/fraction_concurrency_test.go b/frac/fraction_concurrency_test.go index b37cf2a3..addf376d 100644 --- a/frac/fraction_concurrency_test.go +++ b/frac/fraction_concurrency_test.go @@ -1,4 +1,4 @@ -package frac +package frac_test import ( "fmt" @@ -14,6 +14,7 @@ import ( "golang.org/x/sync/errgroup" "github.com/ozontech/seq-db/cache" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/indexer" @@ -39,16 +40,16 @@ func TestConcurrentAppendAndQuery(t *testing.T) { fracPath := filepath.Join(tmpDir, "test_fraction") defer testcommon.RemoveDir(fracPath) - activeIndexer, stop := NewActiveIndexer(numIndexWorkers, 1000) + activeIndexer, stop := frac.NewActiveIndexer(numIndexWorkers, 1000) defer stop() - active := NewActive( + active := frac.NewActive( fracPath, activeIndexer, storage.NewReadLimiter(numReaders/2, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), - &Config{}, + &frac.Config{}, testSkipMaskProvider{}, ) @@ -154,7 +155,7 @@ const ( kafka = "kafka" ) -func readTest(t *testing.T, fraction Fraction, numReaders, numQueries int, docs []*testDoc, fromTime, toTime time.Time, mapping seq.Mapping) { +func readTest(t *testing.T, fraction frac.Fraction, numReaders, numQueries int, docs []*testDoc, fromTime, toTime time.Time, mapping seq.Mapping) { readersGroup, ctx := errgroup.WithContext(t.Context()) type queryFilter func(doc *testDoc) bool @@ -332,7 +333,7 @@ func generatesMessages(numMessages, bulkSize int) ([]*testDoc, [][]string, time. return docs, bulks, fromTime, toTime } -func seal(active *Active) (*Sealed, error) { +func seal(active *frac.Active) (*frac.Sealed, error) { sealParams := common.SealParams{ IDsZstdLevel: 1, LIDsZstdLevel: 1, @@ -342,7 +343,7 @@ func seal(active *Active) (*Sealed, error) { DocBlocksZstdLevel: 1, DocBlockSize: 128 * int(units.KiB), } - activeSealingSource, err := NewActiveSealingSource(active, sealParams) + activeSealingSource, err := frac.NewActiveSealingSource(active, sealParams) if err != nil { return nil, err } @@ -351,13 +352,13 @@ func seal(active *Active) (*Sealed, error) { return nil, err } - sealed := NewSealedPreloaded( + sealed := frac.NewSealedPreloaded( active.BaseFileName, preloaded, storage.NewReadLimiter(1, nil), - newIndexCache(), + frac.NewIndexCache(), cache.NewCache[[]byte](nil, nil), - &Config{}, + &frac.Config{}, testSkipMaskProvider{}, ) diff --git a/frac/fraction_test.go b/frac/fraction_test.go index 0fee4795..b4bbe08a 100644 --- a/frac/fraction_test.go +++ b/frac/fraction_test.go @@ -1,7 +1,9 @@ -package frac +package frac_test import ( "context" + cryptorand "crypto/rand" + "encoding/hex" "fmt" "math" "math/rand/v2" @@ -20,6 +22,8 @@ import ( "github.com/stretchr/testify/suite" "github.com/ozontech/seq-db/cache" + "github.com/ozontech/seq-db/compaction" + "github.com/ozontech/seq-db/frac" "github.com/ozontech/seq-db/frac/common" "github.com/ozontech/seq-db/frac/processor" "github.com/ozontech/seq-db/indexer" @@ -42,20 +46,20 @@ func (testSkipMaskProvider) RemoveFrac(_ string) {} type FractionTestSuite struct { suite.Suite tmpDir string - config *Config + config *frac.Config mapping seq.Mapping tokenizers map[seq.TokenizerType]tokenizer.Tokenizer - activeIndexer *ActiveIndexer + activeIndexer *frac.ActiveIndexer stopIndexer func() sealParams common.SealParams - fraction Fraction + fraction frac.Fraction insertDocuments func(docs ...[]string) } func (s *FractionTestSuite) SetupSuiteCommon() { - s.activeIndexer, s.stopIndexer = NewActiveIndexer(4, 10) + s.activeIndexer, s.stopIndexer = frac.NewActiveIndexer(4, 10) } func (s *FractionTestSuite) TearDownSuiteCommon() { @@ -63,7 +67,7 @@ func (s *FractionTestSuite) TearDownSuiteCommon() { } func (s *FractionTestSuite) SetupTestCommon() { - s.config = &Config{} + s.config = &frac.Config{} s.tokenizers = map[seq.TokenizerType]tokenizer.Tokenizer{ seq.TokenizerTypeKeyword: tokenizer.NewKeywordTokenizer(20, false, true), seq.TokenizerTypeText: tokenizer.NewTextTokenizer(20, false, true, 100), @@ -110,6 +114,12 @@ func (s *FractionTestSuite) TearDownTestCommon() { s.NoError(err, "Failed to remove tmp dir") } +func randomHex(n int) string { + b := make([]byte, (n+1)/2) + cryptorand.Read(b) + return hex.EncodeToString(b)[:n] +} + func (s *FractionTestSuite) TestSearchKeyword() { docs := []string{ /*0*/ `{"timestamp":"2000-01-01T13:00:25Z","service":"service_a","message":"first message some text","trace_id":"abcdef","source":"prod01","level":"1"}`, @@ -1789,7 +1799,7 @@ func (s *FractionTestSuite) TestMIDDistribution() { s.insertDocuments(docs) - _, ok := s.fraction.(*Active) + _, ok := s.fraction.(*frac.Active) if ok { s.Require().Nil(s.fraction.Info().Distribution, "active fraction has MID distribution") return @@ -1828,15 +1838,15 @@ func (s *FractionTestSuite) TestFractionInfo() { s.Require().Equal(seq.MID(946731654000000000), info.To, "to doesn't match") switch s.fraction.(type) { - case *Active: + case *frac.Active: s.Require().True(info.MetaOnDisk >= uint64(250) && info.MetaOnDisk <= uint64(400), "meta on disk doesn't match. actual value: %d", info.MetaOnDisk) s.Require().Equal(uint64(0), info.IndexOnDisk, "index on disk doesn't match") - case *Sealed: + case *frac.Sealed: s.Require().Equal(uint64(0), info.MetaOnDisk, "meta on disk doesn't match. actual value") s.Require().True(info.IndexOnDisk > uint64(1300) && info.IndexOnDisk < uint64(1400), "index on disk doesn't match. actual value: %d", info.IndexOnDisk) - case *Remote: + case *frac.Remote: s.Require().Equal(uint64(0), info.MetaOnDisk, "meta on disk doesn't match. actual value") s.Require().True(info.IndexOnDisk > uint64(1300) && info.IndexOnDisk < uint64(1400), "index on disk doesn't match. actual value: %d", info.IndexOnDisk) @@ -2035,9 +2045,10 @@ func (s *FractionTestSuite) AssertHist( } } -func (s *FractionTestSuite) newActive(bulks ...[]string) *Active { - baseName := filepath.Join(s.tmpDir, "test_fraction") - active := NewActive( +func (s *FractionTestSuite) newActive(bulks ...[]string) *frac.Active { + baseName := filepath.Join(s.tmpDir, randomHex(12)) + + active := frac.NewActive( baseName, s.activeIndexer, storage.NewReadLimiter(1, nil), @@ -2081,20 +2092,20 @@ func (s *FractionTestSuite) newActive(bulks ...[]string) *Active { return active } -func (s *FractionTestSuite) newSealed(bulks ...[]string) *Sealed { +func (s *FractionTestSuite) newSealed(bulks ...[]string) *frac.Sealed { active := s.newActive(bulks...) - activeSealingSource, err := NewActiveSealingSource(active, s.sealParams) + activeSealingSource, err := frac.NewActiveSealingSource(active, s.sealParams) s.Require().NoError(err, "Sealing source creation failed") preloaded, err := sealing.Seal(activeSealingSource, s.sealParams) s.Require().NoError(err, "Sealing failed") - sealed := NewSealedPreloaded( + sealed := frac.NewSealedPreloaded( active.BaseFileName, preloaded, storage.NewReadLimiter(1, nil), - newIndexCache(), + frac.NewIndexCache(), cache.NewCache[[]byte](nil, nil), s.config, testSkipMaskProvider{}, @@ -2127,7 +2138,7 @@ func (s *ActiveFractionTestSuite) SetupTest() { } func (s *ActiveFractionTestSuite) TearDownTest() { - if active, ok := s.fraction.(*Active); ok { + if active, ok := s.fraction.(*frac.Active); ok { active.Release() } else { s.Require().Nil(s.fraction, "fraction is not of Active type") @@ -2145,7 +2156,7 @@ ActiveReplayedFractionTestSuite run tests for active fraction which was replayed */ type ActiveReplayedFractionTestSuite struct { FractionTestSuite - originalFrac *Active + originalFrac *frac.Active } func (s *ActiveReplayedFractionTestSuite) SetupSuite() { @@ -2166,26 +2177,29 @@ func (s *ActiveReplayedFractionTestSuite) SetupTest() { } } -func (s *ActiveReplayedFractionTestSuite) Replay(frac *Active) Fraction { - fracFileName := frac.BaseFileName - s.originalFrac = frac - replayedFrac := NewActive( +func (s *ActiveReplayedFractionTestSuite) Replay(f *frac.Active) frac.Fraction { + s.originalFrac = f + fracFileName := f.BaseFileName + + replayedFrac := frac.NewActive( fracFileName, s.activeIndexer, storage.NewReadLimiter(1, nil), cache.NewCache[[]byte](nil, nil), cache.NewCache[[]byte](nil, nil), - &Config{}, + &frac.Config{}, testSkipMaskProvider{}, ) + err := replayedFrac.Replay(context.Background()) s.Require().NoError(err, "replay failed") + return replayedFrac } func (s *ActiveReplayedFractionTestSuite) TearDownTest() { s.originalFrac.Release() - if active, ok := s.fraction.(*Active); ok { + if active, ok := s.fraction.(*frac.Active); ok { active.Release() } else { s.Require().Nil(s.fraction, "fraction is not of Active type") @@ -2220,7 +2234,7 @@ func (s *SealedFractionTestSuite) SetupTest() { } func (s *SealedFractionTestSuite) TearDownTest() { - if sealed, ok := s.fraction.(*Sealed); ok { + if sealed, ok := s.fraction.(*frac.Sealed); ok { sealed.Release() } else { s.Require().Nil(s.fraction, "fraction is not of Sealed type") @@ -2256,7 +2270,7 @@ func (s *SealedLoadedFractionTestSuite) SetupTest() { } func (s *SealedLoadedFractionTestSuite) TearDownTest() { - if sealed, ok := s.fraction.(*Sealed); ok { + if sealed, ok := s.fraction.(*frac.Sealed); ok { sealed.Release() } else { s.Require().Nil(s.fraction, "fraction is not of Sealed type") @@ -2268,14 +2282,14 @@ func (s *SealedLoadedFractionTestSuite) TearDownSuite() { s.TearDownSuiteCommon() } -func (s *SealedLoadedFractionTestSuite) newSealedLoaded(bulks ...[]string) *Sealed { +func (s *SealedLoadedFractionTestSuite) newSealedLoaded(bulks ...[]string) *frac.Sealed { sealed := s.newSealed(bulks...) sealed.Release() - sealed = NewSealed( + sealed = frac.NewSealed( sealed.BaseFileName, storage.NewReadLimiter(1, nil), - newIndexCache(), + frac.NewIndexCache(), cache.NewCache[[]byte](nil, nil), nil, s.config, @@ -2332,13 +2346,13 @@ func (s *RemoteFractionTestSuite) SetupTest() { s.Require().NoError(err, "offload failed") s.Require().True(offloaded, "didn't offload frac") - remoteFrac := NewRemote( + remoteFrac := frac.NewRemote( context.Background(), sealed.BaseFileName, storage.NewReadLimiter(1, nil), - newIndexCache(), + frac.NewIndexCache(), cache.NewCache[[]byte](nil, nil), - sealed.info, + sealed.Info(), s.config, s3cli, testSkipMaskProvider{}, @@ -2350,7 +2364,7 @@ func (s *RemoteFractionTestSuite) SetupTest() { } func (s *RemoteFractionTestSuite) TearDownTest() { - if remote, ok := s.fraction.(*Remote); ok { + if remote, ok := s.fraction.(*frac.Remote); ok { remote.Suicide() } else { s.Require().Nil(s.fraction, "fraction is not of Remote type") @@ -2364,6 +2378,113 @@ func (s *RemoteFractionTestSuite) TearDownSuite() { s.s3server.Close() } +type CompactedFractionTestSuite struct { + FractionTestSuite +} + +func (s *CompactedFractionTestSuite) SetupSuite() { + s.SetupSuiteCommon() +} + +func (s *CompactedFractionTestSuite) SetupTest() { + s.SetupTestCommon() + + s.insertDocuments = func(bulks ...[]string) { + if s.fraction != nil { + s.Require().Fail("can insert docs only once") + } + s.fraction = s.newCompacted(bulks...) + } +} + +func (s *CompactedFractionTestSuite) TearDownTest() { + if sealed, ok := s.fraction.(*frac.Sealed); ok { + sealed.Release() + } else { + s.Require().Nil(s.fraction, "fraction is not of Sealed type") + } + s.TearDownTestCommon() +} + +func (s *CompactedFractionTestSuite) TearDownSuite() { + s.TearDownSuiteCommon() +} + +// newCompacted flattens all bulks into one doc list, splits it in half, +// seals each half as a separate fraction, and merges them with compaction.Merge. +func (s *CompactedFractionTestSuite) newCompacted(bulks ...[]string) *frac.Sealed { + // Flatten all documents because we are going to reorganize it. + var docs []string + for _, b := range bulks { + docs = append(docs, b...) + } + + var ( + reorganized [][]string + bulkSize = max(len(docs)/32, 1) + ) + + for i := 0; i < len(docs); i += bulkSize { + reorganized = append( + reorganized, + docs[i:min(i+bulkSize, len(docs))], + ) + } + + merged := s.newSealed(reorganized[0]) + for i, bulk := range reorganized[1:] { + current := s.newSealed(bulk) + + mergedBase := filepath.Join( + s.tmpDir, + fmt.Sprintf("merged-%d", i), + ) + + preloaded, err := compaction.Merge( + mergedBase, s.sealParams, + frac.NewSealedSource(merged), + frac.NewSealedSource(current), + ) + + s.Require().NoError(err) + merged = frac.NewSealedPreloaded( + mergedBase, + preloaded, + storage.NewReadLimiter(1, nil), + frac.NewIndexCache(), + cache.NewCache[[]byte](nil, nil), + s.config, + testSkipMaskProvider{}, + ) + } + + return merged +} + +// TestFractionInfo overrides the base test because DocsOnDisk is larger in a +// merged fraction (sum of two source docs files) and MIDsDistribution is not +// populated by compaction.Merge. +func (s *CompactedFractionTestSuite) TestFractionInfo() { + docs := []string{ + `{"timestamp":"2000-01-01T13:00:25Z","service":"service_a","message":"first message some text", "container":"gateway"}`, + `{"timestamp":"2000-01-01T13:00:32Z","service":"service_b","message":"second message other text", "container":"kube-proxy"}`, + `{"timestamp":"2000-01-01T13:00:43Z","service":"service_c","message":"third message other text", "container":"gateway"}`, + `{"timestamp":"2000-01-01T13:00:53Z","service":"service_a","message":"fourth message some text", "container":"kube-proxy"}`, + `{"timestamp":"2000-01-01T13:00:54Z","service":"service_c","message":"apple","container":"kube-scheduler"}`, + } + + s.insertDocuments(docs) + + info := s.fraction.Info() + + s.Require().Equal(uint32(5), info.DocsTotal, "doc total doesn't match") + s.Require().Equal(uint64(583), info.DocsRaw, "doc raw doesn't match") + s.Require().Equal(seq.MID(946731625000000000), info.From, "from doesn't match") + s.Require().Equal(seq.MID(946731654000000000), info.To, "to doesn't match") + s.Require().Equal(uint64(0), info.MetaOnDisk, "meta on disk doesn't match") + s.Require().True(info.IndexOnDisk > 0, "index on disk should be non-zero") +} + func TestActiveFractionTestSuite(t *testing.T) { suite.Run(t, new(ActiveFractionTestSuite)) } @@ -2383,3 +2504,7 @@ func TestSealedLoadedFractionTestSuite(t *testing.T) { func TestRemoteFractionTestSuite(t *testing.T) { suite.Run(t, new(RemoteFractionTestSuite)) } + +func TestCompactedFractionTestSuite(t *testing.T) { + suite.Run(t, new(CompactedFractionTestSuite)) +} diff --git a/frac/index_cache.go b/frac/index_cache.go index 043e8c5c..f270f209 100644 --- a/frac/index_cache.go +++ b/frac/index_cache.go @@ -7,7 +7,7 @@ import ( "github.com/ozontech/seq-db/frac/sealed/token" ) -func newIndexCache() *IndexCache { +func NewIndexCache() *IndexCache { return &IndexCache{ LegacyRegistry: cache.NewCache[[]byte](nil, nil), From 65b228e19d85397cc9e49772998aff27ccf8f988 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Thu, 7 May 2026 15:01:10 +0300 Subject: [PATCH 27/29] feat: rotate fraction based on in-memory size --- Makefile | 2 +- frac/active.go | 9 ++++++ frac/active_docs_positions.go | 11 ++++++++ frac/active_ids.go | 7 +++++ frac/active_lids.go | 12 ++++++++ frac/active_lids_map.go | 11 ++++++++ ...ive_sealing_source.go => active_source.go} | 28 +++++++++---------- frac/active_token_list.go | 27 ++++++++++++++++++ fracmanager/fraction_registry.go | 2 +- 9 files changed, 93 insertions(+), 16 deletions(-) rename frac/{active_sealing_source.go => active_source.go} (91%) diff --git a/Makefile b/Makefile index 9a90289f..1d8c034d 100644 --- a/Makefile +++ b/Makefile @@ -56,7 +56,7 @@ test-deps: .PHONY: test test: test-deps - LOG_LEVEL=ERROR go test ./... -count 1 + LOG_LEVEL=ERROR go test ./... -count 1 -v .bin-deps: export GOBIN := $(LOCAL_BIN) .bin-deps: diff --git a/frac/active.go b/frac/active.go index 75d04e16..f8e89c99 100644 --- a/frac/active.go +++ b/frac/active.go @@ -446,6 +446,15 @@ func (f *Active) createDataProvider(ctx context.Context) *activeDataProvider { } } +func (f *Active) MemSize() int { + return f.MIDs.Size() + + f.RIDs.Size() + + f.DocBlocks.Size() + + f.DocsPositions.Size() + + f.IDsToLIDs.Size() + + f.TokenList.Size() +} + func (f *Active) Info() *common.Info { f.infoMu.RLock() defer f.infoMu.RUnlock() diff --git a/frac/active_docs_positions.go b/frac/active_docs_positions.go index f058091f..618ca910 100644 --- a/frac/active_docs_positions.go +++ b/frac/active_docs_positions.go @@ -2,6 +2,7 @@ package frac import ( "sync" + "unsafe" "github.com/ozontech/seq-db/seq" ) @@ -32,6 +33,16 @@ func (dp *DocsPositions) GetSync(id seq.ID) seq.DocPos { return dp.Get(id) } +func (dp *DocsPositions) Size() int { + dp.mu.RLock() + defer dp.mu.RUnlock() + + const entrySize = int(unsafe.Sizeof(seq.ID{})) + + int(unsafe.Sizeof(seq.DocPos(0))) + + return len(dp.idToPos) * entrySize +} + // SetMultiple returns a slice of added ids func (dp *DocsPositions) SetMultiple(ids []seq.ID, pos []seq.DocPos) []seq.ID { dp.mu.Lock() diff --git a/frac/active_ids.go b/frac/active_ids.go index 1195c8fa..cab87083 100644 --- a/frac/active_ids.go +++ b/frac/active_ids.go @@ -2,6 +2,7 @@ package frac import ( "sync" + "unsafe" ) type UInt64s struct { @@ -47,3 +48,9 @@ func (l *UInt64s) Append(val uint64) uint32 { return l.append(val) } + +func (l *UInt64s) Size() int { + l.mu.RLock() + defer l.mu.RUnlock() + return len(l.vals) * int(unsafe.Sizeof(int64(0))) +} diff --git a/frac/active_lids.go b/frac/active_lids.go index 236136ef..41970d45 100644 --- a/frac/active_lids.go +++ b/frac/active_lids.go @@ -135,6 +135,18 @@ func mergeSorted(right, left []uint32, mids, rids []uint64) []uint32 { return result } +func (tl *TokenLIDs) Size() int { + tl.sortedMu.Lock() + sortedLen := len(tl.sorted) + tl.sortedMu.Unlock() + + tl.queueMu.Lock() + queueLen := len(tl.queue) + tl.queueMu.Unlock() + + return (sortedLen + queueLen) * 4 +} + func (tl *TokenLIDs) PutLIDsInQueue(lids []uint32) int { tl.queueMu.Lock() defer tl.queueMu.Unlock() diff --git a/frac/active_lids_map.go b/frac/active_lids_map.go index bae64854..418fb82c 100644 --- a/frac/active_lids_map.go +++ b/frac/active_lids_map.go @@ -2,6 +2,7 @@ package frac import ( "sync" + "unsafe" "github.com/ozontech/seq-db/seq" ) @@ -27,6 +28,16 @@ func (al *ActiveLIDs) Get(id seq.ID) (seq.LID, bool) { return val, ok } +func (al *ActiveLIDs) Size() int { + al.mu.RLock() + defer al.mu.RUnlock() + + const entrySize = int(unsafe.Sizeof(seq.ID{})) + + int(unsafe.Sizeof(seq.LID(0))) + + return len(al.idToLid) * entrySize +} + func (al *ActiveLIDs) SetMultiple(ids []seq.ID, lids []uint32) { al.mu.Lock() defer al.mu.Unlock() diff --git a/frac/active_sealing_source.go b/frac/active_source.go similarity index 91% rename from frac/active_sealing_source.go rename to frac/active_source.go index e7c451e2..af7084b0 100644 --- a/frac/active_sealing_source.go +++ b/frac/active_source.go @@ -30,7 +30,7 @@ type ( IndexedDocBlock = util.Pair[[]byte, []seq.DocPos] ) -type ActiveSealingSource struct { +type ActiveSource struct { params common.SealParams // Sealing parameters info *common.Info // fraction Info @@ -55,13 +55,13 @@ type ActiveSealingSource struct { docsReader *storage.DocsReader // Document storage reader } -func NewActiveSealingSource(active *Active, params common.SealParams) (*ActiveSealingSource, error) { +func NewActiveSealingSource(active *Active, params common.SealParams) (*ActiveSource, error) { info := *active.info // copy sortedLIDs := active.GetAllDocuments() fields, fieldTIDs := sortFields(active.TokenList) - src := ActiveSealingSource{ + src := ActiveSource{ params: params, info: &info, @@ -116,7 +116,7 @@ func sortFields(tl *TokenList) ([]string, [][]uint32) { return fields, fieldTIDs } -func (src *ActiveSealingSource) ID() iter.Seq2[DocLocation, error] { +func (src *ActiveSource) ID() iter.Seq2[DocLocation, error] { return func(yield func(DocLocation, error) bool) { mids := src.mids.vals rids := src.rids.vals @@ -155,11 +155,11 @@ func (src *ActiveSealingSource) ID() iter.Seq2[DocLocation, error] { } } -func (src *ActiveSealingSource) BlockOffsets() []uint64 { +func (src *ActiveSource) BlockOffsets() []uint64 { return src.blocksOffsets } -func (src *ActiveSealingSource) prepareInfo() { +func (src *ActiveSource) prepareInfo() { src.info.MetaOnDisk = 0 src.info.SealingTime = uint64(src.created.UnixMilli()) mids := src.mids.vals @@ -170,17 +170,17 @@ func (src *ActiveSealingSource) prepareInfo() { src.info.BuildDistribution(mids) } -func (src *ActiveSealingSource) prepareLids() { +func (src *ActiveSource) prepareLids() { for _, tl := range src.lids[1:] { tl.GetLIDs(src.mids, src.rids) } } -func (src *ActiveSealingSource) Info() *common.Info { +func (src *ActiveSource) Info() *common.Info { return src.info } -func (src *ActiveSealingSource) TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, error]] { +func (src *ActiveSource) TokenTriplet() iter.Seq2[string, iter.Seq2[TokenPosting, error]] { return func(yield func(string, iter.Seq2[TokenPosting, error]) bool) { for idx, field := range src.fields { if !yield(field, src.postingsForField(field, idx)) { @@ -190,7 +190,7 @@ func (src *ActiveSealingSource) TokenTriplet() iter.Seq2[string, iter.Seq2[Token } } -func (src *ActiveSealingSource) postingsForField(field string, idx int) iter.Seq2[TokenPosting, error] { +func (src *ActiveSource) postingsForField(field string, idx int) iter.Seq2[TokenPosting, error] { var lidsbuf []uint32 return func(yield func(TokenPosting, error) bool) { for _, tid := range src.fieldTIDs[idx] { @@ -221,7 +221,7 @@ func makeInverser(sortedLIDs []uint32) []uint32 { // Docs returns an iterator for documents with their IDs. // Handles duplicate IDs (for nested indexes). -func (src *ActiveSealingSource) Docs() iter.Seq2[Document, error] { +func (src *ActiveSource) Docs() iter.Seq2[Document, error] { return func(yield func(Document, error) bool) { var ( curdoc []byte @@ -256,7 +256,7 @@ func (src *ActiveSealingSource) Docs() iter.Seq2[Document, error] { } // doc reads a document from storage by its position. -func (src *ActiveSealingSource) doc(pos seq.DocPos) ([]byte, error) { +func (src *ActiveSource) doc(pos seq.DocPos) ([]byte, error) { blockIndex, docOffset := pos.Unpack() blockOffset := src.blocksOffsets[blockIndex] @@ -277,7 +277,7 @@ func (src *ActiveSealingSource) doc(pos seq.DocPos) ([]byte, error) { // SortDocs sorts documents and writes them in compressed form to disk. // Creates a temporary file that is then renamed to the final one. -func (src *ActiveSealingSource) SortDocs() error { +func (src *ActiveSource) SortDocs() error { start := time.Now() logger.Info("sorting docs...") @@ -346,7 +346,7 @@ func (src *ActiveSealingSource) SortDocs() error { // writeDocs compresses and writes document blocks, calculating new offsets // and collecting document positions. -func (src *ActiveSealingSource) writeDocs(blocks iter.Seq2[IndexedDocBlock, error], w io.Writer) ([]uint64, []seq.DocPos, error) { +func (src *ActiveSource) writeDocs(blocks iter.Seq2[IndexedDocBlock, error], w io.Writer) ([]uint64, []seq.DocPos, error) { offset := 0 buf := make([]byte, 0) blocksOffsets := make([]uint64, 0) diff --git a/frac/active_token_list.go b/frac/active_token_list.go index adf94ffd..77c85d02 100644 --- a/frac/active_token_list.go +++ b/frac/active_token_list.go @@ -5,6 +5,7 @@ import ( "fmt" "hash/crc32" "sync" + "unsafe" "github.com/ozontech/seq-db/seq" @@ -91,6 +92,32 @@ func NewActiveTokenList(workers int) *TokenList { return tl } + +func (tl *TokenList) Size() int { + size := 0 + + tl.tidMu.RLock() + for _, val := range tl.tidToVal { + size += len(val) + } + + for _, lids := range tl.tidToLIDs { + if lids != nil { + size += lids.Size() + } + } + tl.tidMu.RUnlock() + + tl.fieldsMu.RLock() + for field, tids := range tl.FieldTIDs { + size += len(field) + + len(tids)*int(unsafe.Sizeof(uint32(0))) + } + tl.fieldsMu.RUnlock() + + return size +} + func (tl *TokenList) Stop() { for _, c := range tl.chList { close(c) diff --git a/fracmanager/fraction_registry.go b/fracmanager/fraction_registry.go index c0112383..be77be20 100644 --- a/fracmanager/fraction_registry.go +++ b/fracmanager/fraction_registry.go @@ -148,7 +148,7 @@ func (r *fractionRegistry) rotateIfFull(maxSize uint64, ap activeProvider) (*ref r.mu.Lock() defer r.mu.Unlock() - if r.sappender.Info().DocsOnDisk <= maxSize { + if uint64(r.sappender.MemSize()) <= maxSize { return nil, nil, nil } From d33fd8099f27b5a2fe61b555f87f9c2861aae5b9 Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Wed, 27 May 2026 18:09:45 +0300 Subject: [PATCH 28/29] chore: frac-size=1MiB --- .seqbench/comparison.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.seqbench/comparison.env b/.seqbench/comparison.env index ee12dfcb..a43f6254 100644 --- a/.seqbench/comparison.env +++ b/.seqbench/comparison.env @@ -1,6 +1,6 @@ GOGC=100 -SEQDB_STORAGE_FRAC_SIZE=16MiB +SEQDB_STORAGE_FRAC_SIZE=1MiB SEQDB_STORAGE_TOTAL_SIZE=10GiB SEQDB_LIMITS_QUERY_RATE=1024 From fdde91dc4616c5173cc769e3d3d66159b06c83ee Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Wed, 27 May 2026 18:41:35 +0300 Subject: [PATCH 29/29] chore: group by creation time --- compaction/planner.go | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/compaction/planner.go b/compaction/planner.go index 3d46e282..0d41ad7f 100644 --- a/compaction/planner.go +++ b/compaction/planner.go @@ -23,7 +23,7 @@ type fraction interface { const ( // TODO(dkharms): Move this options to config. compactionTick = time.Second - compactionWindow = 24 * time.Hour + compactionWindow = time.Minute ) type task struct { @@ -201,15 +201,8 @@ func (p *planner) distribute(window time.Duration, fracs []fraction) map[time.Ti bins := make(map[time.Time]timestampBin) for _, f := range fracs { - from, to := f.Info().From.Time(), f.Info().To.Time() - - // Do not handle fractions which have - // too wide date-range. - if to.Sub(from) > window { - continue - } - - bin := from.Truncate(window) + ct := time.UnixMilli(int64(f.Info().CreationTime)) + bin := ct.Truncate(window) tb := bins[bin] tb.t = bin