datazip-inc · hash-data · Mar 23, 2026 · Jan 14, 2026 · Jan 15, 2026 · Jan 15, 2026
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -20,7 +20,7 @@ jobs:
   integration-tests:
     environment: integration_tests
     runs-on: 32gb-runner
-    timeout-minutes: 30
+    timeout-minutes: 45
     steps:
       - name: Checkout code
         uses: actions/checkout@v3

diff --git a/constants/constants.go b/constants/constants.go
@@ -48,6 +48,8 @@ const (
 	MSSQL    DriverType = "mssql"
 )
 
+// Drivers where filters are applied in memory after full refresh data is read.
+var FullRefreshPostReadFilterDrivers = []DriverType{S3, Kafka}
 var RelationalDrivers = []DriverType{Postgres, MySQL, Oracle, DB2, MSSQL}
 
 var ParallelCDCDrivers = []DriverType{MongoDB, MSSQL}

diff --git a/destination/iceberg/iceberg.go b/destination/iceberg/iceberg.go
@@ -365,7 +365,19 @@ func (i *Iceberg) FlattenAndCleanData(ctx context.Context, records []types.RawRe
 		return false, nil, nil, fmt.Errorf("failed to extract schema from records: %s", err)
 	}
 
-	return schemaDifference, records, recordsSchema, err
+	filter, isLegacy, err := i.stream.GetFilter()
+	if err != nil {
+		return false, nil, nil, fmt.Errorf("failed to parse stream filter: %s", err)
+	}
+
+	if i.options.ApplyFilter {
+		records, err = typeutils.FilterRecords(ctx, records, filter, isLegacy, recordsSchema)
+		if err != nil {
+			return false, nil, nil, fmt.Errorf("failed to filter records: %s", err)
+		}
+	}
+
+	return schemaDifference, records, recordsSchema, nil
 }
 
 // compares with global schema and update schema in destination accordingly

diff --git a/destination/parquet/parquet.go b/destination/parquet/parquet.go
@@ -403,9 +403,22 @@ func (p *Parquet) FlattenAndCleanData(ctx context.Context, records []types.RawRe
 		}
 	}
 
-	return schemaChange, records, p.schema, utils.Concurrent(ctx, records, runtime.GOMAXPROCS(0)*16, func(_ context.Context, record types.RawRecord, _ int) error {
+	if err := utils.Concurrent(ctx, records, runtime.GOMAXPROCS(0)*16, func(_ context.Context, record types.RawRecord, _ int) error {
 		return typeutils.ReformatRecord(p.schema, record.Data)
-	})
+	}); err != nil {
+		return false, nil, nil, fmt.Errorf("failed to reformat records: %s", err)
+	}
+	filter, isLegacy, err := p.stream.GetFilter()
+	if err != nil {
+		return false, nil, nil, fmt.Errorf("failed to parse stream filter: %s", err)
+	}
+	if p.options.ApplyFilter {
+		records, err = typeutils.FilterRecords(ctx, records, filter, isLegacy, p.schema)
+		if err != nil {
+			return false, nil, nil, fmt.Errorf("failed to filter records: %s", err)
+		}
+	}
+	return schemaChange, records, p.schema, nil
 }
 
 // EvolveSchema updates the schema based on changes. Need to pass olakeTimestamp to get the correct partition path based on record ingestion time.

diff --git a/destination/writers.go b/destination/writers.go
@@ -18,10 +18,11 @@ type (
 	WriterOption   func(Writer) error
 
 	Options struct {
-		Identifier string
-		Number     int64
-		Backfill   bool
-		ThreadID   string
+		Identifier  string
+		Number      int64
+		Backfill    bool
+		ThreadID    string
+		ApplyFilter bool
 	}
 
 	ThreadOptions func(opt *Options)
@@ -34,6 +35,7 @@ type (
 	Stats struct {
 		TotalRecordsToSync atomic.Int64 // total record that are required to sync
 		ReadCount          atomic.Int64 // records that got read
+		RecordsFiltered    atomic.Int64 // records that got filtered
 		ThreadCount        atomic.Int64 // total number of writer threads
 	}
 
@@ -83,6 +85,11 @@ func WithThreadID(threadID string) ThreadOptions {
 		opt.ThreadID = threadID
 	}
 }
+func WithApplyFilter(applyFilter bool) ThreadOptions {
+	return func(opt *Options) {
+		opt.ApplyFilter = applyFilter
+	}
+}
 
 func NewWriterPool(ctx context.Context, config *types.WriterConfig, syncStreams []string, batchSize int64) (*WriterPool, error) {
 	newfunc, found := RegisteredWriters[config.Type]
@@ -105,6 +112,7 @@ func NewWriterPool(ctx context.Context, config *types.WriterConfig, syncStreams
 			TotalRecordsToSync: atomic.Int64{},
 			ThreadCount:        atomic.Int64{},
 			ReadCount:          atomic.Int64{},
+			RecordsFiltered:    atomic.Int64{},
 		},
 		config:    config.WriterConfig,
 		init:      newfunc,
@@ -230,12 +238,12 @@ func (wt *WriterThread) flush(ctx context.Context, buf []types.RawRecord) (err e
 	// create flush context
 	flushCtx, cancel := context.WithCancel(ctx)
 	defer cancel()
-
+	recordsCountBeforeFiltering := len(buf)
 	evolution, buf, threadSchema, err := wt.writer.FlattenAndCleanData(flushCtx, buf)
 	if err != nil {
 		return fmt.Errorf("failed to flatten and clean data: %s", err)
 	}
-
+	wt.stats.RecordsFiltered.Add(int64(recordsCountBeforeFiltering - len(buf)))
 	// TODO: after flattening record type raw_record not make sense
 	if evolution {
 		wt.streamArtifact.mu.Lock()

diff --git a/drivers/abstract/backfill.go b/drivers/abstract/backfill.go
@@ -49,7 +49,7 @@ func (a *AbstractDriver) Backfill(mainCtx context.Context, backfilledStreams cha
 		defer backfillCtxCancel()
 
 		threadID := generateThreadID(stream.ID(), fmt.Sprintf("min[%v]-max[%v]", chunk.Min, chunk.Max))
-		inserter, prevMetadataState, err := pool.NewWriter(backfillCtx, stream, destination.WithBackfill(true), destination.WithThreadID(threadID))
+		inserter, prevMetadataState, err := pool.NewWriter(backfillCtx, stream, destination.WithBackfill(true), destination.WithThreadID(threadID), destination.WithApplyFilter(slices.Contains(constants.FullRefreshPostReadFilterDrivers, constants.DriverType(a.driver.Type()))))
 		if err != nil {
 			return fmt.Errorf("failed to create new writer thread: %s", err)
 		}

diff --git a/drivers/abstract/cdc.go b/drivers/abstract/cdc.go
@@ -116,7 +116,7 @@ func (a *AbstractDriver) streamChanges(mainCtx context.Context, pool *destinatio
 
 	for _, stream := range streams {
 		threadID := generateThreadID(stream.ID(), "")
-		w, writerMeta, createErr := pool.NewWriter(cdcCtx, stream, destination.WithThreadID(threadID))
+		w, writerMeta, createErr := pool.NewWriter(cdcCtx, stream, destination.WithThreadID(threadID), destination.WithApplyFilter(true))
 		if createErr != nil {
 			return fmt.Errorf("failed to create CDC writer for stream %s: %s", stream.ID(), createErr)
 		}

diff --git a/drivers/abstract/incremental.go b/drivers/abstract/incremental.go
@@ -75,7 +75,7 @@ func (a *AbstractDriver) Incremental(mainCtx context.Context, pool *destination.
 			defer incrementalCtxCancel()
 
 			threadID := generateThreadID(stream.ID(), fmt.Sprintf("%v_%v", maxPrimaryCursorValue, maxSecondaryCursorValue))
-			inserter, prevMetadataState, err := pool.NewWriter(incrementalCtx, stream, destination.WithThreadID(threadID))
+			inserter, prevMetadataState, err := pool.NewWriter(incrementalCtx, stream, destination.WithThreadID(threadID), destination.WithApplyFilter(true))
 			if err != nil {
 				return fmt.Errorf("failed to create new writer thread: %s", err)
 			}

diff --git a/drivers/db2/internal/db2_test.go b/drivers/db2/internal/db2_test.go
@@ -20,6 +20,21 @@ func TestDB2Integration(t *testing.T) {
 		DestinationDB:                    "db2_testdb_db2inst1",
 		CursorField:                      "COL_CURSOR:COL_TIMESTAMP",
 		PartitionRegex:                   "/{id, identity}",
+		FilterConfig: `{
+                    "logical_operator": "And",
+                    "conditions": [
+                        {
+                            "column": "COL_DOUBLE",
+                            "operator": "<",
+                            "value": 239834.89
+                        },
+                        {
+                            "column": "COL_TIMESTAMP",
+                            "operator": ">=",
+                            "value": "2022-07-01T15:30:00.000+00:00"
+                        }
+                    ]
+                }`,
 	}
 	testConfig.TestIntegration(t)
 }
diff --git a/drivers/db2/internal/db2_test_util.go b/drivers/db2/internal/db2_test_util.go
@@ -91,6 +91,30 @@ func ExecuteQuery(ctx context.Context, t *testing.T, streams []string, operation
 				VARGRAPHIC('vargraphic_val'),
 				TRUE
 			)`, integrationTestTable)
+		_, err = db.ExecContext(ctx, query)
+		require.NoError(t, err, "Failed to execute %s operation", operation)
+		// insert a filtered row — timestamp is before the filter threshold, so it won't be synced
+		filteredQuery := fmt.Sprintf(`
+			INSERT INTO %s (
+				col_cursor, col_bigint, col_char, col_character,
+				col_varchar, col_date, col_decimal,
+				col_double, col_real, col_int, col_smallint,
+				col_clob, col_blob, col_timestamp, col_time,
+				col_graphic, col_vargraphic, col_bool
+			) VALUES (
+				-1, 111111111111111, 'x', 'filtered',
+				'filtered_val', DATE('2022-06-15'), 50.123,
+				50.123, 50.0, 0, 0,
+				CLOB('filtered text'), BLOB(X'00'),
+				TIMESTAMP('2022-06-15-10.00.00.000000'),
+				TIME('10.00.00'),
+				GRAPHIC('filtered'),
+				VARGRAPHIC('filtered'),
+				FALSE
+			)`, integrationTestTable)
+		_, err = db.ExecContext(ctx, filteredQuery)
+		require.NoError(t, err, "Failed to insert filtered test data row")
+		return
 
 	case "update":
 		query = fmt.Sprintf(`
@@ -149,6 +173,27 @@ func insertTestData(t *testing.T, ctx context.Context, db *sqlx.DB, tableName st
 		_, err := db.ExecContext(ctx, query)
 		require.NoError(t, err, "Failed to insert test data")
 	}
+	// insert a filtered row — timestamp is before the filter threshold, so it won't be synced
+	filteredQuery := fmt.Sprintf(`
+		INSERT INTO %s (
+			col_cursor, col_bigint, col_char, col_character,
+			col_varchar, col_date, col_decimal,
+			col_double, col_real, col_int, col_smallint,
+			col_clob, col_blob, col_timestamp, col_time,
+			col_graphic, col_vargraphic, col_bool
+		) VALUES (
+			-1, 111111111111111, 'x', 'filtered',
+			'filtered_val', DATE('2021-06-15'), 500234.123,
+			500234.123, 500234.0, 0, 0,
+			CLOB('filtered text'), BLOB(X'00'),
+			TIMESTAMP('2021-06-15-10.00.00.000000'),
+			TIME('10.00.00'),
+			GRAPHIC('filtered'),
+			VARGRAPHIC('filtered'),
+			FALSE
+		)`, tableName)
+	_, err := db.ExecContext(ctx, filteredQuery)
+	require.NoError(t, err, "Failed to insert filtered test data row")
 }
 
 var ExpectedDB2Data = map[string]interface{}{

diff --git a/drivers/mongodb/internal/backfill.go b/drivers/mongodb/internal/backfill.go
@@ -389,7 +389,7 @@ func generateMinObjectID(t time.Time) string {
 	return objectID.Hex()
 }
 
-func buildMongoCondition(cond types.Condition) bson.D {
+func buildMongoCondition(isLegacy bool, cond interface{}) bson.D {
 	opMap := map[string]string{
 		">":  "$gt",
 		">=": "$gte",
@@ -398,36 +398,75 @@ func buildMongoCondition(cond types.Condition) bson.D {
 		"=":  "$eq",
 		"!=": "$ne",
 	}
-	//TODO: take val as any type
-	value := func(field, val string) interface{} {
-		// Handle unquoted null
-		if val == "null" {
-			return nil
-		}
+	c := cond.(types.FilterCondition)
+	// legacy filter condition
+	if isLegacy {
+		value := func(field, val string) interface{} {
+			if val == "null" {
+				return nil
+			}
 
-		if strings.HasPrefix(val, "\"") && strings.HasSuffix(val, "\"") {
-			val = val[1 : len(val)-1]
-		}
-		if field == "_id" && len(val) == 24 {
-			if oid, err := primitive.ObjectIDFromHex(val); err == nil {
-				return oid
+			if strings.HasPrefix(val, "\"") && strings.HasSuffix(val, "\"") {
+				val = val[1 : len(val)-1]
 			}
+
+			if field == "_id" && len(val) == 24 {
+				if oid, err := primitive.ObjectIDFromHex(val); err == nil {
+					return oid
+				}
+			}
+
+			if strings.EqualFold(val, "true") || strings.EqualFold(val, "false") {
+				return strings.EqualFold(val, "true")
+			}
+
+			if timeVal, err := typeutils.ReformatDate(val, false); err == nil {
+				return timeVal
+			}
+
+			if intVal, err := typeutils.ReformatInt64(val); err == nil {
+				return intVal
+			}
+
+			if floatVal, err := typeutils.ReformatFloat64(val); err == nil {
+				return floatVal
+			}
+
+			return val
+		}(c.Column, c.Value.(string))
+
+		return bson.D{
+			{Key: c.Column, Value: bson.D{
+				{Key: opMap[c.Operator], Value: value},
+			}},
 		}
-		if strings.ToLower(val) == "true" || strings.ToLower(val) == "false" {
-			return strings.ToLower(val) == "true"
-		}
-		if timeVal, err := typeutils.ReformatDate(val, false); err == nil {
-			return timeVal
-		}
-		if intVal, err := typeutils.ReformatInt64(val); err == nil {
-			return intVal
-		}
-		if floatVal, err := typeutils.ReformatFloat64(val); err == nil {
-			return floatVal
+	}
+
+	var value interface{}
+	if v, ok := c.Value.(string); ok {
+		//For string values, attempt type conversion based on field characteristics
+		//This handles cases like timestamp strings, ObjectIDs etc.
+		if c.Column == "_id" && len(v) == 24 {
+			if oid, err := primitive.ObjectIDFromHex(v); err == nil {
+				value = oid
+			} else {
+				value = v
+			}
+		} else if timeVal, err := typeutils.ReformatDate(v, false); err == nil {
+			value = timeVal
+		} else {
+			value = c.Value
 		}
-		return val
-	}(cond.Column, cond.Value)
-	return bson.D{{Key: cond.Column, Value: bson.D{{Key: opMap[cond.Operator], Value: value}}}}
+	} else {
+		// already typed (nil, bool, int, float, etc.)
+		value = c.Value
+	}
+
+	return bson.D{
+		{Key: c.Column, Value: bson.D{
+			{Key: opMap[c.Operator], Value: value},
+		}},
+	}
 }
 
 // buildFilter generates a BSON document for MongoDB by combining threshold conditions with user-defined filter conditions
@@ -437,7 +476,7 @@ func (m *Mongo) buildFilter(stream types.StreamInterface) (bson.D, error) {
 		return nil, fmt.Errorf("failed to create threshold filter: %s", err)
 	}
 
-	filter, err := stream.GetFilter()
+	filter, isLegacy, err := stream.GetFilter()
 	if err != nil {
 		return nil, fmt.Errorf("failed to parse stream filter: %s", err)
 	}
@@ -451,11 +490,9 @@ func (m *Mongo) buildFilter(stream types.StreamInterface) (bson.D, error) {
 	case len(filter.Conditions) == 0:
 		return utils.Ternary(len(allConditions) == 0, bson.D{}, bson.D{{Key: "$and", Value: allConditions}}).(bson.D), nil
 	case len(filter.Conditions) == 1:
-		allConditions = append(allConditions, buildMongoCondition(filter.Conditions[0]))
+		allConditions = append(allConditions, buildMongoCondition(isLegacy, filter.Conditions[0]))
 	case len(filter.Conditions) == 2:
-		allConditions = append(allConditions, bson.D{{Key: "$" + filter.LogicalOperator, Value: bson.A{buildMongoCondition(filter.Conditions[0]), buildMongoCondition(filter.Conditions[1])}}})
-	default:
-		return nil, fmt.Errorf("multiple conditions are not supported in filter")
+		allConditions = append(allConditions, bson.D{{Key: "$" + filter.LogicalOperator, Value: bson.A{buildMongoCondition(isLegacy, filter.Conditions[0]), buildMongoCondition(isLegacy, filter.Conditions[1])}}})
 	}
 
 	return bson.D{{Key: "$and", Value: allConditions}}, nil