From 714d9ea7cc0aff7aa3256e1afb629e0a069312b6 Mon Sep 17 00:00:00 2001 From: PK3NZO Date: Sat, 25 Apr 2026 16:03:24 +0330 Subject: [PATCH 1/3] Improve runtime stability and diagnostics --- .gitignore | 4 +- README.md | 85 ++++- client_config.json.example | 18 +- cmd/client/main.go | 65 +++- cmd/purge/main.go | 61 ++++ cmd/server/main.go | 30 +- internal/app/backend.go | 45 +++ internal/config/config.go | 114 +++++++ internal/health/server.go | 47 +++ internal/httpclient/client.go | 8 +- internal/storage/google.go | 303 +++++++++++------ internal/storage/multi.go | 320 +++++++++++++++++ internal/transport/conn.go | 4 +- internal/transport/engine.go | 472 ++++++++++++++++++++++++-- internal/transport/session.go | 49 ++- scripts/build_release.sh | 1 + scripts/collect_client_diagnostics.sh | 148 ++++++++ scripts/flowdriver-server.service | 16 + server_config.json.example | 23 +- 19 files changed, 1633 insertions(+), 180 deletions(-) create mode 100644 cmd/purge/main.go create mode 100644 internal/app/backend.go create mode 100644 internal/health/server.go create mode 100644 internal/storage/multi.go create mode 100755 scripts/collect_client_diagnostics.sh create mode 100644 scripts/flowdriver-server.service diff --git a/.gitignore b/.gitignore index 37b56d4..fd6b113 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,6 @@ bin/ *.exe *.json *.log -*.token \ No newline at end of file +*.token +diagnostics/ +diagnostics-smoke/ diff --git a/README.md b/README.md index cb827fc..a307734 100644 --- a/README.md +++ b/README.md @@ -100,11 +100,68 @@ Create your `config.json` based on the provided examples: { "storage_type": "google", "google_folder_id": "YOUR_FOLDER_ID", - "refresh_rate_ms": 100, - "flush_rate_ms": 300 + "performance_profile": "balanced", + "refresh_rate_ms": 200, + "flush_rate_ms": 300, + "idle_poll_max_ms": 2000, + "idle_poll_step_ms": 500, + "session_idle_timeout_sec": 25, + "cleanup_file_max_age_sec": 60, + "storage_retry_max": 3, + "storage_retry_base_ms": 300, + "storage_op_timeout_sec": 45, + "max_payload_bytes": 786432, + "max_active_sessions": 0, + "session_wait_timeout_sec": 15, + "backpressure_bytes": 4194304, + "immediate_flush": false, + "metrics_log_sec": 30, + "health_listen_addr": "127.0.0.1:18080", + "google_lanes": [] } ``` +### Runtime Tuning / تنظیمات اجرا + +`performance_profile` can be set to: +- `fast`: lower startup latency, higher Google API usage. +- `balanced`: recommended default for normal browsing and downloads. +- `quota-saver`: lower API usage, higher startup latency. + +You can override any profile value directly: +- `refresh_rate_ms`: how often each side polls for incoming files while active. +- `flush_rate_ms`: how often buffered data is uploaded. +- `idle_poll_max_ms`: server-side maximum polling delay while idle. Lower values reduce first-load delay. +- `session_idle_timeout_sec`: inactive connection timeout. +- `storage_retry_max` and `storage_retry_base_ms`: retry policy for transient Google API failures. +- `storage_op_timeout_sec`: fail-fast timeout for individual Google Drive operations. +- `max_payload_bytes`: maximum per-session payload size written into one transport file. +- `max_active_sessions`: client-side cap for concurrent SOCKS sessions. +- `session_wait_timeout_sec`: how long new SOCKS sessions wait for capacity. +- `backpressure_bytes`: per-session buffer limit before application writes wait. +- `immediate_flush`: uploads new data promptly instead of waiting for the next flush tick. Leave this off for browser/video/download workloads because Google Drive performs better with batched files. +- `metrics_log_sec`: periodic operational metrics log interval. +- `health_listen_addr`: optional local HTTP endpoint for `/healthz` and `/metrics`. + +For higher throughput or resilience, configure multiple Google Drive lanes on both client and server: + +```json +{ + "google_lanes": [ + { + "credentials_path": "credentials.json", + "google_folder_id": "LANE_1_FOLDER_ID" + }, + { + "credentials_path": "credentials-lane2.json", + "google_folder_id": "LANE_2_FOLDER_ID" + } + ] +} +``` + +Each lane can use a separate Google account or folder. Uploads are distributed across healthy lanes, and a lane with transient failures is temporarily avoided. + ### 3. Run / اجرا **Server:** @@ -117,6 +174,30 @@ Create your `config.json` based on the provided examples: ./bin/client -c client_config.json -gc credentials.json ``` +### Run Server as a systemd Service / اجرای سرور به صورت سرویس + +On an Ubuntu VPS, copy `scripts/flowdriver-server.service` to systemd after placing the server files in `/home/ubuntu/flowdriver`: + +```bash +sudo cp scripts/flowdriver-server.service /etc/systemd/system/flowdriver-server.service +sudo systemctl daemon-reload +sudo systemctl enable --now flowdriver-server +sudo systemctl status flowdriver-server +``` + +Logs: + +```bash +journalctl -u flowdriver-server -f +``` + +Health and metrics: + +```bash +curl http://127.0.0.1:18080/healthz +curl http://127.0.0.1:18080/metrics +``` + --- ## Usage & Authentication / نحوه استفاده و احراز هویت diff --git a/client_config.json.example b/client_config.json.example index 32b7476..06656d5 100644 --- a/client_config.json.example +++ b/client_config.json.example @@ -1,12 +1,28 @@ { "listen_addr": "127.0.0.1:1080", "storage_type": "google", + "performance_profile": "balanced", "refresh_rate_ms": 200, "flush_rate_ms": 300, + "idle_poll_max_ms": 2000, + "idle_poll_step_ms": 500, + "session_idle_timeout_sec": 25, + "cleanup_file_max_age_sec": 60, + "storage_retry_max": 3, + "storage_retry_base_ms": 300, + "storage_op_timeout_sec": 45, + "max_payload_bytes": 786432, + "max_active_sessions": 0, + "session_wait_timeout_sec": 15, + "backpressure_bytes": 4194304, + "immediate_flush": false, + "metrics_log_sec": 30, + "health_listen_addr": "127.0.0.1:18081", "transport": { "TargetIP": "216.239.38.120:443", "SNI": "google.com", "HostHeader": "www.googleapis.com", "InsecureSkipVerify": false - } + }, + "google_lanes": [] } diff --git a/cmd/client/main.go b/cmd/client/main.go index 7b43fda..b95d4db 100644 --- a/cmd/client/main.go +++ b/cmd/client/main.go @@ -12,10 +12,11 @@ import ( "os" "os/signal" "syscall" + "time" + "github.com/NullLatency/flow-driver/internal/app" "github.com/NullLatency/flow-driver/internal/config" - "github.com/NullLatency/flow-driver/internal/httpclient" - "github.com/NullLatency/flow-driver/internal/storage" + "github.com/NullLatency/flow-driver/internal/health" "github.com/NullLatency/flow-driver/internal/transport" "github.com/things-go/go-socks5" "github.com/things-go/go-socks5/statute" @@ -48,23 +49,18 @@ func main() { if err != nil { log.Fatalf("Failed to load config: %v", err) } + appCfg.ApplyProfile() - var backend storage.Backend - if appCfg.StorageType == "google" { - customHttpClient := httpclient.NewCustomClient(appCfg.Transport) - backend = storage.NewGoogleBackend(customHttpClient, gcPath, appCfg.GoogleFolderID) - } else { - backend, err = storage.NewLocalBackend(appCfg.LocalDir) - if err != nil { - log.Fatalf("Failed to init local storage: %v", err) - } + backend, err := app.BuildBackend(appCfg, gcPath) + if err != nil { + log.Fatalf("Failed to init storage: %v", err) } if err := backend.Login(ctx); err != nil { log.Fatalf("Backend login failed: %v", err) } // AUTOMATION: If folder ID is missing, find or create it - if appCfg.StorageType == "google" && appCfg.GoogleFolderID == "" { + if appCfg.StorageType == "google" && len(appCfg.GoogleLanes) == 0 && appCfg.GoogleFolderID == "" { log.Println("Zero-Config: Searching for existing Google Drive folder 'Flow-Data'...") folderID, err := backend.FindFolder(ctx, "Flow-Data") if err != nil { @@ -100,7 +96,17 @@ func main() { if appCfg.FlushRateMs > 0 { engine.SetFlushRate(appCfg.FlushRateMs) } + engine.SetIdlePollMax(appCfg.IdlePollMaxMs) + engine.SetIdlePollStep(appCfg.IdlePollStepMs) + engine.SetSessionIdleTimeout(appCfg.SessionIdleTimeoutSec) + engine.SetCleanupFileMaxAge(appCfg.CleanupFileMaxAgeSec) + engine.SetMaxPayloadBytes(appCfg.MaxPayloadBytes) + engine.SetBackpressureBytes(appCfg.BackpressureBytes) + engine.SetStorageOpTimeout(appCfg.StorageOpTimeoutSec) + engine.SetImmediateFlush(appCfg.ImmediateFlush) + engine.SetMetricsLogInterval(appCfg.MetricsLogSec) engine.Start(ctx) + health.Start(ctx, appCfg.HealthListenAddr, engine) listenAddr := appCfg.ListenAddr if listenAddr == "" { @@ -110,6 +116,10 @@ func main() { // Create the library SOCKS5 server wrapping our custom Google Drive Engine tunnel server := socks5.NewServer( socks5.WithDial(func(dc context.Context, network, addr string) (net.Conn, error) { + if err := waitForSessionCapacity(dc, engine, appCfg.MaxActiveSessions, appCfg.SessionWaitTimeoutSec); err != nil { + return nil, err + } + sessionID := generateSessionID() // Intelligently parse the address string to warn users if their browser is natively leaking DNS @@ -158,3 +168,34 @@ func main() { log.Println("Shutting down client...") cancel() } + +func waitForSessionCapacity(ctx context.Context, engine *transport.Engine, maxActive, timeoutSec int) error { + if maxActive <= 0 { + return nil + } + if engine.ActiveSessionCount() < maxActive { + return nil + } + + waitTimeout := 10 * time.Second + if timeoutSec > 0 { + waitTimeout = time.Duration(timeoutSec) * time.Second + } + timer := time.NewTimer(waitTimeout) + defer timer.Stop() + ticker := time.NewTicker(100 * time.Millisecond) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-timer.C: + return fmt.Errorf("too many active sessions: %d/%d", engine.ActiveSessionCount(), maxActive) + case <-ticker.C: + if engine.ActiveSessionCount() < maxActive { + return nil + } + } + } +} diff --git a/cmd/purge/main.go b/cmd/purge/main.go new file mode 100644 index 0000000..bfa9e69 --- /dev/null +++ b/cmd/purge/main.go @@ -0,0 +1,61 @@ +package main + +import ( + "context" + "flag" + "log" + "sync" + + "github.com/NullLatency/flow-driver/internal/app" + "github.com/NullLatency/flow-driver/internal/config" +) + +func main() { + var configPath, gcPath string + flag.StringVar(&configPath, "c", "config.json", "Path to config file") + flag.StringVar(&gcPath, "gc", "credentials.json", "Path to Google credentials JSON") + flag.Parse() + + ctx := context.Background() + appCfg, err := config.Load(configPath) + if err != nil { + log.Fatalf("Failed to load config: %v", err) + } + appCfg.ApplyProfile() + + backend, err := app.BuildBackend(appCfg, gcPath) + if err != nil { + log.Fatalf("Failed to init storage: %v", err) + } + if err := backend.Login(ctx); err != nil { + log.Fatalf("Backend login failed: %v", err) + } + + prefixes := []string{"req-", "res-"} + total := 0 + for _, prefix := range prefixes { + files, err := backend.ListQuery(ctx, prefix) + if err != nil { + log.Fatalf("Failed to list %s files: %v", prefix, err) + } + log.Printf("Purging %d files with prefix %s", len(files), prefix) + + var wg sync.WaitGroup + sem := make(chan struct{}, 8) + for _, file := range files { + wg.Add(1) + sem <- struct{}{} + go func(file string) { + defer wg.Done() + defer func() { <-sem }() + if err := backend.Delete(ctx, file); err != nil { + log.Printf("delete failed %s: %v", file, err) + return + } + }(file) + } + wg.Wait() + total += len(files) + } + log.Printf("Purge finished. Requested delete for %d files.", total) +} diff --git a/cmd/server/main.go b/cmd/server/main.go index 8754174..96b131d 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -10,9 +10,9 @@ import ( "os/signal" "syscall" + "github.com/NullLatency/flow-driver/internal/app" "github.com/NullLatency/flow-driver/internal/config" - "github.com/NullLatency/flow-driver/internal/httpclient" - "github.com/NullLatency/flow-driver/internal/storage" + "github.com/NullLatency/flow-driver/internal/health" "github.com/NullLatency/flow-driver/internal/transport" ) @@ -30,23 +30,18 @@ func main() { if err != nil { log.Fatalf("Failed to load config: %v", err) } + appCfg.ApplyProfile() - var backend storage.Backend - if appCfg.StorageType == "google" { - customHttpClient := httpclient.NewCustomClient(appCfg.Transport) - backend = storage.NewGoogleBackend(customHttpClient, gcPath, appCfg.GoogleFolderID) - } else { - backend, err = storage.NewLocalBackend(appCfg.LocalDir) - if err != nil { - log.Fatalf("Failed to init local storage: %v", err) - } + backend, err := app.BuildBackend(appCfg, gcPath) + if err != nil { + log.Fatalf("Failed to init storage: %v", err) } if err := backend.Login(ctx); err != nil { log.Fatalf("Backend login failed: %v", err) } // AUTOMATION: If folder ID is missing, find or create it - if appCfg.StorageType == "google" && appCfg.GoogleFolderID == "" { + if appCfg.StorageType == "google" && len(appCfg.GoogleLanes) == 0 && appCfg.GoogleFolderID == "" { log.Println("Zero-Config: Searching for existing Google Drive folder 'Flow-Data'...") folderID, err := backend.FindFolder(ctx, "Flow-Data") if err != nil { @@ -78,6 +73,15 @@ func main() { if appCfg.FlushRateMs > 0 { engine.SetFlushRate(appCfg.FlushRateMs) } + engine.SetIdlePollMax(appCfg.IdlePollMaxMs) + engine.SetIdlePollStep(appCfg.IdlePollStepMs) + engine.SetSessionIdleTimeout(appCfg.SessionIdleTimeoutSec) + engine.SetCleanupFileMaxAge(appCfg.CleanupFileMaxAgeSec) + engine.SetMaxPayloadBytes(appCfg.MaxPayloadBytes) + engine.SetBackpressureBytes(appCfg.BackpressureBytes) + engine.SetStorageOpTimeout(appCfg.StorageOpTimeoutSec) + engine.SetImmediateFlush(appCfg.ImmediateFlush) + engine.SetMetricsLogInterval(appCfg.MetricsLogSec) // Called by polling loop when a new incoming session file is found engine.OnNewSession = func(sessionID, targetAddr string, session *transport.Session) { @@ -86,6 +90,7 @@ func main() { } engine.Start(ctx) + health.Start(ctx, appCfg.HealthListenAddr, engine) sigCh := make(chan os.Signal, 1) signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) @@ -114,6 +119,7 @@ func handleServerConn(sessionID, targetAddr string, session *transport.Session, n, err := conn.Read(buf) if n > 0 { session.EnqueueTx(buf[:n]) + engine.RequestFlush() } if err != nil { errCh <- err diff --git a/internal/app/backend.go b/internal/app/backend.go new file mode 100644 index 0000000..6a56441 --- /dev/null +++ b/internal/app/backend.go @@ -0,0 +1,45 @@ +package app + +import ( + "fmt" + + "github.com/NullLatency/flow-driver/internal/config" + "github.com/NullLatency/flow-driver/internal/httpclient" + "github.com/NullLatency/flow-driver/internal/storage" +) + +func BuildBackend(appCfg *config.AppConfig, gcPath string) (storage.Backend, error) { + if appCfg.StorageType != "google" { + return storage.NewLocalBackend(appCfg.LocalDir) + } + + if len(appCfg.GoogleLanes) == 0 { + customHTTPClient := httpclient.NewCustomClient(appCfg.Transport) + googleBackend := storage.NewGoogleBackend(customHTTPClient, gcPath, appCfg.GoogleFolderID) + googleBackend.SetRetryPolicy(appCfg.StorageRetryMax, appCfg.StorageRetryBaseMs) + return googleBackend, nil + } + + backends := make([]storage.Backend, 0, len(appCfg.GoogleLanes)) + for idx, lane := range appCfg.GoogleLanes { + laneCredentials := lane.CredentialsPath + if laneCredentials == "" { + laneCredentials = gcPath + } + laneFolderID := lane.GoogleFolderID + if laneFolderID == "" { + laneFolderID = appCfg.GoogleFolderID + } + if laneFolderID == "" { + return nil, fmt.Errorf("google_lanes[%d] is missing google_folder_id", idx) + } + laneTransport := lane.Transport + if laneTransport.TargetIP == "" && laneTransport.SNI == "" && laneTransport.HostHeader == "" { + laneTransport = appCfg.Transport + } + googleBackend := storage.NewGoogleBackend(httpclient.NewCustomClient(laneTransport), laneCredentials, laneFolderID) + googleBackend.SetRetryPolicy(appCfg.StorageRetryMax, appCfg.StorageRetryBaseMs) + backends = append(backends, googleBackend) + } + return storage.NewMultiBackend(backends) +} diff --git a/internal/config/config.go b/internal/config/config.go index ddcab8b..6660201 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -10,6 +10,10 @@ import ( // AppConfig defines the application-level overarching configuration. type AppConfig struct { + // PerformanceProfile applies conservative runtime defaults. + // Valid values: "fast", "balanced", "quota-saver". + PerformanceProfile string `json:"performance_profile,omitempty"` + // ListenAddr is the SOCKS5 listening address for the client. E.g., "127.0.0.1:1080" ListenAddr string `json:"listen_addr,omitempty"` @@ -25,16 +29,126 @@ type AppConfig struct { // GoogleFolderID is the Drive Folder ID when StorageType is "google". GoogleFolderID string `json:"google_folder_id,omitempty"` + // GoogleLanes optionally configures multiple Google Drive lanes. + GoogleLanes []GoogleLaneConfig `json:"google_lanes,omitempty"` + // RefreshRateMs is the polling (RX) interval in milliseconds for the engine. RefreshRateMs int `json:"refresh_rate_ms,omitempty"` // FlushRateMs is the gathering (TX) interval in milliseconds for the engine. FlushRateMs int `json:"flush_rate_ms,omitempty"` + // IdlePollMaxMs caps the server-side adaptive polling delay when no sessions are active. + IdlePollMaxMs int `json:"idle_poll_max_ms,omitempty"` + + // IdlePollStepMs is the backoff step for server-side adaptive polling. + IdlePollStepMs int `json:"idle_poll_step_ms,omitempty"` + + // SessionIdleTimeoutSec closes inactive sessions after this many seconds. + SessionIdleTimeoutSec int `json:"session_idle_timeout_sec,omitempty"` + + // CleanupFileMaxAgeSec deletes stale transport files after this many seconds. + CleanupFileMaxAgeSec int `json:"cleanup_file_max_age_sec,omitempty"` + + // StorageRetryMax is the maximum number of retry attempts for transient storage errors. + StorageRetryMax int `json:"storage_retry_max,omitempty"` + + // StorageRetryBaseMs is the initial retry backoff delay in milliseconds. + StorageRetryBaseMs int `json:"storage_retry_base_ms,omitempty"` + + // StorageOpTimeoutSec caps each storage operation before it is treated as failed. + StorageOpTimeoutSec int `json:"storage_op_timeout_sec,omitempty"` + + // MaxPayloadBytes caps each session payload written into a single transport file. + MaxPayloadBytes int `json:"max_payload_bytes,omitempty"` + + // MaxActiveSessions caps concurrent SOCKS sessions on the client side. + MaxActiveSessions int `json:"max_active_sessions,omitempty"` + + // SessionWaitTimeoutSec is how long a new SOCKS connection waits for capacity. + SessionWaitTimeoutSec int `json:"session_wait_timeout_sec,omitempty"` + + // BackpressureBytes blocks application writes when a session buffer grows past this size. + BackpressureBytes int `json:"backpressure_bytes,omitempty"` + + // ImmediateFlush uploads new data promptly instead of waiting for the next flush tick. + // Keep this disabled for browser workloads; Google Drive performs better with batching. + ImmediateFlush bool `json:"immediate_flush,omitempty"` + + // MetricsLogSec logs throughput and storage operation counters every N seconds. + MetricsLogSec int `json:"metrics_log_sec,omitempty"` + + // HealthListenAddr exposes local health and metrics HTTP endpoints when set. + HealthListenAddr string `json:"health_listen_addr,omitempty"` + // Transport configures the dpi-evasion layer. Transport httpclient.TransportConfig `json:"transport,omitempty"` } +type GoogleLaneConfig struct { + CredentialsPath string `json:"credentials_path,omitempty"` + GoogleFolderID string `json:"google_folder_id,omitempty"` + Transport httpclient.TransportConfig `json:"transport,omitempty"` +} + +// ApplyProfile fills unset tuning fields from a named profile while preserving +// explicitly configured values. +func (c *AppConfig) ApplyProfile() { + switch c.PerformanceProfile { + case "fast": + setDefault(&c.RefreshRateMs, 100) + setDefault(&c.FlushRateMs, 100) + setDefault(&c.IdlePollMaxMs, 1000) + setDefault(&c.IdlePollStepMs, 200) + setDefault(&c.SessionIdleTimeoutSec, 60) + setDefault(&c.CleanupFileMaxAgeSec, 45) + setDefault(&c.StorageRetryMax, 4) + setDefault(&c.StorageRetryBaseMs, 200) + setDefault(&c.StorageOpTimeoutSec, 45) + setDefault(&c.MaxPayloadBytes, 512*1024) + setDefault(&c.MaxActiveSessions, 24) + setDefault(&c.SessionWaitTimeoutSec, 12) + setDefault(&c.BackpressureBytes, 4*1024*1024) + setDefault(&c.MetricsLogSec, 30) + case "quota-saver": + setDefault(&c.RefreshRateMs, 750) + setDefault(&c.FlushRateMs, 500) + setDefault(&c.IdlePollMaxMs, 5000) + setDefault(&c.IdlePollStepMs, 750) + setDefault(&c.SessionIdleTimeoutSec, 45) + setDefault(&c.CleanupFileMaxAgeSec, 30) + setDefault(&c.StorageRetryMax, 3) + setDefault(&c.StorageRetryBaseMs, 500) + setDefault(&c.StorageOpTimeoutSec, 45) + setDefault(&c.MaxPayloadBytes, 1024*1024) + setDefault(&c.MaxActiveSessions, 16) + setDefault(&c.SessionWaitTimeoutSec, 10) + setDefault(&c.BackpressureBytes, 4*1024*1024) + setDefault(&c.MetricsLogSec, 60) + default: + setDefault(&c.RefreshRateMs, 200) + setDefault(&c.FlushRateMs, 300) + setDefault(&c.IdlePollMaxMs, 2000) + setDefault(&c.IdlePollStepMs, 500) + setDefault(&c.SessionIdleTimeoutSec, 60) + setDefault(&c.CleanupFileMaxAgeSec, 30) + setDefault(&c.StorageRetryMax, 3) + setDefault(&c.StorageRetryBaseMs, 300) + setDefault(&c.StorageOpTimeoutSec, 45) + setDefault(&c.MaxPayloadBytes, 768*1024) + setDefault(&c.MaxActiveSessions, 24) + setDefault(&c.SessionWaitTimeoutSec, 12) + setDefault(&c.BackpressureBytes, 4*1024*1024) + setDefault(&c.MetricsLogSec, 30) + } +} + +func setDefault(target *int, value int) { + if *target <= 0 { + *target = value + } +} + // Save writes the config back to a JSON file. func (c *AppConfig) Save(path string) error { b, err := json.MarshalIndent(c, "", " ") diff --git a/internal/health/server.go b/internal/health/server.go new file mode 100644 index 0000000..dbab03e --- /dev/null +++ b/internal/health/server.go @@ -0,0 +1,47 @@ +package health + +import ( + "context" + "encoding/json" + "log" + "net/http" + "time" + + "github.com/NullLatency/flow-driver/internal/transport" +) + +func Start(ctx context.Context, addr string, engine *transport.Engine) { + if addr == "" { + return + } + + mux := http.NewServeMux() + mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]string{"status": "ok"}) + }) + mux.HandleFunc("/metrics", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(engine.Snapshot()) + }) + + server := &http.Server{ + Addr: addr, + Handler: mux, + ReadHeaderTimeout: 5 * time.Second, + } + + go func() { + <-ctx.Done() + shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _ = server.Shutdown(shutdownCtx) + }() + + go func() { + log.Printf("Health endpoint listening on %s", addr) + if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.Printf("health endpoint failed: %v", err) + } + }() +} diff --git a/internal/httpclient/client.go b/internal/httpclient/client.go index 582ef55..78a5aef 100644 --- a/internal/httpclient/client.go +++ b/internal/httpclient/client.go @@ -44,7 +44,7 @@ func (t *hostRewriteTransport) RoundTrip(req *http.Request) (*http.Response, err // and manipulate TLS/HTTP headers as specified in the config. func NewCustomClient(cfg TransportConfig) *http.Client { dialer := &net.Dialer{ - Timeout: 30 * time.Second, + Timeout: 15 * time.Second, KeepAlive: 30 * time.Second, } @@ -62,8 +62,10 @@ func NewCustomClient(cfg TransportConfig) *http.Client { }, ForceAttemptHTTP2: true, MaxIdleConns: 100, + MaxIdleConnsPerHost: 20, + ResponseHeaderTimeout: 30 * time.Second, IdleConnTimeout: 90 * time.Second, - TLSHandshakeTimeout: 10 * time.Second, + TLSHandshakeTimeout: 15 * time.Second, ExpectContinueTimeout: 1 * time.Second, } @@ -77,6 +79,6 @@ func NewCustomClient(cfg TransportConfig) *http.Client { return &http.Client{ Transport: rt, - Timeout: 60 * time.Second, + Timeout: 45 * time.Second, } } diff --git a/internal/storage/google.go b/internal/storage/google.go index 3a02794..cd4066c 100644 --- a/internal/storage/google.go +++ b/internal/storage/google.go @@ -51,15 +51,30 @@ type GoogleBackend struct { // map from filename to Google Drive file ID fileIDs map[string]string fileIdsMu sync.RWMutex + + retryMax int + retryBaseMs int } // NewGoogleBackend creates a new GoogleBackend. func NewGoogleBackend(client *http.Client, saPath, folderID string) *GoogleBackend { return &GoogleBackend{ - httpClient: client, - saPath: saPath, - folderID: folderID, - fileIDs: make(map[string]string), + httpClient: client, + saPath: saPath, + folderID: folderID, + fileIDs: make(map[string]string), + retryMax: 3, + retryBaseMs: 300, + } +} + +// SetRetryPolicy configures retries for transient Google API failures. +func (b *GoogleBackend) SetRetryPolicy(maxAttempts, baseDelayMs int) { + if maxAttempts > 0 { + b.retryMax = maxAttempts + } + if baseDelayMs > 0 { + b.retryBaseMs = baseDelayMs } } @@ -171,13 +186,15 @@ func (b *GoogleBackend) refreshAccessToken(ctx context.Context) error { } func (b *GoogleBackend) executeTokenRequest(ctx context.Context, v url.Values) error { - req, err := http.NewRequestWithContext(ctx, "POST", b.tokenURI, strings.NewReader(v.Encode())) - if err != nil { - return err - } - req.Header.Set("Content-Type", "application/x-www-form-urlencoded") - - resp, err := b.httpClient.Do(req) + body := v.Encode() + resp, err := b.doRetry(ctx, "token", func() (*http.Response, error) { + req, err := http.NewRequestWithContext(ctx, "POST", b.tokenURI, strings.NewReader(body)) + if err != nil { + return nil, err + } + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + return b.httpClient.Do(req) + }) if err != nil { return fmt.Errorf("token request failed: %w", err) } @@ -223,40 +240,51 @@ func (b *GoogleBackend) Upload(ctx context.Context, filename string, data io.Rea return err } - pr, pw := io.Pipe() - metaWriter := multipart.NewWriter(pw) - - go func() { - defer pw.Close() - defer metaWriter.Close() + payload, err := io.ReadAll(data) + if err != nil { + return err + } + resp, err := b.doRetry(ctx, "upload", func() (*http.Response, error) { + var body bytes.Buffer + metaWriter := multipart.NewWriter(&body) - // Part 1: Metadata h := make(textproto.MIMEHeader) h.Set("Content-Type", "application/json; charset=UTF-8") - part1, _ := metaWriter.CreatePart(h) + part1, err := metaWriter.CreatePart(h) + if err != nil { + return nil, err + } meta := map[string]interface{}{ "name": filename, } if b.folderID != "" { meta["parents"] = []string{b.folderID} } - json.NewEncoder(part1).Encode(meta) + if err := json.NewEncoder(part1).Encode(meta); err != nil { + return nil, err + } - // Part 2: Content h = make(textproto.MIMEHeader) h.Set("Content-Type", "application/octet-stream") - part2, _ := metaWriter.CreatePart(h) - io.Copy(part2, data) - }() - - req, err := http.NewRequestWithContext(ctx, "POST", "https://www.googleapis.com/upload/drive/v3/files?uploadType=multipart", pr) - if err != nil { - return err - } - req.Header.Set("Authorization", "Bearer "+tok) - req.Header.Set("Content-Type", metaWriter.FormDataContentType()) + part2, err := metaWriter.CreatePart(h) + if err != nil { + return nil, err + } + if _, err := part2.Write(payload); err != nil { + return nil, err + } + if err := metaWriter.Close(); err != nil { + return nil, err + } - resp, err := b.httpClient.Do(req) + req, err := http.NewRequestWithContext(ctx, "POST", "https://www.googleapis.com/upload/drive/v3/files?uploadType=multipart", bytes.NewReader(body.Bytes())) + if err != nil { + return nil, err + } + req.Header.Set("Authorization", "Bearer "+tok) + req.Header.Set("Content-Type", metaWriter.FormDataContentType()) + return b.httpClient.Do(req) + }) if err != nil { return err } @@ -275,59 +303,75 @@ func (b *GoogleBackend) ListQuery(ctx context.Context, prefix string) ([]string, return nil, err } - q := fmt.Sprintf("name contains '%s'", prefix) + q := fmt.Sprintf("name contains '%s' and trashed = false", escapeDriveQuery(prefix)) if b.folderID != "" { q += fmt.Sprintf(" and '%s' in parents", b.folderID) } - u, _ := url.Parse("https://www.googleapis.com/drive/v3/files") - v := u.Query() - v.Set("q", q) - v.Set("fields", "files(id, name)") - u.RawQuery = v.Encode() - - req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil) - if err != nil { - return nil, err - } - req.Header.Set("Authorization", "Bearer "+tok) + var names []string + pageToken := "" + for { + u, _ := url.Parse("https://www.googleapis.com/drive/v3/files") + v := u.Query() + v.Set("q", q) + v.Set("fields", "nextPageToken, files(id, name)") + v.Set("pageSize", "1000") + v.Set("spaces", "drive") + if pageToken != "" { + v.Set("pageToken", pageToken) + } + u.RawQuery = v.Encode() - resp, err := b.httpClient.Do(req) - if err != nil { - return nil, err - } - defer resp.Body.Close() + resp, err := b.doRetry(ctx, "list", func() (*http.Response, error) { + req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil) + if err != nil { + return nil, err + } + req.Header.Set("Authorization", "Bearer "+tok) + return b.httpClient.Do(req) + }) + if err != nil { + return nil, err + } - if resp.StatusCode != http.StatusOK { - body, _ := io.ReadAll(resp.Body) - return nil, fmt.Errorf("list returned %d: %s", resp.StatusCode, string(body)) - } + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return nil, fmt.Errorf("list returned %d: %s", resp.StatusCode, string(body)) + } - var resData struct { - Files []struct { - ID string `json:"id"` - Name string `json:"name"` - } `json:"files"` - } - if err := json.NewDecoder(resp.Body).Decode(&resData); err != nil { - return nil, err - } + var resData struct { + NextPageToken string `json:"nextPageToken"` + Files []struct { + ID string `json:"id"` + Name string `json:"name"` + } `json:"files"` + } + if err := json.NewDecoder(resp.Body).Decode(&resData); err != nil { + resp.Body.Close() + return nil, err + } + resp.Body.Close() - b.fileIdsMu.Lock() - // SAFETY: Prevent fileIDs map from infinite growth - if len(b.fileIDs) > 2000 { - b.fileIDs = make(map[string]string) - } + b.fileIdsMu.Lock() + // SAFETY: Prevent fileIDs map from infinite growth + if len(b.fileIDs) > 5000 { + b.fileIDs = make(map[string]string) + } + for _, f := range resData.Files { + // Only collect exact prefix matches client-side just in case + if strings.HasPrefix(f.Name, prefix) { + b.fileIDs[f.Name] = f.ID + names = append(names, f.Name) + } + } + b.fileIdsMu.Unlock() - var names []string - for _, f := range resData.Files { - // Only collect exact prefix matches client-side just in case - if strings.HasPrefix(f.Name, prefix) { - b.fileIDs[f.Name] = f.ID - names = append(names, f.Name) + if resData.NextPageToken == "" { + break } + pageToken = resData.NextPageToken } - b.fileIdsMu.Unlock() return names, nil } @@ -346,13 +390,14 @@ func (b *GoogleBackend) Download(ctx context.Context, filename string) (io.ReadC return nil, err } - req, err := http.NewRequestWithContext(ctx, "GET", "https://www.googleapis.com/drive/v3/files/"+fileID+"?alt=media", nil) - if err != nil { - return nil, err - } - req.Header.Set("Authorization", "Bearer "+tok) - - resp, err := b.httpClient.Do(req) + resp, err := b.doRetry(ctx, "download", func() (*http.Response, error) { + req, err := http.NewRequestWithContext(ctx, "GET", "https://www.googleapis.com/drive/v3/files/"+fileID+"?alt=media", nil) + if err != nil { + return nil, err + } + req.Header.Set("Authorization", "Bearer "+tok) + return b.httpClient.Do(req) + }) if err != nil { return nil, err } @@ -380,13 +425,14 @@ func (b *GoogleBackend) Delete(ctx context.Context, filename string) error { return err } - req, err := http.NewRequestWithContext(ctx, "DELETE", "https://www.googleapis.com/drive/v3/files/"+fileID, nil) - if err != nil { - return err - } - req.Header.Set("Authorization", "Bearer "+tok) - - resp, err := b.httpClient.Do(req) + resp, err := b.doRetry(ctx, "delete", func() (*http.Response, error) { + req, err := http.NewRequestWithContext(ctx, "DELETE", "https://www.googleapis.com/drive/v3/files/"+fileID, nil) + if err != nil { + return nil, err + } + req.Header.Set("Authorization", "Bearer "+tok) + return b.httpClient.Do(req) + }) if err != nil { return err } @@ -416,14 +462,15 @@ func (b *GoogleBackend) CreateFolder(ctx context.Context, name string) (string, } body, _ := json.Marshal(meta) - req, err := http.NewRequestWithContext(ctx, "POST", "https://www.googleapis.com/drive/v3/files", bytes.NewReader(body)) - if err != nil { - return "", err - } - req.Header.Set("Authorization", "Bearer "+tok) - req.Header.Set("Content-Type", "application/json") - - resp, err := b.httpClient.Do(req) + resp, err := b.doRetry(ctx, "create folder", func() (*http.Response, error) { + req, err := http.NewRequestWithContext(ctx, "POST", "https://www.googleapis.com/drive/v3/files", bytes.NewReader(body)) + if err != nil { + return nil, err + } + req.Header.Set("Authorization", "Bearer "+tok) + req.Header.Set("Content-Type", "application/json") + return b.httpClient.Do(req) + }) if err != nil { return "", err } @@ -458,13 +505,14 @@ func (b *GoogleBackend) FindFolder(ctx context.Context, name string) (string, er v.Set("fields", "files(id, name)") u.RawQuery = v.Encode() - req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil) - if err != nil { - return "", err - } - req.Header.Set("Authorization", "Bearer "+tok) - - resp, err := b.httpClient.Do(req) + resp, err := b.doRetry(ctx, "find folder", func() (*http.Response, error) { + req, err := http.NewRequestWithContext(ctx, "GET", u.String(), nil) + if err != nil { + return nil, err + } + req.Header.Set("Authorization", "Bearer "+tok) + return b.httpClient.Do(req) + }) if err != nil { return "", err } @@ -491,3 +539,52 @@ func (b *GoogleBackend) FindFolder(ctx context.Context, name string) (string, er } return "", nil } + +func (b *GoogleBackend) doRetry(ctx context.Context, operation string, fn func() (*http.Response, error)) (*http.Response, error) { + attempts := b.retryMax + if attempts <= 0 { + attempts = 1 + } + baseDelay := time.Duration(b.retryBaseMs) * time.Millisecond + if baseDelay <= 0 { + baseDelay = 300 * time.Millisecond + } + + var lastErr error + for attempt := 1; attempt <= attempts; attempt++ { + resp, err := fn() + if err == nil && resp != nil && !shouldRetryStatus(resp.StatusCode) { + return resp, nil + } + + if resp != nil { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + resp.Body.Close() + if err == nil { + lastErr = fmt.Errorf("%s returned %d: %s", operation, resp.StatusCode, strings.TrimSpace(string(body))) + } + } + if err != nil { + lastErr = err + } + if attempt == attempts { + break + } + + delay := baseDelay << (attempt - 1) + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(delay): + } + } + return nil, lastErr +} + +func shouldRetryStatus(status int) bool { + return status == http.StatusTooManyRequests || status == http.StatusRequestTimeout || status >= 500 +} + +func escapeDriveQuery(value string) string { + return strings.ReplaceAll(value, "'", "\\'") +} diff --git a/internal/storage/multi.go b/internal/storage/multi.go new file mode 100644 index 0000000..7c4a84e --- /dev/null +++ b/internal/storage/multi.go @@ -0,0 +1,320 @@ +package storage + +import ( + "bytes" + "context" + "fmt" + "hash/fnv" + "io" + "sync" + "time" +) + +type MultiBackend struct { + lanes []*storageLane + + fileLane map[string]int + fileLaneMu sync.RWMutex +} + +type storageLane struct { + name string + backend Backend + unhealthyUntil time.Time + mu sync.RWMutex +} + +func NewMultiBackend(backends []Backend) (*MultiBackend, error) { + if len(backends) == 0 { + return nil, fmt.Errorf("multi backend requires at least one lane") + } + + lanes := make([]*storageLane, 0, len(backends)) + for i, backend := range backends { + lanes = append(lanes, &storageLane{ + name: fmt.Sprintf("lane-%d", i), + backend: backend, + }) + } + return &MultiBackend{ + lanes: lanes, + fileLane: make(map[string]int), + }, nil +} + +func (b *MultiBackend) Login(ctx context.Context) error { + var firstErr error + for _, lane := range b.lanes { + if err := lane.backend.Login(ctx); err != nil { + lane.markUnhealthy() + if firstErr == nil { + firstErr = fmt.Errorf("%s login failed: %w", lane.name, err) + } + continue + } + lane.markHealthy() + } + if b.hasHealthyLane() { + return nil + } + return firstErr +} + +func (b *MultiBackend) Upload(ctx context.Context, filename string, data io.Reader) error { + payload, err := io.ReadAll(data) + if err != nil { + return err + } + + start := b.pickLane(filename) + var firstErr error + for offset := 0; offset < len(b.lanes); offset++ { + idx := (start + offset) % len(b.lanes) + lane := b.lanes[idx] + if !lane.healthy() && offset < len(b.lanes)-1 { + continue + } + if err := lane.backend.Upload(ctx, filename, bytesReader(payload)); err != nil { + lane.markUnhealthy() + if firstErr == nil { + firstErr = fmt.Errorf("%s upload failed: %w", lane.name, err) + } + continue + } + lane.markHealthy() + b.setFileLane(filename, idx) + return nil + } + return firstErr +} + +func (b *MultiBackend) ListQuery(ctx context.Context, prefix string) ([]string, error) { + type result struct { + idx int + names []string + err error + } + + ch := make(chan result, len(b.lanes)) + launched := 0 + allowUnhealthy := !b.hasHealthyLane() + for idx, lane := range b.lanes { + if !allowUnhealthy && !lane.healthy() { + continue + } + launched++ + go func(idx int, lane *storageLane) { + names, err := lane.backend.ListQuery(ctx, prefix) + ch <- result{idx: idx, names: names, err: err} + }(idx, lane) + } + + var names []string + var firstErr error + for range launched { + res := <-ch + lane := b.lanes[res.idx] + if res.err != nil { + lane.markUnhealthy() + if firstErr == nil { + firstErr = fmt.Errorf("%s list failed: %w", lane.name, res.err) + } + continue + } + lane.markHealthy() + for _, name := range res.names { + b.setFileLane(name, res.idx) + names = append(names, name) + } + } + if len(names) == 0 && firstErr != nil && !b.hasHealthyLane() { + return nil, firstErr + } + return names, nil +} + +func (b *MultiBackend) Download(ctx context.Context, filename string) (io.ReadCloser, error) { + return b.readFromMappedLane(ctx, filename, func(lane Backend) (io.ReadCloser, error) { + return lane.Download(ctx, filename) + }) +} + +func (b *MultiBackend) Delete(ctx context.Context, filename string) error { + mappedIdx, ok := b.getFileLane(filename) + if ok { + if err := b.lanes[mappedIdx].backend.Delete(ctx, filename); err == nil { + b.deleteFileLane(filename) + return nil + } + b.lanes[mappedIdx].markUnhealthy() + } + + var firstErr error + for idx, lane := range b.lanes { + if ok && idx == mappedIdx { + continue + } + if err := lane.backend.Delete(ctx, filename); err != nil { + lane.markUnhealthy() + if firstErr == nil { + firstErr = fmt.Errorf("%s delete failed: %w", lane.name, err) + } + continue + } + lane.markHealthy() + b.deleteFileLane(filename) + return nil + } + return firstErr +} + +func (b *MultiBackend) CreateFolder(ctx context.Context, name string) (string, error) { + var firstID string + var firstErr error + for _, lane := range b.lanes { + id, err := lane.backend.CreateFolder(ctx, name) + if err != nil { + lane.markUnhealthy() + if firstErr == nil { + firstErr = fmt.Errorf("%s create folder failed: %w", lane.name, err) + } + continue + } + lane.markHealthy() + if firstID == "" { + firstID = id + } + } + if firstID != "" { + return firstID, nil + } + return "", firstErr +} + +func (b *MultiBackend) FindFolder(ctx context.Context, name string) (string, error) { + var firstID string + var firstErr error + for _, lane := range b.lanes { + id, err := lane.backend.FindFolder(ctx, name) + if err != nil { + lane.markUnhealthy() + if firstErr == nil { + firstErr = fmt.Errorf("%s find folder failed: %w", lane.name, err) + } + continue + } + lane.markHealthy() + if firstID == "" { + firstID = id + } + } + if firstID != "" || firstErr == nil { + return firstID, nil + } + return "", firstErr +} + +func (b *MultiBackend) readFromMappedLane(ctx context.Context, filename string, fn func(Backend) (io.ReadCloser, error)) (io.ReadCloser, error) { + if idx, ok := b.getFileLane(filename); ok { + rc, err := fn(b.lanes[idx].backend) + if err == nil { + b.lanes[idx].markHealthy() + return rc, nil + } + b.lanes[idx].markUnhealthy() + } + + var firstErr error + for idx, lane := range b.lanes { + rc, err := fn(lane.backend) + if err != nil { + lane.markUnhealthy() + if firstErr == nil { + firstErr = fmt.Errorf("%s read failed: %w", lane.name, err) + } + continue + } + lane.markHealthy() + b.setFileLane(filename, idx) + return rc, nil + } + return nil, firstErr +} + +func (b *MultiBackend) pickLane(filename string) int { + healthy := b.healthyLaneIndexes() + if len(healthy) == 0 { + return int(hashString(filename) % uint32(len(b.lanes))) + } + return healthy[int(hashString(filename)%uint32(len(healthy)))] +} + +func (b *MultiBackend) healthyLaneIndexes() []int { + var healthy []int + for idx, lane := range b.lanes { + if lane.healthy() { + healthy = append(healthy, idx) + } + } + return healthy +} + +func (b *MultiBackend) hasHealthyLane() bool { + for _, lane := range b.lanes { + if lane.healthy() { + return true + } + } + return false +} + +func (b *MultiBackend) setFileLane(filename string, idx int) { + b.fileLaneMu.Lock() + if len(b.fileLane) > 10000 { + b.fileLane = make(map[string]int) + } + b.fileLane[filename] = idx + b.fileLaneMu.Unlock() +} + +func (b *MultiBackend) getFileLane(filename string) (int, bool) { + b.fileLaneMu.RLock() + idx, ok := b.fileLane[filename] + b.fileLaneMu.RUnlock() + return idx, ok +} + +func (b *MultiBackend) deleteFileLane(filename string) { + b.fileLaneMu.Lock() + delete(b.fileLane, filename) + b.fileLaneMu.Unlock() +} + +func (l *storageLane) healthy() bool { + l.mu.RLock() + unhealthyUntil := l.unhealthyUntil + l.mu.RUnlock() + return time.Now().After(unhealthyUntil) +} + +func (l *storageLane) markUnhealthy() { + l.mu.Lock() + l.unhealthyUntil = time.Now().Add(30 * time.Second) + l.mu.Unlock() +} + +func (l *storageLane) markHealthy() { + l.mu.Lock() + l.unhealthyUntil = time.Time{} + l.mu.Unlock() +} + +func hashString(value string) uint32 { + h := fnv.New32a() + _, _ = h.Write([]byte(value)) + return h.Sum32() +} + +func bytesReader(payload []byte) io.Reader { + return bytes.NewReader(payload) +} diff --git a/internal/transport/conn.go b/internal/transport/conn.go index 844f6b3..39df5b5 100644 --- a/internal/transport/conn.go +++ b/internal/transport/conn.go @@ -65,6 +65,7 @@ func (v *VirtualConn) Read(b []byte) (n int, err error) { func (v *VirtualConn) Write(b []byte) (n int, err error) { if len(b) > 0 { v.session.EnqueueTx(b) + v.engine.RequestFlush() } return len(b), nil } @@ -74,7 +75,8 @@ func (v *VirtualConn) Close() error { v.session.closed = true v.session.txCond.Broadcast() // Wake up any writers blocked on backpressure v.session.mu.Unlock() - + v.engine.RequestFlush() + // A closed connection no longer accepts writes efficiently // Next periodic engine flush will securely remove context return nil diff --git a/internal/transport/engine.go b/internal/transport/engine.go index 28ce69f..6c3b0f8 100644 --- a/internal/transport/engine.go +++ b/internal/transport/engine.go @@ -8,6 +8,7 @@ import ( "strconv" "strings" "sync" + "sync/atomic" "time" "github.com/NullLatency/flow-driver/internal/storage" @@ -28,8 +29,17 @@ type Engine struct { closedSessions map[string]time.Time closedSessionsMu sync.Mutex - pollTicker time.Duration - flushTicker time.Duration + pollTicker time.Duration + flushTicker time.Duration + idlePollMax time.Duration + idlePollStep time.Duration + sessionIdleTimeout time.Duration + cleanupFileMaxAge time.Duration + maxPayloadBytes int + backpressureBytes int + storageOpTimeout time.Duration + immediateFlush bool + metricsLogInterval time.Duration // Server mode handler: called when a new session is discovered OnNewSession func(sessionID, targetAddr string, s *Session) @@ -40,6 +50,62 @@ type Engine struct { // Track processed files to avoid duplicates processed map[string]bool processedMu sync.Mutex + + flushTrigger chan struct{} + metrics engineMetrics +} + +type engineMetrics struct { + uploads uint64 + downloads uint64 + deletes uint64 + listCalls uint64 + uploadBytes uint64 + downloadBytes uint64 + uploadErrors uint64 + downloadErrors uint64 + listErrors uint64 + deleteErrors uint64 + uploadLatencyMs uint64 + downloadLatencyMs uint64 + listLatencyMs uint64 + deleteLatencyMs uint64 + maxUploadLatencyMs uint64 + maxDownloadLatencyMs uint64 + maxListLatencyMs uint64 + maxDeleteLatencyMs uint64 + fileAgeMs uint64 + maxFileAgeMs uint64 + firstResponses uint64 + firstResponseMs uint64 + maxFirstResponseMs uint64 +} + +type MetricsSnapshot struct { + ActiveSessions int `json:"active_sessions"` + Uploads uint64 `json:"uploads"` + Downloads uint64 `json:"downloads"` + Deletes uint64 `json:"deletes"` + ListCalls uint64 `json:"list_calls"` + UploadBytes uint64 `json:"upload_bytes"` + DownloadBytes uint64 `json:"download_bytes"` + UploadErrors uint64 `json:"upload_errors"` + DownloadErrors uint64 `json:"download_errors"` + ListErrors uint64 `json:"list_errors"` + DeleteErrors uint64 `json:"delete_errors"` + AvgUploadLatencyMs float64 `json:"avg_upload_latency_ms"` + AvgDownloadLatencyMs float64 `json:"avg_download_latency_ms"` + AvgListLatencyMs float64 `json:"avg_list_latency_ms"` + AvgDeleteLatencyMs float64 `json:"avg_delete_latency_ms"` + MaxUploadLatencyMs uint64 `json:"max_upload_latency_ms"` + MaxDownloadLatencyMs uint64 `json:"max_download_latency_ms"` + MaxListLatencyMs uint64 `json:"max_list_latency_ms"` + MaxDeleteLatencyMs uint64 `json:"max_delete_latency_ms"` + AvgFileAgeMs float64 `json:"avg_file_age_ms"` + MaxFileAgeMs uint64 `json:"max_file_age_ms"` + FirstResponses uint64 `json:"first_responses"` + AvgFirstResponseMs float64 `json:"avg_first_response_ms"` + MaxFirstResponseMs uint64 `json:"max_first_response_ms"` } func NewEngine(backend storage.Backend, isClient bool, clientID string) *Engine { @@ -50,8 +116,18 @@ func NewEngine(backend storage.Backend, isClient bool, clientID string) *Engine closedSessions: make(map[string]time.Time), processed: make(map[string]bool), // Default intervals: Poll (RX) fast for responsiveness, Flush (TX) slower for gathering - pollTicker: 500 * time.Millisecond, - flushTicker: 300 * time.Millisecond, + pollTicker: 500 * time.Millisecond, + flushTicker: 300 * time.Millisecond, + idlePollMax: 5 * time.Second, + idlePollStep: 500 * time.Millisecond, + sessionIdleTimeout: 10 * time.Second, + cleanupFileMaxAge: 10 * time.Second, + maxPayloadBytes: 768 * 1024, + backpressureBytes: 2 * 1024 * 1024, + storageOpTimeout: 18 * time.Second, + immediateFlush: false, + metricsLogInterval: 30 * time.Second, + flushTrigger: make(chan struct{}, 1), } if isClient { e.myDir = DirReq @@ -87,10 +163,69 @@ func (e *Engine) SetFlushRate(ms int) { } } +func (e *Engine) SetIdlePollMax(ms int) { + if ms > 0 { + e.idlePollMax = time.Duration(ms) * time.Millisecond + } +} + +func (e *Engine) SetIdlePollStep(ms int) { + if ms > 0 { + e.idlePollStep = time.Duration(ms) * time.Millisecond + } +} + +func (e *Engine) SetSessionIdleTimeout(seconds int) { + if seconds > 0 { + e.sessionIdleTimeout = time.Duration(seconds) * time.Second + } +} + +func (e *Engine) SetCleanupFileMaxAge(seconds int) { + if seconds > 0 { + e.cleanupFileMaxAge = time.Duration(seconds) * time.Second + } +} + +func (e *Engine) SetMaxPayloadBytes(bytes int) { + if bytes > 0 { + e.maxPayloadBytes = bytes + } +} + +func (e *Engine) SetBackpressureBytes(bytes int) { + if bytes > 0 { + e.backpressureBytes = bytes + } +} + +func (e *Engine) SetStorageOpTimeout(seconds int) { + if seconds > 0 { + e.storageOpTimeout = time.Duration(seconds) * time.Second + } +} + +func (e *Engine) SetImmediateFlush(enabled bool) { + e.immediateFlush = enabled +} + +func (e *Engine) ActiveSessionCount() int { + e.sessionMu.RLock() + defer e.sessionMu.RUnlock() + return len(e.sessions) +} + +func (e *Engine) SetMetricsLogInterval(seconds int) { + if seconds > 0 { + e.metricsLogInterval = time.Duration(seconds) * time.Second + } +} + func (e *Engine) Start(ctx context.Context) { go e.flushLoop(ctx) go e.pollLoop(ctx) go e.cleanupLoop(ctx) // Delete files older than 10s + go e.metricsLoop(ctx) } func (e *Engine) GetSession(id string) *Session { @@ -100,10 +235,56 @@ func (e *Engine) GetSession(id string) *Session { } func (e *Engine) AddSession(s *Session) { + s.SetBackpressureBytes(e.backpressureBytes) e.sessionMu.Lock() defer e.sessionMu.Unlock() e.sessions[s.ID] = s log.Printf("Engine.AddSession: Added session %s (Total now: %d)", s.ID, len(e.sessions)) + e.RequestFlush() +} + +func (e *Engine) RequestFlush() { + if !e.immediateFlush { + return + } + select { + case e.flushTrigger <- struct{}{}: + default: + } +} + +func (e *Engine) Snapshot() MetricsSnapshot { + current := e.snapshotMetrics() + e.sessionMu.RLock() + activeSessions := len(e.sessions) + e.sessionMu.RUnlock() + + return MetricsSnapshot{ + ActiveSessions: activeSessions, + Uploads: current.uploads, + Downloads: current.downloads, + Deletes: current.deletes, + ListCalls: current.listCalls, + UploadBytes: current.uploadBytes, + DownloadBytes: current.downloadBytes, + UploadErrors: current.uploadErrors, + DownloadErrors: current.downloadErrors, + ListErrors: current.listErrors, + DeleteErrors: current.deleteErrors, + AvgUploadLatencyMs: averageMs(current.uploadLatencyMs, current.uploads), + AvgDownloadLatencyMs: averageMs(current.downloadLatencyMs, current.downloads), + AvgListLatencyMs: averageMs(current.listLatencyMs, current.listCalls), + AvgDeleteLatencyMs: averageMs(current.deleteLatencyMs, current.deletes), + MaxUploadLatencyMs: current.maxUploadLatencyMs, + MaxDownloadLatencyMs: current.maxDownloadLatencyMs, + MaxListLatencyMs: current.maxListLatencyMs, + MaxDeleteLatencyMs: current.maxDeleteLatencyMs, + AvgFileAgeMs: averageMs(current.fileAgeMs, current.downloads), + MaxFileAgeMs: current.maxFileAgeMs, + FirstResponses: current.firstResponses, + AvgFirstResponseMs: averageMs(current.firstResponseMs, current.firstResponses), + MaxFirstResponseMs: current.maxFirstResponseMs, + } } func (e *Engine) flushLoop(ctx context.Context) { @@ -116,6 +297,8 @@ func (e *Engine) flushLoop(ctx context.Context) { return case <-ticker.C: e.flushAll(ctx) + case <-e.flushTrigger: + e.flushAll(ctx) } } } @@ -130,12 +313,13 @@ func (e *Engine) flushAll(ctx context.Context) { muxes := make(map[string][]Envelope) var closedSessionIDs []string + needsFollowupFlush := false for _, s := range sessions { s.mu.Lock() // Idle Timeout check - if time.Since(s.lastActivity) > 10*time.Second { + if time.Since(s.lastActivity) > e.sessionIdleTimeout { s.closed = true } @@ -147,19 +331,26 @@ func (e *Engine) flushAll(ctx context.Context) { } payload := s.txBuf - s.txBuf = nil + if e.maxPayloadBytes > 0 && len(payload) > e.maxPayloadBytes { + payload = append([]byte(nil), payload[:e.maxPayloadBytes]...) + s.txBuf = append([]byte(nil), s.txBuf[e.maxPayloadBytes:]...) + needsFollowupFlush = true + } else { + s.txBuf = nil + } s.txCond.Broadcast() // Release any blocked writers + closePacket := s.closed && len(s.txBuf) == 0 env := Envelope{ SessionID: s.ID, Seq: s.txSeq, Payload: payload, - Close: s.closed, + Close: closePacket, TargetAddr: s.TargetAddr, } s.txSeq++ - if s.closed { + if closePacket { closedSessionIDs = append(closedSessionIDs, s.ID) } @@ -183,9 +374,10 @@ func (e *Engine) flushAll(ctx context.Context) { fnameCID = "unknown" } filename := fmt.Sprintf("%s-%s-mux-%d.bin", e.myDir, fnameCID, time.Now().UnixNano()) + payloadBytes := muxPayloadBytes(mux) // Upload asynchronously with backpressure/limit - go func(fname string, m []Envelope) { + go func(fname string, m []Envelope, bytes int) { e.sem <- struct{}{} // Acquire defer func() { <-e.sem }() // Release @@ -200,20 +392,33 @@ func (e *Engine) flushAll(ctx context.Context) { } }() - if err := e.backend.Upload(ctx, fname, pr); err != nil { + start := time.Now() + opCtx, cancel := e.storageContext(ctx) + err := e.backend.Upload(opCtx, fname, pr) + cancel() + if err != nil { + atomic.AddUint64(&e.metrics.uploadErrors, 1) log.Printf("upload error %s: %v", fname, err) + return } - }(filename, mux) + latencyMs := uint64(time.Since(start).Milliseconds()) + atomic.AddUint64(&e.metrics.uploads, 1) + atomic.AddUint64(&e.metrics.uploadBytes, uint64(bytes)) + atomic.AddUint64(&e.metrics.uploadLatencyMs, latencyMs) + atomicMaxUint64(&e.metrics.maxUploadLatencyMs, latencyMs) + }(filename, mux, payloadBytes) } for _, id := range closedSessionIDs { e.RemoveSession(id) } + if needsFollowupFlush { + e.RequestFlush() + } } func (e *Engine) pollLoop(ctx context.Context) { currentPollInterval := e.pollTicker - maxPollInterval := 5 * time.Second timer := time.NewTimer(currentPollInterval) defer timer.Stop() @@ -245,12 +450,20 @@ func (e *Engine) pollLoop(ctx context.Context) { // Server polls for ALL client requests prefix += "" } - files, err := e.backend.ListQuery(ctx, prefix) + listStart := time.Now() + opCtx, cancel := e.storageContext(ctx) + files, err := e.backend.ListQuery(opCtx, prefix) + cancel() if err != nil { + atomic.AddUint64(&e.metrics.listErrors, 1) log.Printf("poll list error: %v", err) timer.Reset(currentPollInterval) continue } + listLatencyMs := uint64(time.Since(listStart).Milliseconds()) + atomic.AddUint64(&e.metrics.listCalls, 1) + atomic.AddUint64(&e.metrics.listLatencyMs, listLatencyMs) + atomicMaxUint64(&e.metrics.maxListLatencyMs, listLatencyMs) if len(files) == 0 { if e.myDir == DirRes { // SERVER OPTIMIZATION @@ -260,9 +473,9 @@ func (e *Engine) pollLoop(ctx context.Context) { if activeSessions == 0 { // Increase polling delay step-by-step to save API calls - currentPollInterval += 500 * time.Millisecond - if currentPollInterval > maxPollInterval { - currentPollInterval = maxPollInterval + currentPollInterval += e.idlePollStep + if currentPollInterval > e.idlePollMax { + currentPollInterval = e.idlePollMax } } else { // A session is currently active, so loop fast! @@ -286,8 +499,8 @@ func (e *Engine) pollLoop(ctx context.Context) { tsStr := parts[len(parts)-1] tsStr = strings.TrimSuffix(tsStr, ".bin") ts, _ := strconv.ParseInt(tsStr, 10, 64) - if ts > 0 && time.Since(time.Unix(0, ts)) > 5*time.Minute { - e.backend.Delete(ctx, f) // Silent cleanup + if ts > 0 && time.Since(time.Unix(0, ts)) > e.cleanupFileMaxAge { + e.deleteAsync(ctx, f) // Silent cleanup continue } } @@ -311,15 +524,24 @@ func (e *Engine) pollLoop(ctx context.Context) { defer func() { <-e.sem }() // Release // log.Printf("Engine.pollLoop: Downloading %s", fname) - rc, err := e.backend.Download(ctx, fname) + downloadStart := time.Now() + opCtx, cancel := e.storageContext(ctx) + rc, err := e.backend.Download(opCtx, fname) if err != nil { + cancel() + atomic.AddUint64(&e.metrics.downloadErrors, 1) log.Printf("download error %s: %v", fname, err) e.processedMu.Lock() delete(e.processed, fname) // failed to download, retry next poll e.processedMu.Unlock() return } - defer rc.Close() + defer func() { + rc.Close() + cancel() + }() + downloadLatencyMs := uint64(time.Since(downloadStart).Milliseconds()) + fileAgeMs := fileAgeMilliseconds(fname) // Extract ClientID from filename for server-side session initialization var fileClientID string @@ -330,6 +552,7 @@ func (e *Engine) pollLoop(ctx context.Context) { // STREAMING DECODE count := 0 + payloadBytes := 0 for { var env Envelope if err := env.Decode(rc); err != nil { @@ -339,6 +562,7 @@ func (e *Engine) pollLoop(ctx context.Context) { break } count++ + payloadBytes += len(env.Payload) // Process envelope immediately e.closedSessionsMu.Lock() @@ -353,6 +577,7 @@ func (e *Engine) pollLoop(ctx context.Context) { if !exists && e.myDir == DirRes && e.OnNewSession != nil { s = NewSession(env.SessionID) s.ClientID = fileClientID + s.TargetAddr = env.TargetAddr e.sessions[env.SessionID] = s e.sessionMu.Unlock() log.Printf("Engine: Triggering new session %s for Client %s", env.SessionID, fileClientID) @@ -362,11 +587,22 @@ func (e *Engine) pollLoop(ctx context.Context) { } if s != nil { + if len(env.Payload) > 0 { + e.recordFirstResponse(s) + } s.ProcessRx(&env) } } - e.backend.Delete(ctx, fname) + atomic.AddUint64(&e.metrics.downloads, 1) + atomic.AddUint64(&e.metrics.downloadBytes, uint64(payloadBytes)) + atomic.AddUint64(&e.metrics.downloadLatencyMs, downloadLatencyMs) + atomicMaxUint64(&e.metrics.maxDownloadLatencyMs, downloadLatencyMs) + if fileAgeMs > 0 { + atomic.AddUint64(&e.metrics.fileAgeMs, fileAgeMs) + atomicMaxUint64(&e.metrics.maxFileAgeMs, fileAgeMs) + } + e.deleteAsync(ctx, fname) }(f) } @@ -392,6 +628,53 @@ func (e *Engine) RemoveSession(id string) { e.closedSessionsMu.Unlock() } +func (e *Engine) deleteAsync(ctx context.Context, filename string) { + go func() { + e.sem <- struct{}{} + defer func() { <-e.sem }() + + start := time.Now() + opCtx, cancel := e.storageContext(ctx) + err := e.backend.Delete(opCtx, filename) + cancel() + if err != nil { + atomic.AddUint64(&e.metrics.deleteErrors, 1) + return + } + latencyMs := uint64(time.Since(start).Milliseconds()) + atomic.AddUint64(&e.metrics.deletes, 1) + atomic.AddUint64(&e.metrics.deleteLatencyMs, latencyMs) + atomicMaxUint64(&e.metrics.maxDeleteLatencyMs, latencyMs) + }() +} + +func (e *Engine) storageContext(ctx context.Context) (context.Context, context.CancelFunc) { + if e.storageOpTimeout <= 0 { + return context.WithCancel(ctx) + } + return context.WithTimeout(ctx, e.storageOpTimeout) +} + +func (e *Engine) recordFirstResponse(s *Session) { + s.mu.Lock() + if s.firstResponseLogged { + s.mu.Unlock() + return + } + s.firstResponseLogged = true + targetAddr := s.TargetAddr + elapsed := time.Since(s.createdAt) + s.mu.Unlock() + + latencyMs := uint64(elapsed.Milliseconds()) + atomic.AddUint64(&e.metrics.firstResponses, 1) + atomic.AddUint64(&e.metrics.firstResponseMs, latencyMs) + atomicMaxUint64(&e.metrics.maxFirstResponseMs, latencyMs) + if latencyMs > 2000 { + log.Printf("session first response slow: id=%s target=%s first_response_ms=%d", s.ID, targetAddr, latencyMs) + } +} + func (e *Engine) cleanupLoop(ctx context.Context) { ticker := time.NewTicker(5 * time.Second) defer ticker.Stop() @@ -427,7 +710,18 @@ func (e *Engine) cleanupLoop(ctx context.Context) { } } - files, _ := e.backend.ListQuery(ctx, string(e.myDir)+"-") + listStart := time.Now() + opCtx, cancel := e.storageContext(ctx) + files, err := e.backend.ListQuery(opCtx, string(e.myDir)+"-") + cancel() + if err != nil { + atomic.AddUint64(&e.metrics.listErrors, 1) + continue + } + listLatencyMs := uint64(time.Since(listStart).Milliseconds()) + atomic.AddUint64(&e.metrics.listCalls, 1) + atomic.AddUint64(&e.metrics.listLatencyMs, listLatencyMs) + atomicMaxUint64(&e.metrics.maxListLatencyMs, listLatencyMs) for _, f := range files { parts := strings.Split(f, "-") // Formats: @@ -440,8 +734,8 @@ func (e *Engine) cleanupLoop(ctx context.Context) { ts, err := strconv.ParseInt(tsStr, 10, 64) if err == nil { t := time.Unix(0, ts) - if time.Since(t) > 10*time.Second { - e.backend.Delete(ctx, f) + if time.Since(t) > e.cleanupFileMaxAge { + e.deleteAsync(ctx, f) } } } @@ -449,3 +743,133 @@ func (e *Engine) cleanupLoop(ctx context.Context) { } } } + +func (e *Engine) metricsLoop(ctx context.Context) { + if e.metricsLogInterval <= 0 { + return + } + + ticker := time.NewTicker(e.metricsLogInterval) + defer ticker.Stop() + + var last engineMetrics + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + current := e.snapshotMetrics() + e.sessionMu.RLock() + activeSessions := len(e.sessions) + e.sessionMu.RUnlock() + + log.Printf( + "metrics: active=%d uploads=%d/%s up_avg_ms=%.0f downloads=%d/%s down_avg_ms=%.0f lists=%d list_avg_ms=%.0f deletes=%d file_age_avg_ms=%.0f first_resp_avg_ms=%.0f errors[u=%d d=%d l=%d del=%d]", + activeSessions, + current.uploads-last.uploads, + formatBytes(current.uploadBytes-last.uploadBytes), + averageMs(current.uploadLatencyMs-last.uploadLatencyMs, current.uploads-last.uploads), + current.downloads-last.downloads, + formatBytes(current.downloadBytes-last.downloadBytes), + averageMs(current.downloadLatencyMs-last.downloadLatencyMs, current.downloads-last.downloads), + current.listCalls-last.listCalls, + averageMs(current.listLatencyMs-last.listLatencyMs, current.listCalls-last.listCalls), + current.deletes-last.deletes, + averageMs(current.fileAgeMs-last.fileAgeMs, current.downloads-last.downloads), + averageMs(current.firstResponseMs-last.firstResponseMs, current.firstResponses-last.firstResponses), + current.uploadErrors-last.uploadErrors, + current.downloadErrors-last.downloadErrors, + current.listErrors-last.listErrors, + current.deleteErrors-last.deleteErrors, + ) + last = current + } + } +} + +func (e *Engine) snapshotMetrics() engineMetrics { + return engineMetrics{ + uploads: atomic.LoadUint64(&e.metrics.uploads), + downloads: atomic.LoadUint64(&e.metrics.downloads), + deletes: atomic.LoadUint64(&e.metrics.deletes), + listCalls: atomic.LoadUint64(&e.metrics.listCalls), + uploadBytes: atomic.LoadUint64(&e.metrics.uploadBytes), + downloadBytes: atomic.LoadUint64(&e.metrics.downloadBytes), + uploadErrors: atomic.LoadUint64(&e.metrics.uploadErrors), + downloadErrors: atomic.LoadUint64(&e.metrics.downloadErrors), + listErrors: atomic.LoadUint64(&e.metrics.listErrors), + deleteErrors: atomic.LoadUint64(&e.metrics.deleteErrors), + uploadLatencyMs: atomic.LoadUint64(&e.metrics.uploadLatencyMs), + downloadLatencyMs: atomic.LoadUint64(&e.metrics.downloadLatencyMs), + listLatencyMs: atomic.LoadUint64(&e.metrics.listLatencyMs), + deleteLatencyMs: atomic.LoadUint64(&e.metrics.deleteLatencyMs), + maxUploadLatencyMs: atomic.LoadUint64(&e.metrics.maxUploadLatencyMs), + maxDownloadLatencyMs: atomic.LoadUint64(&e.metrics.maxDownloadLatencyMs), + maxListLatencyMs: atomic.LoadUint64(&e.metrics.maxListLatencyMs), + maxDeleteLatencyMs: atomic.LoadUint64(&e.metrics.maxDeleteLatencyMs), + fileAgeMs: atomic.LoadUint64(&e.metrics.fileAgeMs), + maxFileAgeMs: atomic.LoadUint64(&e.metrics.maxFileAgeMs), + firstResponses: atomic.LoadUint64(&e.metrics.firstResponses), + firstResponseMs: atomic.LoadUint64(&e.metrics.firstResponseMs), + maxFirstResponseMs: atomic.LoadUint64(&e.metrics.maxFirstResponseMs), + } +} + +func muxPayloadBytes(mux []Envelope) int { + var total int + for _, env := range mux { + total += len(env.Payload) + } + return total +} + +func formatBytes(n uint64) string { + const unit = 1024 + if n < unit { + return fmt.Sprintf("%dB", n) + } + value := float64(n) + for _, suffix := range []string{"KB", "MB", "GB"} { + value /= unit + if value < unit { + return fmt.Sprintf("%.1f%s", value, suffix) + } + } + return fmt.Sprintf("%.1fTB", value/unit) +} + +func averageMs(total, count uint64) float64 { + if count == 0 { + return 0 + } + return float64(total) / float64(count) +} + +func atomicMaxUint64(target *uint64, value uint64) { + for { + current := atomic.LoadUint64(target) + if value <= current { + return + } + if atomic.CompareAndSwapUint64(target, current, value) { + return + } + } +} + +func fileAgeMilliseconds(filename string) uint64 { + parts := strings.Split(filename, "-") + if len(parts) < 3 { + return 0 + } + tsStr := strings.TrimSuffix(parts[len(parts)-1], ".bin") + ts, err := strconv.ParseInt(tsStr, 10, 64) + if err != nil || ts <= 0 { + return 0 + } + age := time.Since(time.Unix(0, ts)) + if age < 0 { + return 0 + } + return uint64(age.Milliseconds()) +} diff --git a/internal/transport/session.go b/internal/transport/session.go index a2ae032..192a6e8 100644 --- a/internal/transport/session.go +++ b/internal/transport/session.go @@ -15,20 +15,23 @@ const ( // Session represents an active proxy connection mapped to files. type Session struct { - ID string - mu sync.Mutex - txBuf []byte - txSeq uint64 - rxSeq uint64 - rxQueue map[uint64]*Envelope - lastActivity time.Time - closed bool - rxClosed bool // Safely tracks if RxChan was successfully closed - TargetAddr string - ClientID string + ID string + mu sync.Mutex + txBuf []byte + txSeq uint64 + rxSeq uint64 + rxQueue map[uint64]*Envelope + createdAt time.Time + lastActivity time.Time + firstResponseLogged bool + closed bool + rxClosed bool // Safely tracks if RxChan was successfully closed + TargetAddr string + ClientID string // Backpressure: blocked when txBuf is too large - txCond *sync.Cond + txCond *sync.Cond + backpressureBytes int // App channel for receiving data downloaded from remote RxChan chan []byte @@ -36,22 +39,32 @@ type Session struct { func NewSession(id string) *Session { s := &Session{ - ID: id, - rxQueue: make(map[uint64]*Envelope), - lastActivity: time.Now(), - RxChan: make(chan []byte, 1024), + ID: id, + rxQueue: make(map[uint64]*Envelope), + createdAt: time.Now(), + lastActivity: time.Now(), + RxChan: make(chan []byte, 1024), + backpressureBytes: 2 * 1024 * 1024, } s.txCond = sync.NewCond(&s.mu) return s } +func (s *Session) SetBackpressureBytes(bytes int) { + s.mu.Lock() + if bytes > 0 { + s.backpressureBytes = bytes + } + s.mu.Unlock() +} + func (s *Session) EnqueueTx(data []byte) { s.mu.Lock() defer s.mu.Unlock() - // BACKPRESSURE: Block if txBuf is larger than 2MB + // BACKPRESSURE: Block if txBuf is larger than the configured limit. // This prevents memory explosion when uploading through the proxy - for len(s.txBuf) > 2*1024*1024 && !s.closed { + for len(s.txBuf) > s.backpressureBytes && !s.closed { s.txCond.Wait() } diff --git a/scripts/build_release.sh b/scripts/build_release.sh index 85fd71c..8d58b77 100644 --- a/scripts/build_release.sh +++ b/scripts/build_release.sh @@ -41,6 +41,7 @@ for platform in "${platforms[@]}"; do # Copy Example Configs and README cp client_config.json.example "$OUTPUT_PATH/" cp server_config.json.example "$OUTPUT_PATH/" + cp scripts/flowdriver-server.service "$OUTPUT_PATH/" cp README.md "$OUTPUT_PATH/" # Zip it up diff --git a/scripts/collect_client_diagnostics.sh b/scripts/collect_client_diagnostics.sh new file mode 100755 index 0000000..727c674 --- /dev/null +++ b/scripts/collect_client_diagnostics.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +set -u + +OUT_ROOT="${1:-diagnostics}" +STAMP="$(date +%Y%m%d-%H%M%S)" +OUT_DIR="${OUT_ROOT}/flowdriver-client-${STAMP}" +ARCHIVE="" +CLIENT_LOG="${FLOWDRIVER_CLIENT_LOG:-client.log}" +CLIENT_METRICS_URL="${FLOWDRIVER_CLIENT_METRICS_URL:-http://127.0.0.1:18081/metrics}" +CLIENT_HEALTH_URL="${FLOWDRIVER_CLIENT_HEALTH_URL:-http://127.0.0.1:18081/healthz}" +SOCKS_PROXY="${FLOWDRIVER_SOCKS_PROXY:-socks5h://127.0.0.1:1080}" +CURL_MAX_TIME="${FLOWDRIVER_CURL_MAX_TIME:-60}" +RUN_CURLS="${FLOWDRIVER_RUN_CURLS:-0}" +TAIL_LINES="${FLOWDRIVER_TAIL_LINES:-1200}" +SAMPLE_SECONDS="${FLOWDRIVER_SAMPLE_SECONDS:-30}" +SAMPLE_INTERVAL="${FLOWDRIVER_SAMPLE_INTERVAL:-5}" +SERVER_SSH="${FLOWDRIVER_SERVER_SSH:-}" +SSH_PROXY_HOSTPORT="${FLOWDRIVER_SSH_PROXY_HOSTPORT:-127.0.0.1:1080}" + +mkdir -p "$OUT_DIR" + +finalize() { + if [ -n "$ARCHIVE" ] && [ -f "$ARCHIVE" ]; then + return + fi + + if command -v zip >/dev/null 2>&1; then + ARCHIVE="${OUT_DIR}.zip" + (cd "$OUT_ROOT" && zip -qr "$(basename "$ARCHIVE")" "$(basename "$OUT_DIR")") + else + ARCHIVE="${OUT_DIR}.tar.gz" + tar -czf "$ARCHIVE" -C "$OUT_ROOT" "$(basename "$OUT_DIR")" + fi + + echo "Diagnostics written to: $OUT_DIR" + echo "Archive: $ARCHIVE" +} +trap finalize EXIT +trap 'finalize; exit 130' INT TERM + +run_capture() { + local name="$1" + shift + echo "Collecting ${name}..." + { + echo "\$ $*" + "$@" + } > "${OUT_DIR}/${name}" 2>&1 +} + +run_server_ssh_capture() { + local name="$1" + local remote_cmd="$2" + local out="${OUT_DIR}/${name}" + local ssh_opts=(-o BatchMode=yes -o ConnectTimeout=5 -o ServerAliveInterval=5 -o ServerAliveCountMax=1) + local ssh_proxy_opts=(-o BatchMode=yes -o ConnectTimeout=20 -o ServerAliveInterval=5 -o ServerAliveCountMax=1 -o "ProxyCommand=nc -x ${SSH_PROXY_HOSTPORT} -X 5 %h %p") + + { + echo "Collecting ${name}..." + echo "\$ ssh ${SERVER_SSH} ${remote_cmd}" + echo "--- direct ssh attempt ---" + if ssh "${ssh_opts[@]}" "$SERVER_SSH" "$remote_cmd"; then + exit 0 + fi + + echo "--- direct ssh failed; trying through SOCKS5 proxy ${SSH_PROXY_HOSTPORT} ---" + echo "\$ ssh -o ProxyCommand='nc -x ${SSH_PROXY_HOSTPORT} -X 5 %h %p' ${SERVER_SSH} ${remote_cmd}" + ssh "${ssh_proxy_opts[@]}" "$SERVER_SSH" "$remote_cmd" || true + } > "$out" 2>&1 +} + +write_note() { + cat > "${OUT_DIR}/README.txt" < "${OUT_DIR}/client_tail_initial.log" +else + echo "Client log not found: $CLIENT_LOG" > "${OUT_DIR}/client_tail_initial.log" +fi + +if [ -f client_config.json ]; then + echo "Collecting client_config.redacted.json..." + sed -E \ + -e 's/("client_secret"[[:space:]]*:[[:space:]]*")[^"]+/\1REDACTED/g' \ + -e 's/("refresh_token"[[:space:]]*:[[:space:]]*")[^"]+/\1REDACTED/g' \ + -e 's/("client_id"[[:space:]]*:[[:space:]]*")[^"]+/\1REDACTED/g' \ + client_config.json > "${OUT_DIR}/client_config.redacted.json" +fi + +echo "Sampling client metrics for ${SAMPLE_SECONDS}s..." +END=$((SECONDS + SAMPLE_SECONDS)) +while [ "$SECONDS" -lt "$END" ]; do + METRICS="$(curl -sS --max-time 5 "$CLIENT_METRICS_URL" 2>> "${OUT_DIR}/client_metrics_errors.log" || true)" + if [ -z "$METRICS" ]; then + METRICS="null" + fi + printf '{"ts":"%s","metrics":%s}\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$METRICS" >> "${OUT_DIR}/client_metrics_samples.jsonl" + sleep "$SAMPLE_INTERVAL" +done + +if [ "$RUN_CURLS" = "1" ]; then + echo "Running curl benchmarks; each one can take up to ${CURL_MAX_TIME}s..." + run_capture curl_google.txt curl -L --max-time "$CURL_MAX_TIME" -x "$SOCKS_PROXY" https://www.google.com -o /dev/null -w 'time_total=%{time_total} starttransfer=%{time_starttransfer} size=%{size_download} speed=%{speed_download}\n' + run_capture curl_facebook.txt curl -L --max-time "$CURL_MAX_TIME" -x "$SOCKS_PROXY" https://www.facebook.com -o /dev/null -w 'time_total=%{time_total} starttransfer=%{time_starttransfer} size=%{size_download} speed=%{speed_download}\n' + run_capture curl_youtube.txt curl -L --max-time "$CURL_MAX_TIME" -x "$SOCKS_PROXY" https://youtube.com -o /dev/null -w 'time_total=%{time_total} starttransfer=%{time_starttransfer} size=%{size_download} speed=%{speed_download}\n' +else + echo "Skipping curl benchmarks. Set FLOWDRIVER_RUN_CURLS=1 to enable them." +fi + +run_capture metrics_final.json curl -sS --max-time 5 "$CLIENT_METRICS_URL" + +if [ -f "$CLIENT_LOG" ]; then + echo "Collecting client_tail.log..." + tail -n "$TAIL_LINES" "$CLIENT_LOG" > "${OUT_DIR}/client_tail.log" +else + echo "Client log not found: $CLIENT_LOG" > "${OUT_DIR}/client_tail.log" +fi + +if [ -n "$SERVER_SSH" ]; then + run_server_ssh_capture server_metrics.txt 'curl -sS --max-time 5 http://127.0.0.1:18080/metrics' + run_server_ssh_capture server_tail.log 'tail -n 1200 ~/flowdriver/server.log 2>/dev/null || true' +fi diff --git a/scripts/flowdriver-server.service b/scripts/flowdriver-server.service new file mode 100644 index 0000000..c300546 --- /dev/null +++ b/scripts/flowdriver-server.service @@ -0,0 +1,16 @@ +[Unit] +Description=Flow Driver server +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=ubuntu +WorkingDirectory=/home/ubuntu/flowdriver +ExecStart=/home/ubuntu/flowdriver/server -c /home/ubuntu/flowdriver/server_config.json -gc /home/ubuntu/flowdriver/credentials.json +Restart=always +RestartSec=5 +NoNewPrivileges=true + +[Install] +WantedBy=multi-user.target diff --git a/server_config.json.example b/server_config.json.example index bc1101f..64c6690 100644 --- a/server_config.json.example +++ b/server_config.json.example @@ -1,5 +1,22 @@ { "storage_type": "google", - "refresh_rate_ms": 100, - "flush_rate_ms": 300 -} \ No newline at end of file + "performance_profile": "balanced", + "google_folder_id": "YOUR_FOLDER_ID", + "refresh_rate_ms": 200, + "flush_rate_ms": 300, + "idle_poll_max_ms": 2000, + "idle_poll_step_ms": 500, + "session_idle_timeout_sec": 60, + "cleanup_file_max_age_sec": 60, + "storage_retry_max": 3, + "storage_retry_base_ms": 300, + "storage_op_timeout_sec": 45, + "max_payload_bytes": 786432, + "max_active_sessions": 20, + "session_wait_timeout_sec": 15, + "backpressure_bytes": 4194304, + "immediate_flush": false, + "metrics_log_sec": 30, + "health_listen_addr": "127.0.0.1:18080", + "google_lanes": [] +} From e711524a24f4e5e16a7e1f35108b4e0846ad06d0 Mon Sep 17 00:00:00 2001 From: PK3NZO Date: Sat, 25 Apr 2026 20:50:03 +0330 Subject: [PATCH 2/3] Stabilize runtime transport and document setup --- README.md | 347 ++++++++++------ client_config.json.example | 3 + cmd/client/main.go | 4 + cmd/server/main.go | 11 +- internal/config/config.go | 19 + internal/httpclient/client.go | 8 +- internal/transport/conn.go | 11 +- internal/transport/engine.go | 562 ++++++++++++++++++-------- internal/transport/session.go | 10 +- scripts/collect_client_diagnostics.sh | 23 +- server_config.json.example | 3 + 11 files changed, 683 insertions(+), 318 deletions(-) diff --git a/README.md b/README.md index a307734..3ccd95e 100644 --- a/README.md +++ b/README.md @@ -30,116 +30,274 @@ Flow Driver works by treating a cloud storage folder as a data queue: ## Setup & Installation / نصب و راه‌اندازی +This section is a complete client-to-server setup guide for the Google Drive backend. + ### Prerequisites / پیش‌نیازها -- **Go** (1.25 or higher) -- **Google Drive API Credentials**: You need a `credentials.json` (OAuth2) file. -- **Shared Folder (Auto)**: If you leave `google_folder_id` empty, the tool will automatically create a folder named **"Flow-Data"** and save its ID to your config! - -### 1. Obtain Credentials / دریافت فایل اعتبارنامه -To get your `credentials.json`, follow the instructions on the [Google Drive API Go Quickstart](https://developers.google.com/workspace/drive/api/quickstart/go) or follow these steps: - -برای دریافت فایل `credentials.json` می‌توانید طبق دستورالعمل‌های موجود در [شروع سریع Google Drive API برای Go](https://developers.google.com/workspace/drive/api/quickstart/go) عمل کنید یا مراحل زیر را انجام دهید: - -**English:** -1. **Enable the API**: Go to the [Google Cloud Console](https://console.cloud.google.com/), create a project, and enable the **Google Drive API**. -2. **Configure Consent Screen**: Go to "APIs & Services" > "OAuth consent screen." Fill in the app name and user support email (Branding). -3. **Create Credentials**: Go to "Credentials" > "Create Credentials" > **OAuth client ID**. Select **Desktop App** as the application type. -4. **Download JSON**: Download the client secret file and rename it to `credentials.json`. -5. **Publish App (Optional but Recommended)**: If your app status is "Testing," your token will expire every 7 days. Go to the OAuth consent screen and click "Publish App" to make the authorization permanent for your account. - -**فارسی:** -1. **فعال‌سازی API**: به [کنسول گوگل کلاود](https://console.cloud.google.com/) بروید، یک پروژه بسازید و **Google Drive API** را فعال کنید. -2. **تنظیم صفحه رضایت**: به بخش "APIs & Services" > "OAuth consent screen" بروید. نام برنامه و ایمیل پشتیبانی را وارد کنید (بخش Branding). -3. **ساخت اعتبارنامه**: به بخش "Credentials" > "Create Credentials" > **OAuth client ID** بروید. نوع برنامه را **Desktop App** انتخاب کنید. -4. **دانلود فایل**: فایل کلاینت سکرت را دانلود کرده و نام آن را به `credentials.json` تغییر دهید. -5. **انتشار برنامه (پیشنهادی)**: اگر وضعیت برنامه روی "Testing" باشد، توکن شما هر ۷ روز منقضی می‌شود. در صفحه OAuth consent screen بر روی "Publish App" کلیک کنید تا دسترسی برای اکانت شما دائمی شود. - -### 2. Build Binaries / ساخت فایل‌های اجرایی + +- Go 1.25 or newer on the build machine. +- A Linux upstream server or VPS. The examples below assume Ubuntu and user `ubuntu`. +- A Google Cloud project with the Google Drive API enabled. +- An OAuth Desktop App credential downloaded as `credentials.json`. + +Do not commit `credentials.json`, `credentials.json.token`, real config files, or diagnostics. They are ignored by `.gitignore`. + +### 1. Create Google OAuth Credentials + +1. Open Google Cloud Console and create or select a project. +2. Enable **Google Drive API**. +3. Go to **APIs & Services -> OAuth consent screen** and configure the app name, support email, and developer contact email. +4. If the app is in **Testing**, add the Google account you will use under **Test users**. Otherwise Google may return `Error 403: access_denied`. +5. Go to **APIs & Services -> Credentials -> Create Credentials -> OAuth client ID**. +6. Select **Desktop app**. +7. Download the JSON file and save it in the repository root as `credentials.json`. + +If you want long-lived tokens for your own account, publish the OAuth app after the consent screen is complete. Apps left in Testing can require re-authorization. + +### 2. Build Binaries + +Build the local client for your machine: + ```bash go build -o bin/client ./cmd/client -go build -o bin/server ./cmd/server ``` -### 2. Configuration / پیکربندی +Build the Linux server binary from macOS or another build host: -Create your `config.json` based on the provided examples: +```bash +GOOS=linux GOARCH=amd64 go build -o bin/server-linux-amd64 ./cmd/server +go build -o bin/purge ./cmd/purge +``` + +### 3. Create Config Files + +Start from the examples: + +```bash +cp client_config.json.example client_config.json +cp server_config.json.example server_config.json +``` + +Recommended balanced client config: -**Client Side (`client_config.json`):** ```json { "listen_addr": "127.0.0.1:1080", "storage_type": "google", - "google_folder_id": "YOUR_FOLDER_ID", - "refresh_rate_ms": 100, + "performance_profile": "balanced", + "refresh_rate_ms": 200, "flush_rate_ms": 300, + "idle_poll_max_ms": 2000, + "idle_poll_step_ms": 500, + "session_idle_timeout_sec": 25, + "cleanup_file_max_age_sec": 60, + "startup_stale_max_age_sec": 20, + "storage_retry_max": 3, + "storage_retry_base_ms": 300, + "storage_op_timeout_sec": 45, + "max_payload_bytes": 786432, + "max_active_sessions": 0, + "session_wait_timeout_sec": 15, + "backpressure_bytes": 4194304, + "immediate_flush": false, + "cold_start_burst_ms": 10000, + "cold_start_poll_ms": 100, + "metrics_log_sec": 30, + "health_listen_addr": "127.0.0.1:18081", "transport": { "TargetIP": "216.239.38.120:443", "SNI": "google.com", - "HostHeader": "www.googleapis.com" - } + "HostHeader": "www.googleapis.com", + "InsecureSkipVerify": false + }, + "google_lanes": [] } ``` ---- -## Performance & Quotas / عملکرد و سهمیه‌ها +Recommended balanced server config: -### English -**Important**: Google Drive has strict API rate limits (quotas). -- Using very low values (e.g., `refresh_rate_ms: 100`) will consume your API quota very quickly. -- To avoid connections being limited or blocked, it is recommended to keep these values above **100ms** at all times. -- For heavy usage or multiple concurrent users, you should set these to **200ms or higher**. - -### فارسی -**نکته مهم**: گوگل درایو محدودیت‌های سفت‌وسختی برای تعداد درخواست‌های API (Quota) دارد. -- استفاده از مقادیر بسیار پایین (مثلاً `100ms`) باعث می‌شود سهمیه API شما به سرعت تمام شود. -- برای جلوگیری از محدود شدن یا قطع شدن اتصال، توصیه می‌شود این مقادیر همیشه بالای **100ms** باشند. -- برای استفاده‌های سنگین یا زمانی که چندین کاربر به صورت هم‌زمان متصل هستند، بهتر است این مقادیر را روی **200ms یا بالاتر** تنظیم کنید. - -**Server Side (`server_config.json`):** ```json { "storage_type": "google", - "google_folder_id": "YOUR_FOLDER_ID", "performance_profile": "balanced", "refresh_rate_ms": 200, "flush_rate_ms": 300, "idle_poll_max_ms": 2000, "idle_poll_step_ms": 500, - "session_idle_timeout_sec": 25, + "session_idle_timeout_sec": 60, "cleanup_file_max_age_sec": 60, + "startup_stale_max_age_sec": 20, "storage_retry_max": 3, "storage_retry_base_ms": 300, "storage_op_timeout_sec": 45, "max_payload_bytes": 786432, - "max_active_sessions": 0, + "max_active_sessions": 20, "session_wait_timeout_sec": 15, "backpressure_bytes": 4194304, "immediate_flush": false, + "cold_start_burst_ms": 10000, + "cold_start_poll_ms": 100, "metrics_log_sec": 30, "health_listen_addr": "127.0.0.1:18080", "google_lanes": [] } ``` +Leave `google_folder_id` empty for the first local client run. The client will find or create a Google Drive folder named `Flow-Data` and write the folder ID back to `client_config.json`. Copy that same folder ID into `server_config.json`. + +### 4. First Local OAuth Login + +Run the client once locally: + +```bash +./bin/client -c client_config.json -gc credentials.json +``` + +The client prints a Google OAuth URL. Open it in a browser, approve access, then copy the full redirected `http://localhost/?code=...` URL back into the terminal. It is fine if the browser page itself cannot connect. + +After success, the client writes: + +```text +credentials.json.token +``` + +Keep both `credentials.json` and `credentials.json.token` private. + +### 5. Deploy the Server + +Create the remote directory and copy the server files: + +```bash +ssh ubuntu@YOUR_SERVER_IP 'mkdir -p ~/flowdriver' +scp bin/server-linux-amd64 server_config.json credentials.json credentials.json.token ubuntu@YOUR_SERVER_IP:~/flowdriver/ +``` + +On the server: + +```bash +ssh ubuntu@YOUR_SERVER_IP +cd ~/flowdriver +mv server-linux-amd64 server +chmod +x server +./server -c server_config.json -gc credentials.json +``` + +The server is running correctly when it prints `Starting Flow Server...` and keeps waiting. It only logs more when client traffic arrives. + +### 6. Run the Server with systemd + +For an always-on Ubuntu service: + +```bash +scp scripts/flowdriver-server.service ubuntu@YOUR_SERVER_IP:/tmp/ +ssh ubuntu@YOUR_SERVER_IP +sudo mv /tmp/flowdriver-server.service /etc/systemd/system/flowdriver-server.service +sudo systemctl daemon-reload +sudo systemctl enable --now flowdriver-server +sudo systemctl status flowdriver-server --no-pager +``` + +Useful service commands: + +```bash +sudo systemctl restart flowdriver-server +sudo systemctl stop flowdriver-server +sudo systemctl status flowdriver-server --no-pager +journalctl -u flowdriver-server -f +curl http://127.0.0.1:18080/metrics +``` + +To update the server binary later: + +```bash +GOOS=linux GOARCH=amd64 go build -o bin/server-linux-amd64 ./cmd/server +scp bin/server-linux-amd64 server_config.json ubuntu@YOUR_SERVER_IP:~/flowdriver/ +ssh ubuntu@YOUR_SERVER_IP 'cd ~/flowdriver && mv server-linux-amd64 server && chmod +x server && sudo systemctl restart flowdriver-server' +``` + +### 7. Run the Client + +Run the client locally and keep it open: + +```bash +./bin/client -c client_config.json -gc credentials.json +``` + +For logs that are easier to inspect: + +```bash +./bin/client -c client_config.json -gc credentials.json 2>&1 | tee client.log +``` + +The local SOCKS5 proxy listens on: + +```text +127.0.0.1:1080 +``` + +Test with curl: + +```bash +curl -L -x socks5h://127.0.0.1:1080 https://www.google.com -o /dev/null -w 'time_total=%{time_total} starttransfer=%{time_starttransfer} size=%{size_download}\n' +curl -L -x socks5h://127.0.0.1:1080 https://www.facebook.com -o /dev/null -w 'time_total=%{time_total} starttransfer=%{time_starttransfer} size=%{size_download}\n' +``` + +Configure your browser to use SOCKS5 `127.0.0.1:1080`. For browser workloads, expect initial page load latency; once streams start, throughput should be steadier. + +### 8. Diagnostics + +Client health and metrics: + +```bash +curl http://127.0.0.1:18081/healthz +curl http://127.0.0.1:18081/metrics +``` + +Collect a 5-minute client diagnostic bundle: + +```bash +FLOWDRIVER_SERVER_SSH=ubuntu@YOUR_SERVER_IP \ +FLOWDRIVER_SAMPLE_SECONDS=300 \ +FLOWDRIVER_SAMPLE_INTERVAL=5 \ +FLOWDRIVER_TAIL_LINES=3000 \ +./scripts/collect_client_diagnostics.sh +``` + +The script writes a timestamped directory and archive under `diagnostics/`. It does not collect `credentials.json` or `.token` files. + +Key metrics to watch: + +- `upload_errors`, `download_errors`, `list_errors`, `delete_errors`: should stay at or near zero. +- `avg_first_upload_ms`: local queue-to-Drive upload latency. +- `avg_first_server_seen_ms`: how long it takes the server to see a new client request file. +- `avg_first_response_ms`: time until the client receives first response bytes. +- `poll_files_stale`: old transport leftovers ignored and deleted. +- `max_file_age_ms`: large values can indicate backlog or old files. + ### Runtime Tuning / تنظیمات اجرا `performance_profile` can be set to: + - `fast`: lower startup latency, higher Google API usage. - `balanced`: recommended default for normal browsing and downloads. - `quota-saver`: lower API usage, higher startup latency. -You can override any profile value directly: +Important options: + - `refresh_rate_ms`: how often each side polls for incoming files while active. - `flush_rate_ms`: how often buffered data is uploaded. -- `idle_poll_max_ms`: server-side maximum polling delay while idle. Lower values reduce first-load delay. +- `idle_poll_max_ms`: server-side maximum polling delay while idle. - `session_idle_timeout_sec`: inactive connection timeout. +- `cleanup_file_max_age_sec`: deletes stale transport files. +- `startup_stale_max_age_sec`: ignores and deletes old transport leftovers before they can pollute cold starts. - `storage_retry_max` and `storage_retry_base_ms`: retry policy for transient Google API failures. - `storage_op_timeout_sec`: fail-fast timeout for individual Google Drive operations. - `max_payload_bytes`: maximum per-session payload size written into one transport file. -- `max_active_sessions`: client-side cap for concurrent SOCKS sessions. +- `max_active_sessions`: cap for concurrent sessions. `0` means unlimited. - `session_wait_timeout_sec`: how long new SOCKS sessions wait for capacity. - `backpressure_bytes`: per-session buffer limit before application writes wait. -- `immediate_flush`: uploads new data promptly instead of waiting for the next flush tick. Leave this off for browser/video/download workloads because Google Drive performs better with batched files. +- `immediate_flush`: uploads new data promptly instead of waiting for the next flush tick. Keep this disabled for browser/video/download workloads because Google Drive generally performs better with batched files. +- `cold_start_burst_ms`: temporary fast-polling window after a new session starts. +- `cold_start_poll_ms`: polling interval used during the cold-start burst. - `metrics_log_sec`: periodic operational metrics log interval. - `health_listen_addr`: optional local HTTP endpoint for `/healthz` and `/metrics`. @@ -162,78 +320,11 @@ For higher throughput or resilience, configure multiple Google Drive lanes on bo Each lane can use a separate Google account or folder. Uploads are distributed across healthy lanes, and a lane with transient failures is temporarily avoided. -### 3. Run / اجرا - -**Server:** -```bash -./bin/server -c server_config.json -gc credentials.json -``` - -**Client:** -```bash -./bin/client -c client_config.json -gc credentials.json -``` - -### Run Server as a systemd Service / اجرای سرور به صورت سرویس - -On an Ubuntu VPS, copy `scripts/flowdriver-server.service` to systemd after placing the server files in `/home/ubuntu/flowdriver`: - -```bash -sudo cp scripts/flowdriver-server.service /etc/systemd/system/flowdriver-server.service -sudo systemctl daemon-reload -sudo systemctl enable --now flowdriver-server -sudo systemctl status flowdriver-server -``` - -Logs: - -```bash -journalctl -u flowdriver-server -f -``` - -Health and metrics: - -```bash -curl http://127.0.0.1:18080/healthz -curl http://127.0.0.1:18080/metrics -``` - ---- +### Troubleshooting -## Usage & Authentication / نحوه استفاده و احراز هویت - -### 1. First-Time Authentication / احراز هویت اولیه -The project uses OAuth2 "3-legged" flow. You only need to do this once on your local machine: - -**English:** -1. Run the client: `./bin/client -c client_config.json -gc credentials.json` -2. A link will appear in your terminal. **Copy and open it** in your web browser. -3. Log in to your Google account and grant permissions. -4. You will be redirected to an address starting with `http://localhost` (it's okay if the page doesn't load). -5. **Copy the entire URL** from your browser's address bar and paste it back into your terminal. -6. The program will create a `.token` file next to your `credentials.json`. Authorization is now complete. - -**فارسی:** -1. کلاینت را اجرا کنید: `./bin/client -c client_config.json -gc credentials.json` -2. یک لینک در ترمینال ظاهر می‌شود. آن را کپی کرده و در مرورگر خود باز کنید. -3. وارد اکانت گوگل خود شوید و دسترسی‌های لازم را تایید کنید. -4. شما به آدرسی که با `http://localhost` شروع می‌شود هدایت می‌شوید (اشکالی ندارد اگر صفحه باز نشود). -5. **کل آدرس URL** را از نوار آدرس مرورگر کپی کرده و در ترمینال پیست کنید. -6. برنامه یک فایل با پسوند `.token` در کنار `credentials.json` شما می‌سازد. احراز هویت تمام شد. - -### 2. Deploying to Server / استقرار در سرور -Once you have the `.token` file, you don't need to log in again. - -**English:** -To run the server on a remote upstream machine: -1. Copy `credentials.json` **AND** the `.token` file to the server. -2. **Crucial**: Make sure your `server_config.json` has the **SAME** `google_folder_id` that the client just created and saved in your local config. -3. Run: `./bin/server -c server_config.json -gc credentials.json` -4. The server will automatically use the existing token and start immediately. - -**فارسی:** -پس از دریافت فایل `.token` دیگر نیازی به لاگین مجدد نیست. برای اجرای سرور در یک ماشین دور (Upstream): -1. فایل `credentials.json` **و** فایل `.token` ساخته شده را به سرور منتقل کنید. -2. **خیلی مهم**: مطمئن شوید که در فایل `server_config.json` مقدار `google_folder_id` دقیقاً همان مقداری باشد که کلاینت به طور خودکار ساخته و در فایل کانفیگ شما ذخیره کرده است. -3. اجرا کنید: `./bin/server -c server_config.json -gc credentials.json` -4. سرور به صورت خودکار از توکن موجود استفاده کرده و بلافاصله شروع به کار می‌کند. +- `Error 403: access_denied`: add your Google account as an OAuth test user or publish the app. +- Browser redirects to `localhost` and fails to load: this is expected. Copy the full URL from the address bar back into the client prompt. +- Server appears stuck after `Starting Flow Server...`: that is normal while idle. Generate client traffic and watch `journalctl -u flowdriver-server -f`. +- `scp: stat local ... no such file`: run commands from the repository root, or use full paths. +- Client metrics show many `context deadline exceeded` errors: reduce browser load, restart both sides, and check Google Drive/API reachability. +- Large `poll_files_stale` or `max_file_age_ms`: old request/response files are being cleaned up; restart both sides and retest after the folder drains. diff --git a/client_config.json.example b/client_config.json.example index 06656d5..138edb6 100644 --- a/client_config.json.example +++ b/client_config.json.example @@ -8,6 +8,7 @@ "idle_poll_step_ms": 500, "session_idle_timeout_sec": 25, "cleanup_file_max_age_sec": 60, + "startup_stale_max_age_sec": 20, "storage_retry_max": 3, "storage_retry_base_ms": 300, "storage_op_timeout_sec": 45, @@ -16,6 +17,8 @@ "session_wait_timeout_sec": 15, "backpressure_bytes": 4194304, "immediate_flush": false, + "cold_start_burst_ms": 10000, + "cold_start_poll_ms": 100, "metrics_log_sec": 30, "health_listen_addr": "127.0.0.1:18081", "transport": { diff --git a/cmd/client/main.go b/cmd/client/main.go index b95d4db..e206339 100644 --- a/cmd/client/main.go +++ b/cmd/client/main.go @@ -100,10 +100,12 @@ func main() { engine.SetIdlePollStep(appCfg.IdlePollStepMs) engine.SetSessionIdleTimeout(appCfg.SessionIdleTimeoutSec) engine.SetCleanupFileMaxAge(appCfg.CleanupFileMaxAgeSec) + engine.SetStartupStaleMaxAge(appCfg.StartupStaleMaxAgeSec) engine.SetMaxPayloadBytes(appCfg.MaxPayloadBytes) engine.SetBackpressureBytes(appCfg.BackpressureBytes) engine.SetStorageOpTimeout(appCfg.StorageOpTimeoutSec) engine.SetImmediateFlush(appCfg.ImmediateFlush) + engine.SetColdStartBurst(appCfg.ColdStartBurstMs, appCfg.ColdStartPollMs) engine.SetMetricsLogInterval(appCfg.MetricsLogSec) engine.Start(ctx) health.Start(ctx, appCfg.HealthListenAddr, engine) @@ -140,6 +142,8 @@ func main() { // Instantly ping a blank payload so the remote end opens the actual TCP destination session.EnqueueTx(nil) + engine.TriggerWarmPoll() + engine.ForceFlush() return transport.NewVirtualConn(session, engine), nil }), diff --git a/cmd/server/main.go b/cmd/server/main.go index 96b131d..19318ac 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -77,10 +77,12 @@ func main() { engine.SetIdlePollStep(appCfg.IdlePollStepMs) engine.SetSessionIdleTimeout(appCfg.SessionIdleTimeoutSec) engine.SetCleanupFileMaxAge(appCfg.CleanupFileMaxAgeSec) + engine.SetStartupStaleMaxAge(appCfg.StartupStaleMaxAgeSec) engine.SetMaxPayloadBytes(appCfg.MaxPayloadBytes) engine.SetBackpressureBytes(appCfg.BackpressureBytes) engine.SetStorageOpTimeout(appCfg.StorageOpTimeoutSec) engine.SetImmediateFlush(appCfg.ImmediateFlush) + engine.SetColdStartBurst(appCfg.ColdStartBurstMs, appCfg.ColdStartPollMs) engine.SetMetricsLogInterval(appCfg.MetricsLogSec) // Called by polling loop when a new incoming session file is found @@ -118,8 +120,13 @@ func handleServerConn(sessionID, targetAddr string, session *transport.Session, for { n, err := conn.Read(buf) if n > 0 { - session.EnqueueTx(buf[:n]) - engine.RequestFlush() + firstPacket := session.EnqueueTx(buf[:n]) + if firstPacket { + engine.TriggerWarmPoll() + engine.ForceFlush() + } else { + engine.RequestFlush() + } } if err != nil { errCh <- err diff --git a/internal/config/config.go b/internal/config/config.go index 6660201..ff2d42c 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -50,6 +50,10 @@ type AppConfig struct { // CleanupFileMaxAgeSec deletes stale transport files after this many seconds. CleanupFileMaxAgeSec int `json:"cleanup_file_max_age_sec,omitempty"` + // StartupStaleMaxAgeSec ignores and deletes transport files older than this many seconds. + // It prevents leftovers from a previous run from polluting cold-start measurements. + StartupStaleMaxAgeSec int `json:"startup_stale_max_age_sec,omitempty"` + // StorageRetryMax is the maximum number of retry attempts for transient storage errors. StorageRetryMax int `json:"storage_retry_max,omitempty"` @@ -75,6 +79,12 @@ type AppConfig struct { // Keep this disabled for browser workloads; Google Drive performs better with batching. ImmediateFlush bool `json:"immediate_flush,omitempty"` + // ColdStartBurstMs keeps polling temporarily fast when a new session starts. + ColdStartBurstMs int `json:"cold_start_burst_ms,omitempty"` + + // ColdStartPollMs is the temporary polling interval used during the cold-start burst. + ColdStartPollMs int `json:"cold_start_poll_ms,omitempty"` + // MetricsLogSec logs throughput and storage operation counters every N seconds. MetricsLogSec int `json:"metrics_log_sec,omitempty"` @@ -102,6 +112,7 @@ func (c *AppConfig) ApplyProfile() { setDefault(&c.IdlePollStepMs, 200) setDefault(&c.SessionIdleTimeoutSec, 60) setDefault(&c.CleanupFileMaxAgeSec, 45) + setDefault(&c.StartupStaleMaxAgeSec, 20) setDefault(&c.StorageRetryMax, 4) setDefault(&c.StorageRetryBaseMs, 200) setDefault(&c.StorageOpTimeoutSec, 45) @@ -109,6 +120,8 @@ func (c *AppConfig) ApplyProfile() { setDefault(&c.MaxActiveSessions, 24) setDefault(&c.SessionWaitTimeoutSec, 12) setDefault(&c.BackpressureBytes, 4*1024*1024) + setDefault(&c.ColdStartBurstMs, 15000) + setDefault(&c.ColdStartPollMs, 75) setDefault(&c.MetricsLogSec, 30) case "quota-saver": setDefault(&c.RefreshRateMs, 750) @@ -117,6 +130,7 @@ func (c *AppConfig) ApplyProfile() { setDefault(&c.IdlePollStepMs, 750) setDefault(&c.SessionIdleTimeoutSec, 45) setDefault(&c.CleanupFileMaxAgeSec, 30) + setDefault(&c.StartupStaleMaxAgeSec, 20) setDefault(&c.StorageRetryMax, 3) setDefault(&c.StorageRetryBaseMs, 500) setDefault(&c.StorageOpTimeoutSec, 45) @@ -124,6 +138,8 @@ func (c *AppConfig) ApplyProfile() { setDefault(&c.MaxActiveSessions, 16) setDefault(&c.SessionWaitTimeoutSec, 10) setDefault(&c.BackpressureBytes, 4*1024*1024) + setDefault(&c.ColdStartBurstMs, 5000) + setDefault(&c.ColdStartPollMs, 250) setDefault(&c.MetricsLogSec, 60) default: setDefault(&c.RefreshRateMs, 200) @@ -132,6 +148,7 @@ func (c *AppConfig) ApplyProfile() { setDefault(&c.IdlePollStepMs, 500) setDefault(&c.SessionIdleTimeoutSec, 60) setDefault(&c.CleanupFileMaxAgeSec, 30) + setDefault(&c.StartupStaleMaxAgeSec, 20) setDefault(&c.StorageRetryMax, 3) setDefault(&c.StorageRetryBaseMs, 300) setDefault(&c.StorageOpTimeoutSec, 45) @@ -139,6 +156,8 @@ func (c *AppConfig) ApplyProfile() { setDefault(&c.MaxActiveSessions, 24) setDefault(&c.SessionWaitTimeoutSec, 12) setDefault(&c.BackpressureBytes, 4*1024*1024) + setDefault(&c.ColdStartBurstMs, 10000) + setDefault(&c.ColdStartPollMs, 100) setDefault(&c.MetricsLogSec, 30) } } diff --git a/internal/httpclient/client.go b/internal/httpclient/client.go index 78a5aef..82f6e14 100644 --- a/internal/httpclient/client.go +++ b/internal/httpclient/client.go @@ -44,7 +44,7 @@ func (t *hostRewriteTransport) RoundTrip(req *http.Request) (*http.Response, err // and manipulate TLS/HTTP headers as specified in the config. func NewCustomClient(cfg TransportConfig) *http.Client { dialer := &net.Dialer{ - Timeout: 15 * time.Second, + Timeout: 8 * time.Second, KeepAlive: 30 * time.Second, } @@ -63,9 +63,9 @@ func NewCustomClient(cfg TransportConfig) *http.Client { ForceAttemptHTTP2: true, MaxIdleConns: 100, MaxIdleConnsPerHost: 20, - ResponseHeaderTimeout: 30 * time.Second, + ResponseHeaderTimeout: 10 * time.Second, IdleConnTimeout: 90 * time.Second, - TLSHandshakeTimeout: 15 * time.Second, + TLSHandshakeTimeout: 8 * time.Second, ExpectContinueTimeout: 1 * time.Second, } @@ -79,6 +79,6 @@ func NewCustomClient(cfg TransportConfig) *http.Client { return &http.Client{ Transport: rt, - Timeout: 45 * time.Second, + Timeout: 15 * time.Second, } } diff --git a/internal/transport/conn.go b/internal/transport/conn.go index 39df5b5..ce0c469 100644 --- a/internal/transport/conn.go +++ b/internal/transport/conn.go @@ -64,8 +64,13 @@ func (v *VirtualConn) Read(b []byte) (n int, err error) { func (v *VirtualConn) Write(b []byte) (n int, err error) { if len(b) > 0 { - v.session.EnqueueTx(b) - v.engine.RequestFlush() + firstPacket := v.session.EnqueueTx(b) + if firstPacket { + v.engine.TriggerWarmPoll() + v.engine.ForceFlush() + } else { + v.engine.RequestFlush() + } } return len(b), nil } @@ -75,7 +80,7 @@ func (v *VirtualConn) Close() error { v.session.closed = true v.session.txCond.Broadcast() // Wake up any writers blocked on backpressure v.session.mu.Unlock() - v.engine.RequestFlush() + v.engine.ForceFlush() // A closed connection no longer accepts writes efficiently // Next periodic engine flush will securely remove context diff --git a/internal/transport/engine.go b/internal/transport/engine.go index 6c3b0f8..e4e5b1f 100644 --- a/internal/transport/engine.go +++ b/internal/transport/engine.go @@ -35,10 +35,14 @@ type Engine struct { idlePollStep time.Duration sessionIdleTimeout time.Duration cleanupFileMaxAge time.Duration + startupStaleMaxAge time.Duration maxPayloadBytes int backpressureBytes int storageOpTimeout time.Duration immediateFlush bool + coldStartBurst time.Duration + coldStartPoll time.Duration + forceFlushMinGap time.Duration metricsLogInterval time.Duration // Server mode handler: called when a new session is discovered @@ -52,6 +56,9 @@ type Engine struct { processedMu sync.Mutex flushTrigger chan struct{} + pollTrigger chan struct{} + warmUntilNs int64 + lastForceNs int64 metrics engineMetrics } @@ -76,9 +83,19 @@ type engineMetrics struct { maxDeleteLatencyMs uint64 fileAgeMs uint64 maxFileAgeMs uint64 + pollFilesFound uint64 + pollFilesProcessed uint64 + pollFilesStale uint64 + maxPollBatchFiles uint64 firstResponses uint64 firstResponseMs uint64 maxFirstResponseMs uint64 + firstUploads uint64 + firstUploadMs uint64 + maxFirstUploadMs uint64 + firstServerSeens uint64 + firstServerSeenMs uint64 + maxFirstServerSeenMs uint64 } type MetricsSnapshot struct { @@ -103,9 +120,19 @@ type MetricsSnapshot struct { MaxDeleteLatencyMs uint64 `json:"max_delete_latency_ms"` AvgFileAgeMs float64 `json:"avg_file_age_ms"` MaxFileAgeMs uint64 `json:"max_file_age_ms"` + PollFilesFound uint64 `json:"poll_files_found"` + PollFilesProcessed uint64 `json:"poll_files_processed"` + PollFilesStale uint64 `json:"poll_files_stale"` + MaxPollBatchFiles uint64 `json:"max_poll_batch_files"` FirstResponses uint64 `json:"first_responses"` AvgFirstResponseMs float64 `json:"avg_first_response_ms"` MaxFirstResponseMs uint64 `json:"max_first_response_ms"` + FirstUploads uint64 `json:"first_uploads"` + AvgFirstUploadMs float64 `json:"avg_first_upload_ms"` + MaxFirstUploadMs uint64 `json:"max_first_upload_ms"` + FirstServerSeens uint64 `json:"first_server_seens"` + AvgFirstServerSeenMs float64 `json:"avg_first_server_seen_ms"` + MaxFirstServerSeenMs uint64 `json:"max_first_server_seen_ms"` } func NewEngine(backend storage.Backend, isClient bool, clientID string) *Engine { @@ -122,12 +149,17 @@ func NewEngine(backend storage.Backend, isClient bool, clientID string) *Engine idlePollStep: 500 * time.Millisecond, sessionIdleTimeout: 10 * time.Second, cleanupFileMaxAge: 10 * time.Second, + startupStaleMaxAge: 20 * time.Second, maxPayloadBytes: 768 * 1024, backpressureBytes: 2 * 1024 * 1024, storageOpTimeout: 18 * time.Second, immediateFlush: false, + coldStartBurst: 10 * time.Second, + coldStartPoll: 100 * time.Millisecond, + forceFlushMinGap: 100 * time.Millisecond, metricsLogInterval: 30 * time.Second, flushTrigger: make(chan struct{}, 1), + pollTrigger: make(chan struct{}, 1), } if isClient { e.myDir = DirReq @@ -187,6 +219,12 @@ func (e *Engine) SetCleanupFileMaxAge(seconds int) { } } +func (e *Engine) SetStartupStaleMaxAge(seconds int) { + if seconds > 0 { + e.startupStaleMaxAge = time.Duration(seconds) * time.Second + } +} + func (e *Engine) SetMaxPayloadBytes(bytes int) { if bytes > 0 { e.maxPayloadBytes = bytes @@ -209,6 +247,15 @@ func (e *Engine) SetImmediateFlush(enabled bool) { e.immediateFlush = enabled } +func (e *Engine) SetColdStartBurst(burstMs, pollMs int) { + if burstMs > 0 { + e.coldStartBurst = time.Duration(burstMs) * time.Millisecond + } + if pollMs > 0 { + e.coldStartPoll = time.Duration(pollMs) * time.Millisecond + } +} + func (e *Engine) ActiveSessionCount() int { e.sessionMu.RLock() defer e.sessionMu.RUnlock() @@ -222,6 +269,7 @@ func (e *Engine) SetMetricsLogInterval(seconds int) { } func (e *Engine) Start(ctx context.Context) { + e.TriggerWarmPoll() go e.flushLoop(ctx) go e.pollLoop(ctx) go e.cleanupLoop(ctx) // Delete files older than 10s @@ -237,22 +285,61 @@ func (e *Engine) GetSession(id string) *Session { func (e *Engine) AddSession(s *Session) { s.SetBackpressureBytes(e.backpressureBytes) e.sessionMu.Lock() - defer e.sessionMu.Unlock() e.sessions[s.ID] = s - log.Printf("Engine.AddSession: Added session %s (Total now: %d)", s.ID, len(e.sessions)) - e.RequestFlush() + total := len(e.sessions) + e.sessionMu.Unlock() + log.Printf("Engine.AddSession: Added session %s (Total now: %d)", s.ID, total) + e.TriggerWarmPoll() } func (e *Engine) RequestFlush() { if !e.immediateFlush { return } + e.ForceFlush() +} + +func (e *Engine) ForceFlush() { + if e.forceFlushMinGap > 0 { + now := time.Now().UnixNano() + last := atomic.LoadInt64(&e.lastForceNs) + if last > 0 && time.Duration(now-last) < e.forceFlushMinGap { + return + } + if !atomic.CompareAndSwapInt64(&e.lastForceNs, last, now) { + return + } + } select { case e.flushTrigger <- struct{}{}: default: } } +func (e *Engine) TriggerWarmPoll() { + if e.coldStartBurst <= 0 { + return + } + until := time.Now().Add(e.coldStartBurst).UnixNano() + for { + current := atomic.LoadInt64(&e.warmUntilNs) + if until <= current { + break + } + if atomic.CompareAndSwapInt64(&e.warmUntilNs, current, until) { + break + } + } + select { + case e.pollTrigger <- struct{}{}: + default: + } +} + +func (e *Engine) coldStartActive() bool { + return time.Now().UnixNano() < atomic.LoadInt64(&e.warmUntilNs) +} + func (e *Engine) Snapshot() MetricsSnapshot { current := e.snapshotMetrics() e.sessionMu.RLock() @@ -281,9 +368,19 @@ func (e *Engine) Snapshot() MetricsSnapshot { MaxDeleteLatencyMs: current.maxDeleteLatencyMs, AvgFileAgeMs: averageMs(current.fileAgeMs, current.downloads), MaxFileAgeMs: current.maxFileAgeMs, + PollFilesFound: current.pollFilesFound, + PollFilesProcessed: current.pollFilesProcessed, + PollFilesStale: current.pollFilesStale, + MaxPollBatchFiles: current.maxPollBatchFiles, FirstResponses: current.firstResponses, AvgFirstResponseMs: averageMs(current.firstResponseMs, current.firstResponses), MaxFirstResponseMs: current.maxFirstResponseMs, + FirstUploads: current.firstUploads, + AvgFirstUploadMs: averageMs(current.firstUploadMs, current.firstUploads), + MaxFirstUploadMs: current.maxFirstUploadMs, + FirstServerSeens: current.firstServerSeens, + AvgFirstServerSeenMs: averageMs(current.firstServerSeenMs, current.firstServerSeens), + MaxFirstServerSeenMs: current.maxFirstServerSeenMs, } } @@ -375,9 +472,10 @@ func (e *Engine) flushAll(ctx context.Context) { } filename := fmt.Sprintf("%s-%s-mux-%d.bin", e.myDir, fnameCID, time.Now().UnixNano()) payloadBytes := muxPayloadBytes(mux) + firstUploadSessionIDs := firstUploadCandidates(mux) // Upload asynchronously with backpressure/limit - go func(fname string, m []Envelope, bytes int) { + go func(fname string, m []Envelope, bytes int, firstIDs []string) { e.sem <- struct{}{} // Acquire defer func() { <-e.sem }() // Release @@ -406,14 +504,17 @@ func (e *Engine) flushAll(ctx context.Context) { atomic.AddUint64(&e.metrics.uploadBytes, uint64(bytes)) atomic.AddUint64(&e.metrics.uploadLatencyMs, latencyMs) atomicMaxUint64(&e.metrics.maxUploadLatencyMs, latencyMs) - }(filename, mux, payloadBytes) + for _, sessionID := range firstIDs { + e.recordFirstUpload(sessionID) + } + }(filename, mux, payloadBytes, firstUploadSessionIDs) } for _, id := range closedSessionIDs { e.RemoveSession(id) } if needsFollowupFlush { - e.RequestFlush() + e.ForceFlush() } } @@ -426,195 +527,187 @@ func (e *Engine) pollLoop(ctx context.Context) { select { case <-ctx.Done(): return - case <-timer.C: - pollAgain: - // ZERO-TRAFFIC CLIENT OPTIMIZATION: - // SOCKS5 only initiates from the Client. If the Client has 0 active sessions, - // it mathematically never needs to poll Google Drive! Go entirely to sleep! - if e.myDir == DirReq { - e.sessionMu.RLock() - count := len(e.sessions) - e.sessionMu.RUnlock() - if count == 0 { - timer.Reset(currentPollInterval) - continue + case <-e.pollTrigger: + if !timer.Stop() { + select { + case <-timer.C: + default: } } + case <-timer.C: + } - // Fetch multiplexed files - prefix := string(e.peerDir) + "-" - if e.myDir == DirReq { - // Client only polls for its own responses - prefix += e.id + "-mux-" - } else { - // Server polls for ALL client requests - prefix += "" - } - listStart := time.Now() - opCtx, cancel := e.storageContext(ctx) - files, err := e.backend.ListQuery(opCtx, prefix) - cancel() - if err != nil { - atomic.AddUint64(&e.metrics.listErrors, 1) - log.Printf("poll list error: %v", err) - timer.Reset(currentPollInterval) + pollAgain: + if e.myDir == DirReq { + count := e.ActiveSessionCount() + if count == 0 { + timer.Reset(e.effectivePollInterval(currentPollInterval, count)) continue } - listLatencyMs := uint64(time.Since(listStart).Milliseconds()) - atomic.AddUint64(&e.metrics.listCalls, 1) - atomic.AddUint64(&e.metrics.listLatencyMs, listLatencyMs) - atomicMaxUint64(&e.metrics.maxListLatencyMs, listLatencyMs) + } - if len(files) == 0 { - if e.myDir == DirRes { // SERVER OPTIMIZATION - e.sessionMu.RLock() - activeSessions := len(e.sessions) - e.sessionMu.RUnlock() - - if activeSessions == 0 { - // Increase polling delay step-by-step to save API calls - currentPollInterval += e.idlePollStep - if currentPollInterval > e.idlePollMax { - currentPollInterval = e.idlePollMax - } - } else { - // A session is currently active, so loop fast! - currentPollInterval = e.pollTicker + // Fetch multiplexed files + prefix := string(e.peerDir) + "-" + if e.myDir == DirReq { + // Client only polls for its own responses + prefix += e.id + "-mux-" + } else { + // Server polls for ALL client requests + prefix += "" + } + listStart := time.Now() + opCtx, cancel := e.storageContext(ctx) + files, err := e.backend.ListQuery(opCtx, prefix) + cancel() + if err != nil { + atomic.AddUint64(&e.metrics.listErrors, 1) + log.Printf("poll list error: %v", err) + timer.Reset(e.effectivePollInterval(currentPollInterval, e.ActiveSessionCount())) + continue + } + listLatencyMs := uint64(time.Since(listStart).Milliseconds()) + atomic.AddUint64(&e.metrics.listCalls, 1) + atomic.AddUint64(&e.metrics.listLatencyMs, listLatencyMs) + atomicMaxUint64(&e.metrics.maxListLatencyMs, listLatencyMs) + atomic.AddUint64(&e.metrics.pollFilesFound, uint64(len(files))) + atomicMaxUint64(&e.metrics.maxPollBatchFiles, uint64(len(files))) + + if len(files) == 0 { + activeSessions := e.ActiveSessionCount() + if e.myDir == DirRes { + if activeSessions == 0 && !e.coldStartActive() { + currentPollInterval += e.idlePollStep + if currentPollInterval > e.idlePollMax { + currentPollInterval = e.idlePollMax } + } else { + currentPollInterval = e.pollTicker } - // Client optimization doesn't change intervals, but needs its timer reset - timer.Reset(currentPollInterval) - continue } + timer.Reset(e.effectivePollInterval(currentPollInterval, activeSessions)) + continue + } - // We found data! Reset polling back to maximum speed - currentPollInterval = e.pollTicker + currentPollInterval = e.pollTicker - // We found files! Let's download them in parallel to boost speed massively - var wg sync.WaitGroup - for _, f := range files { - // STARTUP OPTIMIZATION: Ignore files older than 5 minutes to avoid memory spikes on restart - parts := strings.Split(f, "-") - if len(parts) >= 3 { - tsStr := parts[len(parts)-1] - tsStr = strings.TrimSuffix(tsStr, ".bin") - ts, _ := strconv.ParseInt(tsStr, 10, 64) - if ts > 0 && time.Since(time.Unix(0, ts)) > e.cleanupFileMaxAge { - e.deleteAsync(ctx, f) // Silent cleanup - continue - } + var wg sync.WaitGroup + for _, f := range files { + fileAge := fileAgeDuration(f) + if e.shouldDropStaleFile(fileAge) { + atomic.AddUint64(&e.metrics.pollFilesStale, 1) + if fileAge > 0 { + atomicMaxUint64(&e.metrics.maxFileAgeMs, uint64(fileAge.Milliseconds())) } + log.Printf("stale transport file ignored: file=%s age_ms=%d", f, fileAge.Milliseconds()) + e.deleteAsync(ctx, f) + continue + } - e.processedMu.Lock() - already := e.processed[f] - if !already { - e.processed[f] = true - } - e.processedMu.Unlock() + e.processedMu.Lock() + already := e.processed[f] + if !already { + e.processed[f] = true + } + e.processedMu.Unlock() - if already { - continue - } + if already { + continue + } - wg.Add(1) - go func(fname string) { - defer wg.Done() - - e.sem <- struct{}{} // Acquire - defer func() { <-e.sem }() // Release - - // log.Printf("Engine.pollLoop: Downloading %s", fname) - downloadStart := time.Now() - opCtx, cancel := e.storageContext(ctx) - rc, err := e.backend.Download(opCtx, fname) - if err != nil { - cancel() - atomic.AddUint64(&e.metrics.downloadErrors, 1) - log.Printf("download error %s: %v", fname, err) - e.processedMu.Lock() - delete(e.processed, fname) // failed to download, retry next poll - e.processedMu.Unlock() - return - } - defer func() { - rc.Close() - cancel() - }() - downloadLatencyMs := uint64(time.Since(downloadStart).Milliseconds()) - fileAgeMs := fileAgeMilliseconds(fname) - - // Extract ClientID from filename for server-side session initialization - var fileClientID string - parts := strings.Split(fname, "-") - if len(parts) >= 4 && parts[2] == "mux" { - fileClientID = parts[1] - } + wg.Add(1) + atomic.AddUint64(&e.metrics.pollFilesProcessed, 1) + go e.downloadAndProcess(ctx, f, &wg) + } - // STREAMING DECODE - count := 0 - payloadBytes := 0 - for { - var env Envelope - if err := env.Decode(rc); err != nil { - if err != io.EOF && err != io.ErrUnexpectedEOF { - log.Printf("mux decode error %s: %v", fname, err) - } - break - } - count++ - payloadBytes += len(env.Payload) - - // Process envelope immediately - e.closedSessionsMu.Lock() - if _, exists := e.closedSessions[env.SessionID]; exists { - e.closedSessionsMu.Unlock() - continue - } - e.closedSessionsMu.Unlock() - - e.sessionMu.Lock() - s, exists := e.sessions[env.SessionID] - if !exists && e.myDir == DirRes && e.OnNewSession != nil { - s = NewSession(env.SessionID) - s.ClientID = fileClientID - s.TargetAddr = env.TargetAddr - e.sessions[env.SessionID] = s - e.sessionMu.Unlock() - log.Printf("Engine: Triggering new session %s for Client %s", env.SessionID, fileClientID) - e.OnNewSession(env.SessionID, env.TargetAddr, s) - } else { - e.sessionMu.Unlock() - } + wg.Wait() + time.Sleep(e.effectivePollInterval(100*time.Millisecond, e.ActiveSessionCount())) + goto pollAgain + } +} - if s != nil { - if len(env.Payload) > 0 { - e.recordFirstResponse(s) - } - s.ProcessRx(&env) - } - } +func (e *Engine) downloadAndProcess(ctx context.Context, fname string, wg *sync.WaitGroup) { + defer wg.Done() - atomic.AddUint64(&e.metrics.downloads, 1) - atomic.AddUint64(&e.metrics.downloadBytes, uint64(payloadBytes)) - atomic.AddUint64(&e.metrics.downloadLatencyMs, downloadLatencyMs) - atomicMaxUint64(&e.metrics.maxDownloadLatencyMs, downloadLatencyMs) - if fileAgeMs > 0 { - atomic.AddUint64(&e.metrics.fileAgeMs, fileAgeMs) - atomicMaxUint64(&e.metrics.maxFileAgeMs, fileAgeMs) - } - e.deleteAsync(ctx, fname) - }(f) + e.sem <- struct{}{} + defer func() { <-e.sem }() + + downloadStart := time.Now() + opCtx, cancel := e.storageContext(ctx) + rc, err := e.backend.Download(opCtx, fname) + if err != nil { + cancel() + atomic.AddUint64(&e.metrics.downloadErrors, 1) + log.Printf("download error %s: %v", fname, err) + e.processedMu.Lock() + delete(e.processed, fname) + e.processedMu.Unlock() + return + } + defer func() { + rc.Close() + cancel() + }() + + downloadLatencyMs := uint64(time.Since(downloadStart).Milliseconds()) + fileAgeMs := fileAgeMilliseconds(fname) + + var fileClientID string + parts := strings.Split(fname, "-") + if len(parts) >= 4 && parts[2] == "mux" { + fileClientID = parts[1] + } + + payloadBytes := 0 + for { + var env Envelope + if err := env.Decode(rc); err != nil { + if err != io.EOF && err != io.ErrUnexpectedEOF { + log.Printf("mux decode error %s: %v", fname, err) } + break + } + payloadBytes += len(env.Payload) - // Wait for parallel batch to finish - wg.Wait() + e.closedSessionsMu.Lock() + if _, exists := e.closedSessions[env.SessionID]; exists { + e.closedSessionsMu.Unlock() + continue + } + e.closedSessionsMu.Unlock() + + e.sessionMu.Lock() + s, exists := e.sessions[env.SessionID] + if !exists && e.myDir == DirRes && e.OnNewSession != nil { + s = NewSession(env.SessionID) + s.ClientID = fileClientID + s.TargetAddr = env.TargetAddr + e.sessions[env.SessionID] = s + e.sessionMu.Unlock() + e.recordFirstServerSeen(s, fileAgeMs) + log.Printf("Engine: Triggering new session %s for Client %s", env.SessionID, fileClientID) + e.TriggerWarmPoll() + e.OnNewSession(env.SessionID, env.TargetAddr, s) + } else { + e.sessionMu.Unlock() + } - // Adaptive Polling: Because we just received data, the connection is active. - // Instead of jumping back to the select, immediately poll again after a tiny 100ms break to drain queues. - time.Sleep(100 * time.Millisecond) - goto pollAgain + if s != nil { + if len(env.Payload) > 0 { + e.recordFirstResponse(s) + } + s.ProcessRx(&env) } } + + atomic.AddUint64(&e.metrics.downloads, 1) + atomic.AddUint64(&e.metrics.downloadBytes, uint64(payloadBytes)) + atomic.AddUint64(&e.metrics.downloadLatencyMs, downloadLatencyMs) + atomicMaxUint64(&e.metrics.maxDownloadLatencyMs, downloadLatencyMs) + if fileAgeMs > 0 { + atomic.AddUint64(&e.metrics.fileAgeMs, fileAgeMs) + atomicMaxUint64(&e.metrics.maxFileAgeMs, fileAgeMs) + } + e.deleteAsync(ctx, fname) } func (e *Engine) RemoveSession(id string) { @@ -675,6 +768,60 @@ func (e *Engine) recordFirstResponse(s *Session) { } } +func (e *Engine) recordFirstUpload(sessionID string) { + e.sessionMu.RLock() + s := e.sessions[sessionID] + e.sessionMu.RUnlock() + if s == nil { + return + } + + s.mu.Lock() + if s.firstUploadLogged { + s.mu.Unlock() + return + } + s.firstUploadLogged = true + start := s.createdAt + if !s.firstTxQueuedAt.IsZero() { + start = s.firstTxQueuedAt + } + targetAddr := s.TargetAddr + elapsed := time.Since(start) + s.mu.Unlock() + + latencyMs := uint64(elapsed.Milliseconds()) + atomic.AddUint64(&e.metrics.firstUploads, 1) + atomic.AddUint64(&e.metrics.firstUploadMs, latencyMs) + atomicMaxUint64(&e.metrics.maxFirstUploadMs, latencyMs) + if latencyMs > 1000 { + log.Printf("session first upload slow: id=%s target=%s first_upload_ms=%d", sessionID, targetAddr, latencyMs) + } +} + +func (e *Engine) recordFirstServerSeen(s *Session, fileAgeMs uint64) { + if fileAgeMs == 0 { + return + } + + s.mu.Lock() + if s.serverSeenLogged { + s.mu.Unlock() + return + } + s.serverSeenLogged = true + targetAddr := s.TargetAddr + sessionID := s.ID + s.mu.Unlock() + + atomic.AddUint64(&e.metrics.firstServerSeens, 1) + atomic.AddUint64(&e.metrics.firstServerSeenMs, fileAgeMs) + atomicMaxUint64(&e.metrics.maxFirstServerSeenMs, fileAgeMs) + if fileAgeMs > 2000 { + log.Printf("session first server seen slow: id=%s target=%s first_server_seen_ms=%d", sessionID, targetAddr, fileAgeMs) + } +} + func (e *Engine) cleanupLoop(ctx context.Context) { ticker := time.NewTicker(5 * time.Second) defer ticker.Stop() @@ -734,7 +881,12 @@ func (e *Engine) cleanupLoop(ctx context.Context) { ts, err := strconv.ParseInt(tsStr, 10, 64) if err == nil { t := time.Unix(0, ts) - if time.Since(t) > e.cleanupFileMaxAge { + age := time.Since(t) + if e.shouldDropStaleFile(age) { + atomic.AddUint64(&e.metrics.pollFilesStale, 1) + if age > 0 { + atomicMaxUint64(&e.metrics.maxFileAgeMs, uint64(age.Milliseconds())) + } e.deleteAsync(ctx, f) } } @@ -764,7 +916,7 @@ func (e *Engine) metricsLoop(ctx context.Context) { e.sessionMu.RUnlock() log.Printf( - "metrics: active=%d uploads=%d/%s up_avg_ms=%.0f downloads=%d/%s down_avg_ms=%.0f lists=%d list_avg_ms=%.0f deletes=%d file_age_avg_ms=%.0f first_resp_avg_ms=%.0f errors[u=%d d=%d l=%d del=%d]", + "metrics: active=%d uploads=%d/%s up_avg_ms=%.0f downloads=%d/%s down_avg_ms=%.0f lists=%d list_avg_ms=%.0f poll_files[f=%d p=%d stale=%d max_batch=%d] deletes=%d file_age_avg_ms=%.0f max_file_age_ms=%d first_upload_avg_ms=%.0f first_seen_avg_ms=%.0f first_resp_avg_ms=%.0f errors[u=%d d=%d l=%d del=%d]", activeSessions, current.uploads-last.uploads, formatBytes(current.uploadBytes-last.uploadBytes), @@ -774,8 +926,15 @@ func (e *Engine) metricsLoop(ctx context.Context) { averageMs(current.downloadLatencyMs-last.downloadLatencyMs, current.downloads-last.downloads), current.listCalls-last.listCalls, averageMs(current.listLatencyMs-last.listLatencyMs, current.listCalls-last.listCalls), + current.pollFilesFound-last.pollFilesFound, + current.pollFilesProcessed-last.pollFilesProcessed, + current.pollFilesStale-last.pollFilesStale, + current.maxPollBatchFiles, current.deletes-last.deletes, averageMs(current.fileAgeMs-last.fileAgeMs, current.downloads-last.downloads), + current.maxFileAgeMs, + averageMs(current.firstUploadMs-last.firstUploadMs, current.firstUploads-last.firstUploads), + averageMs(current.firstServerSeenMs-last.firstServerSeenMs, current.firstServerSeens-last.firstServerSeens), averageMs(current.firstResponseMs-last.firstResponseMs, current.firstResponses-last.firstResponses), current.uploadErrors-last.uploadErrors, current.downloadErrors-last.downloadErrors, @@ -809,9 +968,19 @@ func (e *Engine) snapshotMetrics() engineMetrics { maxDeleteLatencyMs: atomic.LoadUint64(&e.metrics.maxDeleteLatencyMs), fileAgeMs: atomic.LoadUint64(&e.metrics.fileAgeMs), maxFileAgeMs: atomic.LoadUint64(&e.metrics.maxFileAgeMs), + pollFilesFound: atomic.LoadUint64(&e.metrics.pollFilesFound), + pollFilesProcessed: atomic.LoadUint64(&e.metrics.pollFilesProcessed), + pollFilesStale: atomic.LoadUint64(&e.metrics.pollFilesStale), + maxPollBatchFiles: atomic.LoadUint64(&e.metrics.maxPollBatchFiles), firstResponses: atomic.LoadUint64(&e.metrics.firstResponses), firstResponseMs: atomic.LoadUint64(&e.metrics.firstResponseMs), maxFirstResponseMs: atomic.LoadUint64(&e.metrics.maxFirstResponseMs), + firstUploads: atomic.LoadUint64(&e.metrics.firstUploads), + firstUploadMs: atomic.LoadUint64(&e.metrics.firstUploadMs), + maxFirstUploadMs: atomic.LoadUint64(&e.metrics.maxFirstUploadMs), + firstServerSeens: atomic.LoadUint64(&e.metrics.firstServerSeens), + firstServerSeenMs: atomic.LoadUint64(&e.metrics.firstServerSeenMs), + maxFirstServerSeenMs: atomic.LoadUint64(&e.metrics.maxFirstServerSeenMs), } } @@ -823,6 +992,30 @@ func muxPayloadBytes(mux []Envelope) int { return total } +func firstUploadCandidates(mux []Envelope) []string { + sessionIDs := make([]string, 0, len(mux)) + seen := make(map[string]bool, len(mux)) + for _, env := range mux { + if env.Seq != 0 || seen[env.SessionID] { + continue + } + seen[env.SessionID] = true + sessionIDs = append(sessionIDs, env.SessionID) + } + return sessionIDs +} + +func (e *Engine) effectivePollInterval(base time.Duration, activeSessions int) time.Duration { + interval := base + if interval <= 0 { + interval = e.pollTicker + } + if e.coldStartActive() && e.coldStartPoll > 0 && e.coldStartPoll < interval { + interval = e.coldStartPoll + } + return interval +} + func formatBytes(n uint64) string { const unit = 1024 if n < unit { @@ -857,12 +1050,23 @@ func atomicMaxUint64(target *uint64, value uint64) { } } -func fileAgeMilliseconds(filename string) uint64 { +func (e *Engine) shouldDropStaleFile(age time.Duration) bool { + if age <= 0 { + return false + } + if e.startupStaleMaxAge > 0 && age > e.startupStaleMaxAge { + return true + } + return e.cleanupFileMaxAge > 0 && age > e.cleanupFileMaxAge +} + +func fileAgeDuration(filename string) time.Duration { parts := strings.Split(filename, "-") if len(parts) < 3 { return 0 } tsStr := strings.TrimSuffix(parts[len(parts)-1], ".bin") + tsStr = strings.TrimSuffix(tsStr, ".json") ts, err := strconv.ParseInt(tsStr, 10, 64) if err != nil || ts <= 0 { return 0 @@ -871,5 +1075,13 @@ func fileAgeMilliseconds(filename string) uint64 { if age < 0 { return 0 } + return age +} + +func fileAgeMilliseconds(filename string) uint64 { + age := fileAgeDuration(filename) + if age <= 0 { + return 0 + } return uint64(age.Milliseconds()) } diff --git a/internal/transport/session.go b/internal/transport/session.go index 192a6e8..e2f3749 100644 --- a/internal/transport/session.go +++ b/internal/transport/session.go @@ -22,8 +22,11 @@ type Session struct { rxSeq uint64 rxQueue map[uint64]*Envelope createdAt time.Time + firstTxQueuedAt time.Time lastActivity time.Time firstResponseLogged bool + firstUploadLogged bool + serverSeenLogged bool closed bool rxClosed bool // Safely tracks if RxChan was successfully closed TargetAddr string @@ -58,7 +61,7 @@ func (s *Session) SetBackpressureBytes(bytes int) { s.mu.Unlock() } -func (s *Session) EnqueueTx(data []byte) { +func (s *Session) EnqueueTx(data []byte) bool { s.mu.Lock() defer s.mu.Unlock() @@ -68,8 +71,13 @@ func (s *Session) EnqueueTx(data []byte) { s.txCond.Wait() } + firstPacket := s.txSeq == 0 && len(s.txBuf) == 0 + if firstPacket && s.firstTxQueuedAt.IsZero() { + s.firstTxQueuedAt = time.Now() + } s.txBuf = append(s.txBuf, data...) s.lastActivity = time.Now() + return firstPacket } func (s *Session) ClearTx() { diff --git a/scripts/collect_client_diagnostics.sh b/scripts/collect_client_diagnostics.sh index 727c674..0e45c95 100755 --- a/scripts/collect_client_diagnostics.sh +++ b/scripts/collect_client_diagnostics.sh @@ -48,24 +48,35 @@ run_capture() { } > "${OUT_DIR}/${name}" 2>&1 } +run_raw_capture() { + local name="$1" + shift + echo "Collecting ${name}..." + "$@" > "${OUT_DIR}/${name}" 2>&1 || true +} + run_server_ssh_capture() { local name="$1" local remote_cmd="$2" local out="${OUT_DIR}/${name}" local ssh_opts=(-o BatchMode=yes -o ConnectTimeout=5 -o ServerAliveInterval=5 -o ServerAliveCountMax=1) local ssh_proxy_opts=(-o BatchMode=yes -o ConnectTimeout=20 -o ServerAliveInterval=5 -o ServerAliveCountMax=1 -o "ProxyCommand=nc -x ${SSH_PROXY_HOSTPORT} -X 5 %h %p") + local timeout_cmd=() + if command -v timeout >/dev/null 2>&1; then + timeout_cmd=(timeout 30) + fi { echo "Collecting ${name}..." echo "\$ ssh ${SERVER_SSH} ${remote_cmd}" echo "--- direct ssh attempt ---" - if ssh "${ssh_opts[@]}" "$SERVER_SSH" "$remote_cmd"; then + if "${timeout_cmd[@]}" ssh "${ssh_opts[@]}" "$SERVER_SSH" "$remote_cmd"; then exit 0 fi echo "--- direct ssh failed; trying through SOCKS5 proxy ${SSH_PROXY_HOSTPORT} ---" echo "\$ ssh -o ProxyCommand='nc -x ${SSH_PROXY_HOSTPORT} -X 5 %h %p' ${SERVER_SSH} ${remote_cmd}" - ssh "${ssh_proxy_opts[@]}" "$SERVER_SSH" "$remote_cmd" || true + "${timeout_cmd[@]}" ssh "${ssh_proxy_opts[@]}" "$SERVER_SSH" "$remote_cmd" || true } > "$out" 2>&1 } @@ -95,7 +106,7 @@ run_capture go_version.txt go version run_capture process.txt ps aux run_capture ports.txt lsof -nP -iTCP:1080 -iTCP:18081 -sTCP:LISTEN run_capture health.txt curl -sS --max-time 5 "$CLIENT_HEALTH_URL" -run_capture metrics_initial.json curl -sS --max-time 5 "$CLIENT_METRICS_URL" +run_raw_capture metrics_initial.json curl -sS --max-time 5 "$CLIENT_METRICS_URL" if [ -f "$CLIENT_LOG" ]; then echo "Collecting client_tail_initial.log..." @@ -121,8 +132,10 @@ while [ "$SECONDS" -lt "$END" ]; do METRICS="null" fi printf '{"ts":"%s","metrics":%s}\n' "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "$METRICS" >> "${OUT_DIR}/client_metrics_samples.jsonl" + printf '.' sleep "$SAMPLE_INTERVAL" done +printf '\n' if [ "$RUN_CURLS" = "1" ]; then echo "Running curl benchmarks; each one can take up to ${CURL_MAX_TIME}s..." @@ -133,7 +146,7 @@ else echo "Skipping curl benchmarks. Set FLOWDRIVER_RUN_CURLS=1 to enable them." fi -run_capture metrics_final.json curl -sS --max-time 5 "$CLIENT_METRICS_URL" +run_raw_capture metrics_final.json curl -sS --max-time 5 "$CLIENT_METRICS_URL" if [ -f "$CLIENT_LOG" ]; then echo "Collecting client_tail.log..." @@ -144,5 +157,5 @@ fi if [ -n "$SERVER_SSH" ]; then run_server_ssh_capture server_metrics.txt 'curl -sS --max-time 5 http://127.0.0.1:18080/metrics' - run_server_ssh_capture server_tail.log 'tail -n 1200 ~/flowdriver/server.log 2>/dev/null || true' + run_server_ssh_capture server_tail.log 'journalctl -u flowdriver-server -n 1200 --no-pager 2>/dev/null || tail -n 1200 ~/flowdriver/server.log 2>/dev/null || true' fi diff --git a/server_config.json.example b/server_config.json.example index 64c6690..639887a 100644 --- a/server_config.json.example +++ b/server_config.json.example @@ -8,6 +8,7 @@ "idle_poll_step_ms": 500, "session_idle_timeout_sec": 60, "cleanup_file_max_age_sec": 60, + "startup_stale_max_age_sec": 20, "storage_retry_max": 3, "storage_retry_base_ms": 300, "storage_op_timeout_sec": 45, @@ -16,6 +17,8 @@ "session_wait_timeout_sec": 15, "backpressure_bytes": 4194304, "immediate_flush": false, + "cold_start_burst_ms": 10000, + "cold_start_poll_ms": 100, "metrics_log_sec": 30, "health_listen_addr": "127.0.0.1:18080", "google_lanes": [] From 1fe92f79a7e04c6523bc34910a37c2ac2580fdcc Mon Sep 17 00:00:00 2001 From: PK3NZO Date: Sat, 25 Apr 2026 22:21:19 +0330 Subject: [PATCH 3/3] Add target-aware runtime controls and server dial tuning --- README.md | 9 ++ client_config.json.example | 3 + cmd/client/main.go | 134 ++++++++++++++++- cmd/client/target_policy_test.go | 54 +++++++ cmd/server/main.go | 10 +- internal/config/config.go | 14 ++ internal/transport/engine.go | 243 ++++++++++++++++++++++++++----- internal/transport/session.go | 2 + 8 files changed, 424 insertions(+), 45 deletions(-) create mode 100644 cmd/client/target_policy_test.go diff --git a/README.md b/README.md index 3ccd95e..9eeb7dd 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,9 @@ Recommended balanced client config: "storage_op_timeout_sec": 45, "max_payload_bytes": 786432, "max_active_sessions": 0, + "target_metrics_top_n": 10, + "blocked_targets": [], + "low_priority_targets": [], "session_wait_timeout_sec": 15, "backpressure_bytes": 4194304, "immediate_flush": false, @@ -270,6 +273,7 @@ Key metrics to watch: - `avg_first_upload_ms`: local queue-to-Drive upload latency. - `avg_first_server_seen_ms`: how long it takes the server to see a new client request file. - `avg_first_response_ms`: time until the client receives first response bytes. +- `top_targets`: busiest destination hosts with session counts and first-response timing. - `poll_files_stale`: old transport leftovers ignored and deleted. - `max_file_age_ms`: large values can indicate backlog or old files. @@ -293,6 +297,9 @@ Important options: - `storage_op_timeout_sec`: fail-fast timeout for individual Google Drive operations. - `max_payload_bytes`: maximum per-session payload size written into one transport file. - `max_active_sessions`: cap for concurrent sessions. `0` means unlimited. +- `target_metrics_top_n`: number of destination hosts exposed under `/metrics` as `top_targets`. +- `blocked_targets`: optional host globs to reject before tunneling, for example `["*.doubleclick.net"]`. +- `low_priority_targets`: optional host globs to tunnel without extending the cold-start burst, for noisy browser background services. - `session_wait_timeout_sec`: how long new SOCKS sessions wait for capacity. - `backpressure_bytes`: per-session buffer limit before application writes wait. - `immediate_flush`: uploads new data promptly instead of waiting for the next flush tick. Keep this disabled for browser/video/download workloads because Google Drive generally performs better with batched files. @@ -301,6 +308,8 @@ Important options: - `metrics_log_sec`: periodic operational metrics log interval. - `health_listen_addr`: optional local HTTP endpoint for `/healthz` and `/metrics`. +Target patterns are lowercase host globs with optional ports. Examples: `*.doubleclick.net`, `mtalk.google.com:*`, `*:5228`. Prefer `low_priority_targets` before `blocked_targets`; blocking can break page assets, while low priority only prevents background connections from consuming cold-start acceleration. + For higher throughput or resilience, configure multiple Google Drive lanes on both client and server: ```json diff --git a/client_config.json.example b/client_config.json.example index 138edb6..c9880a2 100644 --- a/client_config.json.example +++ b/client_config.json.example @@ -14,6 +14,9 @@ "storage_op_timeout_sec": 45, "max_payload_bytes": 786432, "max_active_sessions": 0, + "target_metrics_top_n": 10, + "blocked_targets": [], + "low_priority_targets": [], "session_wait_timeout_sec": 15, "backpressure_bytes": 4194304, "immediate_flush": false, diff --git a/cmd/client/main.go b/cmd/client/main.go index e206339..5be6819 100644 --- a/cmd/client/main.go +++ b/cmd/client/main.go @@ -11,6 +11,8 @@ import ( "net" "os" "os/signal" + "path" + "strings" "syscall" "time" @@ -107,6 +109,7 @@ func main() { engine.SetImmediateFlush(appCfg.ImmediateFlush) engine.SetColdStartBurst(appCfg.ColdStartBurstMs, appCfg.ColdStartPollMs) engine.SetMetricsLogInterval(appCfg.MetricsLogSec) + engine.SetTargetMetricsTopN(appCfg.TargetMetricsTopN) engine.Start(ctx) health.Start(ctx, appCfg.HealthListenAddr, engine) @@ -114,10 +117,19 @@ func main() { if listenAddr == "" { listenAddr = "127.0.0.1:1080" } + policy := newTargetPolicy(appCfg.BlockedTargets, appCfg.LowPriorityTargets) // Create the library SOCKS5 server wrapping our custom Google Drive Engine tunnel server := socks5.NewServer( socks5.WithDial(func(dc context.Context, network, addr string) (net.Conn, error) { + host, port, hasHostPort := splitTarget(addr) + if policy.blocked(host, port, addr) { + engine.RecordBlockedTarget(addr) + log.Printf("Blocked target by policy: %s", addr) + return nil, fmt.Errorf("target blocked by policy: %s", addr) + } + lowPriority := policy.lowPriority(host, port, addr) + if err := waitForSessionCapacity(dc, engine, appCfg.MaxActiveSessions, appCfg.SessionWaitTimeoutSec); err != nil { return nil, err } @@ -125,25 +137,32 @@ func main() { sessionID := generateSessionID() // Intelligently parse the address string to warn users if their browser is natively leaking DNS - host, port, err := net.SplitHostPort(addr) - if err == nil { + priorityLabel := "" + if lowPriority { + priorityLabel = " LOW-PRIORITY" + } + if hasHostPort { if net.ParseIP(host) != nil { - log.Printf("New covert session %s targeting RAW IP %s:%s (Warning: Local DNS Leak?)", sessionID, host, port) + log.Printf("New covert session %s%s targeting RAW IP %s:%s (Warning: Local DNS Leak?)", sessionID, priorityLabel, host, port) } else { - log.Printf("New covert session %s targeting SECURE DOMAIN %s:%s", sessionID, host, port) + log.Printf("New covert session %s%s targeting SECURE DOMAIN %s:%s", sessionID, priorityLabel, host, port) } } else { - log.Printf("New covert session %s targeting %s", sessionID, addr) + log.Printf("New covert session %s%s targeting %s", sessionID, priorityLabel, addr) } session := transport.NewSession(sessionID) session.TargetAddr = addr + session.TargetHost = host + session.LowPriority = lowPriority engine.AddSession(session) // Instantly ping a blank payload so the remote end opens the actual TCP destination session.EnqueueTx(nil) - engine.TriggerWarmPoll() - engine.ForceFlush() + if !lowPriority { + engine.TriggerWarmPoll() + engine.ForceFlush() + } return transport.NewVirtualConn(session, engine), nil }), @@ -173,6 +192,107 @@ func main() { cancel() } +type targetPolicy struct { + blockedTargets []targetPattern + lowPriorityTargets []targetPattern +} + +type targetPattern struct { + raw string + host string + port string +} + +func newTargetPolicy(blocked, lowPriority []string) targetPolicy { + return targetPolicy{ + blockedTargets: parseTargetPatterns(blocked), + lowPriorityTargets: parseTargetPatterns(lowPriority), + } +} + +func parseTargetPatterns(values []string) []targetPattern { + patterns := make([]targetPattern, 0, len(values)) + for _, value := range values { + raw := strings.TrimSpace(strings.ToLower(value)) + if raw == "" { + continue + } + host, port, ok := splitPattern(raw) + if !ok { + host = raw + } + patterns = append(patterns, targetPattern{raw: raw, host: host, port: port}) + } + return patterns +} + +func splitPattern(value string) (host, port string, ok bool) { + if strings.HasPrefix(value, "[") { + host, port, err := net.SplitHostPort(value) + if err == nil { + return strings.Trim(host, "[]"), port, true + } + } + lastColon := strings.LastIndex(value, ":") + if lastColon <= 0 { + return "", "", false + } + host = value[:lastColon] + port = value[lastColon+1:] + if host == "" || port == "" { + return "", "", false + } + return strings.Trim(host, "[]"), port, true +} + +func (p targetPolicy) blocked(host, port, raw string) bool { + return targetPatternsMatch(p.blockedTargets, host, port, raw) +} + +func (p targetPolicy) lowPriority(host, port, raw string) bool { + return targetPatternsMatch(p.lowPriorityTargets, host, port, raw) +} + +func targetPatternsMatch(patterns []targetPattern, host, port, raw string) bool { + host = strings.ToLower(strings.Trim(host, "[]")) + port = strings.ToLower(port) + raw = strings.ToLower(raw) + for _, pattern := range patterns { + if pattern.match(host, port, raw) { + return true + } + } + return false +} + +func (p targetPattern) match(host, port, raw string) bool { + if p.port != "" && p.port != "*" && p.port != port { + return false + } + + patternHost := p.host + if patternHost == "" { + patternHost = p.raw + } + if matched, err := path.Match(patternHost, host); err == nil && matched { + return true + } + if p.port == "" { + if matched, err := path.Match(p.raw, raw); err == nil && matched { + return true + } + } + return patternHost == host || (p.port == "" && p.raw == raw) +} + +func splitTarget(addr string) (host, port string, ok bool) { + host, port, err := net.SplitHostPort(addr) + if err != nil { + return strings.ToLower(strings.Trim(addr, "[]")), "", false + } + return strings.ToLower(strings.Trim(host, "[]")), strings.ToLower(port), true +} + func waitForSessionCapacity(ctx context.Context, engine *transport.Engine, maxActive, timeoutSec int) error { if maxActive <= 0 { return nil diff --git a/cmd/client/target_policy_test.go b/cmd/client/target_policy_test.go new file mode 100644 index 0000000..29680ca --- /dev/null +++ b/cmd/client/target_policy_test.go @@ -0,0 +1,54 @@ +package main + +import "testing" + +func TestTargetPolicyMatchesHostAndPortPatterns(t *testing.T) { + policy := newTargetPolicy( + []string{"*.doubleclick.net", "*:5228"}, + []string{"mtalk.google.com:*", "optimizationguide-pa.googleapis.com"}, + ) + + tests := []struct { + name string + addr string + blocked bool + lowPriority bool + }{ + { + name: "blocked host glob", + addr: "googleads.g.doubleclick.net:443", + blocked: true, + }, + { + name: "blocked port glob", + addr: "push.example.com:5228", + blocked: true, + }, + { + name: "low priority host with any port", + addr: "mtalk.google.com:443", + lowPriority: true, + }, + { + name: "low priority host without explicit port pattern", + addr: "optimizationguide-pa.googleapis.com:443", + lowPriority: true, + }, + { + name: "unmatched target", + addr: "www.youtube.com:443", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + host, port, _ := splitTarget(tt.addr) + if got := policy.blocked(host, port, tt.addr); got != tt.blocked { + t.Fatalf("blocked=%v, want %v", got, tt.blocked) + } + if got := policy.lowPriority(host, port, tt.addr); got != tt.lowPriority { + t.Fatalf("lowPriority=%v, want %v", got, tt.lowPriority) + } + }) + } +} diff --git a/cmd/server/main.go b/cmd/server/main.go index 19318ac..c6e0a98 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -9,6 +9,7 @@ import ( "os" "os/signal" "syscall" + "time" "github.com/NullLatency/flow-driver/internal/app" "github.com/NullLatency/flow-driver/internal/config" @@ -84,6 +85,7 @@ func main() { engine.SetImmediateFlush(appCfg.ImmediateFlush) engine.SetColdStartBurst(appCfg.ColdStartBurstMs, appCfg.ColdStartPollMs) engine.SetMetricsLogInterval(appCfg.MetricsLogSec) + engine.SetTargetMetricsTopN(appCfg.TargetMetricsTopN) // Called by polling loop when a new incoming session file is found engine.OnNewSession = func(sessionID, targetAddr string, session *transport.Session) { @@ -104,7 +106,11 @@ func main() { func handleServerConn(sessionID, targetAddr string, session *transport.Session, engine *transport.Engine) { defer engine.RemoveSession(sessionID) - conn, err := net.Dial("tcp", targetAddr) + dialer := net.Dialer{ + Timeout: 10 * time.Second, + KeepAlive: 30 * time.Second, + } + conn, err := dialer.Dial("tcp", targetAddr) if err != nil { log.Printf("Dial error to %s: %v", targetAddr, err) // Send back a close packet? Just closing the session will notify client @@ -116,7 +122,7 @@ func handleServerConn(sessionID, targetAddr string, session *transport.Session, // Conn -> Tx (Res) go func() { - buf := make([]byte, 4096) + buf := make([]byte, 32*1024) for { n, err := conn.Read(buf) if n > 0 { diff --git a/internal/config/config.go b/internal/config/config.go index ff2d42c..300cdba 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -69,6 +69,17 @@ type AppConfig struct { // MaxActiveSessions caps concurrent SOCKS sessions on the client side. MaxActiveSessions int `json:"max_active_sessions,omitempty"` + // BlockedTargets rejects matching SOCKS targets before creating a tunnel session. + // Patterns are host globs with optional ports, such as "*.doubleclick.net" or "mtalk.google.com:*". + BlockedTargets []string `json:"blocked_targets,omitempty"` + + // LowPriorityTargets still tunnels matching targets, but skips cold-start acceleration for them. + // This keeps browser background noise from extending the fast-poll window for foreground pages. + LowPriorityTargets []string `json:"low_priority_targets,omitempty"` + + // TargetMetricsTopN controls how many high-traffic targets are exposed in metrics snapshots. + TargetMetricsTopN int `json:"target_metrics_top_n,omitempty"` + // SessionWaitTimeoutSec is how long a new SOCKS connection waits for capacity. SessionWaitTimeoutSec int `json:"session_wait_timeout_sec,omitempty"` @@ -118,6 +129,7 @@ func (c *AppConfig) ApplyProfile() { setDefault(&c.StorageOpTimeoutSec, 45) setDefault(&c.MaxPayloadBytes, 512*1024) setDefault(&c.MaxActiveSessions, 24) + setDefault(&c.TargetMetricsTopN, 10) setDefault(&c.SessionWaitTimeoutSec, 12) setDefault(&c.BackpressureBytes, 4*1024*1024) setDefault(&c.ColdStartBurstMs, 15000) @@ -136,6 +148,7 @@ func (c *AppConfig) ApplyProfile() { setDefault(&c.StorageOpTimeoutSec, 45) setDefault(&c.MaxPayloadBytes, 1024*1024) setDefault(&c.MaxActiveSessions, 16) + setDefault(&c.TargetMetricsTopN, 10) setDefault(&c.SessionWaitTimeoutSec, 10) setDefault(&c.BackpressureBytes, 4*1024*1024) setDefault(&c.ColdStartBurstMs, 5000) @@ -154,6 +167,7 @@ func (c *AppConfig) ApplyProfile() { setDefault(&c.StorageOpTimeoutSec, 45) setDefault(&c.MaxPayloadBytes, 768*1024) setDefault(&c.MaxActiveSessions, 24) + setDefault(&c.TargetMetricsTopN, 10) setDefault(&c.SessionWaitTimeoutSec, 12) setDefault(&c.BackpressureBytes, 4*1024*1024) setDefault(&c.ColdStartBurstMs, 10000) diff --git a/internal/transport/engine.go b/internal/transport/engine.go index e4e5b1f..98249f4 100644 --- a/internal/transport/engine.go +++ b/internal/transport/engine.go @@ -5,6 +5,8 @@ import ( "fmt" "io" "log" + "net" + "sort" "strconv" "strings" "sync" @@ -60,6 +62,9 @@ type Engine struct { warmUntilNs int64 lastForceNs int64 metrics engineMetrics + targetMu sync.Mutex + targetStats map[string]*targetMetrics + targetTopN int } type engineMetrics struct { @@ -98,41 +103,64 @@ type engineMetrics struct { maxFirstServerSeenMs uint64 } +type targetMetrics struct { + target string + sessions uint64 + lowPriority uint64 + blocked uint64 + firstUploads uint64 + firstUploadMs uint64 + firstResponses uint64 + firstResponseMs uint64 +} + type MetricsSnapshot struct { - ActiveSessions int `json:"active_sessions"` - Uploads uint64 `json:"uploads"` - Downloads uint64 `json:"downloads"` - Deletes uint64 `json:"deletes"` - ListCalls uint64 `json:"list_calls"` - UploadBytes uint64 `json:"upload_bytes"` - DownloadBytes uint64 `json:"download_bytes"` - UploadErrors uint64 `json:"upload_errors"` - DownloadErrors uint64 `json:"download_errors"` - ListErrors uint64 `json:"list_errors"` - DeleteErrors uint64 `json:"delete_errors"` - AvgUploadLatencyMs float64 `json:"avg_upload_latency_ms"` - AvgDownloadLatencyMs float64 `json:"avg_download_latency_ms"` - AvgListLatencyMs float64 `json:"avg_list_latency_ms"` - AvgDeleteLatencyMs float64 `json:"avg_delete_latency_ms"` - MaxUploadLatencyMs uint64 `json:"max_upload_latency_ms"` - MaxDownloadLatencyMs uint64 `json:"max_download_latency_ms"` - MaxListLatencyMs uint64 `json:"max_list_latency_ms"` - MaxDeleteLatencyMs uint64 `json:"max_delete_latency_ms"` - AvgFileAgeMs float64 `json:"avg_file_age_ms"` - MaxFileAgeMs uint64 `json:"max_file_age_ms"` - PollFilesFound uint64 `json:"poll_files_found"` - PollFilesProcessed uint64 `json:"poll_files_processed"` - PollFilesStale uint64 `json:"poll_files_stale"` - MaxPollBatchFiles uint64 `json:"max_poll_batch_files"` - FirstResponses uint64 `json:"first_responses"` - AvgFirstResponseMs float64 `json:"avg_first_response_ms"` - MaxFirstResponseMs uint64 `json:"max_first_response_ms"` - FirstUploads uint64 `json:"first_uploads"` - AvgFirstUploadMs float64 `json:"avg_first_upload_ms"` - MaxFirstUploadMs uint64 `json:"max_first_upload_ms"` - FirstServerSeens uint64 `json:"first_server_seens"` - AvgFirstServerSeenMs float64 `json:"avg_first_server_seen_ms"` - MaxFirstServerSeenMs uint64 `json:"max_first_server_seen_ms"` + ActiveSessions int `json:"active_sessions"` + Uploads uint64 `json:"uploads"` + Downloads uint64 `json:"downloads"` + Deletes uint64 `json:"deletes"` + ListCalls uint64 `json:"list_calls"` + UploadBytes uint64 `json:"upload_bytes"` + DownloadBytes uint64 `json:"download_bytes"` + UploadErrors uint64 `json:"upload_errors"` + DownloadErrors uint64 `json:"download_errors"` + ListErrors uint64 `json:"list_errors"` + DeleteErrors uint64 `json:"delete_errors"` + AvgUploadLatencyMs float64 `json:"avg_upload_latency_ms"` + AvgDownloadLatencyMs float64 `json:"avg_download_latency_ms"` + AvgListLatencyMs float64 `json:"avg_list_latency_ms"` + AvgDeleteLatencyMs float64 `json:"avg_delete_latency_ms"` + MaxUploadLatencyMs uint64 `json:"max_upload_latency_ms"` + MaxDownloadLatencyMs uint64 `json:"max_download_latency_ms"` + MaxListLatencyMs uint64 `json:"max_list_latency_ms"` + MaxDeleteLatencyMs uint64 `json:"max_delete_latency_ms"` + AvgFileAgeMs float64 `json:"avg_file_age_ms"` + MaxFileAgeMs uint64 `json:"max_file_age_ms"` + PollFilesFound uint64 `json:"poll_files_found"` + PollFilesProcessed uint64 `json:"poll_files_processed"` + PollFilesStale uint64 `json:"poll_files_stale"` + MaxPollBatchFiles uint64 `json:"max_poll_batch_files"` + FirstResponses uint64 `json:"first_responses"` + AvgFirstResponseMs float64 `json:"avg_first_response_ms"` + MaxFirstResponseMs uint64 `json:"max_first_response_ms"` + FirstUploads uint64 `json:"first_uploads"` + AvgFirstUploadMs float64 `json:"avg_first_upload_ms"` + MaxFirstUploadMs uint64 `json:"max_first_upload_ms"` + FirstServerSeens uint64 `json:"first_server_seens"` + AvgFirstServerSeenMs float64 `json:"avg_first_server_seen_ms"` + MaxFirstServerSeenMs uint64 `json:"max_first_server_seen_ms"` + TopTargets []TargetMetricsSnapshot `json:"top_targets,omitempty"` +} + +type TargetMetricsSnapshot struct { + Target string `json:"target"` + Sessions uint64 `json:"sessions"` + LowPriority uint64 `json:"low_priority,omitempty"` + Blocked uint64 `json:"blocked,omitempty"` + FirstUploads uint64 `json:"first_uploads,omitempty"` + AvgFirstUploadMs float64 `json:"avg_first_upload_ms,omitempty"` + FirstResponses uint64 `json:"first_responses,omitempty"` + AvgFirstResponseMs float64 `json:"avg_first_response_ms,omitempty"` } func NewEngine(backend storage.Backend, isClient bool, clientID string) *Engine { @@ -160,6 +188,8 @@ func NewEngine(backend storage.Backend, isClient bool, clientID string) *Engine metricsLogInterval: 30 * time.Second, flushTrigger: make(chan struct{}, 1), pollTrigger: make(chan struct{}, 1), + targetStats: make(map[string]*targetMetrics), + targetTopN: 10, } if isClient { e.myDir = DirReq @@ -268,6 +298,12 @@ func (e *Engine) SetMetricsLogInterval(seconds int) { } } +func (e *Engine) SetTargetMetricsTopN(n int) { + if n >= 0 { + e.targetTopN = n + } +} + func (e *Engine) Start(ctx context.Context) { e.TriggerWarmPoll() go e.flushLoop(ctx) @@ -284,12 +320,25 @@ func (e *Engine) GetSession(id string) *Session { func (e *Engine) AddSession(s *Session) { s.SetBackpressureBytes(e.backpressureBytes) + if s.TargetHost == "" { + s.TargetHost = targetKey(s.TargetAddr) + } e.sessionMu.Lock() e.sessions[s.ID] = s total := len(e.sessions) e.sessionMu.Unlock() + e.recordTargetSession(s.TargetHost, s.LowPriority) log.Printf("Engine.AddSession: Added session %s (Total now: %d)", s.ID, total) - e.TriggerWarmPoll() + if !s.LowPriority { + e.TriggerWarmPoll() + } +} + +func (e *Engine) RecordBlockedTarget(targetAddr string) { + e.targetMu.Lock() + stats := e.targetStatsLocked(targetKey(targetAddr)) + stats.blocked++ + e.targetMu.Unlock() } func (e *Engine) RequestFlush() { @@ -381,6 +430,7 @@ func (e *Engine) Snapshot() MetricsSnapshot { FirstServerSeens: current.firstServerSeens, AvgFirstServerSeenMs: averageMs(current.firstServerSeenMs, current.firstServerSeens), MaxFirstServerSeenMs: current.maxFirstServerSeenMs, + TopTargets: e.snapshotTopTargets(), } } @@ -681,6 +731,7 @@ func (e *Engine) downloadAndProcess(ctx context.Context, fname string, wg *sync. s = NewSession(env.SessionID) s.ClientID = fileClientID s.TargetAddr = env.TargetAddr + s.TargetHost = targetKey(env.TargetAddr) e.sessions[env.SessionID] = s e.sessionMu.Unlock() e.recordFirstServerSeen(s, fileAgeMs) @@ -763,6 +814,7 @@ func (e *Engine) recordFirstResponse(s *Session) { atomic.AddUint64(&e.metrics.firstResponses, 1) atomic.AddUint64(&e.metrics.firstResponseMs, latencyMs) atomicMaxUint64(&e.metrics.maxFirstResponseMs, latencyMs) + e.recordTargetFirstResponse(targetAddr, latencyMs) if latencyMs > 2000 { log.Printf("session first response slow: id=%s target=%s first_response_ms=%d", s.ID, targetAddr, latencyMs) } @@ -794,11 +846,50 @@ func (e *Engine) recordFirstUpload(sessionID string) { atomic.AddUint64(&e.metrics.firstUploads, 1) atomic.AddUint64(&e.metrics.firstUploadMs, latencyMs) atomicMaxUint64(&e.metrics.maxFirstUploadMs, latencyMs) + e.recordTargetFirstUpload(targetAddr, latencyMs) if latencyMs > 1000 { log.Printf("session first upload slow: id=%s target=%s first_upload_ms=%d", sessionID, targetAddr, latencyMs) } } +func (e *Engine) recordTargetSession(target string, lowPriority bool) { + e.targetMu.Lock() + stats := e.targetStatsLocked(target) + stats.sessions++ + if lowPriority { + stats.lowPriority++ + } + e.targetMu.Unlock() +} + +func (e *Engine) recordTargetFirstUpload(target string, latencyMs uint64) { + e.targetMu.Lock() + stats := e.targetStatsLocked(targetKey(target)) + stats.firstUploads++ + stats.firstUploadMs += latencyMs + e.targetMu.Unlock() +} + +func (e *Engine) recordTargetFirstResponse(target string, latencyMs uint64) { + e.targetMu.Lock() + stats := e.targetStatsLocked(targetKey(target)) + stats.firstResponses++ + stats.firstResponseMs += latencyMs + e.targetMu.Unlock() +} + +func (e *Engine) targetStatsLocked(target string) *targetMetrics { + if target == "" { + target = "unknown" + } + stats := e.targetStats[target] + if stats == nil { + stats = &targetMetrics{target: target} + e.targetStats[target] = stats + } + return stats +} + func (e *Engine) recordFirstServerSeen(s *Session, fileAgeMs uint64) { if fileAgeMs == 0 { return @@ -914,9 +1005,10 @@ func (e *Engine) metricsLoop(ctx context.Context) { e.sessionMu.RLock() activeSessions := len(e.sessions) e.sessionMu.RUnlock() + targets := e.compactTargetLog(5) log.Printf( - "metrics: active=%d uploads=%d/%s up_avg_ms=%.0f downloads=%d/%s down_avg_ms=%.0f lists=%d list_avg_ms=%.0f poll_files[f=%d p=%d stale=%d max_batch=%d] deletes=%d file_age_avg_ms=%.0f max_file_age_ms=%d first_upload_avg_ms=%.0f first_seen_avg_ms=%.0f first_resp_avg_ms=%.0f errors[u=%d d=%d l=%d del=%d]", + "metrics: active=%d uploads=%d/%s up_avg_ms=%.0f downloads=%d/%s down_avg_ms=%.0f lists=%d list_avg_ms=%.0f poll_files[f=%d p=%d stale=%d max_batch=%d] deletes=%d file_age_avg_ms=%.0f max_file_age_ms=%d first_upload_avg_ms=%.0f first_seen_avg_ms=%.0f first_resp_avg_ms=%.0f targets=[%s] errors[u=%d d=%d l=%d del=%d]", activeSessions, current.uploads-last.uploads, formatBytes(current.uploadBytes-last.uploadBytes), @@ -936,6 +1028,7 @@ func (e *Engine) metricsLoop(ctx context.Context) { averageMs(current.firstUploadMs-last.firstUploadMs, current.firstUploads-last.firstUploads), averageMs(current.firstServerSeenMs-last.firstServerSeenMs, current.firstServerSeens-last.firstServerSeens), averageMs(current.firstResponseMs-last.firstResponseMs, current.firstResponses-last.firstResponses), + targets, current.uploadErrors-last.uploadErrors, current.downloadErrors-last.downloadErrors, current.listErrors-last.listErrors, @@ -984,6 +1077,84 @@ func (e *Engine) snapshotMetrics() engineMetrics { } } +func (e *Engine) snapshotTopTargets() []TargetMetricsSnapshot { + if e.targetTopN == 0 { + return nil + } + + e.targetMu.Lock() + targets := make([]TargetMetricsSnapshot, 0, len(e.targetStats)) + for _, stats := range e.targetStats { + targets = append(targets, TargetMetricsSnapshot{ + Target: stats.target, + Sessions: stats.sessions, + LowPriority: stats.lowPriority, + Blocked: stats.blocked, + FirstUploads: stats.firstUploads, + AvgFirstUploadMs: averageMs(stats.firstUploadMs, stats.firstUploads), + FirstResponses: stats.firstResponses, + AvgFirstResponseMs: averageMs(stats.firstResponseMs, stats.firstResponses), + }) + } + e.targetMu.Unlock() + + sort.Slice(targets, func(i, j int) bool { + left := targets[i].Sessions + targets[i].Blocked + right := targets[j].Sessions + targets[j].Blocked + if left == right { + return targets[i].Target < targets[j].Target + } + return left > right + }) + if e.targetTopN > 0 && len(targets) > e.targetTopN { + targets = targets[:e.targetTopN] + } + return targets +} + +func (e *Engine) compactTargetLog(limit int) string { + if limit <= 0 { + return "" + } + targets := e.snapshotTopTargets() + if len(targets) == 0 { + return "" + } + if len(targets) > limit { + targets = targets[:limit] + } + parts := make([]string, 0, len(targets)) + for _, t := range targets { + label := fmt.Sprintf("%s:%d", t.Target, t.Sessions) + if t.LowPriority > 0 { + label += fmt.Sprintf("/lp%d", t.LowPriority) + } + if t.Blocked > 0 { + label += fmt.Sprintf("/blk%d", t.Blocked) + } + if t.AvgFirstResponseMs > 0 { + label += fmt.Sprintf("/fr%.0fms", t.AvgFirstResponseMs) + } + parts = append(parts, label) + } + return strings.Join(parts, ",") +} + +func targetKey(targetAddr string) string { + targetAddr = strings.TrimSpace(strings.ToLower(targetAddr)) + if targetAddr == "" { + return "unknown" + } + host, _, err := net.SplitHostPort(targetAddr) + if err == nil { + host = strings.Trim(host, "[]") + if host != "" { + return host + } + } + return strings.Trim(targetAddr, "[]") +} + func muxPayloadBytes(mux []Envelope) int { var total int for _, env := range mux { diff --git a/internal/transport/session.go b/internal/transport/session.go index e2f3749..d26e0fb 100644 --- a/internal/transport/session.go +++ b/internal/transport/session.go @@ -30,7 +30,9 @@ type Session struct { closed bool rxClosed bool // Safely tracks if RxChan was successfully closed TargetAddr string + TargetHost string ClientID string + LowPriority bool // Backpressure: blocked when txBuf is too large txCond *sync.Cond