feat: Phase 1 modernization — AV1/HEVC presets, Gemini LLM, Silero VAD v6

SysAdminDoc · SysAdminDoc · commit 07fdbd3afdde · 2026-04-06T12:28:00.000-04:00
Export presets: added av1_1080p (SVT-AV1 CRF 28), av1_4k (SVT-AV1
CRF 30), hevc_1080p (libx265 CRF 23). Fixed preset int-to-str bug
for numeric preset values. AV1 offers 40% smaller files than H.264.

LLM module: added Google Gemini provider (_query_gemini via
generativelanguage.googleapis.com). Updated default Ollama model
from llama3 to llama3.2. OpenAI uses max_completion_tokens (new API).
Provider allowlists updated across 6 route files to include gemini.

Silero VAD: upgraded to v6 (from assumed v4). Added ONNX inference
path — uses onnxruntime when available for faster CPU inference
without GPU memory overhead. Falls back to PyTorch if onnxruntime
not installed.

Verified: DeepFilterNet3 imports current, Kokoro version pin updated
to &gt;=0.9, Depth Anything V2 docstring corrected to current HF org.
diff --git a/opencut/core/depth_effects.py b/opencut/core/depth_effects.py
@@ -7,7 +7,7 @@
 - Depth map export for compositing
 
 Requires: pip install torch torchvision transformers
-Models: LiheYoung/depth-anything-v2-small (or base/large)
+Models: depth-anything/Depth-Anything-V2-Small-hf (or Base/Large)
 
 References:
   - Depth Anything V2: https://github.com/DepthAnything/Depth-Anything-V2
diff --git a/opencut/core/export_presets.py b/opencut/core/export_presets.py
@@ -195,6 +195,37 @@
         "pix_fmt": "yuv420p",
         "ext": ".webm",
     },
+    # === AV1 / HEVC ===
+    "av1_1080p": {
+        "label": "AV1 1080p",
+        "description": "AV1 codec — 40% smaller files than H.264 at same quality",
+        "category": "web",
+        "width": 1920, "height": 1080,
+        "codec": "libsvtav1", "crf": 28, "preset": 6,
+        "audio_codec": "libopus", "audio_bitrate": "128k",
+        "pix_fmt": "yuv420p10le",
+        "ext": ".mp4",
+    },
+    "av1_4k": {
+        "label": "AV1 4K",
+        "description": "AV1 4K — excellent quality-to-size ratio for archival",
+        "category": "web",
+        "width": 3840, "height": 2160,
+        "codec": "libsvtav1", "crf": 30, "preset": 4,
+        "audio_codec": "libopus", "audio_bitrate": "192k",
+        "pix_fmt": "yuv420p10le",
+        "ext": ".mp4",
+    },
+    "hevc_1080p": {
+        "label": "HEVC 1080p",
+        "description": "H.265/HEVC — 30% smaller than H.264, wide device support",
+        "category": "web",
+        "width": 1920, "height": 1080,
+        "codec": "libx265", "crf": 23, "preset": "medium",
+        "audio_codec": "aac", "audio_bitrate": "192k",
+        "pix_fmt": "yuv420p",
+        "ext": ".mp4",
+    },
     # === GIF ===
     "gif_high": {
         "label": "GIF (High Quality)",
@@ -302,8 +333,8 @@ def export_with_preset(
 
     if preset.get("crf") is not None:
         cmd += ["-crf", str(preset["crf"])]
-    if preset.get("preset"):
-        cmd += ["-preset", preset["preset"]]
+    if preset.get("preset") is not None:
+        cmd += ["-preset", str(preset["preset"])]
     if preset.get("profile"):
         if codec == "prores_ks":
             cmd += ["-profile:v", preset["profile"]]
diff --git a/opencut/core/llm.py b/opencut/core/llm.py
@@ -2,7 +2,7 @@
 OpenCut LLM Abstraction Module
 
 Shared LLM interface for highlight extraction, video summarization,
-and the shorts pipeline. Supports Ollama (local), OpenAI, and Anthropic.
+and the shorts pipeline. Supports Ollama (local), OpenAI, Anthropic, and Google Gemini.
 
 Uses stdlib urllib.request — zero pip dependencies.
 """
@@ -22,8 +22,8 @@
 @dataclass
 class LLMConfig:
     """Configuration for LLM provider."""
-    provider: str = "ollama"  # "ollama", "openai", "anthropic"
-    model: str = "llama3"
+    provider: str = "ollama"  # "ollama", "openai", "anthropic", "gemini"
+    model: str = "llama3.2"
     api_key: str = ""
     base_url: str = "http://localhost:11434"
     temperature: float = 0.3
@@ -123,7 +123,7 @@ def _query_openai(prompt, system_prompt, config):
         "model": config.model,
         "messages": messages,
         "temperature": config.temperature,
-        "max_tokens": config.max_tokens,
+        "max_completion_tokens": config.max_tokens,
     }
 
     data = _http_json(url, data=body, headers=headers, timeout=60)
@@ -189,13 +189,56 @@ def _query_anthropic(prompt, system_prompt, config):
     )
 
 
+def _query_gemini(prompt, system_prompt, config):
+    """Query Google Gemini API."""
+    if not config.api_key:
+        raise RuntimeError("Gemini API key is required. Set config.api_key.")
+
+    url = (
+        f"https://generativelanguage.googleapis.com/v1beta/models/"
+        f"{config.model}:generateContent?key={config.api_key}"
+    )
+
+    body = {
+        "contents": [{"parts": [{"text": prompt}]}],
+        "generationConfig": {
+            "temperature": config.temperature,
+            "maxOutputTokens": config.max_tokens,
+        },
+    }
+    if system_prompt:
+        body["systemInstruction"] = {"parts": [{"text": system_prompt}]}
+
+    data = _http_json(url, data=body, timeout=60)
+
+    text = ""
+    tokens = 0
+    try:
+        text = data["candidates"][0]["content"]["parts"][0]["text"]
+    except (KeyError, IndexError):
+        text = json.dumps(data)
+    try:
+        usage = data.get("usageMetadata", {})
+        tokens = usage.get("promptTokenCount", 0) + usage.get("candidatesTokenCount", 0)
+    except Exception:
+        pass
+
+    return LLMResponse(
+        text=text,
+        provider="gemini",
+        model=config.model,
+        tokens_used=tokens,
+    )
+
+
 # ---------------------------------------------------------------------------
 # Public API
 # ---------------------------------------------------------------------------
 _PROVIDERS = {
     "ollama": _query_ollama,
     "openai": _query_openai,
     "anthropic": _query_anthropic,
+    "gemini": _query_gemini,
 }
 
 
@@ -219,7 +262,7 @@ def query_llm(prompt, config=None, system_prompt="", on_progress=None):
     handler = _PROVIDERS.get(provider)
     if handler is None:
         return LLMResponse(
-            text=f"Unknown LLM provider: '{provider}'. Use 'ollama', 'openai', or 'anthropic'.",
+            text=f"Unknown LLM provider: '{provider}'. Use 'ollama', 'openai', 'anthropic', or 'gemini'.",
             provider=provider,
             model=config.model,
         )
@@ -279,6 +322,13 @@ def check_llm_reachable(config=None):
         else:
             result["error"] = "Anthropic API key not configured"
 
+    elif provider == "gemini":
+        if config.api_key:
+            result["available"] = True
+            result["models"] = [config.model]
+        else:
+            result["error"] = "Gemini API key not configured"
+
     else:
         result["error"] = f"Unknown provider: {provider}"
 
diff --git a/opencut/core/silence.py b/opencut/core/silence.py
@@ -166,10 +166,18 @@ def detect_silences_vad(
             "Silero VAD requires PyTorch. Install with: pip install torch"
         )
 
-    # Load Silero VAD model (cached after first load)
+    # Load Silero VAD v6 model (cached after first load)
+    # ONNX mode avoids GPU memory overhead for inference
+    use_onnx = True
+    try:
+        import onnxruntime  # noqa: F401
+    except ImportError:
+        use_onnx = False
+
     model, utils = torch.hub.load(
         repo_or_dir="snakers4/silero-vad",
         model="silero_vad",
+        onnx=use_onnx,
         force_reload=False,
         trust_repo=True,
     )
diff --git a/opencut/core/voice_gen.py b/opencut/core/voice_gen.py
@@ -257,8 +257,8 @@ def kokoro_generate(
         voice: Kokoro voice preset (af_heart, af_bella, am_adam, am_michael, etc.)
         speed: Playback speed multiplier.
     """
-    if not ensure_package("kokoro", "kokoro>=0.3", on_progress):
-        raise RuntimeError("Failed to install kokoro. Install manually: pip install kokoro>=0.3")
+    if not ensure_package("kokoro", "kokoro>=0.9", on_progress):
+        raise RuntimeError("Failed to install kokoro. Install manually: pip install kokoro>=0.9")
     import kokoro
 
     if output_path is None:
diff --git a/opencut/routes/captions.py b/opencut/routes/captions.py
@@ -1010,7 +1010,7 @@ def transcript_summarize(job_id, filepath, data):
 
     # LLM config from request
     llm_provider = data.get("llm_provider", "ollama")
-    if llm_provider not in ("ollama", "openai", "anthropic"):
+    if llm_provider not in ("ollama", "openai", "anthropic", "gemini"):
         llm_provider = "ollama"
     llm_model = data.get("llm_model", "")
     llm_api_key = data.get("llm_api_key", "")
@@ -1072,7 +1072,7 @@ def captions_chapters(job_id, filepath, data):
 
     segments = data.get("segments", None)
     llm_provider = data.get("llm_provider", "ollama")
-    if llm_provider not in ("ollama", "openai", "anthropic"):
+    if llm_provider not in ("ollama", "openai", "anthropic", "gemini"):
         llm_provider = "ollama"
     llm_model = data.get("llm_model", "llama3")
     api_key = data.get("api_key", "")
diff --git a/opencut/routes/nlp.py b/opencut/routes/nlp.py
@@ -36,7 +36,7 @@ def nlp_command():
     command = data.get("command", "").strip()
     file_path = data.get("filepath", data.get("file", "")).strip()
     llm_provider = data.get("llm_provider", "ollama").strip()
-    if llm_provider not in ("ollama", "openai", "anthropic"):
+    if llm_provider not in ("ollama", "openai", "anthropic", "gemini"):
         llm_provider = "ollama"
     llm_model = data.get("llm_model", "llama3").strip()
     api_key = data.get("api_key", "").strip()
diff --git a/opencut/routes/system.py b/opencut/routes/system.py
@@ -1411,7 +1411,7 @@ def llm_test():
     """Test LLM connectivity with a simple prompt."""
     data = request.get_json(force=True)
 
-    _VALID_LLM_PROVIDERS = {"ollama", "openai", "anthropic"}
+    _VALID_LLM_PROVIDERS = {"ollama", "openai", "anthropic", "gemini"}
     provider = data.get("provider", "ollama").strip().lower()
     if provider not in _VALID_LLM_PROVIDERS:
         return jsonify({"success": False, "error": f"Invalid provider: {provider}. Must be one of: {', '.join(sorted(_VALID_LLM_PROVIDERS))}"}), 400
@@ -1628,8 +1628,8 @@ def chat_message():
 
         # Build LLM config from settings or request
         provider = data.get("llm_provider", "ollama")
-        if provider not in ("ollama", "openai", "anthropic"):
-            return jsonify({"error": "Invalid provider", "code": "INVALID_INPUT", "suggestion": "Use ollama, openai, or anthropic"}), 400
+        if provider not in ("ollama", "openai", "anthropic", "gemini"):
+            return jsonify({"error": "Invalid provider", "code": "INVALID_INPUT", "suggestion": "Use ollama, openai, anthropic, or gemini"}), 400
         model = data.get("llm_model", "")
         api_key = data.get("llm_api_key", "")
 
diff --git a/opencut/routes/video_editing.py b/opencut/routes/video_editing.py
@@ -313,7 +313,7 @@ def video_highlights(job_id, filepath, data):
     max_duration = safe_float(data.get("max_duration", 60.0), 60.0, min_val=10.0, max_val=600.0)
     transcript = data.get("transcript", None)
     llm_provider = data.get("llm_provider", "ollama")
-    if llm_provider not in ("ollama", "openai", "anthropic"):
+    if llm_provider not in ("ollama", "openai", "anthropic", "gemini"):
         llm_provider = "ollama"
     llm_model = data.get("llm_model", "")
     llm_api_key = data.get("llm_api_key", "")
diff --git a/opencut/routes/video_specialty.py b/opencut/routes/video_specialty.py
@@ -171,7 +171,7 @@ def _on_progress(pct, msg=""):
             _update_job(job_id, progress=pct, message=msg)
 
         _shorts_provider = data.get("llm_provider", "ollama")
-        if _shorts_provider not in ("ollama", "openai", "anthropic"):
+        if _shorts_provider not in ("ollama", "openai", "anthropic", "gemini"):
             _shorts_provider = "ollama"
         llm_config = LLMConfig(
             provider=_shorts_provider,