Skip to content

Commit 07fdbd3

Browse files
committed
feat: Phase 1 modernization — AV1/HEVC presets, Gemini LLM, Silero VAD v6
Export presets: added av1_1080p (SVT-AV1 CRF 28), av1_4k (SVT-AV1 CRF 30), hevc_1080p (libx265 CRF 23). Fixed preset int-to-str bug for numeric preset values. AV1 offers 40% smaller files than H.264. LLM module: added Google Gemini provider (_query_gemini via generativelanguage.googleapis.com). Updated default Ollama model from llama3 to llama3.2. OpenAI uses max_completion_tokens (new API). Provider allowlists updated across 6 route files to include gemini. Silero VAD: upgraded to v6 (from assumed v4). Added ONNX inference path — uses onnxruntime when available for faster CPU inference without GPU memory overhead. Falls back to PyTorch if onnxruntime not installed. Verified: DeepFilterNet3 imports current, Kokoro version pin updated to >=0.9, Depth Anything V2 docstring corrected to current HF org.
1 parent 34b0599 commit 07fdbd3

File tree

10 files changed

+108
-19
lines changed

10 files changed

+108
-19
lines changed

opencut/core/depth_effects.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
- Depth map export for compositing
88
99
Requires: pip install torch torchvision transformers
10-
Models: LiheYoung/depth-anything-v2-small (or base/large)
10+
Models: depth-anything/Depth-Anything-V2-Small-hf (or Base/Large)
1111
1212
References:
1313
- Depth Anything V2: https://github.com/DepthAnything/Depth-Anything-V2

opencut/core/export_presets.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,37 @@
195195
"pix_fmt": "yuv420p",
196196
"ext": ".webm",
197197
},
198+
# === AV1 / HEVC ===
199+
"av1_1080p": {
200+
"label": "AV1 1080p",
201+
"description": "AV1 codec — 40% smaller files than H.264 at same quality",
202+
"category": "web",
203+
"width": 1920, "height": 1080,
204+
"codec": "libsvtav1", "crf": 28, "preset": 6,
205+
"audio_codec": "libopus", "audio_bitrate": "128k",
206+
"pix_fmt": "yuv420p10le",
207+
"ext": ".mp4",
208+
},
209+
"av1_4k": {
210+
"label": "AV1 4K",
211+
"description": "AV1 4K — excellent quality-to-size ratio for archival",
212+
"category": "web",
213+
"width": 3840, "height": 2160,
214+
"codec": "libsvtav1", "crf": 30, "preset": 4,
215+
"audio_codec": "libopus", "audio_bitrate": "192k",
216+
"pix_fmt": "yuv420p10le",
217+
"ext": ".mp4",
218+
},
219+
"hevc_1080p": {
220+
"label": "HEVC 1080p",
221+
"description": "H.265/HEVC — 30% smaller than H.264, wide device support",
222+
"category": "web",
223+
"width": 1920, "height": 1080,
224+
"codec": "libx265", "crf": 23, "preset": "medium",
225+
"audio_codec": "aac", "audio_bitrate": "192k",
226+
"pix_fmt": "yuv420p",
227+
"ext": ".mp4",
228+
},
198229
# === GIF ===
199230
"gif_high": {
200231
"label": "GIF (High Quality)",
@@ -302,8 +333,8 @@ def export_with_preset(
302333

303334
if preset.get("crf") is not None:
304335
cmd += ["-crf", str(preset["crf"])]
305-
if preset.get("preset"):
306-
cmd += ["-preset", preset["preset"]]
336+
if preset.get("preset") is not None:
337+
cmd += ["-preset", str(preset["preset"])]
307338
if preset.get("profile"):
308339
if codec == "prores_ks":
309340
cmd += ["-profile:v", preset["profile"]]

opencut/core/llm.py

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
OpenCut LLM Abstraction Module
33
44
Shared LLM interface for highlight extraction, video summarization,
5-
and the shorts pipeline. Supports Ollama (local), OpenAI, and Anthropic.
5+
and the shorts pipeline. Supports Ollama (local), OpenAI, Anthropic, and Google Gemini.
66
77
Uses stdlib urllib.request — zero pip dependencies.
88
"""
@@ -22,8 +22,8 @@
2222
@dataclass
2323
class LLMConfig:
2424
"""Configuration for LLM provider."""
25-
provider: str = "ollama" # "ollama", "openai", "anthropic"
26-
model: str = "llama3"
25+
provider: str = "ollama" # "ollama", "openai", "anthropic", "gemini"
26+
model: str = "llama3.2"
2727
api_key: str = ""
2828
base_url: str = "http://localhost:11434"
2929
temperature: float = 0.3
@@ -123,7 +123,7 @@ def _query_openai(prompt, system_prompt, config):
123123
"model": config.model,
124124
"messages": messages,
125125
"temperature": config.temperature,
126-
"max_tokens": config.max_tokens,
126+
"max_completion_tokens": config.max_tokens,
127127
}
128128

129129
data = _http_json(url, data=body, headers=headers, timeout=60)
@@ -189,13 +189,56 @@ def _query_anthropic(prompt, system_prompt, config):
189189
)
190190

191191

192+
def _query_gemini(prompt, system_prompt, config):
193+
"""Query Google Gemini API."""
194+
if not config.api_key:
195+
raise RuntimeError("Gemini API key is required. Set config.api_key.")
196+
197+
url = (
198+
f"https://generativelanguage.googleapis.com/v1beta/models/"
199+
f"{config.model}:generateContent?key={config.api_key}"
200+
)
201+
202+
body = {
203+
"contents": [{"parts": [{"text": prompt}]}],
204+
"generationConfig": {
205+
"temperature": config.temperature,
206+
"maxOutputTokens": config.max_tokens,
207+
},
208+
}
209+
if system_prompt:
210+
body["systemInstruction"] = {"parts": [{"text": system_prompt}]}
211+
212+
data = _http_json(url, data=body, timeout=60)
213+
214+
text = ""
215+
tokens = 0
216+
try:
217+
text = data["candidates"][0]["content"]["parts"][0]["text"]
218+
except (KeyError, IndexError):
219+
text = json.dumps(data)
220+
try:
221+
usage = data.get("usageMetadata", {})
222+
tokens = usage.get("promptTokenCount", 0) + usage.get("candidatesTokenCount", 0)
223+
except Exception:
224+
pass
225+
226+
return LLMResponse(
227+
text=text,
228+
provider="gemini",
229+
model=config.model,
230+
tokens_used=tokens,
231+
)
232+
233+
192234
# ---------------------------------------------------------------------------
193235
# Public API
194236
# ---------------------------------------------------------------------------
195237
_PROVIDERS = {
196238
"ollama": _query_ollama,
197239
"openai": _query_openai,
198240
"anthropic": _query_anthropic,
241+
"gemini": _query_gemini,
199242
}
200243

201244

@@ -219,7 +262,7 @@ def query_llm(prompt, config=None, system_prompt="", on_progress=None):
219262
handler = _PROVIDERS.get(provider)
220263
if handler is None:
221264
return LLMResponse(
222-
text=f"Unknown LLM provider: '{provider}'. Use 'ollama', 'openai', or 'anthropic'.",
265+
text=f"Unknown LLM provider: '{provider}'. Use 'ollama', 'openai', 'anthropic', or 'gemini'.",
223266
provider=provider,
224267
model=config.model,
225268
)
@@ -279,6 +322,13 @@ def check_llm_reachable(config=None):
279322
else:
280323
result["error"] = "Anthropic API key not configured"
281324

325+
elif provider == "gemini":
326+
if config.api_key:
327+
result["available"] = True
328+
result["models"] = [config.model]
329+
else:
330+
result["error"] = "Gemini API key not configured"
331+
282332
else:
283333
result["error"] = f"Unknown provider: {provider}"
284334

opencut/core/silence.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,10 +166,18 @@ def detect_silences_vad(
166166
"Silero VAD requires PyTorch. Install with: pip install torch"
167167
)
168168

169-
# Load Silero VAD model (cached after first load)
169+
# Load Silero VAD v6 model (cached after first load)
170+
# ONNX mode avoids GPU memory overhead for inference
171+
use_onnx = True
172+
try:
173+
import onnxruntime # noqa: F401
174+
except ImportError:
175+
use_onnx = False
176+
170177
model, utils = torch.hub.load(
171178
repo_or_dir="snakers4/silero-vad",
172179
model="silero_vad",
180+
onnx=use_onnx,
173181
force_reload=False,
174182
trust_repo=True,
175183
)

opencut/core/voice_gen.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -257,8 +257,8 @@ def kokoro_generate(
257257
voice: Kokoro voice preset (af_heart, af_bella, am_adam, am_michael, etc.)
258258
speed: Playback speed multiplier.
259259
"""
260-
if not ensure_package("kokoro", "kokoro>=0.3", on_progress):
261-
raise RuntimeError("Failed to install kokoro. Install manually: pip install kokoro>=0.3")
260+
if not ensure_package("kokoro", "kokoro>=0.9", on_progress):
261+
raise RuntimeError("Failed to install kokoro. Install manually: pip install kokoro>=0.9")
262262
import kokoro
263263

264264
if output_path is None:

opencut/routes/captions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1010,7 +1010,7 @@ def transcript_summarize(job_id, filepath, data):
10101010

10111011
# LLM config from request
10121012
llm_provider = data.get("llm_provider", "ollama")
1013-
if llm_provider not in ("ollama", "openai", "anthropic"):
1013+
if llm_provider not in ("ollama", "openai", "anthropic", "gemini"):
10141014
llm_provider = "ollama"
10151015
llm_model = data.get("llm_model", "")
10161016
llm_api_key = data.get("llm_api_key", "")
@@ -1072,7 +1072,7 @@ def captions_chapters(job_id, filepath, data):
10721072

10731073
segments = data.get("segments", None)
10741074
llm_provider = data.get("llm_provider", "ollama")
1075-
if llm_provider not in ("ollama", "openai", "anthropic"):
1075+
if llm_provider not in ("ollama", "openai", "anthropic", "gemini"):
10761076
llm_provider = "ollama"
10771077
llm_model = data.get("llm_model", "llama3")
10781078
api_key = data.get("api_key", "")

opencut/routes/nlp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def nlp_command():
3636
command = data.get("command", "").strip()
3737
file_path = data.get("filepath", data.get("file", "")).strip()
3838
llm_provider = data.get("llm_provider", "ollama").strip()
39-
if llm_provider not in ("ollama", "openai", "anthropic"):
39+
if llm_provider not in ("ollama", "openai", "anthropic", "gemini"):
4040
llm_provider = "ollama"
4141
llm_model = data.get("llm_model", "llama3").strip()
4242
api_key = data.get("api_key", "").strip()

opencut/routes/system.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1411,7 +1411,7 @@ def llm_test():
14111411
"""Test LLM connectivity with a simple prompt."""
14121412
data = request.get_json(force=True)
14131413

1414-
_VALID_LLM_PROVIDERS = {"ollama", "openai", "anthropic"}
1414+
_VALID_LLM_PROVIDERS = {"ollama", "openai", "anthropic", "gemini"}
14151415
provider = data.get("provider", "ollama").strip().lower()
14161416
if provider not in _VALID_LLM_PROVIDERS:
14171417
return jsonify({"success": False, "error": f"Invalid provider: {provider}. Must be one of: {', '.join(sorted(_VALID_LLM_PROVIDERS))}"}), 400
@@ -1628,8 +1628,8 @@ def chat_message():
16281628

16291629
# Build LLM config from settings or request
16301630
provider = data.get("llm_provider", "ollama")
1631-
if provider not in ("ollama", "openai", "anthropic"):
1632-
return jsonify({"error": "Invalid provider", "code": "INVALID_INPUT", "suggestion": "Use ollama, openai, or anthropic"}), 400
1631+
if provider not in ("ollama", "openai", "anthropic", "gemini"):
1632+
return jsonify({"error": "Invalid provider", "code": "INVALID_INPUT", "suggestion": "Use ollama, openai, anthropic, or gemini"}), 400
16331633
model = data.get("llm_model", "")
16341634
api_key = data.get("llm_api_key", "")
16351635

opencut/routes/video_editing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ def video_highlights(job_id, filepath, data):
313313
max_duration = safe_float(data.get("max_duration", 60.0), 60.0, min_val=10.0, max_val=600.0)
314314
transcript = data.get("transcript", None)
315315
llm_provider = data.get("llm_provider", "ollama")
316-
if llm_provider not in ("ollama", "openai", "anthropic"):
316+
if llm_provider not in ("ollama", "openai", "anthropic", "gemini"):
317317
llm_provider = "ollama"
318318
llm_model = data.get("llm_model", "")
319319
llm_api_key = data.get("llm_api_key", "")

opencut/routes/video_specialty.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def _on_progress(pct, msg=""):
171171
_update_job(job_id, progress=pct, message=msg)
172172

173173
_shorts_provider = data.get("llm_provider", "ollama")
174-
if _shorts_provider not in ("ollama", "openai", "anthropic"):
174+
if _shorts_provider not in ("ollama", "openai", "anthropic", "gemini"):
175175
_shorts_provider = "ollama"
176176
llm_config = LLMConfig(
177177
provider=_shorts_provider,

0 commit comments

Comments
 (0)