From a91365f0363c2cc61148c4194b169b0b27bae49e Mon Sep 17 00:00:00 2001
From: Cincinnati Podcast Studio <hello@cincinnatipodcaststudio.com>
Date: Sat, 16 May 2026 11:08:36 -0400
Subject: [PATCH] rsv-ben: fix fps detection for CFR reference clips

repairRsvBen() read the video frame duration only from Track::times_,
but getSampleTimes() collapses a constant-rate stts into
constant_duration_ and leaves times_ empty. Every Sony camera records
CFR, so times_ was empty and video_duration_per_sample kept its 1000
default. fps was therefore computed as 24 instead of 23.976
(24000/1001), making the per-GOP audio chunk 96000 bytes instead of
96096.

That ~96-byte/GOP error appended PCM to each GOP's last video frame
(one corrupt frame per GOP -> ~0.5s judder) and truncated audio by the
same amount (~4.5s short over 75 min) for NTSC-fractional footage
(23.976/29.97/59.94).

Fix: use constant_duration_ when times_ is empty. Also compute
audio_samples_per_chunk with exact integer rounding so float
truncation can't drop a sample at other rates.

Verified on a 58GB Sony FX30 XAVC S .RSV: audio/video durations now
match exactly and the file decodes with zero per-GOP errors.
---
 src/rsv.cpp | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/rsv.cpp b/src/rsv.cpp
index 2840326..be44e06 100644
--- a/src/rsv.cpp
+++ b/src/rsv.cpp
@@ -103,7 +103,14 @@ void Mp4::repairRsvBen(const string& filename) {
 	if (video_track_idx >= 0) {
 		Track& video_track = tracks_[video_track_idx];
 		video_timescale = video_track.timescale_;
-		if (!video_track.times_.empty()) {
+		// CFR reference clips (all Sony cameras) collapse stts into
+		// constant_duration_ and leave times_ empty. Must read it, otherwise
+		// video_duration_per_sample keeps its 1000 default -> fps wrongly 24
+		// instead of 23.976 (24000/1001) -> audio chunk size off by ~96
+		// bytes/GOP -> 1 corrupt video frame/GOP + short audio.
+		if (video_track.constant_duration_ != -1) {
+			video_duration_per_sample = video_track.constant_duration_;
+		} else if (!video_track.times_.empty()) {
 			video_duration_per_sample = video_track.times_[0];
 		}
 		logg(V, "video timescale: ", video_timescale, ", duration per sample: ", video_duration_per_sample, "\n");
@@ -216,7 +223,12 @@ void Mp4::repairRsvBen(const string& filename) {
 	// Audio samples per chunk = GOP duration * audio_sample_rate
 	double fps = (double)video_timescale / video_duration_per_sample;
 	double gop_duration_sec = (double)frames_per_gop / fps;
-	int audio_samples_per_chunk = (int)(gop_duration_sec * audio_sample_rate);
+	// Exact integer rational: samples = frames_per_gop * sample_rate *
+	// dur_per_sample / timescale (rounded). Avoids float truncation that
+	// would drop a sample and re-introduce the boundary error.
+	int audio_samples_per_chunk = (int)(
+		((long long)frames_per_gop * audio_sample_rate * video_duration_per_sample
+		 + video_timescale / 2) / video_timescale);
 	int audio_chunk_size_per_track = audio_samples_per_chunk * audio_sample_size;
 	// Total audio size in RSV is for all tracks combined
 	int total_audio_chunk_size = audio_chunk_size_per_track * max(1, num_audio_tracks);