diff --git a/configure b/configure index d77a55b653c..a445f7fb0e5 100755 --- a/configure +++ b/configure @@ -3310,6 +3310,11 @@ scale_npp_filter_deps="ffnvcodec libnpp" scale2ref_npp_filter_deps="ffnvcodec libnpp" scale_cuda_filter_deps="ffnvcodec" scale_cuda_filter_deps_any="cuda_nvcc cuda_llvm" +showinfo_cuda_filter_deps="ffnvcodec" +lut3d_cuda_filter_deps="ffnvcodec" +lut3d_cuda_filter_deps_any="cuda_nvcc cuda_llvm" +tonemap_cuda_filter_deps="ffnvcodec" +tonemap_cuda_filter_deps_any="cuda_nvcc cuda_llvm" thumbnail_cuda_filter_deps="ffnvcodec" thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm" transpose_npp_filter_deps="ffnvcodec libnpp" diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h index 733d551fa41..06474f1b3c8 100644 --- a/fftools/ffmpeg.h +++ b/fftools/ffmpeg.h @@ -176,6 +176,7 @@ typedef struct OptionsContext { SpecifierOptList hwaccel_output_formats; SpecifierOptList autorotate; SpecifierOptList apply_cropping; + SpecifierOptList force_cfr; /* output options */ StreamMap *stream_maps; @@ -276,6 +277,9 @@ typedef struct InputFilterOptions { * accurate */ AVRational framerate; + /* convert input stream to CFR at this framerate before inserting additional filters */ + AVRational force_cfr; + unsigned crop_top; unsigned crop_bottom; unsigned crop_left; @@ -451,6 +455,9 @@ typedef struct InputStream { /* framerate forced with -r */ AVRational framerate; + + /* convert input stream to CFR at this framerate before inserting additional filters */ + AVRational force_cfr; #if FFMPEG_OPT_TOP int top_field_first; #endif diff --git a/fftools/ffmpeg_demux.c b/fftools/ffmpeg_demux.c index 13aef15eaba..9c500b79369 100644 --- a/fftools/ffmpeg_demux.c +++ b/fftools/ffmpeg_demux.c @@ -1032,6 +1032,10 @@ int ist_filter_add(InputStream *ist, InputFilter *ifilter, int is_simple, (opts->crop_top | opts->crop_bottom | opts->crop_left | opts->crop_right)) opts->flags |= IFILTER_FLAG_CROP; } + if (ist->force_cfr.num > 0 && ist->force_cfr.den > 0) { + opts->force_cfr = ist->force_cfr; + opts->flags |= IFILTER_FLAG_CFR; + } } else if (ist->par->codec_type == AVMEDIA_TYPE_SUBTITLE) { /* Compute the size of the canvas for the subtitles stream. If the subtitles codecpar has set a size, use it. Otherwise use the @@ -1241,7 +1245,7 @@ static int ist_add(const OptionsContext *o, Demuxer *d, AVStream *st, AVDictiona AVCodecParameters *par = st->codecpar; DemuxStream *ds; InputStream *ist; - const char *framerate = NULL, *hwaccel_device = NULL; + const char *framerate = NULL, *hwaccel_device = NULL, *forcecfr = NULL; const char *hwaccel = NULL; const char *apply_cropping = NULL; const char *hwaccel_output_format = NULL; @@ -1437,6 +1441,15 @@ static int ist_add(const OptionsContext *o, Demuxer *d, AVStream *st, AVDictiona } } + opt_match_per_stream_str(ist, &o->force_cfr, ic, st, &forcecfr); + if (forcecfr) { + ret = av_parse_video_rate(&ist->force_cfr, forcecfr); + if (ret < 0) { + av_log(ist, AV_LOG_ERROR, "Error parsing framerate %s.\n", forcecfr); + return ret; + } + } + #if FFMPEG_OPT_TOP ist->top_field_first = -1; opt_match_per_stream_int(ist, &o->top_field_first, ic, st, &ist->top_field_first); diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c index 38c7676a7e0..62dd23ebe41 100644 --- a/fftools/ffmpeg_filter.c +++ b/fftools/ffmpeg_filter.c @@ -1781,6 +1781,15 @@ static int configure_input_video_filter(FilterGraph *fg, AVFilterGraph *graph, ifp->displaymatrix_applied = 1; } + if (ifp->opts.force_cfr.num > 0 && ifp->opts.force_cfr.den > 0) { + char force_cfr_buf[64]; + snprintf(force_cfr_buf, sizeof(force_cfr_buf), "%d/%d", + ifp->opts.force_cfr.num, ifp->opts.force_cfr.den); + ret = insert_filter(&last_filter, &pad_idx, "fps", force_cfr_buf); + if (ret < 0) + return ret; + } + snprintf(name, sizeof(name), "trim_in_%s", ifp->opts.name); ret = insert_trim(ifp->opts.trim_start_us, ifp->opts.trim_end_us, &last_filter, &pad_idx, name); diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c index f639a1cf0aa..ce5aabcb590 100644 --- a/fftools/ffmpeg_opt.c +++ b/fftools/ffmpeg_opt.c @@ -2033,5 +2033,9 @@ const OptionDef options[] = { "set video sync method globally; deprecated, use -fps_mode", "" }, #endif + { "force_cfr", OPT_TYPE_STRING, OPT_VIDEO | OPT_EXPERT | OPT_PERSTREAM | OPT_INPUT, + { .off = OFFSET(force_cfr) }, + "set frame rate (Hz value, fraction or abbreviation)", "force_cfr" }, + { NULL, }, }; diff --git a/libavcodec/libsvtav1.c b/libavcodec/libsvtav1.c index 79b28eb4df5..79f5506c1cd 100644 --- a/libavcodec/libsvtav1.c +++ b/libavcodec/libsvtav1.c @@ -435,7 +435,7 @@ static av_cold int eb_enc_init(AVCodecContext *avctx) svt_enc->eos_flag = EOS_NOT_REACHED; - svt_ret = svt_av1_enc_init_handle(&svt_enc->svt_handle, svt_enc, &svt_enc->enc_params); + svt_ret = svt_av1_enc_init_handle(&svt_enc->svt_handle, &svt_enc->enc_params); if (svt_ret != EB_ErrorNone) { return svt_print_error(avctx, svt_ret, "Error initializing encoder handle"); } diff --git a/libavfilter/Makefile b/libavfilter/Makefile index 91487afb218..bf38be983c8 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -33,6 +33,7 @@ OBJS-$(CONFIG_SCENE_SAD) += scene_sad.o OBJS-$(CONFIG_DNN) += dnn_filter_common.o include $(SRC_PATH)/libavfilter/dnn/Makefile + # audio filters OBJS-$(CONFIG_AAP_FILTER) += af_aap.o OBJS-$(CONFIG_ABENCH_FILTER) += f_bench.o @@ -245,7 +246,8 @@ OBJS-$(CONFIG_COLORMATRIX_FILTER) += vf_colormatrix.o OBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o colorspacedsp.o OBJS-$(CONFIG_COLORSPACE_CUDA_FILTER) += vf_colorspace_cuda.o \ vf_colorspace_cuda.ptx.o \ - cuda/load_helper.o + cuda/load_helper.o \ + cuda/cuda_async_queue.o OBJS-$(CONFIG_COLORTEMPERATURE_FILTER) += vf_colortemperature.o OBJS-$(CONFIG_CONVOLUTION_FILTER) += vf_convolution.o OBJS-$(CONFIG_CONVOLUTION_OPENCL_FILTER) += vf_convolution_opencl.o opencl.o \ @@ -375,6 +377,8 @@ OBJS-$(CONFIG_LUT1D_FILTER) += vf_lut3d.o OBJS-$(CONFIG_LUT_FILTER) += vf_lut.o OBJS-$(CONFIG_LUT2_FILTER) += vf_lut2.o framesync.o OBJS-$(CONFIG_LUT3D_FILTER) += vf_lut3d.o framesync.o +OBJS-$(CONFIG_LUT3D_CUDA_FILTER) += vf_lut3d_cuda.o \ + vf_lut3d_cuda.ptx.o cuda/load_helper.o OBJS-$(CONFIG_LUTRGB_FILTER) += vf_lut.o OBJS-$(CONFIG_LUTYUV_FILTER) += vf_lut.o OBJS-$(CONFIG_MASKEDCLAMP_FILTER) += vf_maskedclamp.o framesync.o @@ -486,6 +490,7 @@ OBJS-$(CONFIG_SHARPEN_NPP_FILTER) += vf_sharpen_npp.o OBJS-$(CONFIG_SHARPNESS_VAAPI_FILTER) += vf_misc_vaapi.o vaapi_vpp.o OBJS-$(CONFIG_SHEAR_FILTER) += vf_shear.o OBJS-$(CONFIG_SHOWINFO_FILTER) += vf_showinfo.o +OBJS-$(CONFIG_SHOWINFO_CUDA_FILTER) += vf_showinfo_cuda.o OBJS-$(CONFIG_SHOWPALETTE_FILTER) += vf_showpalette.o OBJS-$(CONFIG_SHUFFLEFRAMES_FILTER) += vf_shuffleframes.o OBJS-$(CONFIG_SHUFFLEPIXELS_FILTER) += vf_shufflepixels.o @@ -524,6 +529,9 @@ OBJS-$(CONFIG_TMEDIAN_FILTER) += vf_xmedian.o framesync.o OBJS-$(CONFIG_TMIDEQUALIZER_FILTER) += vf_tmidequalizer.o OBJS-$(CONFIG_TMIX_FILTER) += vf_mix.o framesync.o OBJS-$(CONFIG_TONEMAP_FILTER) += vf_tonemap.o +OBJS-$(CONFIG_TONEMAP_CUDA_FILTER) += vf_tonemap_cuda.o \ + vf_tonemap_cuda.ptx.o cuda/load_helper.o \ + cuda/cuda_async_queue.o OBJS-$(CONFIG_TONEMAP_OPENCL_FILTER) += vf_tonemap_opencl.o opencl.o \ opencl/tonemap.o opencl/colorspace_common.o OBJS-$(CONFIG_TONEMAP_VAAPI_FILTER) += vf_tonemap_vaapi.o vaapi_vpp.o diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c index 9819f0f95b7..35c303c7442 100644 --- a/libavfilter/allfilters.c +++ b/libavfilter/allfilters.c @@ -351,6 +351,7 @@ extern const AVFilter ff_vf_lut; extern const AVFilter ff_vf_lut1d; extern const AVFilter ff_vf_lut2; extern const AVFilter ff_vf_lut3d; +extern const AVFilter ff_vf_lut3d_cuda; extern const AVFilter ff_vf_lutrgb; extern const AVFilter ff_vf_lutyuv; extern const AVFilter ff_vf_maskedclamp; @@ -458,6 +459,7 @@ extern const AVFilter ff_vf_sharpen_npp; extern const AVFilter ff_vf_sharpness_vaapi; extern const AVFilter ff_vf_shear; extern const AVFilter ff_vf_showinfo; +extern const AVFilter ff_vf_showinfo_cuda; extern const AVFilter ff_vf_showpalette; extern const AVFilter ff_vf_shuffleframes; extern const AVFilter ff_vf_shufflepixels; @@ -494,6 +496,7 @@ extern const AVFilter ff_vf_tmedian; extern const AVFilter ff_vf_tmidequalizer; extern const AVFilter ff_vf_tmix; extern const AVFilter ff_vf_tonemap; +extern const AVFilter ff_vf_tonemap_cuda; extern const AVFilter ff_vf_tonemap_opencl; extern const AVFilter ff_vf_tonemap_vaapi; extern const AVFilter ff_vf_tpad; diff --git a/libavfilter/cuda/cuda_async_queue.c b/libavfilter/cuda/cuda_async_queue.c new file mode 100644 index 00000000000..c7aedf8c3bf --- /dev/null +++ b/libavfilter/cuda/cuda_async_queue.c @@ -0,0 +1,363 @@ +/* + * CUDA Async Queue - Implementation for asynchronous CUDA frame processing + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include + +#include "cuda_async_queue.h" +#include "libavutil/avassert.h" +#include "libavutil/log.h" +#include "libavutil/error.h" +#include "libavutil/hwcontext_cuda_internal.h" + +#define CHECK_CU(x) do { \ + CUresult res = x; \ + if (res != CUDA_SUCCESS) { \ + av_log(NULL, AV_LOG_ERROR, "CUDA error: %d at %s:%d\n", res, __FILE__, __LINE__); \ + return AVERROR_EXTERNAL; \ + } \ +} while(0) + +static int64_t get_time_us(void) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000000LL + tv.tv_usec; +} + +int ff_cuda_async_queue_init(CudaAsyncQueue * queue, + AVCUDADeviceContext * hwctx, int queue_size, + int num_streams, void *filter_ctx, + int (*process_frame)(void *, AVFrame *, + AVFrame *, CUstream)) +{ + + if(!queue || !hwctx || !filter_ctx || !process_frame) + return AVERROR(EINVAL); + + if (queue_size < 1 || queue_size > MAX_FRAME_QUEUE_SIZE) + return AVERROR(EINVAL); + + if (num_streams < 1 || num_streams > MAX_CUDA_STREAMS) + return AVERROR(EINVAL); + + memset(queue, 0, sizeof(*queue)); + + queue->hwctx = hwctx; + queue->cuda_ctx = hwctx->cuda_ctx; + queue->queue_size = queue_size; + queue->num_streams = num_streams; + queue->filter_ctx = filter_ctx; + queue->process_frame = process_frame; + + CudaFunctions *cu = queue->hwctx->internal->cuda_dl; + + CHECK_CU(cu->cuCtxPushCurrent(queue->cuda_ctx)); + + for (int i = 0; i < num_streams; i++) { + CHECK_CU(cu-> + cuStreamCreate(&queue->streams[i], + CU_STREAM_NON_BLOCKING)); + } + + for (int i = 0; i < queue_size; i++) { + CHECK_CU(cu-> + cuEventCreate(&queue->frames[i].event_start, + CU_EVENT_DISABLE_TIMING)); + CHECK_CU(cu-> + cuEventCreate(&queue->frames[i].event_done, + CU_EVENT_DISABLE_TIMING)); + queue->frames[i].in_use = 0; + queue->frames[i].stream_idx = -1; + } + + CHECK_CU(cu->cuCtxPopCurrent(NULL)); + + av_log(NULL, AV_LOG_DEBUG, + "Async queue initialized: depth=%d streams=%d\n", queue_size, + num_streams); + + return 0; +} + +void ff_cuda_async_queue_uninit(CudaAsyncQueue * queue) +{ + if (!queue || !queue->cuda_ctx) + return; + + CudaFunctions *cu = queue->hwctx->internal->cuda_dl; + + cu->cuCtxPushCurrent(queue->cuda_ctx); + + // First, synchronize all streams to ensure all operations complete + for (int i = 0; i < queue->num_streams; i++) { + if (queue->streams[i]) { + cu->cuStreamSynchronize(queue->streams[i]); + } + } + + ff_cuda_async_queue_flush(queue); + + for (int i = 0; i < queue->num_streams; i++) { + if (queue->streams[i]) { + cu->cuStreamDestroy(queue->streams[i]); + } + } + + for (int i = 0; i < queue->queue_size; i++) { + // Synchronize events before destroying to prevent memory leaks + if (queue->frames[i].event_start) { + cu->cuEventSynchronize(queue->frames[i].event_start); + cu->cuEventDestroy(queue->frames[i].event_start); + } + if (queue->frames[i].event_done) { + cu->cuEventSynchronize(queue->frames[i].event_done); + cu->cuEventDestroy(queue->frames[i].event_done); + } + + // Ensure frames are properly unreferenced before freeing + if (queue->frames[i].input_frame) { + // For input frames, unreference all frame data including hardware buffers + av_frame_unref(queue->frames[i].input_frame); + av_frame_free(&queue->frames[i].input_frame); + } + if (queue->frames[i].output_frame) { + // For output frames, ensure hardware frame buffers are unreferenced + // This is critical for CUDA frames allocated via av_hwframe_get_buffer() + av_frame_unref(queue->frames[i].output_frame); + av_frame_free(&queue->frames[i].output_frame); + } + } + + cu->cuCtxPopCurrent(NULL); + + memset(queue, 0, sizeof(*queue)); +} + +int ff_cuda_async_queue_submit(CudaAsyncQueue * queue, AVFrame * in_frame) +{ + CudaAsyncFrame *async_frame; + AVFrame *out_frame; + int ret; + int stream_idx; + + if (!queue || !in_frame) + return AVERROR(EINVAL); + + if (ff_cuda_async_queue_is_full(queue)) + return AVERROR(EAGAIN); + + CudaFunctions *cu = queue->hwctx->internal->cuda_dl; + + CHECK_CU(cu->cuCtxPushCurrent(queue->cuda_ctx)); + + async_frame = &queue->frames[queue->write_idx]; + av_assert0(!async_frame->in_use); + + async_frame->input_frame = av_frame_clone(in_frame); + if (!async_frame->input_frame) { + ret = AVERROR(ENOMEM); + goto fail; + } + + out_frame = av_frame_alloc(); + if (!out_frame) { + ret = AVERROR(ENOMEM); + goto fail; + } + + // Frame is already clean from av_frame_alloc(), no need to unref + + stream_idx = queue->submission_counter % queue->num_streams; + async_frame->stream_idx = stream_idx; + async_frame->submission_order = queue->submission_counter++; + async_frame->submit_time_us = get_time_us(); + + CHECK_CU(cu-> + cuEventRecord(async_frame->event_start, + queue->streams[stream_idx])); + + av_log(NULL, AV_LOG_DEBUG, + "[CUDA_ASYNC] Frame %" PRId64 + " submitted to stream %d (queue slot %d) at time %" PRId64 "\n", + async_frame->submission_order, stream_idx, queue->write_idx, + async_frame->submit_time_us); + + ret = + queue->process_frame(queue->filter_ctx, out_frame, + async_frame->input_frame, + queue->streams[stream_idx]); + if (ret < 0) { + av_frame_free(&out_frame); + goto fail; + } + // Don't synchronize here - let the kernel run asynchronously + // The synchronization will happen in ff_cuda_async_queue_receive + + async_frame->output_frame = out_frame; + + CHECK_CU(cu-> + cuEventRecord(async_frame->event_done, + queue->streams[stream_idx])); + + av_log(NULL, AV_LOG_DEBUG, + "[CUDA_ASYNC] Frame %" PRId64 + " processing initiated on stream %d\n", + async_frame->submission_order, stream_idx); + + async_frame->in_use = 1; + queue->write_idx = (queue->write_idx + 1) % queue->queue_size; + queue->frames_in_queue++; + + // Log current queue status to show parallel activity + av_log(NULL, AV_LOG_DEBUG, + "[CUDA_ASYNC] Queue status: %d/%d frames active\n", + queue->frames_in_queue, queue->queue_size); + + CHECK_CU(cu->cuCtxPopCurrent(NULL)); + + return 0; + + fail: + av_frame_free(&async_frame->input_frame); + cu->cuCtxPopCurrent(NULL); + return ret; +} + +int ff_cuda_async_queue_receive(CudaAsyncQueue * queue, + AVFrame ** out_frame) +{ + CudaAsyncFrame *async_frame; + CUresult cu_res; + int ret = 0; + + if (!queue || !out_frame) + return AVERROR(EINVAL); + + *out_frame = NULL; + + if (ff_cuda_async_queue_is_empty(queue)) + return AVERROR(EAGAIN); + + CudaFunctions *cu = queue->hwctx->internal->cuda_dl; + + CHECK_CU(cu->cuCtxPushCurrent(queue->cuda_ctx)); + + async_frame = &queue->frames[queue->read_idx]; + av_assert0(async_frame->in_use); + + cu_res = cu->cuEventQuery(async_frame->event_done); + if (cu_res == CUDA_ERROR_NOT_READY) { + ret = AVERROR(EAGAIN); + goto done; + } else if (cu_res != CUDA_SUCCESS) { + av_log(NULL, AV_LOG_ERROR, "CUDA error querying event: %d\n", + cu_res); + ret = AVERROR_EXTERNAL; + goto done; + } + + CHECK_CU(cu->cuEventSynchronize(async_frame->event_done)); + + async_frame->complete_time_us = get_time_us(); + int64_t processing_time = + async_frame->complete_time_us - async_frame->submit_time_us; + + av_log(NULL, AV_LOG_DEBUG, + "[CUDA_ASYNC] Frame %" PRId64 + " completed on stream %d (queue slot %d) at time %" PRId64 + " - processing took %" PRId64 "us\n", + async_frame->submission_order, async_frame->stream_idx, + queue->read_idx, async_frame->complete_time_us, + processing_time); + + *out_frame = async_frame->output_frame; + async_frame->output_frame = NULL; + + av_frame_free(&async_frame->input_frame); + async_frame->in_use = 0; + async_frame->stream_idx = -1; + + queue->read_idx = (queue->read_idx + 1) % queue->queue_size; + queue->frames_in_queue--; + + done: + CHECK_CU(cu->cuCtxPopCurrent(NULL)); + return ret; +} + +int ff_cuda_async_queue_flush(CudaAsyncQueue * queue) +{ + AVFrame *frame; + int ret; + + if (!queue) + return AVERROR(EINVAL); + + CudaFunctions *cu = queue->hwctx->internal->cuda_dl; + + CHECK_CU(cu->cuCtxPushCurrent(queue->cuda_ctx)); + + for (int i = 0; i < queue->num_streams; i++) { + if (queue->streams[i]) { + CHECK_CU(cu->cuStreamSynchronize(queue->streams[i])); + } + } + + while (!ff_cuda_async_queue_is_empty(queue)) { + ret = ff_cuda_async_queue_receive(queue, &frame); + if (ret == 0 && frame) { + av_frame_free(&frame); + } else if (ret < 0 && ret != AVERROR(EAGAIN)) { + // Force cleanup of remaining frames + for (int i = 0; i < queue->queue_size; i++) { + if (queue->frames[i].in_use) { + // Properly unreference frames before freeing + if (queue->frames[i].input_frame) { + av_frame_unref(queue->frames[i].input_frame); + av_frame_free(&queue->frames[i].input_frame); + } + if (queue->frames[i].output_frame) { + av_frame_unref(queue->frames[i].output_frame); + av_frame_free(&queue->frames[i].output_frame); + } + queue->frames[i].in_use = 0; + } + } + queue->frames_in_queue = 0; + queue->read_idx = queue->write_idx = 0; + break; + } + } + + CHECK_CU(cu->cuCtxPopCurrent(NULL)); + + return 0; +} + +int ff_cuda_async_queue_is_full(CudaAsyncQueue * queue) +{ + return queue->frames_in_queue >= queue->queue_size; +} + +int ff_cuda_async_queue_is_empty(CudaAsyncQueue * queue) +{ + return queue->frames_in_queue == 0; +} diff --git a/libavfilter/cuda/cuda_async_queue.h b/libavfilter/cuda/cuda_async_queue.h new file mode 100644 index 00000000000..67bbed4e462 --- /dev/null +++ b/libavfilter/cuda/cuda_async_queue.h @@ -0,0 +1,82 @@ +/* + * CUDA Async Queue - Header file for asynchronous CUDA frame processing + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFILTER_CUDA_ASYNC_QUEUE_H +#define AVFILTER_CUDA_ASYNC_QUEUE_H + +#include "libavutil/frame.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "../avfilter.h" + +#define MAX_FRAME_QUEUE_SIZE 8 +#define MAX_CUDA_STREAMS 4 + +typedef struct CudaAsyncFrame { + AVFrame *input_frame; + AVFrame *output_frame; + CUevent event_start; + CUevent event_done; + int in_use; + int stream_idx; + int64_t submission_order; + int64_t submit_time_us; + int64_t complete_time_us; +} CudaAsyncFrame; + +typedef struct CudaAsyncQueue { + CudaAsyncFrame frames[MAX_FRAME_QUEUE_SIZE]; + CUstream streams[MAX_CUDA_STREAMS]; + + int queue_size; + int num_streams; + + int write_idx; + int read_idx; + int frames_in_queue; + int64_t submission_counter; + + CUcontext cuda_ctx; + AVCUDADeviceContext *hwctx; + + void *filter_ctx; + int (*process_frame)(void *filter_ctx, AVFrame * out, AVFrame * in, + CUstream stream); +} CudaAsyncQueue; + +int ff_cuda_async_queue_init(CudaAsyncQueue * queue, + AVCUDADeviceContext * hwctx, int queue_size, + int num_streams, void *filter_ctx, + int (*process_frame)(void *, AVFrame *, + AVFrame *, CUstream)); + +void ff_cuda_async_queue_uninit(CudaAsyncQueue * queue); + +int ff_cuda_async_queue_submit(CudaAsyncQueue * queue, AVFrame * in_frame); + +int ff_cuda_async_queue_receive(CudaAsyncQueue * queue, + AVFrame ** out_frame); + +int ff_cuda_async_queue_flush(CudaAsyncQueue * queue); + +int ff_cuda_async_queue_is_full(CudaAsyncQueue * queue); + +int ff_cuda_async_queue_is_empty(CudaAsyncQueue * queue); + +#endif /* AVFILTER_CUDA_ASYNC_QUEUE_H */ diff --git a/libavfilter/cuda/vector_helpers.cuh b/libavfilter/cuda/vector_helpers.cuh index 67332ef0305..334a29b2dc3 100644 --- a/libavfilter/cuda/vector_helpers.cuh +++ b/libavfilter/cuda/vector_helpers.cuh @@ -23,6 +23,8 @@ #ifndef AVFILTER_CUDA_VECTORHELPERS_H #define AVFILTER_CUDA_VECTORHELPERS_H +#include + typedef unsigned char uchar; typedef unsigned short ushort; @@ -35,7 +37,9 @@ template<> struct vector_helper { typedef float2 ftype; typedef int2 it template<> struct vector_helper { typedef float4 ftype; typedef int4 itype; }; template<> struct vector_helper { typedef float ftype; typedef int itype; }; template<> struct vector_helper { typedef float2 ftype; typedef int2 itype; }; +template<> struct vector_helper { typedef float3 ftype; typedef int3 itype; }; template<> struct vector_helper { typedef float4 ftype; typedef int4 itype; }; +template<> struct vector_helper { typedef float3 ftype; typedef int3 itype; }; #define floatT typename vector_helper::ftype #define intT typename vector_helper::itype @@ -77,6 +81,20 @@ OPERATORS4(uchar4) OPERATORS4(ushort4) OPERATORS4(float4) +#define OPERATORS3(T) \ + template inline __device__ T operator+(const T &a, const V &b) { return make_ ## T (a.x + b.x, a.y + b.y, a.z + b.z); } \ + template inline __device__ T operator-(const T &a, const V &b) { return make_ ## T (a.x - b.x, a.y - b.y, a.z - b.z); } \ + template inline __device__ T operator*(const T &a, V b) { return make_ ## T (a.x * b, a.y * b, a.z * b); } \ + template inline __device__ T operator/(const T &a, V b) { return make_ ## T (a.x / b, a.y / b, a.z / b); } \ + template inline __device__ T &operator+=(T &a, const V &b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; } \ + template inline __device__ void vec_set(T &a, const V &b) { a.x = b.x; a.y = b.y; a.z = b.z; } \ + template inline __device__ void vec_set_scalar(T &a, V b) { a.x = b; a.y = b; a.z = b; } \ + template<> inline __device__ float3 to_floatN(const T &a) { return make_float3(a.x, a.y, a.z); } \ + template<> inline __device__ T from_floatN(const float3 &a) { return make_ ## T(a.x, a.y, a.z); } + +OPERATORS3(int3) +OPERATORS3(float3) + template inline __device__ void vec_set(int &a, V b) { a = b; } template inline __device__ void vec_set(float &a, V b) { a = b; } template inline __device__ void vec_set(uchar &a, V b) { a = b; } @@ -99,6 +117,15 @@ inline __device__ float2 lerp_scalar(float2 v0, float2 v1, float t) { ); } +template<> +inline __device__ float3 lerp_scalar(float3 v0, float3 v1, float t) { + return make_float3( + lerp_scalar(v0.x, v1.x, t), + lerp_scalar(v0.y, v1.y, t), + lerp_scalar(v0.z, v1.z, t) + ); +} + template<> inline __device__ float4 lerp_scalar(float4 v0, float4 v1, float t) { return make_float4( diff --git a/libavfilter/vf_colorspace_cuda.c b/libavfilter/vf_colorspace_cuda.c index 9d1058ff7fd..4eeef38a920 100644 --- a/libavfilter/vf_colorspace_cuda.c +++ b/libavfilter/vf_colorspace_cuda.c @@ -32,13 +32,16 @@ #include "avfilter.h" #include "filters.h" +#include "colorspace.h" #include "cuda/load_helper.h" +#include "cuda/cuda_async_queue.h" static const enum AVPixelFormat supported_formats[] = { AV_PIX_FMT_NV12, AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV444P, + AV_PIX_FMT_P016LE, }; #define DIV_UP(a, b) (((a) + (b)-1) / (b)) @@ -48,27 +51,79 @@ static const enum AVPixelFormat supported_formats[] = { #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) typedef struct CUDAColorspaceContext { - const AVClass* class; + const AVClass *class; - AVCUDADeviceContext* hwctx; - AVBufferRef* frames_ctx; - AVFrame* own_frame; - AVFrame* tmp_frame; + AVCUDADeviceContext *hwctx; + AVBufferRef *frames_ctx; + AVFrame *own_frame; + AVFrame *tmp_frame; CUcontext cu_ctx; CUstream cu_stream; CUmodule cu_module; CUfunction cu_convert[AVCOL_RANGE_NB]; + CUfunction cu_convert_colorspace; + CUfunction cu_convert_colorspace_yuv; + CUfunction cu_convert_colorspace_nv12; + CUfunction cu_convert_16[AVCOL_RANGE_NB]; + CUfunction cu_convert_colorspace_16; + CUfunction cu_convert_colorspace_yuv_16; + enum AVPixelFormat pix_fmt; enum AVColorRange range; + enum AVColorSpace in_csp, out_csp, user_csp; + enum AVColorPrimaries in_prm, out_prm, user_prm; + enum AVColorTransferCharacteristic in_trc, out_trc, user_trc; + + // Conversion matrices for YUV-to-YUV colorspace conversion + float yuv2yuv_matrix[3][3]; + float yuv_offset[2]; // input and output offsets + int colorspace_conversion_needed; int num_planes; + + int async_depth; + int async_streams; + CudaAsyncQueue async_queue; } CUDAColorspaceContext; -static av_cold int cudacolorspace_init(AVFilterContext* ctx) +static int calculate_yuv2yuv_matrix(CUDAColorspaceContext * s) { - CUDAColorspaceContext* s = ctx->priv; + const AVLumaCoefficients *in_lumacoef, *out_lumacoef; + double yuv2rgb[3][3], rgb2yuv[3][3], yuv2yuv[3][3]; + int i, j; + + // Get luma coefficients for input and output colorspaces + in_lumacoef = av_csp_luma_coeffs_from_avcsp(s->in_csp); + out_lumacoef = av_csp_luma_coeffs_from_avcsp(s->out_csp); + + if (!in_lumacoef || !out_lumacoef) { + return AVERROR(EINVAL); + } + // Calculate YUV to RGB matrix for input colorspace + ff_fill_rgb2yuv_table(in_lumacoef, rgb2yuv); + ff_matrix_invert_3x3(rgb2yuv, yuv2rgb); + + // Calculate RGB to YUV matrix for output colorspace + ff_fill_rgb2yuv_table(out_lumacoef, rgb2yuv); + + // Combine: YUV_in -> RGB -> YUV_out + ff_matrix_mul_3x3(yuv2yuv, yuv2rgb, rgb2yuv); + + // Convert to float for CUDA kernel + for (i = 0; i < 3; i++) { + for (j = 0; j < 3; j++) { + s->yuv2yuv_matrix[i][j] = (float) yuv2yuv[i][j]; + } + } + + return 0; +} + +static av_cold int cudacolorspace_init(AVFilterContext * ctx) +{ + CUDAColorspaceContext *s = ctx->priv; s->own_frame = av_frame_alloc(); if (!s->own_frame) @@ -78,15 +133,21 @@ static av_cold int cudacolorspace_init(AVFilterContext* ctx) if (!s->tmp_frame) return AVERROR(ENOMEM); + return 0; } -static av_cold void cudacolorspace_uninit(AVFilterContext* ctx) +static int process_frame_async(void *filter_ctx, AVFrame * out, + AVFrame * in, CUstream stream); +static int cudacolorspace_filter_frame(AVFilterLink * link, AVFrame * in); + +static av_cold void cudacolorspace_uninit(AVFilterContext * ctx) { - CUDAColorspaceContext* s = ctx->priv; + CUDAColorspaceContext *s = ctx->priv; + if (s->hwctx && s->cu_module) { - CudaFunctions* cu = s->hwctx->internal->cuda_dl; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; CUcontext dummy; CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); @@ -96,22 +157,30 @@ static av_cold void cudacolorspace_uninit(AVFilterContext* ctx) } av_frame_free(&s->own_frame); - av_buffer_unref(&s->frames_ctx); av_frame_free(&s->tmp_frame); + + // Clean up async queue BEFORE freeing frames_ctx to ensure + // that async frames can properly return their buffer references + if (s->async_depth > 1 || s->async_streams > 1) { + ff_cuda_async_queue_uninit(&s->async_queue); + } + + av_buffer_unref(&s->frames_ctx); } -static av_cold int init_hwframe_ctx(CUDAColorspaceContext* s, AVBufferRef* device_ctx, - int width, int height) +static av_cold int init_hwframe_ctx(CUDAColorspaceContext * s, + AVBufferRef * device_ctx, int width, + int height) { - AVBufferRef* out_ref = NULL; - AVHWFramesContext* out_ctx; + AVBufferRef *out_ref = NULL; + AVHWFramesContext *out_ctx; int ret; out_ref = av_hwframe_ctx_alloc(device_ctx); if (!out_ref) return AVERROR(ENOMEM); - out_ctx = (AVHWFramesContext*)out_ref->data; + out_ctx = (AVHWFramesContext *) out_ref->data; out_ctx->format = AV_PIX_FMT_CUDA; out_ctx->sw_format = s->pix_fmt; @@ -134,7 +203,7 @@ static av_cold int init_hwframe_ctx(CUDAColorspaceContext* s, AVBufferRef* devic s->frames_ctx = out_ref; return 0; -fail: + fail: av_buffer_unref(&out_ref); return ret; } @@ -148,13 +217,18 @@ static int format_is_supported(enum AVPixelFormat fmt) return 0; } -static av_cold int init_processing_chain(AVFilterContext* ctx, int width, +static int format_is_16bit(enum AVPixelFormat fmt) +{ + return fmt == AV_PIX_FMT_P016LE; +} + +static av_cold int init_processing_chain(AVFilterContext * ctx, int width, int height) { - FilterLink *inl = ff_filter_link(ctx->inputs[0]); - FilterLink *outl = ff_filter_link(ctx->outputs[0]); - CUDAColorspaceContext* s = ctx->priv; - AVHWFramesContext* in_frames_ctx; + FilterLink *inl = ff_filter_link(ctx->inputs[0]); + FilterLink *outl = ff_filter_link(ctx->outputs[0]); + CUDAColorspaceContext *s = ctx->priv; + AVHWFramesContext *in_frames_ctx; int ret; @@ -163,7 +237,7 @@ static av_cold int init_processing_chain(AVFilterContext* ctx, int width, return AVERROR(EINVAL); } - in_frames_ctx = (AVHWFramesContext*)inl->hw_frames_ctx->data; + in_frames_ctx = (AVHWFramesContext *) inl->hw_frames_ctx->data; s->pix_fmt = in_frames_ctx->sw_format; if (!format_is_supported(s->pix_fmt)) { @@ -172,13 +246,32 @@ static av_cold int init_processing_chain(AVFilterContext* ctx, int width, return AVERROR(EINVAL); } - if ((AVCOL_RANGE_MPEG != s->range) && (AVCOL_RANGE_JPEG != s->range)) { + if ((AVCOL_RANGE_MPEG != s->range) && (AVCOL_RANGE_JPEG != s->range) && + (AVCOL_RANGE_UNSPECIFIED != s->range)) { av_log(ctx, AV_LOG_ERROR, "Unsupported color range\n"); return AVERROR(EINVAL); } s->num_planes = av_pix_fmt_count_planes(s->pix_fmt); + // Determine if colorspace conversion is needed + s->colorspace_conversion_needed = (s->in_csp != AVCOL_SPC_UNSPECIFIED + && s->out_csp != + AVCOL_SPC_UNSPECIFIED + && s->in_csp != s->out_csp); + + if (s->colorspace_conversion_needed) { + ret = calculate_yuv2yuv_matrix(s); + if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, + "Failed to calculate colorspace conversion matrix\n"); + return ret; + } + av_log(ctx, AV_LOG_INFO, "Colorspace conversion: %s -> %s\n", + av_color_space_name(s->in_csp), + av_color_space_name(s->out_csp)); + } + ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, width, height); if (ret < 0) return ret; @@ -190,11 +283,11 @@ static av_cold int init_processing_chain(AVFilterContext* ctx, int width, return 0; } -static av_cold int cudacolorspace_load_functions(AVFilterContext* ctx) +static av_cold int cudacolorspace_load_functions(AVFilterContext * ctx) { - CUDAColorspaceContext* s = ctx->priv; + CUDAColorspaceContext *s = ctx->priv; CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx; - CudaFunctions* cu = s->hwctx->internal->cuda_dl; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; int ret; extern const unsigned char ff_vf_colorspace_cuda_ptx_data[]; @@ -210,46 +303,127 @@ static av_cold int cudacolorspace_load_functions(AVFilterContext* ctx) if (ret < 0) goto fail; - ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_convert[AVCOL_RANGE_MPEG], s->cu_module, "to_mpeg_cuda")); + ret = + CHECK_CU(cu-> + cuModuleGetFunction(&s->cu_convert[AVCOL_RANGE_MPEG], + s->cu_module, "to_mpeg_cuda")); + if (ret < 0) + goto fail; + + ret = + CHECK_CU(cu-> + cuModuleGetFunction(&s->cu_convert[AVCOL_RANGE_JPEG], + s->cu_module, "to_jpeg_cuda")); + if (ret < 0) + goto fail; + + ret = + CHECK_CU(cu-> + cuModuleGetFunction(&s->cu_convert_colorspace, + s->cu_module, + "colorspace_convert_cuda")); + if (ret < 0) + goto fail; + + ret = + CHECK_CU(cu-> + cuModuleGetFunction(&s->cu_convert_colorspace_yuv, + s->cu_module, + "colorspace_convert_yuv_cuda")); + if (ret < 0) + goto fail; + + ret = + CHECK_CU(cu-> + cuModuleGetFunction(&s->cu_convert_colorspace_nv12, + s->cu_module, + "colorspace_convert_nv12_cuda")); if (ret < 0) goto fail; - ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_convert[AVCOL_RANGE_JPEG], s->cu_module, "to_jpeg_cuda")); + ret = + CHECK_CU(cu-> + cuModuleGetFunction(&s->cu_convert_16[AVCOL_RANGE_MPEG], + s->cu_module, "to_mpeg_cuda_16")); if (ret < 0) goto fail; -fail: + ret = + CHECK_CU(cu-> + cuModuleGetFunction(&s->cu_convert_16[AVCOL_RANGE_JPEG], + s->cu_module, "to_jpeg_cuda_16")); + if (ret < 0) + goto fail; + + ret = + CHECK_CU(cu-> + cuModuleGetFunction(&s->cu_convert_colorspace_16, + s->cu_module, + "colorspace_convert_cuda_16")); + if (ret < 0) + goto fail; + + ret = + CHECK_CU(cu-> + cuModuleGetFunction(&s->cu_convert_colorspace_yuv_16, + s->cu_module, + "colorspace_convert_yuv_cuda_16")); + if (ret < 0) + goto fail; + + fail: CHECK_CU(cu->cuCtxPopCurrent(&dummy)); return ret; } -static av_cold int cudacolorspace_config_props(AVFilterLink* outlink) +static av_cold int cudacolorspace_config_props(AVFilterLink * outlink) { - AVFilterContext* ctx = outlink->src; - AVFilterLink* inlink = outlink->src->inputs[0]; - FilterLink *inl = ff_filter_link(inlink); - CUDAColorspaceContext* s = ctx->priv; - AVHWFramesContext* frames_ctx; - AVCUDADeviceContext* device_hwctx; + AVFilterContext *ctx = outlink->src; + AVFilterLink *inlink = outlink->src->inputs[0]; + FilterLink *inl = ff_filter_link(inlink); + CUDAColorspaceContext *s = ctx->priv; + AVHWFramesContext *frames_ctx; + AVCUDADeviceContext *device_hwctx; int ret; outlink->w = inlink->w; outlink->h = inlink->h; + // Set defaults - actual colorspace will be determined from frame properties + s->in_csp = + inlink->colorspace != + AVCOL_SPC_UNSPECIFIED ? inlink->colorspace : AVCOL_SPC_BT709; + s->in_prm = AVCOL_PRI_BT709; // Default, will be updated from frame + s->in_trc = AVCOL_TRC_BT709; // Default, will be updated from frame + + // Set output colorspace properties + s->out_csp = + s->user_csp != AVCOL_SPC_UNSPECIFIED ? s->user_csp : s->in_csp; + s->out_prm = + s->user_prm != AVCOL_PRI_UNSPECIFIED ? s->user_prm : s->in_prm; + s->out_trc = + s->user_trc != AVCOL_TRC_UNSPECIFIED ? s->user_trc : s->in_trc; + + // Set output link properties + outlink->colorspace = s->out_csp; + ret = init_processing_chain(ctx, inlink->w, inlink->h); if (ret < 0) return ret; - frames_ctx = (AVHWFramesContext*)inl->hw_frames_ctx->data; + frames_ctx = (AVHWFramesContext *) inl->hw_frames_ctx->data; device_hwctx = frames_ctx->device_ctx->hwctx; s->hwctx = device_hwctx; s->cu_stream = s->hwctx->stream; if (inlink->sample_aspect_ratio.num) { - outlink->sample_aspect_ratio = av_mul_q( - (AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, - inlink->sample_aspect_ratio); + outlink->sample_aspect_ratio = av_mul_q((AVRational) { + outlink->h * inlink->w, + outlink->w * inlink->h} + , + inlink-> + sample_aspect_ratio); } else { outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; } @@ -258,13 +432,29 @@ static av_cold int cudacolorspace_config_props(AVFilterLink* outlink) if (ret < 0) return ret; + if (s->async_depth > 1 || s->async_streams > 1) { + av_log(ctx, AV_LOG_DEBUG, + "Async processing enabled: depth=%d streams=%d\n", + s->async_depth, s->async_streams); + + ret = ff_cuda_async_queue_init(&s->async_queue, s->hwctx, + s->async_depth, s->async_streams, + ctx, process_frame_async); + if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, + "Failed to initialize async queue\n"); + return ret; + } + } + return ret; } -static int conv_cuda_convert(AVFilterContext* ctx, AVFrame* out, AVFrame* in) +static int conv_cuda_convert(AVFilterContext * ctx, AVFrame * out, + AVFrame * in) { - CUDAColorspaceContext* s = ctx->priv; - CudaFunctions* cu = s->hwctx->internal->cuda_dl; + CUDAColorspaceContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx; int ret; @@ -272,54 +462,238 @@ static int conv_cuda_convert(AVFilterContext* ctx, AVFrame* out, AVFrame* in) if (ret < 0) return ret; - out->color_range = s->range; - - for (int i = 0; i < s->num_planes; i++) { - int width = in->width, height = in->height, comp_id = (i > 0); - + // Update input colorspace properties from frame + s->in_csp = + in->colorspace != + AVCOL_SPC_UNSPECIFIED ? in->colorspace : s->in_csp; + s->in_prm = + in->color_primaries != + AVCOL_PRI_UNSPECIFIED ? in->color_primaries : s->in_prm; + s->in_trc = + in->color_trc != AVCOL_TRC_UNSPECIFIED ? in->color_trc : s->in_trc; + + // Update output colorspace if user didn't specify + if (s->user_csp == AVCOL_SPC_UNSPECIFIED) + s->out_csp = s->in_csp; + if (s->user_prm == AVCOL_PRI_UNSPECIFIED) + s->out_prm = s->in_prm; + if (s->user_trc == AVCOL_TRC_UNSPECIFIED) + s->out_trc = s->in_trc; + + // Check if we need to recalculate matrix + int need_recalc = (s->colorspace_conversion_needed && + (s->in_csp != s->out_csp)); + + if (need_recalc) { + ret = calculate_yuv2yuv_matrix(s); + if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, + "Failed to recalculate colorspace conversion matrix\n"); + goto fail; + } + } + + // Set output frame properties immediately in conv function + out->color_range = + s->range != AVCOL_RANGE_UNSPECIFIED ? s->range : in->color_range; + out->colorspace = s->out_csp; + out->color_primaries = s->out_prm; + out->color_trc = s->out_trc; + + // If only colorspace conversion is needed (no range conversion) + if (s->colorspace_conversion_needed + && (in->color_range == out->color_range + || s->range == AVCOL_RANGE_UNSPECIFIED)) { + + // Check if we can use the new YUV kernel + int use_yuv_kernel = 0; + int use_nv12_kernel = 0; switch (s->pix_fmt) { case AV_PIX_FMT_YUV444P: - break; case AV_PIX_FMT_YUV420P: - width = comp_id ? in->width / 2 : in->width; - /* fall-through */ + case AV_PIX_FMT_P016LE: + use_yuv_kernel = 1; + break; case AV_PIX_FMT_NV12: - height = comp_id ? in->height / 2 : in->height; + use_nv12_kernel = 1; break; default: - av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n", - av_get_pix_fmt_name(s->pix_fmt)); - return AVERROR(EINVAL); - } - - if (!s->cu_convert[out->color_range]) { - av_log(ctx, AV_LOG_ERROR, "Unsupported color range\n"); - return AVERROR(EINVAL); + // Fall back to per-plane conversion for other formats + use_yuv_kernel = 0; + use_nv12_kernel = 0; + break; } - - if (in->color_range != out->color_range) { - void* args[] = {&in->data[i], &out->data[i], &in->linesize[i], - &comp_id}; - ret = CHECK_CU(cu->cuLaunchKernel( - s->cu_convert[out->color_range], DIV_UP(width, BLOCKX), - DIV_UP(height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream, - args, NULL)); - } else { - ret = av_hwframe_transfer_data(out, in, 0); + + if (use_yuv_kernel) { + // Use new YUV kernel for proper colorspace conversion + int width = in->width, height = in->height; + int src_pitch_uv = in->linesize[1]; + int dst_pitch_uv = out->linesize[1]; + + if (format_is_16bit(s->pix_fmt)) { + // 16-bit YUV conversion + void *args[] = { + &in->data[0], &in->data[1], &in->data[2], // src Y, U, V + &out->data[0], &out->data[1], &out->data[2], // dst Y, U, V + &in->linesize[0], &src_pitch_uv, // src pitches + &out->linesize[0], &dst_pitch_uv, // dst pitches + &width, &height, + &s->yuv2yuv_matrix[0][0], &s->yuv2yuv_matrix[0][1], &s->yuv2yuv_matrix[0][2], + &s->yuv2yuv_matrix[1][0], &s->yuv2yuv_matrix[1][1], &s->yuv2yuv_matrix[1][2], + &s->yuv2yuv_matrix[2][0], &s->yuv2yuv_matrix[2][1], &s->yuv2yuv_matrix[2][2] + }; + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_convert_colorspace_yuv_16, + DIV_UP(width, BLOCKX), DIV_UP(height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL)); + } else { + // 8-bit YUV conversion + void *args[] = { + &in->data[0], &in->data[1], &in->data[2], // src Y, U, V + &out->data[0], &out->data[1], &out->data[2], // dst Y, U, V + &in->linesize[0], &src_pitch_uv, // src pitches + &out->linesize[0], &dst_pitch_uv, // dst pitches + &width, &height, + &s->yuv2yuv_matrix[0][0], &s->yuv2yuv_matrix[0][1], &s->yuv2yuv_matrix[0][2], + &s->yuv2yuv_matrix[1][0], &s->yuv2yuv_matrix[1][1], &s->yuv2yuv_matrix[1][2], + &s->yuv2yuv_matrix[2][0], &s->yuv2yuv_matrix[2][1], &s->yuv2yuv_matrix[2][2] + }; + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_convert_colorspace_yuv, + DIV_UP(width, BLOCKX), DIV_UP(height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL)); + } + if (ret < 0) - return ret; + goto fail; + } else if (use_nv12_kernel) { + // Use NV12 kernel for proper colorspace conversion + int width = in->width, height = in->height; + + void *args[] = { + &in->data[0], &in->data[1], // src Y, UV planes + &out->data[0], &out->data[1], // dst Y, UV planes + &in->linesize[0], &in->linesize[1], // src pitches + &out->linesize[0], &out->linesize[1], // dst pitches + &width, &height, + &s->yuv2yuv_matrix[0][0], &s->yuv2yuv_matrix[0][1], &s->yuv2yuv_matrix[0][2], + &s->yuv2yuv_matrix[1][0], &s->yuv2yuv_matrix[1][1], &s->yuv2yuv_matrix[1][2], + &s->yuv2yuv_matrix[2][0], &s->yuv2yuv_matrix[2][1], &s->yuv2yuv_matrix[2][2] + }; + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_convert_colorspace_nv12, + DIV_UP(width, BLOCKX), DIV_UP(height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL)); + + if (ret < 0) + goto fail; + } else { + // Fall back to original per-plane conversion for NV12 and other formats + for (int i = 0; i < s->num_planes; i++) { + int width = in->width, height = in->height; + + switch (s->pix_fmt) { + case AV_PIX_FMT_YUV444P: + break; + case AV_PIX_FMT_YUV420P: + width = (i > 0) ? in->width / 2 : in->width; + /* fall-through */ + case AV_PIX_FMT_NV12: + height = (i > 0) ? in->height / 2 : in->height; + break; + case AV_PIX_FMT_P016LE: + height = (i > 0) ? in->height / 2 : in->height; + break; + default: + av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n", + av_get_pix_fmt_name(s->pix_fmt)); + return AVERROR(EINVAL); + } + + void *args[] = + { &in->data[i], &out->data[i], &in->linesize[i], + &out->linesize[i], + &width, &height, &i, + &s->yuv2yuv_matrix[0][0], &s->yuv2yuv_matrix[0][1], + &s->yuv2yuv_matrix[0][2], + &s->yuv2yuv_matrix[1][0], &s->yuv2yuv_matrix[1][1], + &s->yuv2yuv_matrix[1][2], + &s->yuv2yuv_matrix[2][0], &s->yuv2yuv_matrix[2][1], + &s->yuv2yuv_matrix[2][2] + }; + CUfunction kernel = format_is_16bit(s->pix_fmt) ? + s->cu_convert_colorspace_16 : s->cu_convert_colorspace; + ret = + CHECK_CU(cu-> + cuLaunchKernel(kernel, + DIV_UP(width, BLOCKX), + DIV_UP(height, BLOCKY), 1, BLOCKX, + BLOCKY, 1, 0, s->cu_stream, args, + NULL)); + if (ret < 0) + goto fail; + } + } + } else { + // Handle range conversion or passthrough + for (int i = 0; i < s->num_planes; i++) { + int width = in->width, height = in->height, comp_id = (i > 0); + + switch (s->pix_fmt) { + case AV_PIX_FMT_YUV444P: + break; + case AV_PIX_FMT_YUV420P: + width = comp_id ? in->width / 2 : in->width; + /* fall-through */ + case AV_PIX_FMT_NV12: + height = comp_id ? in->height / 2 : in->height; + break; + case AV_PIX_FMT_P016LE: + height = comp_id ? in->height / 2 : in->height; + break; + default: + av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n", + av_get_pix_fmt_name(s->pix_fmt)); + return AVERROR(EINVAL); + } + + if (s->range != AVCOL_RANGE_UNSPECIFIED + && in->color_range != out->color_range) { + CUfunction *convert_funcs = format_is_16bit(s->pix_fmt) ? + s->cu_convert_16 : s->cu_convert; + if (!convert_funcs[out->color_range]) { + av_log(ctx, AV_LOG_ERROR, "Unsupported color range\n"); + return AVERROR(EINVAL); + } + + void *args[] = + { &in->data[i], &out->data[i], &in->linesize[i], + &comp_id }; + ret = + CHECK_CU(cu-> + cuLaunchKernel(convert_funcs[out->color_range], + DIV_UP(width, BLOCKX), + DIV_UP(height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, + s->cu_stream, args, NULL)); + if (ret < 0) + goto fail; + } else { + ret = av_hwframe_transfer_data(out, in, 0); + if (ret < 0) + goto fail; + } } } + fail: CHECK_CU(cu->cuCtxPopCurrent(&dummy)); return ret; } -static int cudacolorspace_conv(AVFilterContext* ctx, AVFrame* out, AVFrame* in) +static int cudacolorspace_conv(AVFilterContext * ctx, AVFrame * out, + AVFrame * in) { - CUDAColorspaceContext* s = ctx->priv; - AVFilterLink* outlink = ctx->outputs[0]; - AVFrame* src = in; + CUDAColorspaceContext *s = ctx->priv; + AVFilterLink *outlink = ctx->outputs[0]; + AVFrame *src = in; int ret; ret = conv_cuda_convert(ctx, s->own_frame, src); @@ -341,20 +715,210 @@ static int cudacolorspace_conv(AVFilterContext* ctx, AVFrame* out, AVFrame* in) if (ret < 0) return ret; + // Set output frame properties AFTER av_frame_copy_props to avoid overwriting + out->color_range = + s->range != AVCOL_RANGE_UNSPECIFIED ? s->range : in->color_range; + out->colorspace = s->out_csp; + out->color_primaries = s->out_prm; + out->color_trc = s->out_trc; + return 0; } -static int cudacolorspace_filter_frame(AVFilterLink* link, AVFrame* in) + +static int cudacolorspace_activate(AVFilterContext * ctx) +{ + AVFilterLink *inlink = ctx->inputs[0]; + AVFilterLink *outlink = ctx->outputs[0]; + CUDAColorspaceContext *s = ctx->priv; + AVFrame *in = NULL; + int ret, status; + int64_t pts; + + FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink); + + if (s->async_depth > 1 || s->async_streams > 1) { + // Try to receive completed frames first to avoid memory buildup + AVFrame *out = NULL; + ret = ff_cuda_async_queue_receive(&s->async_queue, &out); + if (ret == 0 && out) { + av_reduce(&out->sample_aspect_ratio.num, + &out->sample_aspect_ratio.den, + (int64_t) out->sample_aspect_ratio.num * outlink->h * + inlink->w, + (int64_t) out->sample_aspect_ratio.den * outlink->w * + inlink->h, INT_MAX); + return ff_filter_frame(outlink, out); + } else if (ret < 0 && ret != AVERROR(EAGAIN)) { + return ret; + } + + // Only submit new frames if there's space and no completed frames ready + if (!ff_cuda_async_queue_is_full(&s->async_queue)) { + ret = ff_inlink_consume_frame(inlink, &in); + if (ret < 0) { + if (ret == AVERROR_EOF || ret == AVERROR(EAGAIN)) { + // No input available, but we might have frames in queue + return AVERROR(EAGAIN); + } + return ret; + } + + if (in) { + ret = ff_cuda_async_queue_submit(&s->async_queue, in); + av_frame_free(&in); + if (ret < 0) { + return ret; + } + av_log(ctx, AV_LOG_DEBUG, + "Submitted frame to async queue\n"); + } + } + } else { + // Synchronous processing + ret = ff_inlink_consume_frame(inlink, &in); + if (ret < 0) + return ret; + + if (in) { + ret = cudacolorspace_filter_frame(inlink, in); + if (ret < 0) + return ret; + } + } + + if (ff_inlink_acknowledge_status(inlink, &status, &pts)) { + if (s->async_depth > 1 || s->async_streams > 1) { + // Flush all remaining async frames + while (!ff_cuda_async_queue_is_empty(&s->async_queue)) { + AVFrame *out = NULL; + ret = ff_cuda_async_queue_receive(&s->async_queue, &out); + if (ret == 0 && out) { + av_reduce(&out->sample_aspect_ratio.num, + &out->sample_aspect_ratio.den, + (int64_t) out->sample_aspect_ratio.num * + outlink->h * inlink->w, + (int64_t) out->sample_aspect_ratio.den * + outlink->w * inlink->h, INT_MAX); + ret = ff_filter_frame(outlink, out); + if (ret < 0) + return ret; + } else if (ret < 0 && ret != AVERROR(EAGAIN)) { + av_log(ctx, AV_LOG_WARNING, + "Error receiving async frame during flush: %d\n", + ret); + break; + } + } + } + ff_outlink_set_status(outlink, status, pts); + return 0; + } + + FF_FILTER_FORWARD_WANTED(outlink, inlink); + + return FFERROR_NOT_READY; +} + +static int process_frame_async(void *filter_ctx, AVFrame * out, + AVFrame * in, CUstream stream) +{ + AVFilterContext *ctx = (AVFilterContext *) filter_ctx; + CUDAColorspaceContext *s = ctx->priv; + CUstream old_stream = s->cu_stream; + int ret; + + // Don't pre-allocate hardware buffer here - let cudacolorspace_conv handle it + // The previous code had a memory leak by allocating here then overwriting in conv + + // Set the stream and call the conv function (conv manages its own context) + s->cu_stream = stream; + ret = cudacolorspace_conv(ctx, out, in); + s->cu_stream = old_stream; + + if (ret >= 0) { + ret = av_frame_copy_props(out, in); + if (ret >= 0) { + // Set output frame properties AFTER av_frame_copy_props to avoid overwriting + out->color_range = + s->range != AVCOL_RANGE_UNSPECIFIED ? s->range : in->color_range; + out->colorspace = s->out_csp; + out->color_primaries = s->out_prm; + out->color_trc = s->out_trc; + } + } + + return ret; +} + +static int cudacolorspace_filter_frame(AVFilterLink * link, AVFrame * in) { - AVFilterContext* ctx = link->dst; - CUDAColorspaceContext* s = ctx->priv; - AVFilterLink* outlink = ctx->outputs[0]; - CudaFunctions* cu = s->hwctx->internal->cuda_dl; + AVFilterContext *ctx = link->dst; + CUDAColorspaceContext *s = ctx->priv; + AVFilterLink *outlink = ctx->outputs[0]; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; - AVFrame* out = NULL; + AVFrame *out = NULL; CUcontext dummy; int ret = 0; + if (s->async_depth > 1 || s->async_streams > 1) { + ret = ff_cuda_async_queue_submit(&s->async_queue, in); + if (ret == AVERROR(EAGAIN)) { + AVFrame *completed_frame = NULL; + ret = + ff_cuda_async_queue_receive(&s->async_queue, + &completed_frame); + if (ret < 0 && ret != AVERROR(EAGAIN)) { + av_frame_free(&in); + return ret; + } + if (completed_frame) { + av_reduce(&completed_frame->sample_aspect_ratio.num, + &completed_frame->sample_aspect_ratio.den, + (int64_t) in->sample_aspect_ratio.num * + outlink->h * link->w, + (int64_t) in->sample_aspect_ratio.den * + outlink->w * link->h, INT_MAX); + ret = ff_filter_frame(outlink, completed_frame); + if (ret < 0) { + av_frame_free(&in); + return ret; + } + } + ret = ff_cuda_async_queue_submit(&s->async_queue, in); + } + if (ret < 0) { + av_frame_free(&in); + return ret; + } + + while (ff_inlink_queued_frames(link) == 0) { + AVFrame *completed_frame = NULL; + ret = + ff_cuda_async_queue_receive(&s->async_queue, + &completed_frame); + if (ret == AVERROR(EAGAIN)) { + break; + } else if (ret < 0) { + return ret; + } + if (completed_frame) { + av_reduce(&completed_frame->sample_aspect_ratio.num, + &completed_frame->sample_aspect_ratio.den, + (int64_t) in->sample_aspect_ratio.num * + outlink->h * link->w, + (int64_t) in->sample_aspect_ratio.den * + outlink->w * link->h, INT_MAX); + ret = ff_filter_frame(outlink, completed_frame); + if (ret < 0) + return ret; + } + } + + return 0; + } + out = av_frame_alloc(); if (!out) { ret = AVERROR(ENOMEM); @@ -372,13 +936,13 @@ static int cudacolorspace_filter_frame(AVFilterLink* link, AVFrame* in) goto fail; av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den, - (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w, - (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h, + (int64_t) in->sample_aspect_ratio.num * outlink->h * link->w, + (int64_t) in->sample_aspect_ratio.den * outlink->w * link->h, INT_MAX); av_frame_free(&in); return ff_filter_frame(outlink, out); -fail: + fail: av_frame_free(&in); av_frame_free(&out); return ret; @@ -387,12 +951,70 @@ static int cudacolorspace_filter_frame(AVFilterLink* link, AVFrame* in) #define OFFSET(x) offsetof(CUDAColorspaceContext, x) #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM) static const AVOption options[] = { - {"range", "Output video range", OFFSET(range), AV_OPT_TYPE_INT, { .i64 = AVCOL_RANGE_UNSPECIFIED }, AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_NB - 1, FLAGS, .unit = "range"}, - {"tv", "Limited range", 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, .unit = "range"}, - {"mpeg", "Limited range", 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, .unit = "range"}, - {"pc", "Full range", 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, .unit = "range"}, - {"jpeg", "Full range", 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, .unit = "range"}, - {NULL}, + { "range", "Output video range", OFFSET(range), AV_OPT_TYPE_INT, + {.i64 = + AVCOL_RANGE_UNSPECIFIED}, AVCOL_RANGE_UNSPECIFIED, + AVCOL_RANGE_NB - 1, FLAGS,.unit = "range" }, + { "tv", "Limited range", 0, AV_OPT_TYPE_CONST, + {.i64 = AVCOL_RANGE_MPEG}, 0, 0, FLAGS,.unit = "range" }, + { "mpeg", "Limited range", 0, AV_OPT_TYPE_CONST, + {.i64 = AVCOL_RANGE_MPEG}, 0, 0, FLAGS,.unit = "range" }, + { "pc", "Full range", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG}, + 0, 0, FLAGS,.unit = "range" }, + { "jpeg", "Full range", 0, AV_OPT_TYPE_CONST, + {.i64 = AVCOL_RANGE_JPEG}, 0, 0, FLAGS,.unit = "range" }, + + { "space", "Output colorspace", OFFSET(user_csp), AV_OPT_TYPE_INT, + {.i64 = + AVCOL_SPC_UNSPECIFIED}, AVCOL_SPC_RGB, AVCOL_SPC_NB - 1, + FLAGS,.unit = "csp" }, + { "bt709", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709}, 0, 0, + FLAGS,.unit = "csp" }, + { "bt470bg", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT470BG}, 0, + 0, FLAGS,.unit = "csp" }, + { "smpte170m", NULL, 0, AV_OPT_TYPE_CONST, + {.i64 = AVCOL_SPC_SMPTE170M}, 0, 0, FLAGS,.unit = "csp" }, + { "bt2020nc", NULL, 0, AV_OPT_TYPE_CONST, + {.i64 = AVCOL_SPC_BT2020_NCL}, 0, 0, FLAGS,.unit = "csp" }, + { "bt2020ncl", NULL, 0, AV_OPT_TYPE_CONST, + {.i64 = AVCOL_SPC_BT2020_NCL}, 0, 0, FLAGS,.unit = "csp" }, + { "bt601", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_SMPTE170M}, 0, + 0, FLAGS,.unit = "csp" }, + + { "primaries", "Output color primaries", OFFSET(user_prm), + AV_OPT_TYPE_INT, {.i64 = + AVCOL_PRI_UNSPECIFIED}, AVCOL_PRI_RESERVED0, + AVCOL_PRI_NB - 1, FLAGS,.unit = "prm" }, + { "bt709", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709}, 0, 0, + FLAGS,.unit = "prm" }, + { "bt470bg", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470BG}, 0, + 0, FLAGS,.unit = "prm" }, + { "smpte170m", NULL, 0, AV_OPT_TYPE_CONST, + {.i64 = AVCOL_PRI_SMPTE170M}, 0, 0, FLAGS,.unit = "prm" }, + { "bt2020", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020}, 0, + 0, FLAGS,.unit = "prm" }, + + { "trc", "Output transfer characteristics", OFFSET(user_trc), + AV_OPT_TYPE_INT, {.i64 = + AVCOL_TRC_UNSPECIFIED}, AVCOL_TRC_RESERVED0, + AVCOL_TRC_NB - 1, FLAGS,.unit = "trc" }, + { "bt709", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709}, 0, 0, + FLAGS,.unit = "trc" }, + { "bt2020-10", NULL, 0, AV_OPT_TYPE_CONST, + {.i64 = AVCOL_TRC_BT2020_10}, 0, 0, FLAGS,.unit = "trc" }, + { "smpte170m", NULL, 0, AV_OPT_TYPE_CONST, + {.i64 = AVCOL_TRC_SMPTE170M}, 0, 0, FLAGS,.unit = "trc" }, + + { "async_depth", "Frame queue depth for async processing", + OFFSET(async_depth), AV_OPT_TYPE_INT, {.i64 = + 1}, 1, MAX_FRAME_QUEUE_SIZE, + FLAGS }, + { "async_streams", "Number of CUDA streams for async processing", + OFFSET(async_streams), AV_OPT_TYPE_INT, {.i64 = + 1}, 1, MAX_CUDA_STREAMS, + FLAGS }, + + { NULL }, }; static const AVClass cudacolorspace_class = { @@ -404,18 +1026,17 @@ static const AVClass cudacolorspace_class = { static const AVFilterPad cudacolorspace_inputs[] = { { - .name = "default", - .type = AVMEDIA_TYPE_VIDEO, - .filter_frame = cudacolorspace_filter_frame, - }, + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + }, }; static const AVFilterPad cudacolorspace_outputs[] = { { - .name = "default", - .type = AVMEDIA_TYPE_VIDEO, - .config_props = cudacolorspace_config_props, - }, + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = cudacolorspace_config_props, + }, }; const AVFilter ff_vf_colorspace_cuda = { @@ -424,6 +1045,7 @@ const AVFilter ff_vf_colorspace_cuda = { .init = cudacolorspace_init, .uninit = cudacolorspace_uninit, + .activate = cudacolorspace_activate, .priv_size = sizeof(CUDAColorspaceContext), .priv_class = &cudacolorspace_class, diff --git a/libavfilter/vf_colorspace_cuda.cu b/libavfilter/vf_colorspace_cuda.cu index 59db9f69f3c..16860742f69 100644 --- a/libavfilter/vf_colorspace_cuda.cu +++ b/libavfilter/vf_colorspace_cuda.cu @@ -31,12 +31,29 @@ extern "C" { #define JPEG_LUMA_MAX (255) #define JPEG_CHROMA_MAX (255) +// 16-bit constants (10-bit values shifted to 16-bit) +#define MPEG_LUMA_MIN_16 (16 << 6) +#define MPEG_CHROMA_MIN_16 (16 << 6) +#define MPEG_LUMA_MAX_16 (235 << 6) +#define MPEG_CHROMA_MAX_16 (240 << 6) + +#define JPEG_LUMA_MIN_16 (0) +#define JPEG_CHROMA_MIN_16 (1 << 6) +#define JPEG_LUMA_MAX_16 (65535) +#define JPEG_CHROMA_MAX_16 (65535) + __device__ int mpeg_min[] = {MPEG_LUMA_MIN, MPEG_CHROMA_MIN}; __device__ int mpeg_max[] = {MPEG_LUMA_MAX, MPEG_CHROMA_MAX}; __device__ int jpeg_min[] = {JPEG_LUMA_MIN, JPEG_CHROMA_MIN}; __device__ int jpeg_max[] = {JPEG_LUMA_MAX, JPEG_CHROMA_MAX}; +__device__ int mpeg_min_16[] = {MPEG_LUMA_MIN_16, MPEG_CHROMA_MIN_16}; +__device__ int mpeg_max_16[] = {MPEG_LUMA_MAX_16, MPEG_CHROMA_MAX_16}; + +__device__ int jpeg_min_16[] = {JPEG_LUMA_MIN_16, JPEG_CHROMA_MIN_16}; +__device__ int jpeg_max_16[] = {JPEG_LUMA_MAX_16, JPEG_CHROMA_MAX_16}; + __device__ int clamp(int val, int min, int max) { if (val < min) @@ -91,4 +108,249 @@ __global__ void to_mpeg_cuda(const unsigned char* src, unsigned char* dst, dst[x + y * pitch] = static_cast(dst_); } +__global__ void to_jpeg_cuda_16(const unsigned short* src, unsigned short* dst, + int pitch, int comp_id) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int pitch_pixels = pitch / 2; // Convert byte pitch to pixel pitch + int src_, dst_; + + // 10-bit input in 16-bit container -> scale for processing + src_ = static_cast(src[x + y * pitch_pixels]); + + // Scale to full 16-bit range for conversion + dst_ = comp_id ? (src_ * 65535 - 64 * 65535) / (1023 - 64) // chroma + : (src_ * 65535 - 64 * 65535) / (940 - 64); // luma + + dst[x + y * pitch_pixels] = static_cast(clamp(dst_, jpeg_min_16[comp_id], jpeg_max_16[comp_id])); +} + +__global__ void to_mpeg_cuda_16(const unsigned short* src, unsigned short* dst, + int pitch, int comp_id) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int pitch_pixels = pitch / 2; // Convert byte pitch to pixel pitch + int src_, dst_; + + // 10-bit input in 16-bit container -> scale to limited range + src_ = static_cast(src[x + y * pitch_pixels]); + + // Scale from full range to limited range + dst_ = comp_id ? (src_ * (240 - 16) / 65535) + 16 // chroma + : (src_ * (235 - 16) / 65535) + 16; // luma + + // Scale to 16-bit container (10-bit values) + dst_ = dst_ << 6; + + dst[x + y * pitch_pixels] = static_cast(clamp(dst_, mpeg_min_16[comp_id], mpeg_max_16[comp_id])); +} + +__global__ void colorspace_convert_cuda(const unsigned char* src, unsigned char* dst, + int src_pitch, int dst_pitch, int width, int height, int plane_id, + float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width || y >= height) + return; + + if (plane_id == 0) { + // Luma plane (Y) - apply only first row of matrix + float src_y = (float)src[x + y * src_pitch]; + float dst_y = m00 * src_y + m01 * 128.0f + m02 * 128.0f; + dst[x + y * dst_pitch] = (unsigned char)clamp((int)(dst_y + 0.5f), 0, 255); + } else if (plane_id == 1) { + // U plane - apply second row of matrix, but we only have U component + float src_u = (float)src[x + y * src_pitch] - 128.0f; + // For U plane conversion, we apply the matrix considering U input and zero V + float dst_u = m10 * 128.0f + m11 * (src_u + 128.0f) + m12 * 128.0f; + dst[x + y * dst_pitch] = (unsigned char)clamp((int)(dst_u + 0.5f), 0, 255); + } else { + // V plane - apply third row of matrix, but we only have V component + float src_v = (float)src[x + y * src_pitch] - 128.0f; + // For V plane conversion, we apply the matrix considering zero U input and V + float dst_v = m20 * 128.0f + m21 * 128.0f + m22 * (src_v + 128.0f); + dst[x + y * dst_pitch] = (unsigned char)clamp((int)(dst_v + 0.5f), 0, 255); + } +} + +__global__ void colorspace_convert_cuda_16(const unsigned short* src, unsigned short* dst, + int src_pitch, int dst_pitch, int width, int height, int plane_id, + float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + int src_pitch_pixels = src_pitch / 2; // Convert byte pitch to pixel pitch + int dst_pitch_pixels = dst_pitch / 2; // Convert byte pitch to pixel pitch + + if (x >= width || y >= height) + return; + + // For 10-bit in 16-bit container, chroma center is at 512 << 6 = 32768 + const float chroma_center = 32768.0f; + + if (plane_id == 0) { + // Luma plane (Y) - apply only first row of matrix + float src_y = (float)src[x + y * src_pitch_pixels]; + float dst_y = m00 * src_y + m01 * chroma_center + m02 * chroma_center; + dst[x + y * dst_pitch_pixels] = (unsigned short)clamp((int)(dst_y + 0.5f), 0, 65535); + } else if (plane_id == 1) { + // U plane - apply second row of matrix, but we only have U component + float src_u = (float)src[x + y * src_pitch_pixels] - chroma_center; + // For U plane conversion, we apply the matrix considering U input and zero V + float dst_u = m10 * chroma_center + m11 * (src_u + chroma_center) + m12 * chroma_center; + dst[x + y * dst_pitch_pixels] = (unsigned short)clamp((int)(dst_u + 0.5f), 0, 65535); + } else { + // V plane - apply third row of matrix, but we only have V component + float src_v = (float)src[x + y * src_pitch_pixels] - chroma_center; + // For V plane conversion, we apply the matrix considering zero U input and V + float dst_v = m20 * chroma_center + m21 * chroma_center + m22 * (src_v + chroma_center); + dst[x + y * dst_pitch_pixels] = (unsigned short)clamp((int)(dst_v + 0.5f), 0, 65535); + } +} + +// New kernel for proper YUV colorspace conversion that processes all components together +__global__ void colorspace_convert_yuv_cuda(const unsigned char* src_y, const unsigned char* src_u, const unsigned char* src_v, + unsigned char* dst_y, unsigned char* dst_u, unsigned char* dst_v, + int src_pitch_y, int src_pitch_uv, int dst_pitch_y, int dst_pitch_uv, + int width, int height, + float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width || y >= height) + return; + + // Read YUV components + float src_Y = (float)src_y[x + y * src_pitch_y]; + + // For chroma subsampling, calculate chroma coordinates + int chroma_x = x / 2; + int chroma_y = y / 2; + + float src_U = (float)src_u[chroma_x + chroma_y * src_pitch_uv] - 128.0f; + float src_V = (float)src_v[chroma_x + chroma_y * src_pitch_uv] - 128.0f; + + // Apply 3x3 matrix transformation + float dst_Y = m00 * src_Y + m01 * (src_U + 128.0f) + m02 * (src_V + 128.0f); + float dst_U = m10 * src_Y + m11 * (src_U + 128.0f) + m12 * (src_V + 128.0f); + float dst_V = m20 * src_Y + m21 * (src_U + 128.0f) + m22 * (src_V + 128.0f); + + // Clamp and write Y component + dst_y[x + y * dst_pitch_y] = (unsigned char)clamp((int)(dst_Y + 0.5f), 0, 255); + + // Write UV components (only for appropriate positions to handle subsampling) + if (x % 2 == 0 && y % 2 == 0) { + dst_u[chroma_x + chroma_y * dst_pitch_uv] = (unsigned char)clamp((int)(dst_U + 0.5f), 0, 255); + dst_v[chroma_x + chroma_y * dst_pitch_uv] = (unsigned char)clamp((int)(dst_V + 0.5f), 0, 255); + } +} + +// New kernel for 16-bit YUV colorspace conversion +__global__ void colorspace_convert_yuv_cuda_16(const unsigned short* src_y, const unsigned short* src_u, const unsigned short* src_v, + unsigned short* dst_y, unsigned short* dst_u, unsigned short* dst_v, + int src_pitch_y, int src_pitch_uv, int dst_pitch_y, int dst_pitch_uv, + int width, int height, + float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width || y >= height) + return; + + // Convert byte pitch to pixel pitch for 16-bit + int src_pitch_y_pixels = src_pitch_y / 2; + int src_pitch_uv_pixels = src_pitch_uv / 2; + int dst_pitch_y_pixels = dst_pitch_y / 2; + int dst_pitch_uv_pixels = dst_pitch_uv / 2; + + // For 10-bit in 16-bit container, chroma center is at 512 << 6 = 32768 + const float chroma_center = 32768.0f; + + // Read YUV components + float src_Y = (float)src_y[x + y * src_pitch_y_pixels]; + + // For chroma subsampling, calculate chroma coordinates + int chroma_x = x / 2; + int chroma_y = y / 2; + + float src_U = (float)src_u[chroma_x + chroma_y * src_pitch_uv_pixels] - chroma_center; + float src_V = (float)src_v[chroma_x + chroma_y * src_pitch_uv_pixels] - chroma_center; + + // Apply 3x3 matrix transformation + float dst_Y = m00 * src_Y + m01 * (src_U + chroma_center) + m02 * (src_V + chroma_center); + float dst_U = m10 * src_Y + m11 * (src_U + chroma_center) + m12 * (src_V + chroma_center); + float dst_V = m20 * src_Y + m21 * (src_U + chroma_center) + m22 * (src_V + chroma_center); + + // Clamp and write Y component + dst_y[x + y * dst_pitch_y_pixels] = (unsigned short)clamp((int)(dst_Y + 0.5f), 0, 65535); + + // Write UV components (only for appropriate positions to handle subsampling) + if (x % 2 == 0 && y % 2 == 0) { + dst_u[chroma_x + chroma_y * dst_pitch_uv_pixels] = (unsigned short)clamp((int)(dst_U + 0.5f), 0, 65535); + dst_v[chroma_x + chroma_y * dst_pitch_uv_pixels] = (unsigned short)clamp((int)(dst_V + 0.5f), 0, 65535); + } +} + +// Specialized kernel for NV12 format (Y plane + interleaved UV plane) +__global__ void colorspace_convert_nv12_cuda(const unsigned char* src_y, const unsigned char* src_uv, + unsigned char* dst_y, unsigned char* dst_uv, + int src_pitch_y, int src_pitch_uv, int dst_pitch_y, int dst_pitch_uv, + int width, int height, + float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width || y >= height) + return; + + // Read Y component + float src_Y = (float)src_y[x + y * src_pitch_y]; + + // For chroma subsampling, calculate chroma coordinates + int chroma_x = x / 2; + int chroma_y = y / 2; + + // Check bounds for UV plane access + if (chroma_x >= (width / 2) || chroma_y >= (height / 2)) + return; + + int uv_idx = chroma_x * 2 + chroma_y * src_pitch_uv; // NV12: UVUVUV... + + // Read interleaved UV components safely + float src_U = (float)src_uv[uv_idx] - 128.0f; + float src_V = (float)src_uv[uv_idx + 1] - 128.0f; + + // Apply 3x3 matrix transformation + float dst_Y = m00 * src_Y + m01 * (src_U + 128.0f) + m02 * (src_V + 128.0f); + float dst_U = m10 * src_Y + m11 * (src_U + 128.0f) + m12 * (src_V + 128.0f); + float dst_V = m20 * src_Y + m21 * (src_U + 128.0f) + m22 * (src_V + 128.0f); + + // Clamp and write Y component + dst_y[x + y * dst_pitch_y] = (unsigned char)clamp((int)(dst_Y + 0.5f), 0, 255); + + // Write UV components (only for appropriate positions to handle subsampling) + if (x % 2 == 0 && y % 2 == 0) { + int dst_uv_idx = chroma_x * 2 + chroma_y * dst_pitch_uv; + dst_uv[dst_uv_idx] = (unsigned char)clamp((int)(dst_U + 0.5f), 0, 255); + dst_uv[dst_uv_idx + 1] = (unsigned char)clamp((int)(dst_V + 0.5f), 0, 255); + } +} + } diff --git a/libavfilter/vf_lut3d_cuda.c b/libavfilter/vf_lut3d_cuda.c new file mode 100644 index 00000000000..d92c5a5a0ee --- /dev/null +++ b/libavfilter/vf_lut3d_cuda.c @@ -0,0 +1,660 @@ +/* + * This file is part of FFmpeg. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +#include "libavutil/common.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/cuda_check.h" +#include "libavutil/internal.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" +#include "libavutil/mem.h" +#include "libavutil/file_open.h" +#include "libavutil/intfloat.h" +#include "libavutil/avassert.h" +#include "libavutil/avstring.h" + +#include "avfilter.h" +#include "filters.h" +#include "video.h" +#include "lut3d.h" + +#include "cuda/load_helper.h" +#include "vf_lut3d_cuda.h" + +static const enum AVPixelFormat supported_formats[] = { + AV_PIX_FMT_0RGB32, + AV_PIX_FMT_0BGR32, + AV_PIX_FMT_RGB32, + AV_PIX_FMT_BGR32, + AV_PIX_FMT_RGB48, + AV_PIX_FMT_BGR48, + AV_PIX_FMT_RGBA64, + AV_PIX_FMT_BGRA64, +}; + +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) +#define BLOCKX 32 +#define BLOCKY 16 + +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) + +typedef struct LUT3DCudaContext { + const AVClass *class; + + AVCUDADeviceContext *hwctx; + + enum AVPixelFormat in_fmt, out_fmt; + const AVPixFmtDescriptor *in_desc, *out_desc; + + AVBufferRef *frames_ctx; + AVFrame *frame; + AVFrame *tmp_frame; + + CUcontext cu_ctx; + CUmodule cu_module; + CUfunction cu_func; + CUstream cu_stream; + CUdeviceptr cu_lut; + + struct rgbvec *lut; + int lutsize; + int lutsize2; + struct rgbvec scale; + int interpolation; + char *file; + + int bit_depth; + int passthrough; +} LUT3DCudaContext; + +static int format_is_supported(enum AVPixelFormat fmt) +{ + int i; + + for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) + if (supported_formats[i] == fmt) + return 1; + return 0; +} + +static int get_bit_depth(enum AVPixelFormat fmt) +{ + switch (fmt) { + case AV_PIX_FMT_0RGB32: + case AV_PIX_FMT_0BGR32: + case AV_PIX_FMT_RGB32: + case AV_PIX_FMT_BGR32: + return 8; + case AV_PIX_FMT_RGB48: + case AV_PIX_FMT_BGR48: + return 12; + case AV_PIX_FMT_RGBA64: + case AV_PIX_FMT_BGRA64: + return 10; + default: + return 8; + } +} + +static av_cold int lut3d_cuda_init(AVFilterContext * ctx) +{ + LUT3DCudaContext *s = ctx->priv; + + s->frame = av_frame_alloc(); + if (!s->frame) + return AVERROR(ENOMEM); + + s->tmp_frame = av_frame_alloc(); + if (!s->tmp_frame) + return AVERROR(ENOMEM); + + return 0; +} + +static av_cold void lut3d_cuda_uninit(AVFilterContext * ctx) +{ + LUT3DCudaContext *s = ctx->priv; + + if (s->hwctx && s->cu_module) { + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUcontext dummy; + + CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (s->cu_lut) { + CHECK_CU(cu->cuMemFree(s->cu_lut)); + s->cu_lut = 0; + } + CHECK_CU(cu->cuModuleUnload(s->cu_module)); + s->cu_module = NULL; + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + } + + av_freep(&s->lut); + av_frame_free(&s->frame); + av_buffer_unref(&s->frames_ctx); + av_frame_free(&s->tmp_frame); +} + +static av_cold int init_hwframe_ctx(LUT3DCudaContext * s, + AVBufferRef * device_ctx, int width, + int height) +{ + AVBufferRef *out_ref = NULL; + AVHWFramesContext *out_ctx; + int ret; + + out_ref = av_hwframe_ctx_alloc(device_ctx); + if (!out_ref) + return AVERROR(ENOMEM); + out_ctx = (AVHWFramesContext *) out_ref->data; + + out_ctx->format = AV_PIX_FMT_CUDA; + out_ctx->sw_format = s->out_fmt; + out_ctx->width = FFALIGN(width, 32); + out_ctx->height = FFALIGN(height, 32); + + ret = av_hwframe_ctx_init(out_ref); + if (ret < 0) + goto fail; + + av_frame_unref(s->frame); + ret = av_hwframe_get_buffer(out_ref, s->frame, 0); + if (ret < 0) + goto fail; + + s->frame->width = width; + s->frame->height = height; + + av_buffer_unref(&s->frames_ctx); + s->frames_ctx = out_ref; + + return 0; + fail: + av_buffer_unref(&out_ref); + return ret; +} + +#define MAX_LINE_SIZE 512 + +static int skip_line(const char *p) +{ + while (*p && av_isspace(*p)) + p++; + return !*p || *p == '#'; +} + +#define NEXT_LINE(loop_cond) do { \ + if (!fgets(line, sizeof(line), f)) { \ + av_log(ctx, AV_LOG_ERROR, "Unexpected EOF\n"); \ + return AVERROR_INVALIDDATA; \ + } \ +} while (loop_cond) + +static int parse_cube(AVFilterContext * ctx, FILE * f) +{ + LUT3DCudaContext *lut3d = ctx->priv; + char line[MAX_LINE_SIZE]; + float min[3] = { 0.0, 0.0, 0.0 }; + float max[3] = { 1.0, 1.0, 1.0 }; + int size = 0, size2, i, j, k; + + while (fgets(line, sizeof(line), f)) { + if (!strncmp(line, "LUT_3D_SIZE", 11)) { + size = strtol(line + 12, NULL, 0); + if (size < 2 || size > MAX_LEVEL) { + av_log(ctx, AV_LOG_ERROR, + "Too large or invalid 3D LUT size\n"); + return AVERROR(EINVAL); + } + lut3d->lutsize = size; + lut3d->lutsize2 = size * size; + break; + } + } + + if (!size) { + av_log(ctx, AV_LOG_ERROR, "3D LUT size not found\n"); + return AVERROR(EINVAL); + } + + if (! + (lut3d->lut = + av_malloc_array(size * size * size, sizeof(*lut3d->lut)))) + return AVERROR(ENOMEM); + + size2 = size * size; + rewind(f); + + for (i = 0; i < size; i++) { + for (j = 0; j < size; j++) { + for (k = 0; k < size; k++) { + struct rgbvec *vec = &lut3d->lut[i * size2 + j * size + k]; + + do { + try_again: + NEXT_LINE(0); + if (!strncmp(line, "DOMAIN_", 7)) { + float *vals = NULL; + if (!strncmp(line + 7, "MIN ", 4)) + vals = min; + else if (!strncmp(line + 7, "MAX ", 4)) + vals = max; + if (!vals) + return AVERROR_INVALIDDATA; + if (av_sscanf + (line + 11, "%f %f %f", vals, vals + 1, + vals + 2) != 3) + return AVERROR_INVALIDDATA; + av_log(ctx, AV_LOG_DEBUG, + "min: %f %f %f | max: %f %f %f\n", min[0], + min[1], min[2], max[0], max[1], max[2]); + goto try_again; + } else if (!strncmp(line, "TITLE", 5)) { + goto try_again; + } + } while (skip_line(line)); + if (av_sscanf(line, "%f %f %f", &vec->r, &vec->g, &vec->b) + != 3) + return AVERROR_INVALIDDATA; + vec->r = av_clipf(vec->r, 0.f, 1.f); + vec->g = av_clipf(vec->g, 0.f, 1.f); + vec->b = av_clipf(vec->b, 0.f, 1.f); + } + } + } + + lut3d->scale.r = + av_clipf(1. / (max[0] - min[0]), 0.f, 1.f) * (lut3d->lutsize - 1); + lut3d->scale.g = + av_clipf(1. / (max[1] - min[1]), 0.f, 1.f) * (lut3d->lutsize - 1); + lut3d->scale.b = + av_clipf(1. / (max[2] - min[2]), 0.f, 1.f) * (lut3d->lutsize - 1); + + return 0; +} + +static av_cold int lut3d_cuda_load_cube(AVFilterContext * ctx, + const char *fname) +{ + FILE *f; + int ret; + + f = avpriv_fopen_utf8(fname, "r"); + if (!f) { + ret = AVERROR_INVALIDDATA; + av_log(ctx, AV_LOG_ERROR, "Cannot read file '%s': %s\n", fname, + av_err2str(ret)); + return ret; + } + + ret = parse_cube(ctx, f); + fclose(f); + return ret; +} + +static av_cold int lut3d_cuda_load_functions(AVFilterContext * ctx) +{ + LUT3DCudaContext *s = ctx->priv; + CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + char function_name[128]; + int ret; + + extern const unsigned char ff_vf_lut3d_cuda_ptx_data[]; + extern const unsigned int ff_vf_lut3d_cuda_ptx_len; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; + + ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module, + ff_vf_lut3d_cuda_ptx_data, + ff_vf_lut3d_cuda_ptx_len); + if (ret < 0) + goto fail; + + switch (s->interpolation) { + case INTERPOLATE_NEAREST: + snprintf(function_name, sizeof(function_name), + "lut3d_interp_%d_nearest", s->bit_depth); + break; + case INTERPOLATE_TRILINEAR: + snprintf(function_name, sizeof(function_name), + "lut3d_interp_%d_trilinear", s->bit_depth); + break; + case INTERPOLATE_TETRAHEDRAL: + default: + snprintf(function_name, sizeof(function_name), + "lut3d_interp_%d_tetrahedral", s->bit_depth); + break; + } + + ret = + CHECK_CU(cu-> + cuModuleGetFunction(&s->cu_func, s->cu_module, + function_name)); + if (ret < 0) { + av_log(ctx, AV_LOG_FATAL, "Failed to load CUDA function %s\n", + function_name); + goto fail; + } + + ret = + CHECK_CU(cu-> + cuMemAlloc(&s->cu_lut, + s->lutsize * s->lutsize * s->lutsize * 3 * + sizeof(float))); + if (ret < 0) + goto fail; + + ret = + CHECK_CU(cu-> + cuMemcpyHtoD(s->cu_lut, s->lut, + s->lutsize * s->lutsize * s->lutsize * + sizeof(struct rgbvec))); + if (ret < 0) + goto fail; + + fail: + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + + return ret; +} + +static av_cold int init_processing_chain(AVFilterContext * ctx, + int in_width, int in_height, + int out_width, int out_height) +{ + LUT3DCudaContext *s = ctx->priv; + FilterLink *inl = ff_filter_link(ctx->inputs[0]); + FilterLink *outl = ff_filter_link(ctx->outputs[0]); + + AVHWFramesContext *in_frames_ctx; + enum AVPixelFormat in_format; + enum AVPixelFormat out_format; + int ret; + + if (!inl->hw_frames_ctx) { + av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n"); + return AVERROR(EINVAL); + } + in_frames_ctx = (AVHWFramesContext *) inl->hw_frames_ctx->data; + in_format = in_frames_ctx->sw_format; + out_format = in_format; + + if (!format_is_supported(in_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n", + av_get_pix_fmt_name(in_format)); + return AVERROR(ENOSYS); + } + if (!format_is_supported(out_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n", + av_get_pix_fmt_name(out_format)); + return AVERROR(ENOSYS); + } + + s->in_fmt = in_format; + s->out_fmt = out_format; + s->in_desc = av_pix_fmt_desc_get(s->in_fmt); + s->out_desc = av_pix_fmt_desc_get(s->out_fmt); + s->bit_depth = get_bit_depth(s->in_fmt); + + if (in_width == out_width && in_height == out_height + && in_format == out_format) { + s->frames_ctx = av_buffer_ref(inl->hw_frames_ctx); + if (!s->frames_ctx) + return AVERROR(ENOMEM); + } else { + s->passthrough = 0; + + ret = + init_hwframe_ctx(s, in_frames_ctx->device_ref, out_width, + out_height); + if (ret < 0) + return ret; + } + + outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx); + if (!outl->hw_frames_ctx) + return AVERROR(ENOMEM); + + return 0; +} + +static av_cold int lut3d_cuda_config_props(AVFilterLink * outlink) +{ + AVFilterContext *ctx = outlink->src; + AVFilterLink *inlink = outlink->src->inputs[0]; + FilterLink *inl = ff_filter_link(inlink); + LUT3DCudaContext *s = ctx->priv; + AVHWFramesContext *frames_ctx; + AVCUDADeviceContext *device_hwctx; + int ret; + + if (s->file) { + ret = lut3d_cuda_load_cube(ctx, s->file); + if (ret < 0) + return ret; + } + + outlink->w = inlink->w; + outlink->h = inlink->h; + + ret = + init_processing_chain(ctx, inlink->w, inlink->h, outlink->w, + outlink->h); + if (ret < 0) + return ret; + + frames_ctx = (AVHWFramesContext *) inl->hw_frames_ctx->data; + device_hwctx = frames_ctx->device_ctx->hwctx; + + s->hwctx = device_hwctx; + s->cu_stream = s->hwctx->stream; + + av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d fmt:%s -> w:%d h:%d fmt:%s%s\n", + inlink->w, inlink->h, av_get_pix_fmt_name(s->in_fmt), + outlink->w, outlink->h, av_get_pix_fmt_name(s->out_fmt), + s->passthrough ? " (passthrough)" : ""); + + ret = lut3d_cuda_load_functions(ctx); + if (ret < 0) + return ret; + + return 0; +} + +static int call_lut3d_kernel(AVFilterContext * ctx, AVFrame * out, + AVFrame * in) +{ + LUT3DCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx; + CUtexObject tex = 0; + int ret; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; + + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = + (s->bit_depth > + 8) ? CU_AD_FORMAT_UNSIGNED_INT16 : CU_AD_FORMAT_UNSIGNED_INT8, + .res.pitch2D.numChannels = 4, + .res.pitch2D.width = in->width, + .res.pitch2D.height = in->height, + .res.pitch2D.pitchInBytes = in->linesize[0], + .res.pitch2D.devPtr = (CUdeviceptr) in->data[0], + }; + + ret = + CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + + void *args[] = { + &tex, + &out->data[0], + &out->width, &out->height, &out->linesize[0], + &s->cu_lut, &s->lutsize, + &s->scale.r, &s->scale.g, &s->scale.b + }; + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func, + DIV_UP(out->width, BLOCKX), + DIV_UP(out->height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, + args, NULL)); + + exit: + if (tex) + CHECK_CU(cu->cuTexObjectDestroy(tex)); + + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + + return ret; +} + +static int lut3d_cuda_filter_frame(AVFilterLink * link, AVFrame * in) +{ + AVFilterContext *ctx = link->dst; + LUT3DCudaContext *s = ctx->priv; + AVFilterLink *outlink = ctx->outputs[0]; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + AVFrame *out = NULL; + CUcontext dummy; + int ret = 0; + + if (s->passthrough) + return ff_filter_frame(outlink, in); + + out = av_frame_alloc(); + if (!out) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (ret < 0) + goto fail; + + ret = av_hwframe_get_buffer(s->frames_ctx, out, 0); + if (ret < 0) + goto fail; + + ret = call_lut3d_kernel(ctx, out, in); + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + if (ret < 0) + goto fail; + + ret = av_frame_copy_props(out, in); + if (ret < 0) + goto fail; + + av_frame_free(&in); + return ff_filter_frame(outlink, out); + + fail: + av_frame_free(&in); + av_frame_free(&out); + return ret; +} + +#define OFFSET(x) offsetof(LUT3DCudaContext, x) +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM) +#define TFLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM) + +static const AVOption lut3d_cuda_options[] = { + { "file", "set 3D LUT file name", OFFSET(file), AV_OPT_TYPE_STRING, + {.str = NULL},.flags = FLAGS }, + { "interp", "select interpolation mode", OFFSET(interpolation), + AV_OPT_TYPE_INT, {.i64 = + INTERPOLATE_TETRAHEDRAL}, 0, NB_INTERP_MODE - 1, + TFLAGS,.unit = "interp_mode" }, + { "nearest", "use values from the nearest defined points", 0, + AV_OPT_TYPE_CONST, {.i64 = INTERPOLATE_NEAREST}, 0, 0, TFLAGS,.unit = + "interp_mode" }, + { "trilinear", "interpolate values using the 8 points defining a cube", + 0, AV_OPT_TYPE_CONST, {.i64 = + INTERPOLATE_TRILINEAR}, 0, 0, TFLAGS,.unit = + "interp_mode" }, + { "tetrahedral", "interpolate values using a tetrahedron", 0, + AV_OPT_TYPE_CONST, {.i64 = + INTERPOLATE_TETRAHEDRAL}, 0, 0, TFLAGS,.unit = + "interp_mode" }, + { "passthrough", "Do not process frames at all if parameters match", + OFFSET(passthrough), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS }, + { NULL }, +}; + +static const AVClass lut3d_cuda_class = { + .class_name = "lut3d_cuda", + .item_name = av_default_item_name, + .option = lut3d_cuda_options, + .version = LIBAVUTIL_VERSION_INT, +}; + +static const AVFilterPad lut3d_cuda_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .filter_frame = lut3d_cuda_filter_frame, + }, +}; + +static const AVFilterPad lut3d_cuda_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = lut3d_cuda_config_props, + }, +}; + +const AVFilter ff_vf_lut3d_cuda = { + .name = "lut3d_cuda", + .description = NULL_IF_CONFIG_SMALL("GPU accelerated 3D LUT filter"), + .priv_class = &lut3d_cuda_class, + + .init = lut3d_cuda_init, + .uninit = lut3d_cuda_uninit, + + .priv_size = sizeof(LUT3DCudaContext), + + FILTER_INPUTS(lut3d_cuda_inputs), + FILTER_OUTPUTS(lut3d_cuda_outputs), + + FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), + + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; diff --git a/libavfilter/vf_lut3d_cuda.cu b/libavfilter/vf_lut3d_cuda.cu new file mode 100644 index 00000000000..a142299e13d --- /dev/null +++ b/libavfilter/vf_lut3d_cuda.cu @@ -0,0 +1,423 @@ +/* + * This file is part of FFmpeg. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "cuda/vector_helpers.cuh" + +#define BLOCKX 32 +#define BLOCKY 16 + +extern "C" { + +__device__ static inline float3 lerpf3(float3 a, float3 b, float t) +{ + return make_float3( + a.x + (b.x - a.x) * t, + a.y + (b.y - a.y) * t, + a.z + (b.z - a.z) * t + ); +} + +__device__ static inline float3 interp_nearest_cuda(float3 *lut, int lutsize, float3 s) +{ + int r = (int)(s.x + 0.5f); + int g = (int)(s.y + 0.5f); + int b = (int)(s.z + 0.5f); + + r = min(max(r, 0), lutsize - 1); + g = min(max(g, 0), lutsize - 1); + b = min(max(b, 0), lutsize - 1); + + return lut[r * lutsize * lutsize + g * lutsize + b]; +} + +__device__ static inline float3 interp_trilinear_cuda(float3 *lut, int lutsize, float3 s) +{ + int lutsize2 = lutsize * lutsize; + + int prev_r = (int)s.x; + int prev_g = (int)s.y; + int prev_b = (int)s.z; + + int next_r = min(prev_r + 1, lutsize - 1); + int next_g = min(prev_g + 1, lutsize - 1); + int next_b = min(prev_b + 1, lutsize - 1); + + float3 d = make_float3(s.x - prev_r, s.y - prev_g, s.z - prev_b); + + float3 c000 = lut[prev_r * lutsize2 + prev_g * lutsize + prev_b]; + float3 c001 = lut[prev_r * lutsize2 + prev_g * lutsize + next_b]; + float3 c010 = lut[prev_r * lutsize2 + next_g * lutsize + prev_b]; + float3 c011 = lut[prev_r * lutsize2 + next_g * lutsize + next_b]; + float3 c100 = lut[next_r * lutsize2 + prev_g * lutsize + prev_b]; + float3 c101 = lut[next_r * lutsize2 + prev_g * lutsize + next_b]; + float3 c110 = lut[next_r * lutsize2 + next_g * lutsize + prev_b]; + float3 c111 = lut[next_r * lutsize2 + next_g * lutsize + next_b]; + + float3 c00 = lerpf3(c000, c100, d.x); + float3 c10 = lerpf3(c010, c110, d.x); + float3 c01 = lerpf3(c001, c101, d.x); + float3 c11 = lerpf3(c011, c111, d.x); + float3 c0 = lerpf3(c00, c10, d.y); + float3 c1 = lerpf3(c01, c11, d.y); + + return lerpf3(c0, c1, d.z); +} + +__device__ static inline float3 interp_tetrahedral_cuda(float3 *lut, int lutsize, float3 s) +{ + int lutsize2 = lutsize * lutsize; + + int prev_r = (int)s.x; + int prev_g = (int)s.y; + int prev_b = (int)s.z; + + int next_r = min(prev_r + 1, lutsize - 1); + int next_g = min(prev_g + 1, lutsize - 1); + int next_b = min(prev_b + 1, lutsize - 1); + + float3 d = make_float3(s.x - prev_r, s.y - prev_g, s.z - prev_b); + + float3 c000 = lut[prev_r * lutsize2 + prev_g * lutsize + prev_b]; + float3 c111 = lut[next_r * lutsize2 + next_g * lutsize + next_b]; + + float3 c; + if (d.x > d.y && d.x > d.z) { + if (d.y > d.z) { + float3 c100 = lut[next_r * lutsize2 + prev_g * lutsize + prev_b]; + float3 c110 = lut[next_r * lutsize2 + next_g * lutsize + prev_b]; + c = make_float3( + c000.x + d.x * (c100.x - c000.x) + d.y * (c110.x - c100.x) + d.z * (c111.x - c110.x), + c000.y + d.x * (c100.y - c000.y) + d.y * (c110.y - c100.y) + d.z * (c111.y - c110.y), + c000.z + d.x * (c100.z - c000.z) + d.y * (c110.z - c100.z) + d.z * (c111.z - c110.z) + ); + } else { + float3 c100 = lut[next_r * lutsize2 + prev_g * lutsize + prev_b]; + float3 c101 = lut[next_r * lutsize2 + prev_g * lutsize + next_b]; + c = make_float3( + c000.x + d.x * (c100.x - c000.x) + d.z * (c101.x - c100.x) + d.y * (c111.x - c101.x), + c000.y + d.x * (c100.y - c000.y) + d.z * (c101.y - c100.y) + d.y * (c111.y - c101.y), + c000.z + d.x * (c100.z - c000.z) + d.z * (c101.z - c100.z) + d.y * (c111.z - c101.z) + ); + } + } else if (d.y > d.z) { + if (d.x > d.z) { + float3 c010 = lut[prev_r * lutsize2 + next_g * lutsize + prev_b]; + float3 c110 = lut[next_r * lutsize2 + next_g * lutsize + prev_b]; + c = make_float3( + c000.x + d.y * (c010.x - c000.x) + d.x * (c110.x - c010.x) + d.z * (c111.x - c110.x), + c000.y + d.y * (c010.y - c000.y) + d.x * (c110.y - c010.y) + d.z * (c111.y - c110.y), + c000.z + d.y * (c010.z - c000.z) + d.x * (c110.z - c010.z) + d.z * (c111.z - c110.z) + ); + } else { + float3 c010 = lut[prev_r * lutsize2 + next_g * lutsize + prev_b]; + float3 c011 = lut[prev_r * lutsize2 + next_g * lutsize + next_b]; + c = make_float3( + c000.x + d.y * (c010.x - c000.x) + d.z * (c011.x - c010.x) + d.x * (c111.x - c011.x), + c000.y + d.y * (c010.y - c000.y) + d.z * (c011.y - c010.y) + d.x * (c111.y - c011.y), + c000.z + d.y * (c010.z - c000.z) + d.z * (c011.z - c010.z) + d.x * (c111.z - c011.z) + ); + } + } else { + if (d.x > d.y) { + float3 c001 = lut[prev_r * lutsize2 + prev_g * lutsize + next_b]; + float3 c101 = lut[next_r * lutsize2 + prev_g * lutsize + next_b]; + c = make_float3( + c000.x + d.z * (c001.x - c000.x) + d.x * (c101.x - c001.x) + d.y * (c111.x - c101.x), + c000.y + d.z * (c001.y - c000.y) + d.x * (c101.y - c001.y) + d.y * (c111.y - c101.y), + c000.z + d.z * (c001.z - c000.z) + d.x * (c101.z - c001.z) + d.y * (c111.z - c101.z) + ); + } else { + float3 c001 = lut[prev_r * lutsize2 + prev_g * lutsize + next_b]; + float3 c011 = lut[prev_r * lutsize2 + next_g * lutsize + next_b]; + c = make_float3( + c000.x + d.z * (c001.x - c000.x) + d.y * (c011.x - c001.x) + d.x * (c111.x - c011.x), + c000.y + d.z * (c001.y - c000.y) + d.y * (c011.y - c001.y) + d.x * (c111.y - c011.y), + c000.z + d.z * (c001.z - c000.z) + d.y * (c011.z - c001.z) + d.x * (c111.z - c011.z) + ); + } + } + + return c; +} + +__global__ void lut3d_interp_8_nearest( + cudaTextureObject_t src_tex, + uchar4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + uchar4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + src.x / 255.0f * scale_r, + src.y / 255.0f * scale_g, + src.z / 255.0f * scale_b + ); + + float3 result = interp_nearest_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(uchar4)) + x] = make_uchar4( + (unsigned char)(result.x * 255.0f + 0.5f), + (unsigned char)(result.y * 255.0f + 0.5f), + (unsigned char)(result.z * 255.0f + 0.5f), + src.w + ); +} + +__global__ void lut3d_interp_10_nearest( + cudaTextureObject_t src_tex, + ushort4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + ushort4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + (src.x >> 6) * scale_r, // Convert 10-bit to normalized range + (src.y >> 6) * scale_g, + (src.z >> 6) * scale_b + ); + + float3 result = interp_nearest_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4( + (unsigned short)(result.x * 1023.0f + 0.5f) << 6, // Convert back to 10-bit in 16-bit container + (unsigned short)(result.y * 1023.0f + 0.5f) << 6, + (unsigned short)(result.z * 1023.0f + 0.5f) << 6, + src.w + ); +} + +__global__ void lut3d_interp_12_nearest( + cudaTextureObject_t src_tex, + ushort4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + ushort4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + (src.x >> 4) * scale_r, // Convert 12-bit to normalized range + (src.y >> 4) * scale_g, + (src.z >> 4) * scale_b + ); + + float3 result = interp_nearest_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4( + (unsigned short)(result.x * 4095.0f + 0.5f) << 4, // Convert back to 12-bit in 16-bit container + (unsigned short)(result.y * 4095.0f + 0.5f) << 4, + (unsigned short)(result.z * 4095.0f + 0.5f) << 4, + src.w + ); +} + +__global__ void lut3d_interp_8_trilinear( + cudaTextureObject_t src_tex, + uchar4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + uchar4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + src.x / 255.0f * scale_r, + src.y / 255.0f * scale_g, + src.z / 255.0f * scale_b + ); + + float3 result = interp_trilinear_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(uchar4)) + x] = make_uchar4( + (unsigned char)(result.x * 255.0f + 0.5f), + (unsigned char)(result.y * 255.0f + 0.5f), + (unsigned char)(result.z * 255.0f + 0.5f), + src.w + ); +} + +__global__ void lut3d_interp_10_trilinear( + cudaTextureObject_t src_tex, + ushort4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + ushort4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + (src.x >> 6) / 1023.0f * scale_r, + (src.y >> 6) / 1023.0f * scale_g, + (src.z >> 6) / 1023.0f * scale_b + ); + + float3 result = interp_trilinear_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4( + (unsigned short)(result.x * 1023.0f + 0.5f) << 6, + (unsigned short)(result.y * 1023.0f + 0.5f) << 6, + (unsigned short)(result.z * 1023.0f + 0.5f) << 6, + src.w + ); +} + +__global__ void lut3d_interp_12_trilinear( + cudaTextureObject_t src_tex, + ushort4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + ushort4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + (src.x >> 4) / 4095.0f * scale_r, + (src.y >> 4) / 4095.0f * scale_g, + (src.z >> 4) / 4095.0f * scale_b + ); + + float3 result = interp_trilinear_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4( + (unsigned short)(result.x * 4095.0f + 0.5f) << 4, + (unsigned short)(result.y * 4095.0f + 0.5f) << 4, + (unsigned short)(result.z * 4095.0f + 0.5f) << 4, + src.w + ); +} + +__global__ void lut3d_interp_8_tetrahedral( + cudaTextureObject_t src_tex, + uchar4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + uchar4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + src.x / 255.0f * scale_r, + src.y / 255.0f * scale_g, + src.z / 255.0f * scale_b + ); + + float3 result = interp_tetrahedral_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(uchar4)) + x] = make_uchar4( + (unsigned char)(result.x * 255.0f + 0.5f), + (unsigned char)(result.y * 255.0f + 0.5f), + (unsigned char)(result.z * 255.0f + 0.5f), + src.w + ); +} + +__global__ void lut3d_interp_10_tetrahedral( + cudaTextureObject_t src_tex, + ushort4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + ushort4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + (src.x >> 6) / 1023.0f * scale_r, + (src.y >> 6) / 1023.0f * scale_g, + (src.z >> 6) / 1023.0f * scale_b + ); + + float3 result = interp_tetrahedral_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4( + (unsigned short)(result.x * 1023.0f + 0.5f) << 6, + (unsigned short)(result.y * 1023.0f + 0.5f) << 6, + (unsigned short)(result.z * 1023.0f + 0.5f) << 6, + src.w + ); +} + +__global__ void lut3d_interp_12_tetrahedral( + cudaTextureObject_t src_tex, + ushort4 *dst, int dst_width, int dst_height, int dst_pitch, + float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + ushort4 src = tex2D(src_tex, x, y); + + float3 s = make_float3( + (src.x >> 4) / 4095.0f * scale_r, + (src.y >> 4) / 4095.0f * scale_g, + (src.z >> 4) / 4095.0f * scale_b + ); + + float3 result = interp_tetrahedral_cuda(lut, lutsize, s); + + dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4( + (unsigned short)(result.x * 4095.0f + 0.5f) << 4, + (unsigned short)(result.y * 4095.0f + 0.5f) << 4, + (unsigned short)(result.z * 4095.0f + 0.5f) << 4, + src.w + ); +} + +} \ No newline at end of file diff --git a/libavfilter/vf_lut3d_cuda.h b/libavfilter/vf_lut3d_cuda.h new file mode 100644 index 00000000000..967177a3df3 --- /dev/null +++ b/libavfilter/vf_lut3d_cuda.h @@ -0,0 +1,26 @@ +/* + * This file is part of FFmpeg. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef AVFILTER_LUT3D_CUDA_H +#define AVFILTER_LUT3D_CUDA_H + +#endif diff --git a/libavfilter/vf_showinfo_cuda.c b/libavfilter/vf_showinfo_cuda.c new file mode 100644 index 00000000000..f634556e607 --- /dev/null +++ b/libavfilter/vf_showinfo_cuda.c @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2025 Claude Code Assistant + * Based on the original showinfo filter by Stefano Sabatini + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * CUDA-compatible filter for showing textual video frame information + */ + +#include +#include + +#include "libavutil/buffer.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/cuda_check.h" +#include "libavutil/bswap.h" +#include "libavutil/adler32.h" +#include "libavutil/display.h" +#include "libavutil/dovi_meta.h" +#include "libavutil/imgutils.h" +#include "libavutil/internal.h" +#include "libavutil/film_grain_params.h" +#include "libavutil/hdr_dynamic_metadata.h" +#include "libavutil/hdr_dynamic_vivid_metadata.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" +#include "libavutil/spherical.h" +#include "libavutil/stereo3d.h" +#include "libavutil/timestamp.h" +#include "libavutil/timecode.h" +#include "libavutil/mastering_display_metadata.h" +#include "libavutil/video_enc_params.h" +#include "libavutil/detection_bbox.h" +#include "libavutil/ambient_viewing_environment.h" +#include "libavutil/uuid.h" + +#include "avfilter.h" +#include "filters.h" +#include "formats.h" +#include "video.h" + +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) + +typedef struct ShowInfoCudaContext { + const AVClass *class; + + AVCUDADeviceContext *hwctx; + AVBufferRef *hw_frames_ctx; + + int calculate_checksums; + int udu_sei_as_ascii; + + AVFrame *tmp_frame; +} ShowInfoCudaContext; + +#define OFFSET(x) offsetof(ShowInfoCudaContext, x) +#define VF AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM + +static const AVOption showinfo_cuda_options[] = { + { "checksum", "calculate checksums", OFFSET(calculate_checksums), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, VF }, + { "udu_sei_as_ascii", "try to print user data unregistered SEI as ascii character when possible", + OFFSET(udu_sei_as_ascii), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VF }, + { NULL } +}; + +AVFILTER_DEFINE_CLASS(showinfo_cuda); + +static const enum AVPixelFormat supported_formats[] = { + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_NV12, + AV_PIX_FMT_YUV444P, + AV_PIX_FMT_P010, + AV_PIX_FMT_P016, + AV_PIX_FMT_YUV444P16, + AV_PIX_FMT_0RGB32, + AV_PIX_FMT_0BGR32, + AV_PIX_FMT_RGB32, + AV_PIX_FMT_BGR32, +}; + +static int format_is_supported(enum AVPixelFormat fmt) +{ + for (int i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) + if (supported_formats[i] == fmt) + return 1; + return 0; +} + +static int showinfo_cuda_query_formats(const AVFilterContext *avctx, + AVFilterFormatsConfig **cfg_in, + AVFilterFormatsConfig **cfg_out) +{ + int err; + + if ((err = ff_formats_ref(ff_formats_pixdesc_filter(AV_PIX_FMT_FLAG_HWACCEL, 0), + &cfg_in[0]->formats)) < 0) + return err; + + if ((err = ff_formats_ref(ff_formats_pixdesc_filter(AV_PIX_FMT_FLAG_HWACCEL, 0), + &cfg_out[0]->formats)) < 0) + return err; + + return 0; +} + +static int showinfo_cuda_config_props(AVFilterLink *outlink) +{ + AVFilterContext *ctx = outlink->src; + AVFilterLink *inlink = outlink->src->inputs[0]; + ShowInfoCudaContext *s = ctx->priv; + FilterLink *il = ff_filter_link(inlink); + FilterLink *ol = ff_filter_link(outlink); + AVHWFramesContext *frames_ctx; + + av_buffer_unref(&s->hw_frames_ctx); + + if (!il->hw_frames_ctx) { + av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n"); + return AVERROR(EINVAL); + } + + s->hw_frames_ctx = av_buffer_ref(il->hw_frames_ctx); + if (!s->hw_frames_ctx) + return AVERROR(ENOMEM); + + frames_ctx = (AVHWFramesContext*)s->hw_frames_ctx->data; + s->hwctx = frames_ctx->device_ctx->hwctx; + + if (!format_is_supported(frames_ctx->sw_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n", + av_get_pix_fmt_name(frames_ctx->sw_format)); + return AVERROR(ENOSYS); + } + + ol->hw_frames_ctx = av_buffer_ref(s->hw_frames_ctx); + if (!ol->hw_frames_ctx) + return AVERROR(ENOMEM); + + return 0; +} + +// Include all the dump functions from the original showinfo filter +// (These functions work on metadata which doesn't require CUDA processing) + +static void dump_spherical(AVFilterContext *ctx, AVFrame *frame, const AVFrameSideData *sd) +{ + const AVSphericalMapping *spherical = (const AVSphericalMapping *)sd->data; + double yaw, pitch, roll; + + av_log(ctx, AV_LOG_INFO, "%s ", av_spherical_projection_name(spherical->projection)); + + if (spherical->yaw || spherical->pitch || spherical->roll) { + yaw = ((double)spherical->yaw) / (1 << 16); + pitch = ((double)spherical->pitch) / (1 << 16); + roll = ((double)spherical->roll) / (1 << 16); + av_log(ctx, AV_LOG_INFO, "(%f/%f/%f) ", yaw, pitch, roll); + } + + if (spherical->projection == AV_SPHERICAL_EQUIRECTANGULAR_TILE) { + size_t l, t, r, b; + av_spherical_tile_bounds(spherical, frame->width, frame->height, + &l, &t, &r, &b); + av_log(ctx, AV_LOG_INFO, + "[%"SIZE_SPECIFIER", %"SIZE_SPECIFIER", %"SIZE_SPECIFIER", %"SIZE_SPECIFIER"] ", + l, t, r, b); + } else if (spherical->projection == AV_SPHERICAL_CUBEMAP) { + av_log(ctx, AV_LOG_INFO, "[pad %"PRIu32"] ", spherical->padding); + } +} + +static void dump_color_property(AVFilterContext *ctx, AVFrame *frame) +{ + const char *color_range_str = av_color_range_name(frame->color_range); + const char *colorspace_str = av_color_space_name(frame->colorspace); + const char *color_primaries_str = av_color_primaries_name(frame->color_primaries); + const char *color_trc_str = av_color_transfer_name(frame->color_trc); + + if (!color_range_str || frame->color_range == AVCOL_RANGE_UNSPECIFIED) { + av_log(ctx, AV_LOG_INFO, "color_range:unknown"); + } else { + av_log(ctx, AV_LOG_INFO, "color_range:%s", color_range_str); + } + + if (!colorspace_str || frame->colorspace == AVCOL_SPC_UNSPECIFIED) { + av_log(ctx, AV_LOG_INFO, " color_space:unknown"); + } else { + av_log(ctx, AV_LOG_INFO, " color_space:%s", colorspace_str); + } + + if (!color_primaries_str || frame->color_primaries == AVCOL_PRI_UNSPECIFIED) { + av_log(ctx, AV_LOG_INFO, " color_primaries:unknown"); + } else { + av_log(ctx, AV_LOG_INFO, " color_primaries:%s", color_primaries_str); + } + + if (!color_trc_str || frame->color_trc == AVCOL_TRC_UNSPECIFIED) { + av_log(ctx, AV_LOG_INFO, " color_trc:unknown"); + } else { + av_log(ctx, AV_LOG_INFO, " color_trc:%s", color_trc_str); + } + av_log(ctx, AV_LOG_INFO, "\n"); +} + +static int filter_frame(AVFilterLink *inlink, AVFrame *frame) +{ + FilterLink *inl = ff_filter_link(inlink); + AVFilterContext *ctx = inlink->dst; + ShowInfoCudaContext *s = ctx->priv; + const AVPixFmtDescriptor *desc; + AVHWFramesContext *frames_ctx; + uint32_t plane_checksum[4] = {0}, checksum = 0; + int32_t pixelcount[4] = {0}; + int err, i; + + if (!frame->hw_frames_ctx) { + av_log(ctx, AV_LOG_ERROR, "Input frame missing hw_frames_ctx\n"); + return AVERROR(EINVAL); + } + + frames_ctx = (AVHWFramesContext*)frame->hw_frames_ctx->data; + desc = av_pix_fmt_desc_get(frames_ctx->sw_format); + + // For CUDA frames, we show info without downloading unless checksums are requested + av_log(ctx, AV_LOG_INFO, + "n:%4"PRId64" pts:%7s pts_time:%-7s duration:%7"PRId64 + " duration_time:%-7s " + "fmt:%s(cuda) cl:%s sar:%d/%d s:%dx%d i:%c iskey:%d type:%c ", + inl->frame_count_out, + av_ts2str(frame->pts), av_ts2timestr(frame->pts, &inlink->time_base), + frame->duration, av_ts2timestr(frame->duration, &inlink->time_base), + desc->name, av_chroma_location_name(frame->chroma_location), + frame->sample_aspect_ratio.num, frame->sample_aspect_ratio.den, + frame->width, frame->height, + !(frame->flags & AV_FRAME_FLAG_INTERLACED) ? 'P' : /* Progressive */ + (frame->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST) ? 'T' : 'B', /* Top / Bottom */ + !!(frame->flags & AV_FRAME_FLAG_KEY), + av_get_picture_type_char(frame->pict_type)); + + // If checksums are requested, we need to download the frame + if (s->calculate_checksums) { + if (!s->tmp_frame) { + s->tmp_frame = av_frame_alloc(); + if (!s->tmp_frame) + return AVERROR(ENOMEM); + } + + err = av_hwframe_transfer_data(s->tmp_frame, frame, 0); + if (err < 0) { + av_log(ctx, AV_LOG_WARNING, "Failed to download frame for checksum calculation: %s\n", + av_err2str(err)); + av_log(ctx, AV_LOG_INFO, "checksum:N/A"); + } else { + // Calculate checksums on downloaded frame + int bitdepth = desc->comp[0].depth; + int plane, vsub = desc->log2_chroma_h; + + for (plane = 0; plane < 4 && s->tmp_frame->data[plane] && s->tmp_frame->linesize[plane]; plane++) { + uint8_t *data = s->tmp_frame->data[plane]; + int h = plane == 1 || plane == 2 ? AV_CEIL_RSHIFT(inlink->h, vsub) : inlink->h; + int linesize = av_image_get_linesize(s->tmp_frame->format, s->tmp_frame->width, plane); + int width = linesize >> (bitdepth > 8); + + if (linesize < 0) { + av_frame_unref(s->tmp_frame); + return linesize; + } + + for (i = 0; i < h; i++) { + plane_checksum[plane] = av_adler32_update(plane_checksum[plane], data, linesize); + checksum = av_adler32_update(checksum, data, linesize); + pixelcount[plane] += width; + data += s->tmp_frame->linesize[plane]; + } + } + + av_log(ctx, AV_LOG_INFO, + "checksum:%08"PRIX32" plane_checksum:[%08"PRIX32, + checksum, plane_checksum[0]); + + for (plane = 1; plane < 4 && s->tmp_frame->data[plane] && s->tmp_frame->linesize[plane]; plane++) + av_log(ctx, AV_LOG_INFO, " %08"PRIX32, plane_checksum[plane]); + av_log(ctx, AV_LOG_INFO, "]"); + } + + av_frame_unref(s->tmp_frame); + } + + av_log(ctx, AV_LOG_INFO, "\n"); + + // Show side data (this doesn't require frame data download) + for (i = 0; i < frame->nb_side_data; i++) { + AVFrameSideData *sd = frame->side_data[i]; + const char *name = av_frame_side_data_name(sd->type); + + av_log(ctx, AV_LOG_INFO, " side data - "); + if (name) + av_log(ctx, AV_LOG_INFO, "%s: ", name); + + switch (sd->type) { + case AV_FRAME_DATA_SPHERICAL: + dump_spherical(ctx, frame, sd); + break; + case AV_FRAME_DATA_DISPLAYMATRIX: + av_log(ctx, AV_LOG_INFO, "rotation of %.2f degrees", + av_display_rotation_get((int32_t *)sd->data)); + break; + case AV_FRAME_DATA_AFD: + av_log(ctx, AV_LOG_INFO, "value of %"PRIu8, sd->data[0]); + break; + default: + if (name) + av_log(ctx, AV_LOG_INFO, + "(%"SIZE_SPECIFIER" bytes)", sd->size); + else + av_log(ctx, AV_LOG_WARNING, "unknown side data type %d " + "(%"SIZE_SPECIFIER" bytes)", sd->type, sd->size); + break; + } + + av_log(ctx, AV_LOG_INFO, "\n"); + } + + dump_color_property(ctx, frame); + + return ff_filter_frame(inlink->dst->outputs[0], frame); +} + +static av_cold void showinfo_cuda_uninit(AVFilterContext *ctx) +{ + ShowInfoCudaContext *s = ctx->priv; + + av_frame_free(&s->tmp_frame); + av_buffer_unref(&s->hw_frames_ctx); +} + +static const AVFilterPad showinfo_cuda_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .filter_frame = filter_frame, + }, +}; + +static const AVFilterPad showinfo_cuda_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = showinfo_cuda_config_props, + }, +}; + +const AVFilter ff_vf_showinfo_cuda = { + .name = "showinfo_cuda", + .description = NULL_IF_CONFIG_SMALL("Show textual information for each CUDA video frame."), + .priv_class = &showinfo_cuda_class, + .flags = AVFILTER_FLAG_METADATA_ONLY, + .priv_size = sizeof(ShowInfoCudaContext), + .uninit = showinfo_cuda_uninit, + FILTER_INPUTS(showinfo_cuda_inputs), + FILTER_OUTPUTS(showinfo_cuda_outputs), + FILTER_QUERY_FUNC2(showinfo_cuda_query_formats), + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; \ No newline at end of file diff --git a/libavfilter/vf_tonemap_cuda.c b/libavfilter/vf_tonemap_cuda.c new file mode 100644 index 00000000000..02b0347293d --- /dev/null +++ b/libavfilter/vf_tonemap_cuda.c @@ -0,0 +1,925 @@ +/* + * This file is part of FFmpeg. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include + +#include "libavutil/common.h" +#include "libavutil/hwcontext.h" +#include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/cuda_check.h" +#include "libavutil/internal.h" +#include "colorspace.h" +#include "libavutil/opt.h" +#include "libavutil/pixdesc.h" +#include "libavutil/csp.h" + +#include "avfilter.h" +#include "filters.h" +#include "video.h" + +#include "cuda/load_helper.h" +#include "vf_tonemap_cuda.h" +#include "cuda/cuda_async_queue.h" + +static const enum AVPixelFormat supported_formats[] = { + AV_PIX_FMT_GBRPF32, + AV_PIX_FMT_RGB48, + AV_PIX_FMT_RGBA64, + AV_PIX_FMT_P010LE, + AV_PIX_FMT_P016LE, + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_YUV420P10LE, + AV_PIX_FMT_YUV420P16LE, + AV_PIX_FMT_NV12, +}; + +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) +#define BLOCKX 32 +#define BLOCKY 16 + +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) + +enum TonemapAlgorithm { + TONEMAP_NONE, + TONEMAP_LINEAR, + TONEMAP_GAMMA, + TONEMAP_CLIP, + TONEMAP_REINHARD, + TONEMAP_HABLE, + TONEMAP_MOBIUS, + TONEMAP_MAX, +}; + +typedef struct TonemapCudaContext { + const AVClass *class; + + AVCUDADeviceContext *hwctx; + + enum AVPixelFormat in_fmt, out_fmt; + const AVPixFmtDescriptor *in_desc, *out_desc; + + AVBufferRef *frames_ctx; + AVFrame *frame; + AVFrame *tmp_frame; + + CUcontext cu_ctx; + CUmodule cu_module; + CUfunction cu_func; + CUstream cu_stream; + + + enum TonemapAlgorithm tonemap; + double param; + double desat; + double peak; + + const AVLumaCoefficients *coeffs; + int passthrough; + + int async_depth; + int async_streams; + CudaAsyncQueue async_queue; +} TonemapCudaContext; + +static int format_is_supported(enum AVPixelFormat fmt) +{ + int i; + + for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) + if (supported_formats[i] == fmt) + return 1; + return 0; +} + +static av_cold int tonemap_cuda_init(AVFilterContext * ctx) +{ + TonemapCudaContext *s = ctx->priv; + + s->frame = av_frame_alloc(); + if (!s->frame) + return AVERROR(ENOMEM); + + s->tmp_frame = av_frame_alloc(); + if (!s->tmp_frame) + return AVERROR(ENOMEM); + + + switch (s->tonemap) { + case TONEMAP_GAMMA: + if (isnan(s->param)) + s->param = 1.8f; + break; + case TONEMAP_REINHARD: + if (!isnan(s->param)) + s->param = (1.0f - s->param) / s->param; + break; + case TONEMAP_MOBIUS: + if (isnan(s->param)) + s->param = 0.3f; + break; + } + + if (isnan(s->param)) + s->param = 1.0f; + + return 0; +} + +static int process_frame_async(void *filter_ctx, AVFrame * out, + AVFrame * in, CUstream stream); + +static av_cold void tonemap_cuda_uninit(AVFilterContext * ctx) +{ + TonemapCudaContext *s = ctx->priv; + + + if (s->hwctx && s->cu_module) { + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUcontext dummy; + + CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + CHECK_CU(cu->cuModuleUnload(s->cu_module)); + s->cu_module = NULL; + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + } + + av_frame_free(&s->frame); + av_frame_free(&s->tmp_frame); + + // Clean up async queue BEFORE freeing frames_ctx to ensure + // that async frames can properly return their buffer references + if (s->async_depth > 1 || s->async_streams > 1) { + ff_cuda_async_queue_uninit(&s->async_queue); + } + + av_buffer_unref(&s->frames_ctx); +} + +static av_cold int init_hwframe_ctx(TonemapCudaContext * s, + AVBufferRef * device_ctx, int width, + int height) +{ + AVBufferRef *out_ref = NULL; + AVHWFramesContext *out_ctx; + int ret; + + out_ref = av_hwframe_ctx_alloc(device_ctx); + if (!out_ref) + return AVERROR(ENOMEM); + out_ctx = (AVHWFramesContext *) out_ref->data; + + out_ctx->format = AV_PIX_FMT_CUDA; + out_ctx->sw_format = s->out_fmt; + out_ctx->width = FFALIGN(width, 32); + out_ctx->height = FFALIGN(height, 32); + + ret = av_hwframe_ctx_init(out_ref); + if (ret < 0) + goto fail; + + av_frame_unref(s->frame); + ret = av_hwframe_get_buffer(out_ref, s->frame, 0); + if (ret < 0) + goto fail; + + s->frame->width = width; + s->frame->height = height; + + av_buffer_unref(&s->frames_ctx); + s->frames_ctx = out_ref; + + return 0; + fail: + av_buffer_unref(&out_ref); + return ret; +} + +static av_cold int tonemap_cuda_load_functions(AVFilterContext * ctx) +{ + TonemapCudaContext *s = ctx->priv; + CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + char function_name[128]; + int ret; + + extern const unsigned char ff_vf_tonemap_cuda_ptx_data[]; + extern const unsigned int ff_vf_tonemap_cuda_ptx_len; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; + + ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module, + ff_vf_tonemap_cuda_ptx_data, + ff_vf_tonemap_cuda_ptx_len); + if (ret < 0) + goto fail; + + if (s->in_fmt == AV_PIX_FMT_GBRPF32) { + snprintf(function_name, sizeof(function_name), + "tonemap_cuda_planar_float"); + } else + if ((s->in_fmt == AV_PIX_FMT_P010LE + || s->in_fmt == AV_PIX_FMT_P016LE) + && s->out_fmt == AV_PIX_FMT_NV12) { + snprintf(function_name, sizeof(function_name), + "tonemap_cuda_p016_to_nv12"); + } else if (s->in_fmt == AV_PIX_FMT_YUV420P10LE + || s->in_fmt == AV_PIX_FMT_YUV420P16LE) { + snprintf(function_name, sizeof(function_name), + "tonemap_cuda_yuv420p10"); + } else { + snprintf(function_name, sizeof(function_name), + "tonemap_cuda_16bit"); + } + + ret = + CHECK_CU(cu-> + cuModuleGetFunction(&s->cu_func, s->cu_module, + function_name)); + if (ret < 0) { + av_log(ctx, AV_LOG_FATAL, "Failed to load CUDA function %s\n", + function_name); + goto fail; + } + + fail: + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + + return ret; +} + +static av_cold int init_processing_chain(AVFilterContext * ctx, + int in_width, int in_height, + int out_width, int out_height) +{ + TonemapCudaContext *s = ctx->priv; + FilterLink *inl = ff_filter_link(ctx->inputs[0]); + FilterLink *outl = ff_filter_link(ctx->outputs[0]); + + AVHWFramesContext *in_frames_ctx; + enum AVPixelFormat in_format; + enum AVPixelFormat out_format; + int ret; + + if (!inl->hw_frames_ctx) { + av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n"); + return AVERROR(EINVAL); + } + in_frames_ctx = (AVHWFramesContext *) inl->hw_frames_ctx->data; + in_format = in_frames_ctx->sw_format; + // Convert HDR to SDR format for encoder compatibility + if (in_format == AV_PIX_FMT_P010LE || in_format == AV_PIX_FMT_P016LE) { + out_format = AV_PIX_FMT_NV12; + } else if (in_format == AV_PIX_FMT_YUV420P10LE + || in_format == AV_PIX_FMT_YUV420P16LE) { + out_format = AV_PIX_FMT_YUV420P; + } else { + out_format = in_format; + } + + if (!format_is_supported(in_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n", + av_get_pix_fmt_name(in_format)); + return AVERROR(ENOSYS); + } + if (!format_is_supported(out_format)) { + av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n", + av_get_pix_fmt_name(out_format)); + return AVERROR(ENOSYS); + } + + s->in_fmt = in_format; + s->out_fmt = out_format; + s->in_desc = av_pix_fmt_desc_get(s->in_fmt); + s->out_desc = av_pix_fmt_desc_get(s->out_fmt); + + if (in_width == out_width && in_height == out_height + && in_format == out_format) { + s->frames_ctx = av_buffer_ref(inl->hw_frames_ctx); + if (!s->frames_ctx) + return AVERROR(ENOMEM); + } else { + s->passthrough = 0; + + ret = + init_hwframe_ctx(s, in_frames_ctx->device_ref, out_width, + out_height); + if (ret < 0) + return ret; + } + + outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx); + if (!outl->hw_frames_ctx) + return AVERROR(ENOMEM); + + return 0; +} + +static av_cold int tonemap_cuda_config_props(AVFilterLink * outlink) +{ + AVFilterContext *ctx = outlink->src; + AVFilterLink *inlink = outlink->src->inputs[0]; + FilterLink *inl = ff_filter_link(inlink); + TonemapCudaContext *s = ctx->priv; + AVHWFramesContext *frames_ctx; + AVCUDADeviceContext *device_hwctx; + int ret; + + outlink->w = inlink->w; + outlink->h = inlink->h; + + ret = + init_processing_chain(ctx, inlink->w, inlink->h, outlink->w, + outlink->h); + if (ret < 0) + return ret; + + frames_ctx = (AVHWFramesContext *) inl->hw_frames_ctx->data; + device_hwctx = frames_ctx->device_ctx->hwctx; + + s->hwctx = device_hwctx; + s->cu_stream = s->hwctx->stream; + + s->coeffs = av_csp_luma_coeffs_from_avcsp(inlink->colorspace); + + av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d fmt:%s -> w:%d h:%d fmt:%s%s\n", + inlink->w, inlink->h, av_get_pix_fmt_name(s->in_fmt), + outlink->w, outlink->h, av_get_pix_fmt_name(s->out_fmt), + s->passthrough ? " (passthrough)" : ""); + + ret = tonemap_cuda_load_functions(ctx); + if (ret < 0) + return ret; + + if (s->async_depth > 1 || s->async_streams > 1) { + av_log(ctx, AV_LOG_DEBUG, + "Async processing enabled: depth=%d streams=%d\n", + s->async_depth, s->async_streams); + + ret = ff_cuda_async_queue_init(&s->async_queue, s->hwctx, + s->async_depth, s->async_streams, + ctx, process_frame_async); + if (ret < 0) { + av_log(ctx, AV_LOG_ERROR, + "Failed to initialize async queue\n"); + return ret; + } + } + + return 0; +} + +static int call_tonemap_kernel(AVFilterContext * ctx, AVFrame * out, + AVFrame * in) +{ + TonemapCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx; + CUtexObject tex = 0; + int ret; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; + + if (s->in_fmt == AV_PIX_FMT_GBRPF32) { + // Planar float format + float coeff_r = av_q2d(s->coeffs->cr); + float coeff_g = av_q2d(s->coeffs->cg); + float coeff_b = av_q2d(s->coeffs->cb); + void *args[] = { + &in->data[0], &in->data[1], &in->data[2], + &out->data[0], &out->data[1], &out->data[2], + &out->width, &out->height, &in->linesize[0], &out->linesize[0], + &s->tonemap, &s->param, &s->desat, &s->peak, + &coeff_r, &coeff_g, &coeff_b + }; + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func, + DIV_UP(out->width, BLOCKX), + DIV_UP(out->height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, + s->cu_stream, args, NULL)); + } else + if ((s->in_fmt == AV_PIX_FMT_P010LE + || s->in_fmt == AV_PIX_FMT_P016LE) + && s->out_fmt == AV_PIX_FMT_NV12) { + // P016 to NV12 conversion + CUDA_TEXTURE_DESC tex_desc_y = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_TEXTURE_DESC tex_desc_uv = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc_y = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = 1, + .res.pitch2D.width = in->width, + .res.pitch2D.height = in->height, + .res.pitch2D.pitchInBytes = in->linesize[0], + .res.pitch2D.devPtr = (CUdeviceptr) in->data[0], + }; + + CUDA_RESOURCE_DESC res_desc_uv = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = 2, + .res.pitch2D.width = in->width / 2, + .res.pitch2D.height = in->height / 2, + .res.pitch2D.pitchInBytes = in->linesize[1], + .res.pitch2D.devPtr = (CUdeviceptr) in->data[1], + }; + + CUtexObject tex_y = 0, tex_uv = 0; + ret = + CHECK_CU(cu-> + cuTexObjectCreate(&tex_y, &res_desc_y, &tex_desc_y, + NULL)); + if (ret < 0) + goto exit; + + ret = + CHECK_CU(cu-> + cuTexObjectCreate(&tex_uv, &res_desc_uv, &tex_desc_uv, + NULL)); + if (ret < 0) + goto exit; + + float coeff_r = av_q2d(s->coeffs->cr); + float coeff_g = av_q2d(s->coeffs->cg); + float coeff_b = av_q2d(s->coeffs->cb); + + void *args[] = { + &tex_y, &tex_uv, + &out->data[0], &out->data[1], + &out->width, &out->height, &out->linesize[0], + &out->linesize[1], + &s->tonemap, &s->param, &s->desat, &s->peak, + &coeff_r, &coeff_g, &coeff_b + }; + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func, + DIV_UP(out->width, BLOCKX), + DIV_UP(out->height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, + s->cu_stream, args, NULL)); + + if (tex_y) + CHECK_CU(cu->cuTexObjectDestroy(tex_y)); + if (tex_uv) + CHECK_CU(cu->cuTexObjectDestroy(tex_uv)); + } else if (s->in_fmt == AV_PIX_FMT_YUV420P10LE + || s->in_fmt == AV_PIX_FMT_YUV420P16LE) { + // YUV format handling + CUDA_TEXTURE_DESC tex_desc_y = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_TEXTURE_DESC tex_desc_uv = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc_y = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = 1, + .res.pitch2D.width = in->width, + .res.pitch2D.height = in->height, + .res.pitch2D.pitchInBytes = in->linesize[0], + .res.pitch2D.devPtr = (CUdeviceptr) in->data[0], + }; + + CUDA_RESOURCE_DESC res_desc_uv = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = 2, + .res.pitch2D.width = in->width / 2, + .res.pitch2D.height = in->height / 2, + .res.pitch2D.pitchInBytes = in->linesize[1], + .res.pitch2D.devPtr = (CUdeviceptr) in->data[1], + }; + + CUtexObject tex_y = 0, tex_uv = 0; + ret = + CHECK_CU(cu-> + cuTexObjectCreate(&tex_y, &res_desc_y, &tex_desc_y, + NULL)); + if (ret < 0) + goto exit; + + ret = + CHECK_CU(cu-> + cuTexObjectCreate(&tex_uv, &res_desc_uv, &tex_desc_uv, + NULL)); + if (ret < 0) + goto exit; + + float coeff_r = av_q2d(s->coeffs->cr); + float coeff_g = av_q2d(s->coeffs->cg); + float coeff_b = av_q2d(s->coeffs->cb); + + void *args[] = { + &tex_y, &tex_uv, + &out->data[0], &out->data[1], &out->data[2], + &out->width, &out->height, &out->linesize[0], + &out->linesize[1], + &s->tonemap, &s->param, &s->desat, &s->peak, + &coeff_r, &coeff_g, &coeff_b + }; + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func, + DIV_UP(out->width, BLOCKX), + DIV_UP(out->height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, + s->cu_stream, args, NULL)); + + if (tex_y) + CHECK_CU(cu->cuTexObjectDestroy(tex_y)); + if (tex_uv) + CHECK_CU(cu->cuTexObjectDestroy(tex_uv)); + } else { + // 16-bit formats with texture + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_POINT, + .flags = CU_TRSF_READ_AS_INTEGER, + }; + + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = 4, + .res.pitch2D.width = in->width, + .res.pitch2D.height = in->height, + .res.pitch2D.pitchInBytes = in->linesize[0], + .res.pitch2D.devPtr = (CUdeviceptr) in->data[0], + }; + + ret = + CHECK_CU(cu-> + cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + + int bit_depth = (s->in_fmt == AV_PIX_FMT_RGB48) ? 16 : 16; + float coeff_r = av_q2d(s->coeffs->cr); + float coeff_g = av_q2d(s->coeffs->cg); + float coeff_b = av_q2d(s->coeffs->cb); + + void *args[] = { + &tex, + &out->data[0], + &out->width, &out->height, &out->linesize[0], + &s->tonemap, &s->param, &s->desat, &s->peak, + &coeff_r, &coeff_g, &coeff_b, + &bit_depth + }; + + ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func, + DIV_UP(out->width, BLOCKX), + DIV_UP(out->height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, + s->cu_stream, args, NULL)); + } + + exit: + if (tex) + CHECK_CU(cu->cuTexObjectDestroy(tex)); + + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + + return ret; +} + + +static int tonemap_cuda_filter_frame(AVFilterLink * link, AVFrame * in); + +static int process_frame_async(void *filter_ctx, AVFrame * out, + AVFrame * in, CUstream stream) +{ + AVFilterContext *ctx = (AVFilterContext *) filter_ctx; + TonemapCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUstream old_stream = s->cu_stream; + CUcontext dummy; + int ret; + + // Allocate output frame buffer (context will be managed by call_tonemap_kernel) + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (ret < 0) + return ret; + + ret = av_hwframe_get_buffer(s->frames_ctx, out, 0); + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + + if (ret < 0) + return ret; + + // Ensure output frame has correct dimensions (not aligned dimensions) + out->width = in->width; + out->height = in->height; + + // Set the stream and call the kernel (kernel manages its own context) + s->cu_stream = stream; + ret = call_tonemap_kernel(ctx, out, in); + s->cu_stream = old_stream; + + if (ret >= 0) { + ret = av_frame_copy_props(out, in); + } + + return ret; +} + +static int tonemap_cuda_activate(AVFilterContext * ctx) +{ + AVFilterLink *inlink = ctx->inputs[0]; + AVFilterLink *outlink = ctx->outputs[0]; + TonemapCudaContext *s = ctx->priv; + AVFrame *in = NULL; + int ret, status; + int64_t pts; + + FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink); + + if (s->async_depth > 1 || s->async_streams > 1) { + // Try to receive completed frames first to avoid memory buildup + AVFrame *out = NULL; + ret = ff_cuda_async_queue_receive(&s->async_queue, &out); + if (ret == 0 && out) { + return ff_filter_frame(outlink, out); + } else if (ret < 0 && ret != AVERROR(EAGAIN)) { + return ret; + } + + // Only submit new frames if there's space and no completed frames ready + if (!ff_cuda_async_queue_is_full(&s->async_queue)) { + ret = ff_inlink_consume_frame(inlink, &in); + if (ret < 0) { + if (ret == AVERROR_EOF || ret == AVERROR(EAGAIN)) { + // No input available, but we might have frames in queue + return AVERROR(EAGAIN); + } + return ret; + } + + if (in) { + ret = ff_cuda_async_queue_submit(&s->async_queue, in); + av_frame_free(&in); + if (ret < 0) { + return ret; + } + av_log(ctx, AV_LOG_DEBUG, + "Submitted frame to async queue\n"); + } + } + } else { + // Synchronous processing + ret = ff_inlink_consume_frame(inlink, &in); + if (ret < 0) + return ret; + + if (in) { + ret = tonemap_cuda_filter_frame(inlink, in); + if (ret < 0) + return ret; + } + } + + if (ff_inlink_acknowledge_status(inlink, &status, &pts)) { + if (s->async_depth > 1 || s->async_streams > 1) { + // Flush all remaining async frames + while (!ff_cuda_async_queue_is_empty(&s->async_queue)) { + AVFrame *out = NULL; + ret = ff_cuda_async_queue_receive(&s->async_queue, &out); + if (ret == 0 && out) { + ret = ff_filter_frame(outlink, out); + if (ret < 0) + return ret; + } else if (ret < 0 && ret != AVERROR(EAGAIN)) { + av_log(ctx, AV_LOG_WARNING, + "Error receiving async frame during flush: %d\n", + ret); + break; + } + } + } + ff_outlink_set_status(outlink, status, pts); + return 0; + } + + FF_FILTER_FORWARD_WANTED(outlink, inlink); + + return FFERROR_NOT_READY; +} + +static int tonemap_cuda_filter_frame(AVFilterLink * link, AVFrame * in) +{ + AVFilterContext *ctx = link->dst; + TonemapCudaContext *s = ctx->priv; + AVFilterLink *outlink = ctx->outputs[0]; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + + AVFrame *out = NULL; + CUcontext dummy; + int ret = 0; + + if (s->passthrough) + return ff_filter_frame(outlink, in); + + /* read peak from side data if not passed in */ + if (!s->peak) { + s->peak = ff_determine_signal_peak(in); + av_log(s, AV_LOG_DEBUG, "Computed signal peak: %f\n", s->peak); + } + + if (s->async_depth > 1 || s->async_streams > 1) { + ret = ff_cuda_async_queue_submit(&s->async_queue, in); + if (ret == AVERROR(EAGAIN)) { + AVFrame *completed_frame = NULL; + ret = + ff_cuda_async_queue_receive(&s->async_queue, + &completed_frame); + if (ret < 0 && ret != AVERROR(EAGAIN)) { + av_frame_free(&in); + return ret; + } + if (completed_frame) { + ret = ff_filter_frame(outlink, completed_frame); + if (ret < 0) { + av_frame_free(&in); + return ret; + } + } + ret = ff_cuda_async_queue_submit(&s->async_queue, in); + } + if (ret < 0) { + av_frame_free(&in); + return ret; + } + + while (ff_inlink_queued_frames(link) == 0) { + AVFrame *completed_frame = NULL; + ret = + ff_cuda_async_queue_receive(&s->async_queue, + &completed_frame); + if (ret == AVERROR(EAGAIN)) { + break; + } else if (ret < 0) { + return ret; + } + if (completed_frame) { + ret = ff_filter_frame(outlink, completed_frame); + if (ret < 0) + return ret; + } + } + + return 0; + } + + out = av_frame_alloc(); + if (!out) { + ret = AVERROR(ENOMEM); + goto fail; + } + + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (ret < 0) + goto fail; + + ret = av_hwframe_get_buffer(s->frames_ctx, out, 0); + if (ret < 0) + goto fail; + + // Ensure output frame has correct dimensions (not aligned dimensions) + out->width = in->width; + out->height = in->height; + + ret = call_tonemap_kernel(ctx, out, in); + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); + if (ret < 0) + goto fail; + + ret = av_frame_copy_props(out, in); + if (ret < 0) + goto fail; + + av_frame_free(&in); + return ff_filter_frame(outlink, out); + + fail: + av_frame_free(&in); + av_frame_free(&out); + return ret; +} + +#define OFFSET(x) offsetof(TonemapCudaContext, x) +#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM) + +static const AVOption tonemap_cuda_options[] = { + { "tonemap", "tonemap algorithm selection", OFFSET(tonemap), + AV_OPT_TYPE_INT, {.i64 = + TONEMAP_NONE}, 0, TONEMAP_MAX - 1, FLAGS,.unit = + "tonemap" }, + { "none", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE}, 0, 0, + FLAGS,.unit = "tonemap" }, + { "linear", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR}, 0, 0, + FLAGS,.unit = "tonemap" }, + { "gamma", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA}, 0, 0, + FLAGS,.unit = "tonemap" }, + { "clip", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP}, 0, 0, + FLAGS,.unit = "tonemap" }, + { "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD}, 0, 0, + FLAGS,.unit = "tonemap" }, + { "hable", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE}, 0, 0, + FLAGS,.unit = "tonemap" }, + { "mobius", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS}, 0, 0, + FLAGS,.unit = "tonemap" }, + { "param", "tonemap parameter", OFFSET(param), AV_OPT_TYPE_DOUBLE, + {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS }, + { "desat", "desaturation strength", OFFSET(desat), AV_OPT_TYPE_DOUBLE, + {.dbl = 2}, 0, DBL_MAX, FLAGS }, + { "peak", "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE, + {.dbl = 0}, 0, DBL_MAX, FLAGS }, + { "passthrough", "Do not process frames at all if parameters match", + OFFSET(passthrough), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS }, + { "async_depth", "Frame queue depth for async processing", + OFFSET(async_depth), AV_OPT_TYPE_INT, {.i64 = + 1}, 1, MAX_FRAME_QUEUE_SIZE, + FLAGS }, + { "async_streams", "Number of CUDA streams for async processing", + OFFSET(async_streams), AV_OPT_TYPE_INT, {.i64 = + 1}, 1, MAX_CUDA_STREAMS, + FLAGS }, + { NULL }, +}; + +static const AVClass tonemap_cuda_class = { + .class_name = "tonemap_cuda", + .item_name = av_default_item_name, + .option = tonemap_cuda_options, + .version = LIBAVUTIL_VERSION_INT, +}; + +static const AVFilterPad tonemap_cuda_inputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + }, +}; + +static const AVFilterPad tonemap_cuda_outputs[] = { + { + .name = "default", + .type = AVMEDIA_TYPE_VIDEO, + .config_props = tonemap_cuda_config_props, + }, +}; + +const AVFilter ff_vf_tonemap_cuda = { + .name = "tonemap_cuda", + .description = + NULL_IF_CONFIG_SMALL("GPU accelerated HDR to SDR tonemap filter"), + .priv_class = &tonemap_cuda_class, + + .init = tonemap_cuda_init, + .uninit = tonemap_cuda_uninit, + .activate = tonemap_cuda_activate, + + .priv_size = sizeof(TonemapCudaContext), + + FILTER_INPUTS(tonemap_cuda_inputs), + FILTER_OUTPUTS(tonemap_cuda_outputs), + + FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA), + + .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE, +}; diff --git a/libavfilter/vf_tonemap_cuda.cu b/libavfilter/vf_tonemap_cuda.cu new file mode 100644 index 00000000000..742f917db06 --- /dev/null +++ b/libavfilter/vf_tonemap_cuda.cu @@ -0,0 +1,330 @@ +/* + * This file is part of FFmpeg. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "cuda/vector_helpers.cuh" + +#define BLOCKX 32 +#define BLOCKY 16 + +extern "C" { + +enum TonemapAlgorithm { + TONEMAP_NONE, + TONEMAP_LINEAR, + TONEMAP_GAMMA, + TONEMAP_CLIP, + TONEMAP_REINHARD, + TONEMAP_HABLE, + TONEMAP_MOBIUS, + TONEMAP_MAX, +}; + +__device__ static inline float hable_cuda(float in) +{ + float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f; + return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f; +} + +__device__ static inline float mobius_cuda(float in, float j, float peak) +{ + float a, b; + + if (in <= j) + return in; + + a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak); + b = (j * j - 2.0f * j * peak + peak) / fmaxf(peak - 1.0f, 1e-6f); + + return (b * b + 2.0f * b * j + j * j) / (b - a) * (in + a) / (in + b); +} + +__device__ static inline float apply_tonemap_cuda(int algorithm, float sig, float param, float peak) +{ + switch (algorithm) { + case TONEMAP_NONE: + return sig; + case TONEMAP_LINEAR: + return sig * param / peak; + case TONEMAP_GAMMA: + return sig > 0.05f ? powf(sig / peak, 1.0f / param) + : sig * powf(0.05f / peak, 1.0f / param) / 0.05f; + case TONEMAP_CLIP: + return fminf(fmaxf(sig * param, 0.0f), 1.0f); + case TONEMAP_HABLE: + return hable_cuda(sig) / hable_cuda(peak); + case TONEMAP_REINHARD: + return sig / (sig + param) * (peak + param) / peak; + case TONEMAP_MOBIUS: + return mobius_cuda(sig, param, peak); + default: + return sig; + } +} + +__global__ void tonemap_cuda_float( + cudaTextureObject_t src_tex, + float *dst_r, float *dst_g, float *dst_b, + int dst_width, int dst_height, int dst_pitch, + int algorithm, float param, float desat, float peak, + float coeff_r, float coeff_g, float coeff_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + float4 src = tex2D(src_tex, x, y); + float r = src.x; + float g = src.y; + float b = src.z; + + // Desaturate to prevent unnatural colors + if (desat > 0) { + float luma = coeff_r * r + coeff_g * g + coeff_b * b; + float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f); + r = r * (1.0f - overbright) + luma * overbright; + g = g * (1.0f - overbright) + luma * overbright; + b = b * (1.0f - overbright) + luma * overbright; + } + + // Pick the brightest component + float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f); + float sig_orig = sig; + + // Apply tonemap algorithm + sig = apply_tonemap_cuda(algorithm, sig, param, peak); + + // Apply the computed scale factor to the color + float scale = sig / sig_orig; + r *= scale; + g *= scale; + b *= scale; + + int idx = y * (dst_pitch / sizeof(float)) + x; + dst_r[idx] = r; + dst_g[idx] = g; + dst_b[idx] = b; +} + +__global__ void tonemap_cuda_planar_float( + float *src_r, float *src_g, float *src_b, + float *dst_r, float *dst_g, float *dst_b, + int width, int height, int src_pitch, int dst_pitch, + int algorithm, float param, float desat, float peak, + float coeff_r, float coeff_g, float coeff_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= width || y >= height) + return; + + int src_idx = y * (src_pitch / sizeof(float)) + x; + int dst_idx = y * (dst_pitch / sizeof(float)) + x; + + float r = src_r[src_idx]; + float g = src_g[src_idx]; + float b = src_b[src_idx]; + + // Desaturate to prevent unnatural colors + if (desat > 0) { + float luma = coeff_r * r + coeff_g * g + coeff_b * b; + float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f); + r = r * (1.0f - overbright) + luma * overbright; + g = g * (1.0f - overbright) + luma * overbright; + b = b * (1.0f - overbright) + luma * overbright; + } + + // Pick the brightest component + float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f); + float sig_orig = sig; + + // Apply tonemap algorithm + sig = apply_tonemap_cuda(algorithm, sig, param, peak); + + // Apply the computed scale factor to the color + float scale = sig / sig_orig; + r *= scale; + g *= scale; + b *= scale; + + dst_r[dst_idx] = r; + dst_g[dst_idx] = g; + dst_b[dst_idx] = b; +} + +__global__ void tonemap_cuda_16bit( + cudaTextureObject_t src_tex, + ushort4 *dst, int dst_width, int dst_height, int dst_pitch, + int algorithm, float param, float desat, float peak, + float coeff_r, float coeff_g, float coeff_b, int bit_depth) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + ushort4 src = tex2D(src_tex, x, y); + float max_val = (1 << bit_depth) - 1; + + float r = src.x / max_val; + float g = src.y / max_val; + float b = src.z / max_val; + + // Desaturate to prevent unnatural colors + if (desat > 0) { + float luma = coeff_r * r + coeff_g * g + coeff_b * b; + float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f); + r = r * (1.0f - overbright) + luma * overbright; + g = g * (1.0f - overbright) + luma * overbright; + b = b * (1.0f - overbright) + luma * overbright; + } + + // Pick the brightest component + float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f); + float sig_orig = sig; + + // Apply tonemap algorithm + sig = apply_tonemap_cuda(algorithm, sig, param, peak); + + // Apply the computed scale factor to the color + float scale = sig / sig_orig; + r *= scale; + g *= scale; + b *= scale; + + dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4( + (unsigned short)(r * max_val + 0.5f), + (unsigned short)(g * max_val + 0.5f), + (unsigned short)(b * max_val + 0.5f), + src.w + ); +} + +__global__ void tonemap_cuda_p016_to_nv12( + cudaTextureObject_t src_y_tex, cudaTextureObject_t src_uv_tex, + unsigned char *dst_y, unsigned char *dst_uv, + int dst_width, int dst_height, int dst_pitch_y, int dst_pitch_uv, + int algorithm, float param, float desat, float peak, + float coeff_r, float coeff_g, float coeff_b) +{ + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + + if (x >= dst_width || y >= dst_height) + return; + + // Sample Y component (16-bit) + unsigned short y_val = tex2D(src_y_tex, x, y); + + // Sample UV components (subsampled, 16-bit) - ensure consistent sampling for 2x2 blocks + int uv_x = (x / 2); + int uv_y = (y / 2); + ushort2 uv_val = tex2D(src_uv_tex, uv_x, uv_y); + + // Convert YUV to RGB (BT.2020 for HDR, P016LE format: 10-bit left-justified in 16-bit) + // P016LE: 10-bit values are stored in upper 10 bits, so divide by 64 to get 10-bit range + // Add safety clamping to prevent extreme values + unsigned short y_10bit = y_val >> 6; + unsigned short u_10bit = uv_val.x >> 6; + unsigned short v_10bit = uv_val.y >> 6; + + // Clamp to valid 10-bit TV range before normalization + y_10bit = max(64, min(940, (int)y_10bit)); + u_10bit = max(64, min(960, (int)u_10bit)); + v_10bit = max(64, min(960, (int)v_10bit)); + + float Y = (y_10bit - 64.0f) / 876.0f; // 10-bit Y: range 64-940 -> 0.0-1.0 + float U = (u_10bit - 512.0f) / 448.0f; // 10-bit U: range 64-960, centered at 512, scale by half-range + float V = (v_10bit - 512.0f) / 448.0f; // 10-bit V: range 64-960, centered at 512, scale by half-range + + // BT.2020 YUV to RGB conversion (corrected coefficients) + float r = Y + 1.7166f * V; + float g = Y - 0.1873f * U - 0.6526f * V; + float b = Y + 2.0426f * U; + + r = fmaxf(r, 0.0f); + g = fmaxf(g, 0.0f); + b = fmaxf(b, 0.0f); + + // Desaturate to prevent unnatural colors + if (desat > 0) { + float luma = coeff_r * r + coeff_g * g + coeff_b * b; + float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f); + r = r * (1.0f - overbright) + luma * overbright; + g = g * (1.0f - overbright) + luma * overbright; + b = b * (1.0f - overbright) + luma * overbright; + } + + // Pick the brightest component + float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f); + float sig_orig = sig; + + // Apply tonemap algorithm + sig = apply_tonemap_cuda(algorithm, sig, param, peak); + + // Apply the computed scale factor to the color + float scale = sig / sig_orig; + + // Debug: clamp extreme scale values for hable to prevent artifacts + if (algorithm == TONEMAP_HABLE) { + scale = fmaxf(0.01f, fminf(10.0f, scale)); + } + + r *= scale; + g *= scale; + b *= scale; + + // Clamp RGB to valid range [0.0, 1.0] before YUV conversion + r = fmaxf(0.0f, fminf(1.0f, r)); + g = fmaxf(0.0f, fminf(1.0f, g)); + b = fmaxf(0.0f, fminf(1.0f, b)); + + // Convert back to YUV (BT.2020 to preserve input colorspace) + float out_Y = 0.2627f * r + 0.6780f * g + 0.0593f * b; + float out_U = -0.1396f * r - 0.3604f * g + 0.5000f * b; + float out_V = 0.5000f * r - 0.4598f * g - 0.0402f * b; + + // Scale to 8-bit TV range and clamp + int y_out = (int)(out_Y * 219.0f + 16.0f + 0.5f); + int u_out = (int)(out_U * 224.0f + 128.0f + 0.5f); + int v_out = (int)(out_V * 224.0f + 128.0f + 0.5f); + + // Clamp to valid TV range + y_out = max(16, min(235, y_out)); + u_out = max(16, min(240, u_out)); // Proper TV range for chroma + v_out = max(16, min(240, v_out)); + + // Write Y component + dst_y[y * dst_pitch_y + x] = y_out; + + // Write UV components in NV12 format (only for one thread per 2x2 block to avoid race conditions) + if (x % 2 == 0 && y % 2 == 0) { + int uv_idx = (y / 2) * dst_pitch_uv + (x / 2) * 2; // Correct NV12 UV indexing + dst_uv[uv_idx] = u_out; + dst_uv[uv_idx + 1] = v_out; + } +} + +} \ No newline at end of file diff --git a/libavfilter/vf_tonemap_cuda.h b/libavfilter/vf_tonemap_cuda.h new file mode 100644 index 00000000000..7ce8d2865dc --- /dev/null +++ b/libavfilter/vf_tonemap_cuda.h @@ -0,0 +1,28 @@ +/* + * This file is part of FFmpeg. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef AVFILTER_VF_TONEMAP_CUDA_H +#define AVFILTER_VF_TONEMAP_CUDA_H + +#include "avfilter.h" + +#endif /* AVFILTER_VF_TONEMAP_CUDA_H */ diff --git a/libavformat/mov.c b/libavformat/mov.c index ff32d4160e1..0c5eb810af7 100644 --- a/libavformat/mov.c +++ b/libavformat/mov.c @@ -85,6 +85,7 @@ typedef struct MOVParseTableEntry { static int mov_read_default(MOVContext *c, AVIOContext *pb, MOVAtom atom); static int mov_read_mfra(MOVContext *c, AVIOContext *f); static void mov_free_stream_context(AVFormatContext *s, AVStream *st); +static int mov_switch_root(AVFormatContext *s, int64_t target, int index); static int64_t add_ctts_entry(MOVCtts** ctts_data, unsigned int* ctts_count, unsigned int* allocated_size, int count, int duration); @@ -1653,6 +1654,31 @@ static int64_t get_frag_time(AVFormatContext *s, AVStream *dst_st, return frag_stream_info->sidx_pts; } + + // Check if the requested stream is present in the fragment + int exists = 0; + for (i = 0; i < frag_index->item[index].nb_stream_info; i++) { + if (dst_st->id != frag_index->item[index].stream_info[i].id) { + continue; + } + + if ( get_stream_info_time(&frag_index->item[index].stream_info[i]) != AV_NOPTS_VALUE) { + exists = 1; + break; + } + if ( mov_switch_root(s,-1,index) < 0) + return AV_NOPTS_VALUE; + + if ( get_stream_info_time(&frag_index->item[index].stream_info[i]) != AV_NOPTS_VALUE) { + exists = 1; + break; + } + } + + if (!exists) + return AV_NOPTS_VALUE; + + for (i = 0; i < frag_index->item[index].nb_stream_info; i++) { AVStream *frag_stream = NULL; frag_stream_info = &frag_index->item[index].stream_info[i];