diff --git a/configure b/configure
index d77a55b653c..a445f7fb0e5 100755
--- a/configure
+++ b/configure
@@ -3310,6 +3310,11 @@ scale_npp_filter_deps="ffnvcodec libnpp"
 scale2ref_npp_filter_deps="ffnvcodec libnpp"
 scale_cuda_filter_deps="ffnvcodec"
 scale_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+showinfo_cuda_filter_deps="ffnvcodec"
+lut3d_cuda_filter_deps="ffnvcodec"
+lut3d_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+tonemap_cuda_filter_deps="ffnvcodec"
+tonemap_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 thumbnail_cuda_filter_deps="ffnvcodec"
 thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 transpose_npp_filter_deps="ffnvcodec libnpp"
diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
index 733d551fa41..06474f1b3c8 100644
--- a/fftools/ffmpeg.h
+++ b/fftools/ffmpeg.h
@@ -176,6 +176,7 @@ typedef struct OptionsContext {
     SpecifierOptList hwaccel_output_formats;
     SpecifierOptList autorotate;
     SpecifierOptList apply_cropping;
+    SpecifierOptList force_cfr;
 
     /* output options */
     StreamMap *stream_maps;
@@ -276,6 +277,9 @@ typedef struct InputFilterOptions {
      * accurate */
     AVRational          framerate;
 
+    /* convert input stream to CFR at this framerate before inserting additional filters */
+    AVRational                force_cfr;
+
     unsigned            crop_top;
     unsigned            crop_bottom;
     unsigned            crop_left;
@@ -451,6 +455,9 @@ typedef struct InputStream {
 
     /* framerate forced with -r */
     AVRational            framerate;
+
+    /* convert input stream to CFR at this framerate before inserting additional filters */
+    AVRational                force_cfr;
 #if FFMPEG_OPT_TOP
     int                   top_field_first;
 #endif
diff --git a/fftools/ffmpeg_demux.c b/fftools/ffmpeg_demux.c
index 13aef15eaba..9c500b79369 100644
--- a/fftools/ffmpeg_demux.c
+++ b/fftools/ffmpeg_demux.c
@@ -1032,6 +1032,10 @@ int ist_filter_add(InputStream *ist, InputFilter *ifilter, int is_simple,
                 (opts->crop_top | opts->crop_bottom | opts->crop_left | opts->crop_right))
                 opts->flags |= IFILTER_FLAG_CROP;
         }
+        if (ist->force_cfr.num > 0 && ist->force_cfr.den > 0) {
+            opts->force_cfr = ist->force_cfr;
+            opts->flags |= IFILTER_FLAG_CFR;
+        }
     } else if (ist->par->codec_type == AVMEDIA_TYPE_SUBTITLE) {
         /* Compute the size of the canvas for the subtitles stream.
            If the subtitles codecpar has set a size, use it. Otherwise use the
@@ -1241,7 +1245,7 @@ static int ist_add(const OptionsContext *o, Demuxer *d, AVStream *st, AVDictiona
     AVCodecParameters *par = st->codecpar;
     DemuxStream *ds;
     InputStream *ist;
-    const char *framerate = NULL, *hwaccel_device = NULL;
+    const char *framerate = NULL, *hwaccel_device = NULL, *forcecfr = NULL;
     const char *hwaccel = NULL;
     const char *apply_cropping = NULL;
     const char *hwaccel_output_format = NULL;
@@ -1437,6 +1441,15 @@ static int ist_add(const OptionsContext *o, Demuxer *d, AVStream *st, AVDictiona
             }
         }
 
+        opt_match_per_stream_str(ist, &o->force_cfr, ic, st, &forcecfr);
+        if (forcecfr) {
+            ret = av_parse_video_rate(&ist->force_cfr, forcecfr);
+            if (ret < 0) {
+                av_log(ist, AV_LOG_ERROR, "Error parsing framerate %s.\n", forcecfr);
+                return ret;
+            }
+        }
+
 #if FFMPEG_OPT_TOP
         ist->top_field_first = -1;
         opt_match_per_stream_int(ist, &o->top_field_first, ic, st, &ist->top_field_first);
diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
index 38c7676a7e0..62dd23ebe41 100644
--- a/fftools/ffmpeg_filter.c
+++ b/fftools/ffmpeg_filter.c
@@ -1781,6 +1781,15 @@ static int configure_input_video_filter(FilterGraph *fg, AVFilterGraph *graph,
         ifp->displaymatrix_applied = 1;
     }
 
+    if (ifp->opts.force_cfr.num > 0 && ifp->opts.force_cfr.den > 0) {
+        char force_cfr_buf[64];
+        snprintf(force_cfr_buf, sizeof(force_cfr_buf), "%d/%d",
+                 ifp->opts.force_cfr.num, ifp->opts.force_cfr.den);
+        ret = insert_filter(&last_filter, &pad_idx, "fps", force_cfr_buf);
+        if (ret < 0)
+            return ret;
+    }
+
     snprintf(name, sizeof(name), "trim_in_%s", ifp->opts.name);
     ret = insert_trim(ifp->opts.trim_start_us, ifp->opts.trim_end_us,
                       &last_filter, &pad_idx, name);
diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
index f639a1cf0aa..ce5aabcb590 100644
--- a/fftools/ffmpeg_opt.c
+++ b/fftools/ffmpeg_opt.c
@@ -2033,5 +2033,9 @@ const OptionDef options[] = {
         "set video sync method globally; deprecated, use -fps_mode", "" },
 #endif
 
+    { "force_cfr", OPT_TYPE_STRING, OPT_VIDEO | OPT_EXPERT | OPT_PERSTREAM | OPT_INPUT,
+        { .off = OFFSET(force_cfr) },
+        "set frame rate (Hz value, fraction or abbreviation)", "force_cfr" },
+
     { NULL, },
 };
diff --git a/libavcodec/libsvtav1.c b/libavcodec/libsvtav1.c
index 79b28eb4df5..79f5506c1cd 100644
--- a/libavcodec/libsvtav1.c
+++ b/libavcodec/libsvtav1.c
@@ -435,7 +435,7 @@ static av_cold int eb_enc_init(AVCodecContext *avctx)
 
     svt_enc->eos_flag = EOS_NOT_REACHED;
 
-    svt_ret = svt_av1_enc_init_handle(&svt_enc->svt_handle, svt_enc, &svt_enc->enc_params);
+    svt_ret = svt_av1_enc_init_handle(&svt_enc->svt_handle, &svt_enc->enc_params);
     if (svt_ret != EB_ErrorNone) {
         return svt_print_error(avctx, svt_ret, "Error initializing encoder handle");
     }
diff --git a/libavfilter/Makefile b/libavfilter/Makefile
index 91487afb218..bf38be983c8 100644
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -33,6 +33,7 @@ OBJS-$(CONFIG_SCENE_SAD)                     += scene_sad.o
 OBJS-$(CONFIG_DNN)                           += dnn_filter_common.o
 include $(SRC_PATH)/libavfilter/dnn/Makefile
 
+
 # audio filters
 OBJS-$(CONFIG_AAP_FILTER)                    += af_aap.o
 OBJS-$(CONFIG_ABENCH_FILTER)                 += f_bench.o
@@ -245,7 +246,8 @@ OBJS-$(CONFIG_COLORMATRIX_FILTER)            += vf_colormatrix.o
 OBJS-$(CONFIG_COLORSPACE_FILTER)             += vf_colorspace.o colorspacedsp.o
 OBJS-$(CONFIG_COLORSPACE_CUDA_FILTER)        += vf_colorspace_cuda.o \
                                                 vf_colorspace_cuda.ptx.o \
-                                                cuda/load_helper.o
+                                                cuda/load_helper.o \
+                                                cuda/cuda_async_queue.o
 OBJS-$(CONFIG_COLORTEMPERATURE_FILTER)       += vf_colortemperature.o
 OBJS-$(CONFIG_CONVOLUTION_FILTER)            += vf_convolution.o
 OBJS-$(CONFIG_CONVOLUTION_OPENCL_FILTER)     += vf_convolution_opencl.o opencl.o \
@@ -375,6 +377,8 @@ OBJS-$(CONFIG_LUT1D_FILTER)                  += vf_lut3d.o
 OBJS-$(CONFIG_LUT_FILTER)                    += vf_lut.o
 OBJS-$(CONFIG_LUT2_FILTER)                   += vf_lut2.o framesync.o
 OBJS-$(CONFIG_LUT3D_FILTER)                  += vf_lut3d.o framesync.o
+OBJS-$(CONFIG_LUT3D_CUDA_FILTER)             += vf_lut3d_cuda.o \
+                                                vf_lut3d_cuda.ptx.o cuda/load_helper.o
 OBJS-$(CONFIG_LUTRGB_FILTER)                 += vf_lut.o
 OBJS-$(CONFIG_LUTYUV_FILTER)                 += vf_lut.o
 OBJS-$(CONFIG_MASKEDCLAMP_FILTER)            += vf_maskedclamp.o framesync.o
@@ -486,6 +490,7 @@ OBJS-$(CONFIG_SHARPEN_NPP_FILTER)            += vf_sharpen_npp.o
 OBJS-$(CONFIG_SHARPNESS_VAAPI_FILTER)        += vf_misc_vaapi.o vaapi_vpp.o
 OBJS-$(CONFIG_SHEAR_FILTER)                  += vf_shear.o
 OBJS-$(CONFIG_SHOWINFO_FILTER)               += vf_showinfo.o
+OBJS-$(CONFIG_SHOWINFO_CUDA_FILTER)          += vf_showinfo_cuda.o
 OBJS-$(CONFIG_SHOWPALETTE_FILTER)            += vf_showpalette.o
 OBJS-$(CONFIG_SHUFFLEFRAMES_FILTER)          += vf_shuffleframes.o
 OBJS-$(CONFIG_SHUFFLEPIXELS_FILTER)          += vf_shufflepixels.o
@@ -524,6 +529,9 @@ OBJS-$(CONFIG_TMEDIAN_FILTER)                += vf_xmedian.o framesync.o
 OBJS-$(CONFIG_TMIDEQUALIZER_FILTER)          += vf_tmidequalizer.o
 OBJS-$(CONFIG_TMIX_FILTER)                   += vf_mix.o framesync.o
 OBJS-$(CONFIG_TONEMAP_FILTER)                += vf_tonemap.o
+OBJS-$(CONFIG_TONEMAP_CUDA_FILTER)           += vf_tonemap_cuda.o \
+                                                vf_tonemap_cuda.ptx.o cuda/load_helper.o \
+                                                cuda/cuda_async_queue.o
 OBJS-$(CONFIG_TONEMAP_OPENCL_FILTER)         += vf_tonemap_opencl.o opencl.o \
                                                 opencl/tonemap.o opencl/colorspace_common.o
 OBJS-$(CONFIG_TONEMAP_VAAPI_FILTER)          += vf_tonemap_vaapi.o vaapi_vpp.o
diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
index 9819f0f95b7..35c303c7442 100644
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -351,6 +351,7 @@ extern const AVFilter ff_vf_lut;
 extern const AVFilter ff_vf_lut1d;
 extern const AVFilter ff_vf_lut2;
 extern const AVFilter ff_vf_lut3d;
+extern const AVFilter ff_vf_lut3d_cuda;
 extern const AVFilter ff_vf_lutrgb;
 extern const AVFilter ff_vf_lutyuv;
 extern const AVFilter ff_vf_maskedclamp;
@@ -458,6 +459,7 @@ extern const AVFilter ff_vf_sharpen_npp;
 extern const AVFilter ff_vf_sharpness_vaapi;
 extern const AVFilter ff_vf_shear;
 extern const AVFilter ff_vf_showinfo;
+extern const AVFilter ff_vf_showinfo_cuda;
 extern const AVFilter ff_vf_showpalette;
 extern const AVFilter ff_vf_shuffleframes;
 extern const AVFilter ff_vf_shufflepixels;
@@ -494,6 +496,7 @@ extern const AVFilter ff_vf_tmedian;
 extern const AVFilter ff_vf_tmidequalizer;
 extern const AVFilter ff_vf_tmix;
 extern const AVFilter ff_vf_tonemap;
+extern const AVFilter ff_vf_tonemap_cuda;
 extern const AVFilter ff_vf_tonemap_opencl;
 extern const AVFilter ff_vf_tonemap_vaapi;
 extern const AVFilter ff_vf_tpad;
diff --git a/libavfilter/cuda/cuda_async_queue.c b/libavfilter/cuda/cuda_async_queue.c
new file mode 100644
index 00000000000..c7aedf8c3bf
--- /dev/null
+++ b/libavfilter/cuda/cuda_async_queue.c
@@ -0,0 +1,363 @@
+/*
+ * CUDA Async Queue - Implementation for asynchronous CUDA frame processing
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <string.h>
+#include <sys/time.h>
+
+#include "cuda_async_queue.h"
+#include "libavutil/avassert.h"
+#include "libavutil/log.h"
+#include "libavutil/error.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+
+#define CHECK_CU(x) do { \
+    CUresult res = x; \
+    if (res != CUDA_SUCCESS) { \
+        av_log(NULL, AV_LOG_ERROR, "CUDA error: %d at %s:%d\n", res, __FILE__, __LINE__); \
+        return AVERROR_EXTERNAL; \
+    } \
+} while(0)
+
+static int64_t get_time_us(void)
+{
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec * 1000000LL + tv.tv_usec;
+}
+
+int ff_cuda_async_queue_init(CudaAsyncQueue * queue,
+                             AVCUDADeviceContext * hwctx, int queue_size,
+                             int num_streams, void *filter_ctx,
+                             int (*process_frame)(void *, AVFrame *,
+                                                  AVFrame *, CUstream))
+{
+
+    if(!queue || !hwctx || !filter_ctx || !process_frame)
+        return AVERROR(EINVAL);
+
+    if (queue_size < 1 || queue_size > MAX_FRAME_QUEUE_SIZE)
+        return AVERROR(EINVAL);
+
+    if (num_streams < 1 || num_streams > MAX_CUDA_STREAMS)
+        return AVERROR(EINVAL);
+
+    memset(queue, 0, sizeof(*queue));
+
+    queue->hwctx = hwctx;
+    queue->cuda_ctx = hwctx->cuda_ctx;
+    queue->queue_size = queue_size;
+    queue->num_streams = num_streams;
+    queue->filter_ctx = filter_ctx;
+    queue->process_frame = process_frame;
+
+    CudaFunctions *cu = queue->hwctx->internal->cuda_dl;
+
+    CHECK_CU(cu->cuCtxPushCurrent(queue->cuda_ctx));
+
+    for (int i = 0; i < num_streams; i++) {
+        CHECK_CU(cu->
+                 cuStreamCreate(&queue->streams[i],
+                                CU_STREAM_NON_BLOCKING));
+    }
+
+    for (int i = 0; i < queue_size; i++) {
+        CHECK_CU(cu->
+                 cuEventCreate(&queue->frames[i].event_start,
+                               CU_EVENT_DISABLE_TIMING));
+        CHECK_CU(cu->
+                 cuEventCreate(&queue->frames[i].event_done,
+                               CU_EVENT_DISABLE_TIMING));
+        queue->frames[i].in_use = 0;
+        queue->frames[i].stream_idx = -1;
+    }
+
+    CHECK_CU(cu->cuCtxPopCurrent(NULL));
+
+    av_log(NULL, AV_LOG_DEBUG,
+           "Async queue initialized: depth=%d streams=%d\n", queue_size,
+           num_streams);
+
+    return 0;
+}
+
+void ff_cuda_async_queue_uninit(CudaAsyncQueue * queue)
+{
+    if (!queue || !queue->cuda_ctx)
+        return;
+
+    CudaFunctions *cu = queue->hwctx->internal->cuda_dl;
+
+    cu->cuCtxPushCurrent(queue->cuda_ctx);
+
+    // First, synchronize all streams to ensure all operations complete
+    for (int i = 0; i < queue->num_streams; i++) {
+        if (queue->streams[i]) {
+            cu->cuStreamSynchronize(queue->streams[i]);
+        }
+    }
+
+    ff_cuda_async_queue_flush(queue);
+
+    for (int i = 0; i < queue->num_streams; i++) {
+        if (queue->streams[i]) {
+            cu->cuStreamDestroy(queue->streams[i]);
+        }
+    }
+
+    for (int i = 0; i < queue->queue_size; i++) {
+        // Synchronize events before destroying to prevent memory leaks
+        if (queue->frames[i].event_start) {
+            cu->cuEventSynchronize(queue->frames[i].event_start);
+            cu->cuEventDestroy(queue->frames[i].event_start);
+        }
+        if (queue->frames[i].event_done) {
+            cu->cuEventSynchronize(queue->frames[i].event_done);
+            cu->cuEventDestroy(queue->frames[i].event_done);
+        }
+
+        // Ensure frames are properly unreferenced before freeing
+        if (queue->frames[i].input_frame) {
+            // For input frames, unreference all frame data including hardware buffers
+            av_frame_unref(queue->frames[i].input_frame);
+            av_frame_free(&queue->frames[i].input_frame);
+        }
+        if (queue->frames[i].output_frame) {
+            // For output frames, ensure hardware frame buffers are unreferenced
+            // This is critical for CUDA frames allocated via av_hwframe_get_buffer()
+            av_frame_unref(queue->frames[i].output_frame);
+            av_frame_free(&queue->frames[i].output_frame);
+        }
+    }
+
+    cu->cuCtxPopCurrent(NULL);
+
+    memset(queue, 0, sizeof(*queue));
+}
+
+int ff_cuda_async_queue_submit(CudaAsyncQueue * queue, AVFrame * in_frame)
+{
+    CudaAsyncFrame *async_frame;
+    AVFrame *out_frame;
+    int ret;
+    int stream_idx;
+
+    if (!queue || !in_frame)
+        return AVERROR(EINVAL);
+
+    if (ff_cuda_async_queue_is_full(queue))
+        return AVERROR(EAGAIN);
+
+    CudaFunctions *cu = queue->hwctx->internal->cuda_dl;
+
+    CHECK_CU(cu->cuCtxPushCurrent(queue->cuda_ctx));
+
+    async_frame = &queue->frames[queue->write_idx];
+    av_assert0(!async_frame->in_use);
+
+    async_frame->input_frame = av_frame_clone(in_frame);
+    if (!async_frame->input_frame) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    out_frame = av_frame_alloc();
+    if (!out_frame) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    // Frame is already clean from av_frame_alloc(), no need to unref
+    
+    stream_idx = queue->submission_counter % queue->num_streams;
+    async_frame->stream_idx = stream_idx;
+    async_frame->submission_order = queue->submission_counter++;
+    async_frame->submit_time_us = get_time_us();
+
+    CHECK_CU(cu->
+             cuEventRecord(async_frame->event_start,
+                           queue->streams[stream_idx]));
+
+    av_log(NULL, AV_LOG_DEBUG,
+           "[CUDA_ASYNC] Frame %" PRId64
+           " submitted to stream %d (queue slot %d) at time %" PRId64 "\n",
+           async_frame->submission_order, stream_idx, queue->write_idx,
+           async_frame->submit_time_us);
+
+    ret =
+        queue->process_frame(queue->filter_ctx, out_frame,
+                             async_frame->input_frame,
+                             queue->streams[stream_idx]);
+    if (ret < 0) {
+        av_frame_free(&out_frame);
+        goto fail;
+    }
+    // Don't synchronize here - let the kernel run asynchronously
+    // The synchronization will happen in ff_cuda_async_queue_receive
+
+    async_frame->output_frame = out_frame;
+
+    CHECK_CU(cu->
+             cuEventRecord(async_frame->event_done,
+                           queue->streams[stream_idx]));
+
+    av_log(NULL, AV_LOG_DEBUG,
+           "[CUDA_ASYNC] Frame %" PRId64
+           " processing initiated on stream %d\n",
+           async_frame->submission_order, stream_idx);
+
+    async_frame->in_use = 1;
+    queue->write_idx = (queue->write_idx + 1) % queue->queue_size;
+    queue->frames_in_queue++;
+
+    // Log current queue status to show parallel activity
+    av_log(NULL, AV_LOG_DEBUG,
+           "[CUDA_ASYNC] Queue status: %d/%d frames active\n",
+           queue->frames_in_queue, queue->queue_size);
+
+    CHECK_CU(cu->cuCtxPopCurrent(NULL));
+
+    return 0;
+
+  fail:
+    av_frame_free(&async_frame->input_frame);
+    cu->cuCtxPopCurrent(NULL);
+    return ret;
+}
+
+int ff_cuda_async_queue_receive(CudaAsyncQueue * queue,
+                                AVFrame ** out_frame)
+{
+    CudaAsyncFrame *async_frame;
+    CUresult cu_res;
+    int ret = 0;
+
+    if (!queue || !out_frame)
+        return AVERROR(EINVAL);
+
+    *out_frame = NULL;
+
+    if (ff_cuda_async_queue_is_empty(queue))
+        return AVERROR(EAGAIN);
+
+    CudaFunctions *cu = queue->hwctx->internal->cuda_dl;
+
+    CHECK_CU(cu->cuCtxPushCurrent(queue->cuda_ctx));
+
+    async_frame = &queue->frames[queue->read_idx];
+    av_assert0(async_frame->in_use);
+
+    cu_res = cu->cuEventQuery(async_frame->event_done);
+    if (cu_res == CUDA_ERROR_NOT_READY) {
+        ret = AVERROR(EAGAIN);
+        goto done;
+    } else if (cu_res != CUDA_SUCCESS) {
+        av_log(NULL, AV_LOG_ERROR, "CUDA error querying event: %d\n",
+               cu_res);
+        ret = AVERROR_EXTERNAL;
+        goto done;
+    }
+
+    CHECK_CU(cu->cuEventSynchronize(async_frame->event_done));
+
+    async_frame->complete_time_us = get_time_us();
+    int64_t processing_time =
+        async_frame->complete_time_us - async_frame->submit_time_us;
+
+    av_log(NULL, AV_LOG_DEBUG,
+           "[CUDA_ASYNC] Frame %" PRId64
+           " completed on stream %d (queue slot %d) at time %" PRId64
+           " - processing took %" PRId64 "us\n",
+           async_frame->submission_order, async_frame->stream_idx,
+           queue->read_idx, async_frame->complete_time_us,
+           processing_time);
+
+    *out_frame = async_frame->output_frame;
+    async_frame->output_frame = NULL;
+
+    av_frame_free(&async_frame->input_frame);
+    async_frame->in_use = 0;
+    async_frame->stream_idx = -1;
+
+    queue->read_idx = (queue->read_idx + 1) % queue->queue_size;
+    queue->frames_in_queue--;
+
+  done:
+    CHECK_CU(cu->cuCtxPopCurrent(NULL));
+    return ret;
+}
+
+int ff_cuda_async_queue_flush(CudaAsyncQueue * queue)
+{
+    AVFrame *frame;
+    int ret;
+
+    if (!queue)
+        return AVERROR(EINVAL);
+
+    CudaFunctions *cu = queue->hwctx->internal->cuda_dl;
+
+    CHECK_CU(cu->cuCtxPushCurrent(queue->cuda_ctx));
+
+    for (int i = 0; i < queue->num_streams; i++) {
+        if (queue->streams[i]) {
+            CHECK_CU(cu->cuStreamSynchronize(queue->streams[i]));
+        }
+    }
+
+    while (!ff_cuda_async_queue_is_empty(queue)) {
+        ret = ff_cuda_async_queue_receive(queue, &frame);
+        if (ret == 0 && frame) {
+            av_frame_free(&frame);
+        } else if (ret < 0 && ret != AVERROR(EAGAIN)) {
+            // Force cleanup of remaining frames
+            for (int i = 0; i < queue->queue_size; i++) {
+                if (queue->frames[i].in_use) {
+                    // Properly unreference frames before freeing
+                    if (queue->frames[i].input_frame) {
+                        av_frame_unref(queue->frames[i].input_frame);
+                        av_frame_free(&queue->frames[i].input_frame);
+                    }
+                    if (queue->frames[i].output_frame) {
+                        av_frame_unref(queue->frames[i].output_frame);
+                        av_frame_free(&queue->frames[i].output_frame);
+                    }
+                    queue->frames[i].in_use = 0;
+                }
+            }
+            queue->frames_in_queue = 0;
+            queue->read_idx = queue->write_idx = 0;
+            break;
+        }
+    }
+
+    CHECK_CU(cu->cuCtxPopCurrent(NULL));
+
+    return 0;
+}
+
+int ff_cuda_async_queue_is_full(CudaAsyncQueue * queue)
+{
+    return queue->frames_in_queue >= queue->queue_size;
+}
+
+int ff_cuda_async_queue_is_empty(CudaAsyncQueue * queue)
+{
+    return queue->frames_in_queue == 0;
+}
diff --git a/libavfilter/cuda/cuda_async_queue.h b/libavfilter/cuda/cuda_async_queue.h
new file mode 100644
index 00000000000..67bbed4e462
--- /dev/null
+++ b/libavfilter/cuda/cuda_async_queue.h
@@ -0,0 +1,82 @@
+/*
+ * CUDA Async Queue - Header file for asynchronous CUDA frame processing
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_CUDA_ASYNC_QUEUE_H
+#define AVFILTER_CUDA_ASYNC_QUEUE_H
+
+#include "libavutil/frame.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "../avfilter.h"
+
+#define MAX_FRAME_QUEUE_SIZE 8
+#define MAX_CUDA_STREAMS 4
+
+typedef struct CudaAsyncFrame {
+    AVFrame *input_frame;
+    AVFrame *output_frame;
+    CUevent event_start;
+    CUevent event_done;
+    int in_use;
+    int stream_idx;
+    int64_t submission_order;
+    int64_t submit_time_us;
+    int64_t complete_time_us;
+} CudaAsyncFrame;
+
+typedef struct CudaAsyncQueue {
+    CudaAsyncFrame frames[MAX_FRAME_QUEUE_SIZE];
+    CUstream streams[MAX_CUDA_STREAMS];
+
+    int queue_size;
+    int num_streams;
+
+    int write_idx;
+    int read_idx;
+    int frames_in_queue;
+    int64_t submission_counter;
+
+    CUcontext cuda_ctx;
+    AVCUDADeviceContext *hwctx;
+
+    void *filter_ctx;
+    int (*process_frame)(void *filter_ctx, AVFrame * out, AVFrame * in,
+                         CUstream stream);
+} CudaAsyncQueue;
+
+int ff_cuda_async_queue_init(CudaAsyncQueue * queue,
+                             AVCUDADeviceContext * hwctx, int queue_size,
+                             int num_streams, void *filter_ctx,
+                             int (*process_frame)(void *, AVFrame *,
+                                                  AVFrame *, CUstream));
+
+void ff_cuda_async_queue_uninit(CudaAsyncQueue * queue);
+
+int ff_cuda_async_queue_submit(CudaAsyncQueue * queue, AVFrame * in_frame);
+
+int ff_cuda_async_queue_receive(CudaAsyncQueue * queue,
+                                AVFrame ** out_frame);
+
+int ff_cuda_async_queue_flush(CudaAsyncQueue * queue);
+
+int ff_cuda_async_queue_is_full(CudaAsyncQueue * queue);
+
+int ff_cuda_async_queue_is_empty(CudaAsyncQueue * queue);
+
+#endif                          /* AVFILTER_CUDA_ASYNC_QUEUE_H */
diff --git a/libavfilter/cuda/vector_helpers.cuh b/libavfilter/cuda/vector_helpers.cuh
index 67332ef0305..334a29b2dc3 100644
--- a/libavfilter/cuda/vector_helpers.cuh
+++ b/libavfilter/cuda/vector_helpers.cuh
@@ -23,6 +23,8 @@
 #ifndef AVFILTER_CUDA_VECTORHELPERS_H
 #define AVFILTER_CUDA_VECTORHELPERS_H
 
+#include <vector_types.h>
+
 typedef unsigned char uchar;
 typedef unsigned short ushort;
 
@@ -35,7 +37,9 @@ template<> struct vector_helper<ushort2> { typedef float2 ftype; typedef int2 it
 template<> struct vector_helper<ushort4> { typedef float4 ftype; typedef int4 itype; };
 template<> struct vector_helper<int>     { typedef float  ftype; typedef int  itype; };
 template<> struct vector_helper<int2>    { typedef float2 ftype; typedef int2 itype; };
+template<> struct vector_helper<int3>    { typedef float3 ftype; typedef int3 itype; };
 template<> struct vector_helper<int4>    { typedef float4 ftype; typedef int4 itype; };
+template<> struct vector_helper<float3>  { typedef float3 ftype; typedef int3 itype; };
 
 #define floatT typename vector_helper<T>::ftype
 #define intT typename vector_helper<T>::itype
@@ -77,6 +81,20 @@ OPERATORS4(uchar4)
 OPERATORS4(ushort4)
 OPERATORS4(float4)
 
+#define OPERATORS3(T) \
+    template<typename V> inline __device__ T operator+(const T &a, const V &b) { return make_ ## T (a.x + b.x, a.y + b.y, a.z + b.z); } \
+    template<typename V> inline __device__ T operator-(const T &a, const V &b) { return make_ ## T (a.x - b.x, a.y - b.y, a.z - b.z); } \
+    template<typename V> inline __device__ T operator*(const T &a, V b) { return make_ ## T (a.x * b, a.y * b, a.z * b); } \
+    template<typename V> inline __device__ T operator/(const T &a, V b) { return make_ ## T (a.x / b, a.y / b, a.z / b); } \
+    template<typename V> inline __device__ T &operator+=(T &a, const V &b) { a.x += b.x; a.y += b.y; a.z += b.z; return a; } \
+    template<typename V> inline __device__ void vec_set(T &a, const V &b) { a.x = b.x; a.y = b.y; a.z = b.z; } \
+    template<typename V> inline __device__ void vec_set_scalar(T &a, V b) { a.x = b; a.y = b; a.z = b; } \
+    template<> inline __device__ float3 to_floatN<T, float3>(const T &a) { return make_float3(a.x, a.y, a.z); } \
+    template<> inline __device__ T from_floatN<T, float3>(const float3 &a) { return make_ ## T(a.x, a.y, a.z); }
+
+OPERATORS3(int3)
+OPERATORS3(float3)
+
 template<typename V> inline __device__ void vec_set(int &a, V b) { a = b; }
 template<typename V> inline __device__ void vec_set(float &a, V b) { a = b; }
 template<typename V> inline __device__ void vec_set(uchar &a, V b) { a = b; }
@@ -99,6 +117,15 @@ inline __device__ float2 lerp_scalar<float2>(float2 v0, float2 v1, float t) {
     );
 }
 
+template<>
+inline __device__ float3 lerp_scalar<float3>(float3 v0, float3 v1, float t) {
+    return make_float3(
+        lerp_scalar(v0.x, v1.x, t),
+        lerp_scalar(v0.y, v1.y, t),
+        lerp_scalar(v0.z, v1.z, t)
+    );
+}
+
 template<>
 inline __device__ float4 lerp_scalar<float4>(float4 v0, float4 v1, float t) {
     return make_float4(
diff --git a/libavfilter/vf_colorspace_cuda.c b/libavfilter/vf_colorspace_cuda.c
index 9d1058ff7fd..4eeef38a920 100644
--- a/libavfilter/vf_colorspace_cuda.c
+++ b/libavfilter/vf_colorspace_cuda.c
@@ -32,13 +32,16 @@
 
 #include "avfilter.h"
 #include "filters.h"
+#include "colorspace.h"
 
 #include "cuda/load_helper.h"
+#include "cuda/cuda_async_queue.h"
 
 static const enum AVPixelFormat supported_formats[] = {
     AV_PIX_FMT_NV12,
     AV_PIX_FMT_YUV420P,
     AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_P016LE,
 };
 
 #define DIV_UP(a, b) (((a) + (b)-1) / (b))
@@ -48,27 +51,79 @@ static const enum AVPixelFormat supported_formats[] = {
 #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
 
 typedef struct CUDAColorspaceContext {
-    const AVClass* class;
+    const AVClass *class;
 
-    AVCUDADeviceContext* hwctx;
-    AVBufferRef* frames_ctx;
-    AVFrame* own_frame;
-    AVFrame* tmp_frame;
+    AVCUDADeviceContext *hwctx;
+    AVBufferRef *frames_ctx;
+    AVFrame *own_frame;
+    AVFrame *tmp_frame;
 
     CUcontext cu_ctx;
     CUstream cu_stream;
     CUmodule cu_module;
     CUfunction cu_convert[AVCOL_RANGE_NB];
+    CUfunction cu_convert_colorspace;
+    CUfunction cu_convert_colorspace_yuv;
+    CUfunction cu_convert_colorspace_nv12;
+    CUfunction cu_convert_16[AVCOL_RANGE_NB];
+    CUfunction cu_convert_colorspace_16;
+    CUfunction cu_convert_colorspace_yuv_16;
+
 
     enum AVPixelFormat pix_fmt;
     enum AVColorRange range;
+    enum AVColorSpace in_csp, out_csp, user_csp;
+    enum AVColorPrimaries in_prm, out_prm, user_prm;
+    enum AVColorTransferCharacteristic in_trc, out_trc, user_trc;
+
+    // Conversion matrices for YUV-to-YUV colorspace conversion
+    float yuv2yuv_matrix[3][3];
+    float yuv_offset[2];        // input and output offsets
+    int colorspace_conversion_needed;
 
     int num_planes;
+
+    int async_depth;
+    int async_streams;
+    CudaAsyncQueue async_queue;
 } CUDAColorspaceContext;
 
-static av_cold int cudacolorspace_init(AVFilterContext* ctx)
+static int calculate_yuv2yuv_matrix(CUDAColorspaceContext * s)
 {
-    CUDAColorspaceContext* s = ctx->priv;
+    const AVLumaCoefficients *in_lumacoef, *out_lumacoef;
+    double yuv2rgb[3][3], rgb2yuv[3][3], yuv2yuv[3][3];
+    int i, j;
+
+    // Get luma coefficients for input and output colorspaces
+    in_lumacoef = av_csp_luma_coeffs_from_avcsp(s->in_csp);
+    out_lumacoef = av_csp_luma_coeffs_from_avcsp(s->out_csp);
+
+    if (!in_lumacoef || !out_lumacoef) {
+        return AVERROR(EINVAL);
+    }
+    // Calculate YUV to RGB matrix for input colorspace
+    ff_fill_rgb2yuv_table(in_lumacoef, rgb2yuv);
+    ff_matrix_invert_3x3(rgb2yuv, yuv2rgb);
+
+    // Calculate RGB to YUV matrix for output colorspace
+    ff_fill_rgb2yuv_table(out_lumacoef, rgb2yuv);
+
+    // Combine: YUV_in -> RGB -> YUV_out
+    ff_matrix_mul_3x3(yuv2yuv, yuv2rgb, rgb2yuv);
+
+    // Convert to float for CUDA kernel
+    for (i = 0; i < 3; i++) {
+        for (j = 0; j < 3; j++) {
+            s->yuv2yuv_matrix[i][j] = (float) yuv2yuv[i][j];
+        }
+    }
+
+    return 0;
+}
+
+static av_cold int cudacolorspace_init(AVFilterContext * ctx)
+{
+    CUDAColorspaceContext *s = ctx->priv;
 
     s->own_frame = av_frame_alloc();
     if (!s->own_frame)
@@ -78,15 +133,21 @@ static av_cold int cudacolorspace_init(AVFilterContext* ctx)
     if (!s->tmp_frame)
         return AVERROR(ENOMEM);
 
+
     return 0;
 }
 
-static av_cold void cudacolorspace_uninit(AVFilterContext* ctx)
+static int process_frame_async(void *filter_ctx, AVFrame * out,
+                               AVFrame * in, CUstream stream);
+static int cudacolorspace_filter_frame(AVFilterLink * link, AVFrame * in);
+
+static av_cold void cudacolorspace_uninit(AVFilterContext * ctx)
 {
-    CUDAColorspaceContext* s = ctx->priv;
+    CUDAColorspaceContext *s = ctx->priv;
+
 
     if (s->hwctx && s->cu_module) {
-        CudaFunctions* cu = s->hwctx->internal->cuda_dl;
+        CudaFunctions *cu = s->hwctx->internal->cuda_dl;
         CUcontext dummy;
 
         CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
@@ -96,22 +157,30 @@ static av_cold void cudacolorspace_uninit(AVFilterContext* ctx)
     }
 
     av_frame_free(&s->own_frame);
-    av_buffer_unref(&s->frames_ctx);
     av_frame_free(&s->tmp_frame);
+
+    // Clean up async queue BEFORE freeing frames_ctx to ensure
+    // that async frames can properly return their buffer references
+    if (s->async_depth > 1 || s->async_streams > 1) {
+        ff_cuda_async_queue_uninit(&s->async_queue);
+    }
+
+    av_buffer_unref(&s->frames_ctx);
 }
 
-static av_cold int init_hwframe_ctx(CUDAColorspaceContext* s, AVBufferRef* device_ctx,
-                                    int width, int height)
+static av_cold int init_hwframe_ctx(CUDAColorspaceContext * s,
+                                    AVBufferRef * device_ctx, int width,
+                                    int height)
 {
-    AVBufferRef* out_ref = NULL;
-    AVHWFramesContext* out_ctx;
+    AVBufferRef *out_ref = NULL;
+    AVHWFramesContext *out_ctx;
     int ret;
 
     out_ref = av_hwframe_ctx_alloc(device_ctx);
     if (!out_ref)
         return AVERROR(ENOMEM);
 
-    out_ctx = (AVHWFramesContext*)out_ref->data;
+    out_ctx = (AVHWFramesContext *) out_ref->data;
 
     out_ctx->format = AV_PIX_FMT_CUDA;
     out_ctx->sw_format = s->pix_fmt;
@@ -134,7 +203,7 @@ static av_cold int init_hwframe_ctx(CUDAColorspaceContext* s, AVBufferRef* devic
     s->frames_ctx = out_ref;
 
     return 0;
-fail:
+  fail:
     av_buffer_unref(&out_ref);
     return ret;
 }
@@ -148,13 +217,18 @@ static int format_is_supported(enum AVPixelFormat fmt)
     return 0;
 }
 
-static av_cold int init_processing_chain(AVFilterContext* ctx, int width,
+static int format_is_16bit(enum AVPixelFormat fmt)
+{
+    return fmt == AV_PIX_FMT_P016LE;
+}
+
+static av_cold int init_processing_chain(AVFilterContext * ctx, int width,
                                          int height)
 {
-    FilterLink          *inl = ff_filter_link(ctx->inputs[0]);
-    FilterLink         *outl = ff_filter_link(ctx->outputs[0]);
-    CUDAColorspaceContext* s = ctx->priv;
-    AVHWFramesContext* in_frames_ctx;
+    FilterLink *inl = ff_filter_link(ctx->inputs[0]);
+    FilterLink *outl = ff_filter_link(ctx->outputs[0]);
+    CUDAColorspaceContext *s = ctx->priv;
+    AVHWFramesContext *in_frames_ctx;
 
     int ret;
 
@@ -163,7 +237,7 @@ static av_cold int init_processing_chain(AVFilterContext* ctx, int width,
         return AVERROR(EINVAL);
     }
 
-    in_frames_ctx = (AVHWFramesContext*)inl->hw_frames_ctx->data;
+    in_frames_ctx = (AVHWFramesContext *) inl->hw_frames_ctx->data;
     s->pix_fmt = in_frames_ctx->sw_format;
 
     if (!format_is_supported(s->pix_fmt)) {
@@ -172,13 +246,32 @@ static av_cold int init_processing_chain(AVFilterContext* ctx, int width,
         return AVERROR(EINVAL);
     }
 
-    if ((AVCOL_RANGE_MPEG != s->range) && (AVCOL_RANGE_JPEG != s->range)) {
+    if ((AVCOL_RANGE_MPEG != s->range) && (AVCOL_RANGE_JPEG != s->range) &&
+        (AVCOL_RANGE_UNSPECIFIED != s->range)) {
         av_log(ctx, AV_LOG_ERROR, "Unsupported color range\n");
         return AVERROR(EINVAL);
     }
 
     s->num_planes = av_pix_fmt_count_planes(s->pix_fmt);
 
+    // Determine if colorspace conversion is needed
+    s->colorspace_conversion_needed = (s->in_csp != AVCOL_SPC_UNSPECIFIED
+                                       && s->out_csp !=
+                                       AVCOL_SPC_UNSPECIFIED
+                                       && s->in_csp != s->out_csp);
+
+    if (s->colorspace_conversion_needed) {
+        ret = calculate_yuv2yuv_matrix(s);
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "Failed to calculate colorspace conversion matrix\n");
+            return ret;
+        }
+        av_log(ctx, AV_LOG_INFO, "Colorspace conversion: %s -> %s\n",
+               av_color_space_name(s->in_csp),
+               av_color_space_name(s->out_csp));
+    }
+
     ret = init_hwframe_ctx(s, in_frames_ctx->device_ref, width, height);
     if (ret < 0)
         return ret;
@@ -190,11 +283,11 @@ static av_cold int init_processing_chain(AVFilterContext* ctx, int width,
     return 0;
 }
 
-static av_cold int cudacolorspace_load_functions(AVFilterContext* ctx)
+static av_cold int cudacolorspace_load_functions(AVFilterContext * ctx)
 {
-    CUDAColorspaceContext* s = ctx->priv;
+    CUDAColorspaceContext *s = ctx->priv;
     CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
-    CudaFunctions* cu = s->hwctx->internal->cuda_dl;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
     int ret;
 
     extern const unsigned char ff_vf_colorspace_cuda_ptx_data[];
@@ -210,46 +303,127 @@ static av_cold int cudacolorspace_load_functions(AVFilterContext* ctx)
     if (ret < 0)
         goto fail;
 
-    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_convert[AVCOL_RANGE_MPEG], s->cu_module, "to_mpeg_cuda"));
+    ret =
+        CHECK_CU(cu->
+                 cuModuleGetFunction(&s->cu_convert[AVCOL_RANGE_MPEG],
+                                     s->cu_module, "to_mpeg_cuda"));
+    if (ret < 0)
+        goto fail;
+
+    ret =
+        CHECK_CU(cu->
+                 cuModuleGetFunction(&s->cu_convert[AVCOL_RANGE_JPEG],
+                                     s->cu_module, "to_jpeg_cuda"));
+    if (ret < 0)
+        goto fail;
+
+    ret =
+        CHECK_CU(cu->
+                 cuModuleGetFunction(&s->cu_convert_colorspace,
+                                     s->cu_module,
+                                     "colorspace_convert_cuda"));
+    if (ret < 0)
+        goto fail;
+
+    ret =
+        CHECK_CU(cu->
+                 cuModuleGetFunction(&s->cu_convert_colorspace_yuv,
+                                     s->cu_module,
+                                     "colorspace_convert_yuv_cuda"));
+    if (ret < 0)
+        goto fail;
+
+    ret =
+        CHECK_CU(cu->
+                 cuModuleGetFunction(&s->cu_convert_colorspace_nv12,
+                                     s->cu_module,
+                                     "colorspace_convert_nv12_cuda"));
     if (ret < 0)
         goto fail;
 
-    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_convert[AVCOL_RANGE_JPEG], s->cu_module, "to_jpeg_cuda"));
+    ret =
+        CHECK_CU(cu->
+                 cuModuleGetFunction(&s->cu_convert_16[AVCOL_RANGE_MPEG],
+                                     s->cu_module, "to_mpeg_cuda_16"));
     if (ret < 0)
         goto fail;
 
-fail:
+    ret =
+        CHECK_CU(cu->
+                 cuModuleGetFunction(&s->cu_convert_16[AVCOL_RANGE_JPEG],
+                                     s->cu_module, "to_jpeg_cuda_16"));
+    if (ret < 0)
+        goto fail;
+
+    ret =
+        CHECK_CU(cu->
+                 cuModuleGetFunction(&s->cu_convert_colorspace_16,
+                                     s->cu_module,
+                                     "colorspace_convert_cuda_16"));
+    if (ret < 0)
+        goto fail;
+
+    ret =
+        CHECK_CU(cu->
+                 cuModuleGetFunction(&s->cu_convert_colorspace_yuv_16,
+                                     s->cu_module,
+                                     "colorspace_convert_yuv_cuda_16"));
+    if (ret < 0)
+        goto fail;
+
+  fail:
     CHECK_CU(cu->cuCtxPopCurrent(&dummy));
     return ret;
 }
 
-static av_cold int cudacolorspace_config_props(AVFilterLink* outlink)
+static av_cold int cudacolorspace_config_props(AVFilterLink * outlink)
 {
-    AVFilterContext* ctx = outlink->src;
-    AVFilterLink* inlink = outlink->src->inputs[0];
-    FilterLink      *inl = ff_filter_link(inlink);
-    CUDAColorspaceContext* s = ctx->priv;
-    AVHWFramesContext* frames_ctx;
-    AVCUDADeviceContext* device_hwctx;
+    AVFilterContext *ctx = outlink->src;
+    AVFilterLink *inlink = outlink->src->inputs[0];
+    FilterLink *inl = ff_filter_link(inlink);
+    CUDAColorspaceContext *s = ctx->priv;
+    AVHWFramesContext *frames_ctx;
+    AVCUDADeviceContext *device_hwctx;
     int ret;
 
     outlink->w = inlink->w;
     outlink->h = inlink->h;
 
+    // Set defaults - actual colorspace will be determined from frame properties
+    s->in_csp =
+        inlink->colorspace !=
+        AVCOL_SPC_UNSPECIFIED ? inlink->colorspace : AVCOL_SPC_BT709;
+    s->in_prm = AVCOL_PRI_BT709;        // Default, will be updated from frame
+    s->in_trc = AVCOL_TRC_BT709;        // Default, will be updated from frame
+
+    // Set output colorspace properties
+    s->out_csp =
+        s->user_csp != AVCOL_SPC_UNSPECIFIED ? s->user_csp : s->in_csp;
+    s->out_prm =
+        s->user_prm != AVCOL_PRI_UNSPECIFIED ? s->user_prm : s->in_prm;
+    s->out_trc =
+        s->user_trc != AVCOL_TRC_UNSPECIFIED ? s->user_trc : s->in_trc;
+
+    // Set output link properties
+    outlink->colorspace = s->out_csp;
+
     ret = init_processing_chain(ctx, inlink->w, inlink->h);
     if (ret < 0)
         return ret;
 
-    frames_ctx = (AVHWFramesContext*)inl->hw_frames_ctx->data;
+    frames_ctx = (AVHWFramesContext *) inl->hw_frames_ctx->data;
     device_hwctx = frames_ctx->device_ctx->hwctx;
 
     s->hwctx = device_hwctx;
     s->cu_stream = s->hwctx->stream;
 
     if (inlink->sample_aspect_ratio.num) {
-        outlink->sample_aspect_ratio = av_mul_q(
-            (AVRational){outlink->h * inlink->w, outlink->w * inlink->h},
-            inlink->sample_aspect_ratio);
+        outlink->sample_aspect_ratio = av_mul_q((AVRational) {
+                                                outlink->h * inlink->w,
+                                                outlink->w * inlink->h}
+                                                ,
+                                                inlink->
+                                                sample_aspect_ratio);
     } else {
         outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
     }
@@ -258,13 +432,29 @@ static av_cold int cudacolorspace_config_props(AVFilterLink* outlink)
     if (ret < 0)
         return ret;
 
+    if (s->async_depth > 1 || s->async_streams > 1) {
+        av_log(ctx, AV_LOG_DEBUG,
+               "Async processing enabled: depth=%d streams=%d\n",
+               s->async_depth, s->async_streams);
+
+        ret = ff_cuda_async_queue_init(&s->async_queue, s->hwctx,
+                                       s->async_depth, s->async_streams,
+                                       ctx, process_frame_async);
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "Failed to initialize async queue\n");
+            return ret;
+        }
+    }
+
     return ret;
 }
 
-static int conv_cuda_convert(AVFilterContext* ctx, AVFrame* out, AVFrame* in)
+static int conv_cuda_convert(AVFilterContext * ctx, AVFrame * out,
+                             AVFrame * in)
 {
-    CUDAColorspaceContext* s = ctx->priv;
-    CudaFunctions* cu = s->hwctx->internal->cuda_dl;
+    CUDAColorspaceContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
     CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
     int ret;
 
@@ -272,54 +462,238 @@ static int conv_cuda_convert(AVFilterContext* ctx, AVFrame* out, AVFrame* in)
     if (ret < 0)
         return ret;
 
-    out->color_range = s->range;
-
-    for (int i = 0; i < s->num_planes; i++) {
-        int width = in->width, height = in->height, comp_id = (i > 0);
-
+    // Update input colorspace properties from frame
+    s->in_csp =
+        in->colorspace !=
+        AVCOL_SPC_UNSPECIFIED ? in->colorspace : s->in_csp;
+    s->in_prm =
+        in->color_primaries !=
+        AVCOL_PRI_UNSPECIFIED ? in->color_primaries : s->in_prm;
+    s->in_trc =
+        in->color_trc != AVCOL_TRC_UNSPECIFIED ? in->color_trc : s->in_trc;
+
+    // Update output colorspace if user didn't specify
+    if (s->user_csp == AVCOL_SPC_UNSPECIFIED)
+        s->out_csp = s->in_csp;
+    if (s->user_prm == AVCOL_PRI_UNSPECIFIED)
+        s->out_prm = s->in_prm;
+    if (s->user_trc == AVCOL_TRC_UNSPECIFIED)
+        s->out_trc = s->in_trc;
+
+    // Check if we need to recalculate matrix
+    int need_recalc = (s->colorspace_conversion_needed &&
+                       (s->in_csp != s->out_csp));
+
+    if (need_recalc) {
+        ret = calculate_yuv2yuv_matrix(s);
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "Failed to recalculate colorspace conversion matrix\n");
+            goto fail;
+        }
+    }
+    
+    // Set output frame properties immediately in conv function
+    out->color_range =
+        s->range != AVCOL_RANGE_UNSPECIFIED ? s->range : in->color_range;
+    out->colorspace = s->out_csp;
+    out->color_primaries = s->out_prm;
+    out->color_trc = s->out_trc;
+
+    // If only colorspace conversion is needed (no range conversion)
+    if (s->colorspace_conversion_needed
+        && (in->color_range == out->color_range
+            || s->range == AVCOL_RANGE_UNSPECIFIED)) {
+        
+        // Check if we can use the new YUV kernel
+        int use_yuv_kernel = 0;
+        int use_nv12_kernel = 0;
         switch (s->pix_fmt) {
         case AV_PIX_FMT_YUV444P:
-            break;
         case AV_PIX_FMT_YUV420P:
-            width = comp_id ? in->width / 2 : in->width;
-            /* fall-through */
+        case AV_PIX_FMT_P016LE:
+            use_yuv_kernel = 1;
+            break;
         case AV_PIX_FMT_NV12:
-            height = comp_id ? in->height / 2 : in->height;
+            use_nv12_kernel = 1;
             break;
         default:
-            av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n",
-                   av_get_pix_fmt_name(s->pix_fmt));
-            return AVERROR(EINVAL);
-        }
-
-        if (!s->cu_convert[out->color_range]) {
-            av_log(ctx, AV_LOG_ERROR, "Unsupported color range\n");
-            return AVERROR(EINVAL);
+            // Fall back to per-plane conversion for other formats
+            use_yuv_kernel = 0;
+            use_nv12_kernel = 0;
+            break;
         }
-
-        if (in->color_range != out->color_range) {
-            void* args[] = {&in->data[i], &out->data[i], &in->linesize[i],
-                            &comp_id};
-            ret = CHECK_CU(cu->cuLaunchKernel(
-                s->cu_convert[out->color_range], DIV_UP(width, BLOCKX),
-                DIV_UP(height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, s->cu_stream,
-                args, NULL));
-        } else {
-            ret = av_hwframe_transfer_data(out, in, 0);
+        
+        if (use_yuv_kernel) {
+            // Use new YUV kernel for proper colorspace conversion
+            int width = in->width, height = in->height;
+            int src_pitch_uv = in->linesize[1];
+            int dst_pitch_uv = out->linesize[1];
+
+            if (format_is_16bit(s->pix_fmt)) {
+                // 16-bit YUV conversion
+                void *args[] = {
+                    &in->data[0], &in->data[1], &in->data[2],    // src Y, U, V
+                    &out->data[0], &out->data[1], &out->data[2], // dst Y, U, V
+                    &in->linesize[0], &src_pitch_uv,             // src pitches
+                    &out->linesize[0], &dst_pitch_uv,            // dst pitches
+                    &width, &height,
+                    &s->yuv2yuv_matrix[0][0], &s->yuv2yuv_matrix[0][1], &s->yuv2yuv_matrix[0][2],
+                    &s->yuv2yuv_matrix[1][0], &s->yuv2yuv_matrix[1][1], &s->yuv2yuv_matrix[1][2],
+                    &s->yuv2yuv_matrix[2][0], &s->yuv2yuv_matrix[2][1], &s->yuv2yuv_matrix[2][2]
+                };
+                ret = CHECK_CU(cu->cuLaunchKernel(s->cu_convert_colorspace_yuv_16,
+                                                DIV_UP(width, BLOCKX), DIV_UP(height, BLOCKY), 1,
+                                                BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL));
+            } else {
+                // 8-bit YUV conversion
+                void *args[] = {
+                    &in->data[0], &in->data[1], &in->data[2],    // src Y, U, V
+                    &out->data[0], &out->data[1], &out->data[2], // dst Y, U, V
+                    &in->linesize[0], &src_pitch_uv,             // src pitches
+                    &out->linesize[0], &dst_pitch_uv,            // dst pitches
+                    &width, &height,
+                    &s->yuv2yuv_matrix[0][0], &s->yuv2yuv_matrix[0][1], &s->yuv2yuv_matrix[0][2],
+                    &s->yuv2yuv_matrix[1][0], &s->yuv2yuv_matrix[1][1], &s->yuv2yuv_matrix[1][2],
+                    &s->yuv2yuv_matrix[2][0], &s->yuv2yuv_matrix[2][1], &s->yuv2yuv_matrix[2][2]
+                };
+                ret = CHECK_CU(cu->cuLaunchKernel(s->cu_convert_colorspace_yuv,
+                                                DIV_UP(width, BLOCKX), DIV_UP(height, BLOCKY), 1,
+                                                BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL));
+            }
+            
             if (ret < 0)
-                return ret;
+                goto fail;
+        } else if (use_nv12_kernel) {
+            // Use NV12 kernel for proper colorspace conversion
+            int width = in->width, height = in->height;
+
+            void *args[] = {
+                &in->data[0], &in->data[1],                  // src Y, UV planes
+                &out->data[0], &out->data[1],                // dst Y, UV planes
+                &in->linesize[0], &in->linesize[1],          // src pitches
+                &out->linesize[0], &out->linesize[1],        // dst pitches
+                &width, &height,
+                &s->yuv2yuv_matrix[0][0], &s->yuv2yuv_matrix[0][1], &s->yuv2yuv_matrix[0][2],
+                &s->yuv2yuv_matrix[1][0], &s->yuv2yuv_matrix[1][1], &s->yuv2yuv_matrix[1][2],
+                &s->yuv2yuv_matrix[2][0], &s->yuv2yuv_matrix[2][1], &s->yuv2yuv_matrix[2][2]
+            };
+            ret = CHECK_CU(cu->cuLaunchKernel(s->cu_convert_colorspace_nv12,
+                                            DIV_UP(width, BLOCKX), DIV_UP(height, BLOCKY), 1,
+                                            BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL));
+            
+            if (ret < 0)
+                goto fail;
+        } else {
+            // Fall back to original per-plane conversion for NV12 and other formats
+            for (int i = 0; i < s->num_planes; i++) {
+                int width = in->width, height = in->height;
+
+                switch (s->pix_fmt) {
+                case AV_PIX_FMT_YUV444P:
+                    break;
+                case AV_PIX_FMT_YUV420P:
+                    width = (i > 0) ? in->width / 2 : in->width;
+                    /* fall-through */
+                case AV_PIX_FMT_NV12:
+                    height = (i > 0) ? in->height / 2 : in->height;
+                    break;
+                case AV_PIX_FMT_P016LE:
+                    height = (i > 0) ? in->height / 2 : in->height;
+                    break;
+                default:
+                    av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n",
+                           av_get_pix_fmt_name(s->pix_fmt));
+                    return AVERROR(EINVAL);
+                }
+
+                void *args[] =
+                    { &in->data[i], &out->data[i], &in->linesize[i],
+             &out->linesize[i],
+                    &width, &height, &i,
+                    &s->yuv2yuv_matrix[0][0], &s->yuv2yuv_matrix[0][1],
+                        &s->yuv2yuv_matrix[0][2],
+                    &s->yuv2yuv_matrix[1][0], &s->yuv2yuv_matrix[1][1],
+                        &s->yuv2yuv_matrix[1][2],
+                    &s->yuv2yuv_matrix[2][0], &s->yuv2yuv_matrix[2][1],
+                        &s->yuv2yuv_matrix[2][2]
+                };
+                CUfunction kernel = format_is_16bit(s->pix_fmt) ? 
+                                    s->cu_convert_colorspace_16 : s->cu_convert_colorspace;
+                ret =
+                    CHECK_CU(cu->
+                             cuLaunchKernel(kernel,
+                                            DIV_UP(width, BLOCKX),
+                                            DIV_UP(height, BLOCKY), 1, BLOCKX,
+                                            BLOCKY, 1, 0, s->cu_stream, args,
+                                            NULL));
+                if (ret < 0)
+                    goto fail;
+            }
+        }
+    } else {
+        // Handle range conversion or passthrough
+        for (int i = 0; i < s->num_planes; i++) {
+            int width = in->width, height = in->height, comp_id = (i > 0);
+
+            switch (s->pix_fmt) {
+            case AV_PIX_FMT_YUV444P:
+                break;
+            case AV_PIX_FMT_YUV420P:
+                width = comp_id ? in->width / 2 : in->width;
+                /* fall-through */
+            case AV_PIX_FMT_NV12:
+                height = comp_id ? in->height / 2 : in->height;
+                break;
+            case AV_PIX_FMT_P016LE:
+                height = comp_id ? in->height / 2 : in->height;
+                break;
+            default:
+                av_log(ctx, AV_LOG_ERROR, "Unsupported pixel format: %s\n",
+                       av_get_pix_fmt_name(s->pix_fmt));
+                return AVERROR(EINVAL);
+            }
+
+            if (s->range != AVCOL_RANGE_UNSPECIFIED
+                && in->color_range != out->color_range) {
+                CUfunction *convert_funcs = format_is_16bit(s->pix_fmt) ? 
+                                           s->cu_convert_16 : s->cu_convert;
+                if (!convert_funcs[out->color_range]) {
+                    av_log(ctx, AV_LOG_ERROR, "Unsupported color range\n");
+                    return AVERROR(EINVAL);
+                }
+
+                void *args[] =
+                    { &in->data[i], &out->data[i], &in->linesize[i],
+             &comp_id };
+                ret =
+                    CHECK_CU(cu->
+                             cuLaunchKernel(convert_funcs[out->color_range],
+                                            DIV_UP(width, BLOCKX),
+                                            DIV_UP(height, BLOCKY), 1,
+                                            BLOCKX, BLOCKY, 1, 0,
+                                            s->cu_stream, args, NULL));
+                if (ret < 0)
+                    goto fail;
+            } else {
+                ret = av_hwframe_transfer_data(out, in, 0);
+                if (ret < 0)
+                    goto fail;
+            }
         }
     }
 
+  fail:
     CHECK_CU(cu->cuCtxPopCurrent(&dummy));
     return ret;
 }
 
-static int cudacolorspace_conv(AVFilterContext* ctx, AVFrame* out, AVFrame* in)
+static int cudacolorspace_conv(AVFilterContext * ctx, AVFrame * out,
+                               AVFrame * in)
 {
-    CUDAColorspaceContext* s = ctx->priv;
-    AVFilterLink* outlink = ctx->outputs[0];
-    AVFrame* src = in;
+    CUDAColorspaceContext *s = ctx->priv;
+    AVFilterLink *outlink = ctx->outputs[0];
+    AVFrame *src = in;
     int ret;
 
     ret = conv_cuda_convert(ctx, s->own_frame, src);
@@ -341,20 +715,210 @@ static int cudacolorspace_conv(AVFilterContext* ctx, AVFrame* out, AVFrame* in)
     if (ret < 0)
         return ret;
 
+    // Set output frame properties AFTER av_frame_copy_props to avoid overwriting
+    out->color_range =
+        s->range != AVCOL_RANGE_UNSPECIFIED ? s->range : in->color_range;
+    out->colorspace = s->out_csp;
+    out->color_primaries = s->out_prm;
+    out->color_trc = s->out_trc;
+
     return 0;
 }
 
-static int cudacolorspace_filter_frame(AVFilterLink* link, AVFrame* in)
+
+static int cudacolorspace_activate(AVFilterContext * ctx)
+{
+    AVFilterLink *inlink = ctx->inputs[0];
+    AVFilterLink *outlink = ctx->outputs[0];
+    CUDAColorspaceContext *s = ctx->priv;
+    AVFrame *in = NULL;
+    int ret, status;
+    int64_t pts;
+
+    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
+
+    if (s->async_depth > 1 || s->async_streams > 1) {
+        // Try to receive completed frames first to avoid memory buildup
+        AVFrame *out = NULL;
+        ret = ff_cuda_async_queue_receive(&s->async_queue, &out);
+        if (ret == 0 && out) {
+            av_reduce(&out->sample_aspect_ratio.num,
+                      &out->sample_aspect_ratio.den,
+                      (int64_t) out->sample_aspect_ratio.num * outlink->h *
+                      inlink->w,
+                      (int64_t) out->sample_aspect_ratio.den * outlink->w *
+                      inlink->h, INT_MAX);
+            return ff_filter_frame(outlink, out);
+        } else if (ret < 0 && ret != AVERROR(EAGAIN)) {
+            return ret;
+        }
+
+        // Only submit new frames if there's space and no completed frames ready
+        if (!ff_cuda_async_queue_is_full(&s->async_queue)) {
+            ret = ff_inlink_consume_frame(inlink, &in);
+            if (ret < 0) {
+                if (ret == AVERROR_EOF || ret == AVERROR(EAGAIN)) {
+                    // No input available, but we might have frames in queue
+                    return AVERROR(EAGAIN);
+                }
+                return ret;
+            }
+
+            if (in) {
+                ret = ff_cuda_async_queue_submit(&s->async_queue, in);
+                av_frame_free(&in);
+                if (ret < 0) {
+                    return ret;
+                }
+                av_log(ctx, AV_LOG_DEBUG,
+                       "Submitted frame to async queue\n");
+            }
+        }
+    } else {
+        // Synchronous processing
+        ret = ff_inlink_consume_frame(inlink, &in);
+        if (ret < 0)
+            return ret;
+
+        if (in) {
+            ret = cudacolorspace_filter_frame(inlink, in);
+            if (ret < 0)
+                return ret;
+        }
+    }
+
+    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
+        if (s->async_depth > 1 || s->async_streams > 1) {
+            // Flush all remaining async frames
+            while (!ff_cuda_async_queue_is_empty(&s->async_queue)) {
+                AVFrame *out = NULL;
+                ret = ff_cuda_async_queue_receive(&s->async_queue, &out);
+                if (ret == 0 && out) {
+                    av_reduce(&out->sample_aspect_ratio.num,
+                              &out->sample_aspect_ratio.den,
+                              (int64_t) out->sample_aspect_ratio.num *
+                              outlink->h * inlink->w,
+                              (int64_t) out->sample_aspect_ratio.den *
+                              outlink->w * inlink->h, INT_MAX);
+                    ret = ff_filter_frame(outlink, out);
+                    if (ret < 0)
+                        return ret;
+                } else if (ret < 0 && ret != AVERROR(EAGAIN)) {
+                    av_log(ctx, AV_LOG_WARNING,
+                           "Error receiving async frame during flush: %d\n",
+                           ret);
+                    break;
+                }
+            }
+        }
+        ff_outlink_set_status(outlink, status, pts);
+        return 0;
+    }
+
+    FF_FILTER_FORWARD_WANTED(outlink, inlink);
+
+    return FFERROR_NOT_READY;
+}
+
+static int process_frame_async(void *filter_ctx, AVFrame * out,
+                               AVFrame * in, CUstream stream)
+{
+    AVFilterContext *ctx = (AVFilterContext *) filter_ctx;
+    CUDAColorspaceContext *s = ctx->priv;
+    CUstream old_stream = s->cu_stream;
+    int ret;
+
+    // Don't pre-allocate hardware buffer here - let cudacolorspace_conv handle it
+    // The previous code had a memory leak by allocating here then overwriting in conv
+    
+    // Set the stream and call the conv function (conv manages its own context)
+    s->cu_stream = stream;
+    ret = cudacolorspace_conv(ctx, out, in);
+    s->cu_stream = old_stream;
+
+    if (ret >= 0) {
+        ret = av_frame_copy_props(out, in);
+        if (ret >= 0) {
+            // Set output frame properties AFTER av_frame_copy_props to avoid overwriting
+            out->color_range =
+                s->range != AVCOL_RANGE_UNSPECIFIED ? s->range : in->color_range;
+            out->colorspace = s->out_csp;
+            out->color_primaries = s->out_prm;
+            out->color_trc = s->out_trc;
+        }
+    }
+
+    return ret;
+}
+
+static int cudacolorspace_filter_frame(AVFilterLink * link, AVFrame * in)
 {
-    AVFilterContext* ctx = link->dst;
-    CUDAColorspaceContext* s = ctx->priv;
-    AVFilterLink* outlink = ctx->outputs[0];
-    CudaFunctions* cu = s->hwctx->internal->cuda_dl;
+    AVFilterContext *ctx = link->dst;
+    CUDAColorspaceContext *s = ctx->priv;
+    AVFilterLink *outlink = ctx->outputs[0];
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
 
-    AVFrame* out = NULL;
+    AVFrame *out = NULL;
     CUcontext dummy;
     int ret = 0;
 
+    if (s->async_depth > 1 || s->async_streams > 1) {
+        ret = ff_cuda_async_queue_submit(&s->async_queue, in);
+        if (ret == AVERROR(EAGAIN)) {
+            AVFrame *completed_frame = NULL;
+            ret =
+                ff_cuda_async_queue_receive(&s->async_queue,
+                                            &completed_frame);
+            if (ret < 0 && ret != AVERROR(EAGAIN)) {
+                av_frame_free(&in);
+                return ret;
+            }
+            if (completed_frame) {
+                av_reduce(&completed_frame->sample_aspect_ratio.num,
+                          &completed_frame->sample_aspect_ratio.den,
+                          (int64_t) in->sample_aspect_ratio.num *
+                          outlink->h * link->w,
+                          (int64_t) in->sample_aspect_ratio.den *
+                          outlink->w * link->h, INT_MAX);
+                ret = ff_filter_frame(outlink, completed_frame);
+                if (ret < 0) {
+                    av_frame_free(&in);
+                    return ret;
+                }
+            }
+            ret = ff_cuda_async_queue_submit(&s->async_queue, in);
+        }
+        if (ret < 0) {
+            av_frame_free(&in);
+            return ret;
+        }
+
+        while (ff_inlink_queued_frames(link) == 0) {
+            AVFrame *completed_frame = NULL;
+            ret =
+                ff_cuda_async_queue_receive(&s->async_queue,
+                                            &completed_frame);
+            if (ret == AVERROR(EAGAIN)) {
+                break;
+            } else if (ret < 0) {
+                return ret;
+            }
+            if (completed_frame) {
+                av_reduce(&completed_frame->sample_aspect_ratio.num,
+                          &completed_frame->sample_aspect_ratio.den,
+                          (int64_t) in->sample_aspect_ratio.num *
+                          outlink->h * link->w,
+                          (int64_t) in->sample_aspect_ratio.den *
+                          outlink->w * link->h, INT_MAX);
+                ret = ff_filter_frame(outlink, completed_frame);
+                if (ret < 0)
+                    return ret;
+            }
+        }
+
+        return 0;
+    }
+
     out = av_frame_alloc();
     if (!out) {
         ret = AVERROR(ENOMEM);
@@ -372,13 +936,13 @@ static int cudacolorspace_filter_frame(AVFilterLink* link, AVFrame* in)
         goto fail;
 
     av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den,
-              (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
-              (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
+              (int64_t) in->sample_aspect_ratio.num * outlink->h * link->w,
+              (int64_t) in->sample_aspect_ratio.den * outlink->w * link->h,
               INT_MAX);
 
     av_frame_free(&in);
     return ff_filter_frame(outlink, out);
-fail:
+  fail:
     av_frame_free(&in);
     av_frame_free(&out);
     return ret;
@@ -387,12 +951,70 @@ static int cudacolorspace_filter_frame(AVFilterLink* link, AVFrame* in)
 #define OFFSET(x) offsetof(CUDAColorspaceContext, x)
 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
 static const AVOption options[] = {
-    {"range", "Output video range", OFFSET(range), AV_OPT_TYPE_INT, { .i64 = AVCOL_RANGE_UNSPECIFIED }, AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_NB - 1, FLAGS, .unit = "range"},
-        {"tv",   "Limited range", 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, .unit = "range"},
-        {"mpeg", "Limited range", 0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, .unit = "range"},
-        {"pc",   "Full range",    0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, .unit = "range"},
-        {"jpeg", "Full range",    0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, .unit = "range"},
-    {NULL},
+    { "range", "Output video range", OFFSET(range), AV_OPT_TYPE_INT,
+     {.i64 =
+      AVCOL_RANGE_UNSPECIFIED}, AVCOL_RANGE_UNSPECIFIED,
+     AVCOL_RANGE_NB - 1, FLAGS,.unit = "range" },
+    { "tv", "Limited range", 0, AV_OPT_TYPE_CONST,
+     {.i64 = AVCOL_RANGE_MPEG}, 0, 0, FLAGS,.unit = "range" },
+    { "mpeg", "Limited range", 0, AV_OPT_TYPE_CONST,
+     {.i64 = AVCOL_RANGE_MPEG}, 0, 0, FLAGS,.unit = "range" },
+    { "pc", "Full range", 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_RANGE_JPEG},
+     0, 0, FLAGS,.unit = "range" },
+    { "jpeg", "Full range", 0, AV_OPT_TYPE_CONST,
+     {.i64 = AVCOL_RANGE_JPEG}, 0, 0, FLAGS,.unit = "range" },
+
+    { "space", "Output colorspace", OFFSET(user_csp), AV_OPT_TYPE_INT,
+     {.i64 =
+      AVCOL_SPC_UNSPECIFIED}, AVCOL_SPC_RGB, AVCOL_SPC_NB - 1,
+     FLAGS,.unit = "csp" },
+    { "bt709", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT709}, 0, 0,
+     FLAGS,.unit = "csp" },
+    { "bt470bg", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_BT470BG}, 0,
+     0, FLAGS,.unit = "csp" },
+    { "smpte170m", NULL, 0, AV_OPT_TYPE_CONST,
+     {.i64 = AVCOL_SPC_SMPTE170M}, 0, 0, FLAGS,.unit = "csp" },
+    { "bt2020nc", NULL, 0, AV_OPT_TYPE_CONST,
+     {.i64 = AVCOL_SPC_BT2020_NCL}, 0, 0, FLAGS,.unit = "csp" },
+    { "bt2020ncl", NULL, 0, AV_OPT_TYPE_CONST,
+     {.i64 = AVCOL_SPC_BT2020_NCL}, 0, 0, FLAGS,.unit = "csp" },
+    { "bt601", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_SPC_SMPTE170M}, 0,
+     0, FLAGS,.unit = "csp" },
+
+    { "primaries", "Output color primaries", OFFSET(user_prm),
+     AV_OPT_TYPE_INT, {.i64 =
+                       AVCOL_PRI_UNSPECIFIED}, AVCOL_PRI_RESERVED0,
+     AVCOL_PRI_NB - 1, FLAGS,.unit = "prm" },
+    { "bt709", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT709}, 0, 0,
+     FLAGS,.unit = "prm" },
+    { "bt470bg", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT470BG}, 0,
+     0, FLAGS,.unit = "prm" },
+    { "smpte170m", NULL, 0, AV_OPT_TYPE_CONST,
+     {.i64 = AVCOL_PRI_SMPTE170M}, 0, 0, FLAGS,.unit = "prm" },
+    { "bt2020", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_PRI_BT2020}, 0,
+     0, FLAGS,.unit = "prm" },
+
+    { "trc", "Output transfer characteristics", OFFSET(user_trc),
+     AV_OPT_TYPE_INT, {.i64 =
+                       AVCOL_TRC_UNSPECIFIED}, AVCOL_TRC_RESERVED0,
+     AVCOL_TRC_NB - 1, FLAGS,.unit = "trc" },
+    { "bt709", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = AVCOL_TRC_BT709}, 0, 0,
+     FLAGS,.unit = "trc" },
+    { "bt2020-10", NULL, 0, AV_OPT_TYPE_CONST,
+     {.i64 = AVCOL_TRC_BT2020_10}, 0, 0, FLAGS,.unit = "trc" },
+    { "smpte170m", NULL, 0, AV_OPT_TYPE_CONST,
+     {.i64 = AVCOL_TRC_SMPTE170M}, 0, 0, FLAGS,.unit = "trc" },
+
+    { "async_depth", "Frame queue depth for async processing",
+     OFFSET(async_depth), AV_OPT_TYPE_INT, {.i64 =
+                                            1}, 1, MAX_FRAME_QUEUE_SIZE,
+     FLAGS },
+    { "async_streams", "Number of CUDA streams for async processing",
+     OFFSET(async_streams), AV_OPT_TYPE_INT, {.i64 =
+                                              1}, 1, MAX_CUDA_STREAMS,
+     FLAGS },
+
+    { NULL },
 };
 
 static const AVClass cudacolorspace_class = {
@@ -404,18 +1026,17 @@ static const AVClass cudacolorspace_class = {
 
 static const AVFilterPad cudacolorspace_inputs[] = {
     {
-        .name = "default",
-        .type = AVMEDIA_TYPE_VIDEO,
-        .filter_frame = cudacolorspace_filter_frame,
-    },
+     .name = "default",
+     .type = AVMEDIA_TYPE_VIDEO,
+      },
 };
 
 static const AVFilterPad cudacolorspace_outputs[] = {
     {
-        .name = "default",
-        .type = AVMEDIA_TYPE_VIDEO,
-        .config_props = cudacolorspace_config_props,
-    },
+     .name = "default",
+     .type = AVMEDIA_TYPE_VIDEO,
+     .config_props = cudacolorspace_config_props,
+      },
 };
 
 const AVFilter ff_vf_colorspace_cuda = {
@@ -424,6 +1045,7 @@ const AVFilter ff_vf_colorspace_cuda = {
 
     .init = cudacolorspace_init,
     .uninit = cudacolorspace_uninit,
+    .activate = cudacolorspace_activate,
 
     .priv_size = sizeof(CUDAColorspaceContext),
     .priv_class = &cudacolorspace_class,
diff --git a/libavfilter/vf_colorspace_cuda.cu b/libavfilter/vf_colorspace_cuda.cu
index 59db9f69f3c..16860742f69 100644
--- a/libavfilter/vf_colorspace_cuda.cu
+++ b/libavfilter/vf_colorspace_cuda.cu
@@ -31,12 +31,29 @@ extern "C" {
 #define JPEG_LUMA_MAX   (255)
 #define JPEG_CHROMA_MAX (255)
 
+// 16-bit constants (10-bit values shifted to 16-bit)
+#define MPEG_LUMA_MIN_16   (16 << 6)
+#define MPEG_CHROMA_MIN_16 (16 << 6)
+#define MPEG_LUMA_MAX_16   (235 << 6)
+#define MPEG_CHROMA_MAX_16 (240 << 6)
+
+#define JPEG_LUMA_MIN_16   (0)
+#define JPEG_CHROMA_MIN_16 (1 << 6)
+#define JPEG_LUMA_MAX_16   (65535)
+#define JPEG_CHROMA_MAX_16 (65535)
+
 __device__ int mpeg_min[] = {MPEG_LUMA_MIN, MPEG_CHROMA_MIN};
 __device__ int mpeg_max[] = {MPEG_LUMA_MAX, MPEG_CHROMA_MAX};
 
 __device__ int jpeg_min[] = {JPEG_LUMA_MIN, JPEG_CHROMA_MIN};
 __device__ int jpeg_max[] = {JPEG_LUMA_MAX, JPEG_CHROMA_MAX};
 
+__device__ int mpeg_min_16[] = {MPEG_LUMA_MIN_16, MPEG_CHROMA_MIN_16};
+__device__ int mpeg_max_16[] = {MPEG_LUMA_MAX_16, MPEG_CHROMA_MAX_16};
+
+__device__ int jpeg_min_16[] = {JPEG_LUMA_MIN_16, JPEG_CHROMA_MIN_16};
+__device__ int jpeg_max_16[] = {JPEG_LUMA_MAX_16, JPEG_CHROMA_MAX_16};
+
 __device__ int clamp(int val, int min, int max)
 {
     if (val < min)
@@ -91,4 +108,249 @@ __global__ void to_mpeg_cuda(const unsigned char* src, unsigned char* dst,
     dst[x + y * pitch] = static_cast<unsigned char>(dst_);
 }
 
+__global__ void to_jpeg_cuda_16(const unsigned short* src, unsigned short* dst,
+                                int pitch, int comp_id)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int pitch_pixels = pitch / 2;  // Convert byte pitch to pixel pitch
+    int src_, dst_;
+
+    // 10-bit input in 16-bit container -> scale for processing
+    src_ = static_cast<int>(src[x + y * pitch_pixels]);
+
+    // Scale to full 16-bit range for conversion
+    dst_ = comp_id ? (src_ * 65535 - 64 * 65535) / (1023 - 64)     // chroma
+                   : (src_ * 65535 - 64 * 65535) / (940 - 64);    // luma
+
+    dst[x + y * pitch_pixels] = static_cast<unsigned short>(clamp(dst_, jpeg_min_16[comp_id], jpeg_max_16[comp_id]));
+}
+
+__global__ void to_mpeg_cuda_16(const unsigned short* src, unsigned short* dst,
+                                int pitch, int comp_id)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int pitch_pixels = pitch / 2;  // Convert byte pitch to pixel pitch
+    int src_, dst_;
+
+    // 10-bit input in 16-bit container -> scale to limited range
+    src_ = static_cast<int>(src[x + y * pitch_pixels]);
+
+    // Scale from full range to limited range
+    dst_ = comp_id ? (src_ * (240 - 16) / 65535) + 16     // chroma
+                   : (src_ * (235 - 16) / 65535) + 16;    // luma
+
+    // Scale to 16-bit container (10-bit values)
+    dst_ = dst_ << 6;
+
+    dst[x + y * pitch_pixels] = static_cast<unsigned short>(clamp(dst_, mpeg_min_16[comp_id], mpeg_max_16[comp_id]));
+}
+
+__global__ void colorspace_convert_cuda(const unsigned char* src, unsigned char* dst,
+                                       int src_pitch, int dst_pitch, int width, int height, int plane_id,
+                                       float m00, float m01, float m02,
+                                       float m10, float m11, float m12,
+                                       float m20, float m21, float m22)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= width || y >= height)
+        return;
+
+    if (plane_id == 0) {
+        // Luma plane (Y) - apply only first row of matrix
+        float src_y = (float)src[x + y * src_pitch];
+        float dst_y = m00 * src_y + m01 * 128.0f + m02 * 128.0f;
+        dst[x + y * dst_pitch] = (unsigned char)clamp((int)(dst_y + 0.5f), 0, 255);
+    } else if (plane_id == 1) {
+        // U plane - apply second row of matrix, but we only have U component
+        float src_u = (float)src[x + y * src_pitch] - 128.0f;
+        // For U plane conversion, we apply the matrix considering U input and zero V
+        float dst_u = m10 * 128.0f + m11 * (src_u + 128.0f) + m12 * 128.0f;
+        dst[x + y * dst_pitch] = (unsigned char)clamp((int)(dst_u + 0.5f), 0, 255);
+    } else {
+        // V plane - apply third row of matrix, but we only have V component  
+        float src_v = (float)src[x + y * src_pitch] - 128.0f;
+        // For V plane conversion, we apply the matrix considering zero U input and V
+        float dst_v = m20 * 128.0f + m21 * 128.0f + m22 * (src_v + 128.0f);
+        dst[x + y * dst_pitch] = (unsigned char)clamp((int)(dst_v + 0.5f), 0, 255);
+    }
+}
+
+__global__ void colorspace_convert_cuda_16(const unsigned short* src, unsigned short* dst,
+                                          int src_pitch, int dst_pitch, int width, int height, int plane_id,
+                                          float m00, float m01, float m02,
+                                          float m10, float m11, float m12,
+                                          float m20, float m21, float m22)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    int src_pitch_pixels = src_pitch / 2;  // Convert byte pitch to pixel pitch
+    int dst_pitch_pixels = dst_pitch / 2;  // Convert byte pitch to pixel pitch
+
+    if (x >= width || y >= height)
+        return;
+
+    // For 10-bit in 16-bit container, chroma center is at 512 << 6 = 32768
+    const float chroma_center = 32768.0f;
+
+    if (plane_id == 0) {
+        // Luma plane (Y) - apply only first row of matrix
+        float src_y = (float)src[x + y * src_pitch_pixels];
+        float dst_y = m00 * src_y + m01 * chroma_center + m02 * chroma_center;
+        dst[x + y * dst_pitch_pixels] = (unsigned short)clamp((int)(dst_y + 0.5f), 0, 65535);
+    } else if (plane_id == 1) {
+        // U plane - apply second row of matrix, but we only have U component
+        float src_u = (float)src[x + y * src_pitch_pixels] - chroma_center;
+        // For U plane conversion, we apply the matrix considering U input and zero V
+        float dst_u = m10 * chroma_center + m11 * (src_u + chroma_center) + m12 * chroma_center;
+        dst[x + y * dst_pitch_pixels] = (unsigned short)clamp((int)(dst_u + 0.5f), 0, 65535);
+    } else {
+        // V plane - apply third row of matrix, but we only have V component  
+        float src_v = (float)src[x + y * src_pitch_pixels] - chroma_center;
+        // For V plane conversion, we apply the matrix considering zero U input and V
+        float dst_v = m20 * chroma_center + m21 * chroma_center + m22 * (src_v + chroma_center);
+        dst[x + y * dst_pitch_pixels] = (unsigned short)clamp((int)(dst_v + 0.5f), 0, 65535);
+    }
+}
+
+// New kernel for proper YUV colorspace conversion that processes all components together
+__global__ void colorspace_convert_yuv_cuda(const unsigned char* src_y, const unsigned char* src_u, const unsigned char* src_v,
+                                            unsigned char* dst_y, unsigned char* dst_u, unsigned char* dst_v,
+                                            int src_pitch_y, int src_pitch_uv, int dst_pitch_y, int dst_pitch_uv,
+                                            int width, int height,
+                                            float m00, float m01, float m02,
+                                            float m10, float m11, float m12,
+                                            float m20, float m21, float m22)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= width || y >= height)
+        return;
+
+    // Read YUV components
+    float src_Y = (float)src_y[x + y * src_pitch_y];
+    
+    // For chroma subsampling, calculate chroma coordinates
+    int chroma_x = x / 2;
+    int chroma_y = y / 2;
+    
+    float src_U = (float)src_u[chroma_x + chroma_y * src_pitch_uv] - 128.0f;
+    float src_V = (float)src_v[chroma_x + chroma_y * src_pitch_uv] - 128.0f;
+
+    // Apply 3x3 matrix transformation
+    float dst_Y = m00 * src_Y + m01 * (src_U + 128.0f) + m02 * (src_V + 128.0f);
+    float dst_U = m10 * src_Y + m11 * (src_U + 128.0f) + m12 * (src_V + 128.0f);
+    float dst_V = m20 * src_Y + m21 * (src_U + 128.0f) + m22 * (src_V + 128.0f);
+
+    // Clamp and write Y component
+    dst_y[x + y * dst_pitch_y] = (unsigned char)clamp((int)(dst_Y + 0.5f), 0, 255);
+
+    // Write UV components (only for appropriate positions to handle subsampling)
+    if (x % 2 == 0 && y % 2 == 0) {
+        dst_u[chroma_x + chroma_y * dst_pitch_uv] = (unsigned char)clamp((int)(dst_U + 0.5f), 0, 255);
+        dst_v[chroma_x + chroma_y * dst_pitch_uv] = (unsigned char)clamp((int)(dst_V + 0.5f), 0, 255);
+    }
+}
+
+// New kernel for 16-bit YUV colorspace conversion 
+__global__ void colorspace_convert_yuv_cuda_16(const unsigned short* src_y, const unsigned short* src_u, const unsigned short* src_v,
+                                               unsigned short* dst_y, unsigned short* dst_u, unsigned short* dst_v,
+                                               int src_pitch_y, int src_pitch_uv, int dst_pitch_y, int dst_pitch_uv,
+                                               int width, int height,
+                                               float m00, float m01, float m02,
+                                               float m10, float m11, float m12,
+                                               float m20, float m21, float m22)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= width || y >= height)
+        return;
+
+    // Convert byte pitch to pixel pitch for 16-bit
+    int src_pitch_y_pixels = src_pitch_y / 2;
+    int src_pitch_uv_pixels = src_pitch_uv / 2;
+    int dst_pitch_y_pixels = dst_pitch_y / 2;
+    int dst_pitch_uv_pixels = dst_pitch_uv / 2;
+
+    // For 10-bit in 16-bit container, chroma center is at 512 << 6 = 32768
+    const float chroma_center = 32768.0f;
+
+    // Read YUV components
+    float src_Y = (float)src_y[x + y * src_pitch_y_pixels];
+    
+    // For chroma subsampling, calculate chroma coordinates
+    int chroma_x = x / 2;
+    int chroma_y = y / 2;
+    
+    float src_U = (float)src_u[chroma_x + chroma_y * src_pitch_uv_pixels] - chroma_center;
+    float src_V = (float)src_v[chroma_x + chroma_y * src_pitch_uv_pixels] - chroma_center;
+
+    // Apply 3x3 matrix transformation
+    float dst_Y = m00 * src_Y + m01 * (src_U + chroma_center) + m02 * (src_V + chroma_center);
+    float dst_U = m10 * src_Y + m11 * (src_U + chroma_center) + m12 * (src_V + chroma_center);
+    float dst_V = m20 * src_Y + m21 * (src_U + chroma_center) + m22 * (src_V + chroma_center);
+
+    // Clamp and write Y component
+    dst_y[x + y * dst_pitch_y_pixels] = (unsigned short)clamp((int)(dst_Y + 0.5f), 0, 65535);
+
+    // Write UV components (only for appropriate positions to handle subsampling)
+    if (x % 2 == 0 && y % 2 == 0) {
+        dst_u[chroma_x + chroma_y * dst_pitch_uv_pixels] = (unsigned short)clamp((int)(dst_U + 0.5f), 0, 65535);
+        dst_v[chroma_x + chroma_y * dst_pitch_uv_pixels] = (unsigned short)clamp((int)(dst_V + 0.5f), 0, 65535);
+    }
+}
+
+// Specialized kernel for NV12 format (Y plane + interleaved UV plane)
+__global__ void colorspace_convert_nv12_cuda(const unsigned char* src_y, const unsigned char* src_uv,
+                                             unsigned char* dst_y, unsigned char* dst_uv,
+                                             int src_pitch_y, int src_pitch_uv, int dst_pitch_y, int dst_pitch_uv,
+                                             int width, int height,
+                                             float m00, float m01, float m02,
+                                             float m10, float m11, float m12,
+                                             float m20, float m21, float m22)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= width || y >= height)
+        return;
+
+    // Read Y component
+    float src_Y = (float)src_y[x + y * src_pitch_y];
+    
+    // For chroma subsampling, calculate chroma coordinates
+    int chroma_x = x / 2;
+    int chroma_y = y / 2;
+    
+    // Check bounds for UV plane access
+    if (chroma_x >= (width / 2) || chroma_y >= (height / 2))
+        return;
+    
+    int uv_idx = chroma_x * 2 + chroma_y * src_pitch_uv; // NV12: UVUVUV...
+    
+    // Read interleaved UV components safely
+    float src_U = (float)src_uv[uv_idx] - 128.0f;
+    float src_V = (float)src_uv[uv_idx + 1] - 128.0f;
+
+    // Apply 3x3 matrix transformation
+    float dst_Y = m00 * src_Y + m01 * (src_U + 128.0f) + m02 * (src_V + 128.0f);
+    float dst_U = m10 * src_Y + m11 * (src_U + 128.0f) + m12 * (src_V + 128.0f);
+    float dst_V = m20 * src_Y + m21 * (src_U + 128.0f) + m22 * (src_V + 128.0f);
+
+    // Clamp and write Y component
+    dst_y[x + y * dst_pitch_y] = (unsigned char)clamp((int)(dst_Y + 0.5f), 0, 255);
+
+    // Write UV components (only for appropriate positions to handle subsampling)
+    if (x % 2 == 0 && y % 2 == 0) {
+        int dst_uv_idx = chroma_x * 2 + chroma_y * dst_pitch_uv;
+        dst_uv[dst_uv_idx] = (unsigned char)clamp((int)(dst_U + 0.5f), 0, 255);
+        dst_uv[dst_uv_idx + 1] = (unsigned char)clamp((int)(dst_V + 0.5f), 0, 255);
+    }
+}
+
 }
diff --git a/libavfilter/vf_lut3d_cuda.c b/libavfilter/vf_lut3d_cuda.c
new file mode 100644
index 00000000000..d92c5a5a0ee
--- /dev/null
+++ b/libavfilter/vf_lut3d_cuda.c
@@ -0,0 +1,660 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <float.h>
+#include <stdio.h>
+
+#include "libavutil/common.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/internal.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/mem.h"
+#include "libavutil/file_open.h"
+#include "libavutil/intfloat.h"
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+
+#include "avfilter.h"
+#include "filters.h"
+#include "video.h"
+#include "lut3d.h"
+
+#include "cuda/load_helper.h"
+#include "vf_lut3d_cuda.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_0RGB32,
+    AV_PIX_FMT_0BGR32,
+    AV_PIX_FMT_RGB32,
+    AV_PIX_FMT_BGR32,
+    AV_PIX_FMT_RGB48,
+    AV_PIX_FMT_BGR48,
+    AV_PIX_FMT_RGBA64,
+    AV_PIX_FMT_BGRA64,
+};
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define BLOCKX 32
+#define BLOCKY 16
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+typedef struct LUT3DCudaContext {
+    const AVClass *class;
+
+    AVCUDADeviceContext *hwctx;
+
+    enum AVPixelFormat in_fmt, out_fmt;
+    const AVPixFmtDescriptor *in_desc, *out_desc;
+
+    AVBufferRef *frames_ctx;
+    AVFrame *frame;
+    AVFrame *tmp_frame;
+
+    CUcontext cu_ctx;
+    CUmodule cu_module;
+    CUfunction cu_func;
+    CUstream cu_stream;
+    CUdeviceptr cu_lut;
+
+    struct rgbvec *lut;
+    int lutsize;
+    int lutsize2;
+    struct rgbvec scale;
+    int interpolation;
+    char *file;
+
+    int bit_depth;
+    int passthrough;
+} LUT3DCudaContext;
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+static int get_bit_depth(enum AVPixelFormat fmt)
+{
+    switch (fmt) {
+    case AV_PIX_FMT_0RGB32:
+    case AV_PIX_FMT_0BGR32:
+    case AV_PIX_FMT_RGB32:
+    case AV_PIX_FMT_BGR32:
+        return 8;
+    case AV_PIX_FMT_RGB48:
+    case AV_PIX_FMT_BGR48:
+        return 12;
+    case AV_PIX_FMT_RGBA64:
+    case AV_PIX_FMT_BGRA64:
+        return 10;
+    default:
+        return 8;
+    }
+}
+
+static av_cold int lut3d_cuda_init(AVFilterContext * ctx)
+{
+    LUT3DCudaContext *s = ctx->priv;
+
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
+    s->tmp_frame = av_frame_alloc();
+    if (!s->tmp_frame)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold void lut3d_cuda_uninit(AVFilterContext * ctx)
+{
+    LUT3DCudaContext *s = ctx->priv;
+
+    if (s->hwctx && s->cu_module) {
+        CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+        CUcontext dummy;
+
+        CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+        if (s->cu_lut) {
+            CHECK_CU(cu->cuMemFree(s->cu_lut));
+            s->cu_lut = 0;
+        }
+        CHECK_CU(cu->cuModuleUnload(s->cu_module));
+        s->cu_module = NULL;
+        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    }
+
+    av_freep(&s->lut);
+    av_frame_free(&s->frame);
+    av_buffer_unref(&s->frames_ctx);
+    av_frame_free(&s->tmp_frame);
+}
+
+static av_cold int init_hwframe_ctx(LUT3DCudaContext * s,
+                                    AVBufferRef * device_ctx, int width,
+                                    int height)
+{
+    AVBufferRef *out_ref = NULL;
+    AVHWFramesContext *out_ctx;
+    int ret;
+
+    out_ref = av_hwframe_ctx_alloc(device_ctx);
+    if (!out_ref)
+        return AVERROR(ENOMEM);
+    out_ctx = (AVHWFramesContext *) out_ref->data;
+
+    out_ctx->format = AV_PIX_FMT_CUDA;
+    out_ctx->sw_format = s->out_fmt;
+    out_ctx->width = FFALIGN(width, 32);
+    out_ctx->height = FFALIGN(height, 32);
+
+    ret = av_hwframe_ctx_init(out_ref);
+    if (ret < 0)
+        goto fail;
+
+    av_frame_unref(s->frame);
+    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);
+    if (ret < 0)
+        goto fail;
+
+    s->frame->width = width;
+    s->frame->height = height;
+
+    av_buffer_unref(&s->frames_ctx);
+    s->frames_ctx = out_ref;
+
+    return 0;
+  fail:
+    av_buffer_unref(&out_ref);
+    return ret;
+}
+
+#define MAX_LINE_SIZE 512
+
+static int skip_line(const char *p)
+{
+    while (*p && av_isspace(*p))
+        p++;
+    return !*p || *p == '#';
+}
+
+#define NEXT_LINE(loop_cond) do {                           \
+    if (!fgets(line, sizeof(line), f)) {                    \
+        av_log(ctx, AV_LOG_ERROR, "Unexpected EOF\n");      \
+        return AVERROR_INVALIDDATA;                         \
+    }                                                       \
+} while (loop_cond)
+
+static int parse_cube(AVFilterContext * ctx, FILE * f)
+{
+    LUT3DCudaContext *lut3d = ctx->priv;
+    char line[MAX_LINE_SIZE];
+    float min[3] = { 0.0, 0.0, 0.0 };
+    float max[3] = { 1.0, 1.0, 1.0 };
+    int size = 0, size2, i, j, k;
+
+    while (fgets(line, sizeof(line), f)) {
+        if (!strncmp(line, "LUT_3D_SIZE", 11)) {
+            size = strtol(line + 12, NULL, 0);
+            if (size < 2 || size > MAX_LEVEL) {
+                av_log(ctx, AV_LOG_ERROR,
+                       "Too large or invalid 3D LUT size\n");
+                return AVERROR(EINVAL);
+            }
+            lut3d->lutsize = size;
+            lut3d->lutsize2 = size * size;
+            break;
+        }
+    }
+
+    if (!size) {
+        av_log(ctx, AV_LOG_ERROR, "3D LUT size not found\n");
+        return AVERROR(EINVAL);
+    }
+
+    if (!
+        (lut3d->lut =
+         av_malloc_array(size * size * size, sizeof(*lut3d->lut))))
+        return AVERROR(ENOMEM);
+
+    size2 = size * size;
+    rewind(f);
+
+    for (i = 0; i < size; i++) {
+        for (j = 0; j < size; j++) {
+            for (k = 0; k < size; k++) {
+                struct rgbvec *vec = &lut3d->lut[i * size2 + j * size + k];
+
+                do {
+                  try_again:
+                    NEXT_LINE(0);
+                    if (!strncmp(line, "DOMAIN_", 7)) {
+                        float *vals = NULL;
+                        if (!strncmp(line + 7, "MIN ", 4))
+                            vals = min;
+                        else if (!strncmp(line + 7, "MAX ", 4))
+                            vals = max;
+                        if (!vals)
+                            return AVERROR_INVALIDDATA;
+                        if (av_sscanf
+                            (line + 11, "%f %f %f", vals, vals + 1,
+                             vals + 2) != 3)
+                            return AVERROR_INVALIDDATA;
+                        av_log(ctx, AV_LOG_DEBUG,
+                               "min: %f %f %f | max: %f %f %f\n", min[0],
+                               min[1], min[2], max[0], max[1], max[2]);
+                        goto try_again;
+                    } else if (!strncmp(line, "TITLE", 5)) {
+                        goto try_again;
+                    }
+                } while (skip_line(line));
+                if (av_sscanf(line, "%f %f %f", &vec->r, &vec->g, &vec->b)
+                    != 3)
+                    return AVERROR_INVALIDDATA;
+                vec->r = av_clipf(vec->r, 0.f, 1.f);
+                vec->g = av_clipf(vec->g, 0.f, 1.f);
+                vec->b = av_clipf(vec->b, 0.f, 1.f);
+            }
+        }
+    }
+
+    lut3d->scale.r =
+        av_clipf(1. / (max[0] - min[0]), 0.f, 1.f) * (lut3d->lutsize - 1);
+    lut3d->scale.g =
+        av_clipf(1. / (max[1] - min[1]), 0.f, 1.f) * (lut3d->lutsize - 1);
+    lut3d->scale.b =
+        av_clipf(1. / (max[2] - min[2]), 0.f, 1.f) * (lut3d->lutsize - 1);
+
+    return 0;
+}
+
+static av_cold int lut3d_cuda_load_cube(AVFilterContext * ctx,
+                                        const char *fname)
+{
+    FILE *f;
+    int ret;
+
+    f = avpriv_fopen_utf8(fname, "r");
+    if (!f) {
+        ret = AVERROR_INVALIDDATA;
+        av_log(ctx, AV_LOG_ERROR, "Cannot read file '%s': %s\n", fname,
+               av_err2str(ret));
+        return ret;
+    }
+
+    ret = parse_cube(ctx, f);
+    fclose(f);
+    return ret;
+}
+
+static av_cold int lut3d_cuda_load_functions(AVFilterContext * ctx)
+{
+    LUT3DCudaContext *s = ctx->priv;
+    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    char function_name[128];
+    int ret;
+
+    extern const unsigned char ff_vf_lut3d_cuda_ptx_data[];
+    extern const unsigned int ff_vf_lut3d_cuda_ptx_len;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module,
+                              ff_vf_lut3d_cuda_ptx_data,
+                              ff_vf_lut3d_cuda_ptx_len);
+    if (ret < 0)
+        goto fail;
+
+    switch (s->interpolation) {
+    case INTERPOLATE_NEAREST:
+        snprintf(function_name, sizeof(function_name),
+                 "lut3d_interp_%d_nearest", s->bit_depth);
+        break;
+    case INTERPOLATE_TRILINEAR:
+        snprintf(function_name, sizeof(function_name),
+                 "lut3d_interp_%d_trilinear", s->bit_depth);
+        break;
+    case INTERPOLATE_TETRAHEDRAL:
+    default:
+        snprintf(function_name, sizeof(function_name),
+                 "lut3d_interp_%d_tetrahedral", s->bit_depth);
+        break;
+    }
+
+    ret =
+        CHECK_CU(cu->
+                 cuModuleGetFunction(&s->cu_func, s->cu_module,
+                                     function_name));
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_FATAL, "Failed to load CUDA function %s\n",
+               function_name);
+        goto fail;
+    }
+
+    ret =
+        CHECK_CU(cu->
+                 cuMemAlloc(&s->cu_lut,
+                            s->lutsize * s->lutsize * s->lutsize * 3 *
+                            sizeof(float)));
+    if (ret < 0)
+        goto fail;
+
+    ret =
+        CHECK_CU(cu->
+                 cuMemcpyHtoD(s->cu_lut, s->lut,
+                              s->lutsize * s->lutsize * s->lutsize *
+                              sizeof(struct rgbvec)));
+    if (ret < 0)
+        goto fail;
+
+  fail:
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    return ret;
+}
+
+static av_cold int init_processing_chain(AVFilterContext * ctx,
+                                         int in_width, int in_height,
+                                         int out_width, int out_height)
+{
+    LUT3DCudaContext *s = ctx->priv;
+    FilterLink *inl = ff_filter_link(ctx->inputs[0]);
+    FilterLink *outl = ff_filter_link(ctx->outputs[0]);
+
+    AVHWFramesContext *in_frames_ctx;
+    enum AVPixelFormat in_format;
+    enum AVPixelFormat out_format;
+    int ret;
+
+    if (!inl->hw_frames_ctx) {
+        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
+        return AVERROR(EINVAL);
+    }
+    in_frames_ctx = (AVHWFramesContext *) inl->hw_frames_ctx->data;
+    in_format = in_frames_ctx->sw_format;
+    out_format = in_format;
+
+    if (!format_is_supported(in_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
+               av_get_pix_fmt_name(in_format));
+        return AVERROR(ENOSYS);
+    }
+    if (!format_is_supported(out_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
+               av_get_pix_fmt_name(out_format));
+        return AVERROR(ENOSYS);
+    }
+
+    s->in_fmt = in_format;
+    s->out_fmt = out_format;
+    s->in_desc = av_pix_fmt_desc_get(s->in_fmt);
+    s->out_desc = av_pix_fmt_desc_get(s->out_fmt);
+    s->bit_depth = get_bit_depth(s->in_fmt);
+
+    if (in_width == out_width && in_height == out_height
+        && in_format == out_format) {
+        s->frames_ctx = av_buffer_ref(inl->hw_frames_ctx);
+        if (!s->frames_ctx)
+            return AVERROR(ENOMEM);
+    } else {
+        s->passthrough = 0;
+
+        ret =
+            init_hwframe_ctx(s, in_frames_ctx->device_ref, out_width,
+                             out_height);
+        if (ret < 0)
+            return ret;
+    }
+
+    outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
+    if (!outl->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold int lut3d_cuda_config_props(AVFilterLink * outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    AVFilterLink *inlink = outlink->src->inputs[0];
+    FilterLink *inl = ff_filter_link(inlink);
+    LUT3DCudaContext *s = ctx->priv;
+    AVHWFramesContext *frames_ctx;
+    AVCUDADeviceContext *device_hwctx;
+    int ret;
+
+    if (s->file) {
+        ret = lut3d_cuda_load_cube(ctx, s->file);
+        if (ret < 0)
+            return ret;
+    }
+
+    outlink->w = inlink->w;
+    outlink->h = inlink->h;
+
+    ret =
+        init_processing_chain(ctx, inlink->w, inlink->h, outlink->w,
+                              outlink->h);
+    if (ret < 0)
+        return ret;
+
+    frames_ctx = (AVHWFramesContext *) inl->hw_frames_ctx->data;
+    device_hwctx = frames_ctx->device_ctx->hwctx;
+
+    s->hwctx = device_hwctx;
+    s->cu_stream = s->hwctx->stream;
+
+    av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d fmt:%s -> w:%d h:%d fmt:%s%s\n",
+           inlink->w, inlink->h, av_get_pix_fmt_name(s->in_fmt),
+           outlink->w, outlink->h, av_get_pix_fmt_name(s->out_fmt),
+           s->passthrough ? " (passthrough)" : "");
+
+    ret = lut3d_cuda_load_functions(ctx);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+static int call_lut3d_kernel(AVFilterContext * ctx, AVFrame * out,
+                             AVFrame * in)
+{
+    LUT3DCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
+    CUtexObject tex = 0;
+    int ret;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    CUDA_TEXTURE_DESC tex_desc = {
+        .filterMode = CU_TR_FILTER_MODE_POINT,
+        .flags = CU_TRSF_READ_AS_INTEGER,
+    };
+
+    CUDA_RESOURCE_DESC res_desc = {
+        .resType = CU_RESOURCE_TYPE_PITCH2D,
+        .res.pitch2D.format =
+            (s->bit_depth >
+             8) ? CU_AD_FORMAT_UNSIGNED_INT16 : CU_AD_FORMAT_UNSIGNED_INT8,
+        .res.pitch2D.numChannels = 4,
+        .res.pitch2D.width = in->width,
+        .res.pitch2D.height = in->height,
+        .res.pitch2D.pitchInBytes = in->linesize[0],
+        .res.pitch2D.devPtr = (CUdeviceptr) in->data[0],
+    };
+
+    ret =
+        CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+    if (ret < 0)
+        goto exit;
+
+    void *args[] = {
+        &tex,
+        &out->data[0],
+        &out->width, &out->height, &out->linesize[0],
+        &s->cu_lut, &s->lutsize,
+        &s->scale.r, &s->scale.g, &s->scale.b
+    };
+
+    ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func,
+                                      DIV_UP(out->width, BLOCKX),
+                                      DIV_UP(out->height, BLOCKY), 1,
+                                      BLOCKX, BLOCKY, 1, 0, s->cu_stream,
+                                      args, NULL));
+
+  exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    return ret;
+}
+
+static int lut3d_cuda_filter_frame(AVFilterLink * link, AVFrame * in)
+{
+    AVFilterContext *ctx = link->dst;
+    LUT3DCudaContext *s = ctx->priv;
+    AVFilterLink *outlink = ctx->outputs[0];
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    AVFrame *out = NULL;
+    CUcontext dummy;
+    int ret = 0;
+
+    if (s->passthrough)
+        return ff_filter_frame(outlink, in);
+
+    out = av_frame_alloc();
+    if (!out) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (ret < 0)
+        goto fail;
+
+    ret = av_hwframe_get_buffer(s->frames_ctx, out, 0);
+    if (ret < 0)
+        goto fail;
+
+    ret = call_lut3d_kernel(ctx, out, in);
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    if (ret < 0)
+        goto fail;
+
+    ret = av_frame_copy_props(out, in);
+    if (ret < 0)
+        goto fail;
+
+    av_frame_free(&in);
+    return ff_filter_frame(outlink, out);
+
+  fail:
+    av_frame_free(&in);
+    av_frame_free(&out);
+    return ret;
+}
+
+#define OFFSET(x) offsetof(LUT3DCudaContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
+#define TFLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_RUNTIME_PARAM)
+
+static const AVOption lut3d_cuda_options[] = {
+    { "file", "set 3D LUT file name", OFFSET(file), AV_OPT_TYPE_STRING,
+     {.str = NULL},.flags = FLAGS },
+    { "interp", "select interpolation mode", OFFSET(interpolation),
+     AV_OPT_TYPE_INT, {.i64 =
+                       INTERPOLATE_TETRAHEDRAL}, 0, NB_INTERP_MODE - 1,
+     TFLAGS,.unit = "interp_mode" },
+    { "nearest", "use values from the nearest defined points", 0,
+     AV_OPT_TYPE_CONST, {.i64 = INTERPOLATE_NEAREST}, 0, 0, TFLAGS,.unit =
+     "interp_mode" },
+    { "trilinear", "interpolate values using the 8 points defining a cube",
+     0, AV_OPT_TYPE_CONST, {.i64 =
+                            INTERPOLATE_TRILINEAR}, 0, 0, TFLAGS,.unit =
+     "interp_mode" },
+    { "tetrahedral", "interpolate values using a tetrahedron", 0,
+     AV_OPT_TYPE_CONST, {.i64 =
+                         INTERPOLATE_TETRAHEDRAL}, 0, 0, TFLAGS,.unit =
+     "interp_mode" },
+    { "passthrough", "Do not process frames at all if parameters match",
+     OFFSET(passthrough), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS },
+    { NULL },
+};
+
+static const AVClass lut3d_cuda_class = {
+    .class_name = "lut3d_cuda",
+    .item_name = av_default_item_name,
+    .option = lut3d_cuda_options,
+    .version = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVFilterPad lut3d_cuda_inputs[] = {
+    {
+     .name = "default",
+     .type = AVMEDIA_TYPE_VIDEO,
+     .filter_frame = lut3d_cuda_filter_frame,
+      },
+};
+
+static const AVFilterPad lut3d_cuda_outputs[] = {
+    {
+     .name = "default",
+     .type = AVMEDIA_TYPE_VIDEO,
+     .config_props = lut3d_cuda_config_props,
+      },
+};
+
+const AVFilter ff_vf_lut3d_cuda = {
+    .name = "lut3d_cuda",
+    .description = NULL_IF_CONFIG_SMALL("GPU accelerated 3D LUT filter"),
+    .priv_class = &lut3d_cuda_class,
+
+    .init = lut3d_cuda_init,
+    .uninit = lut3d_cuda_uninit,
+
+    .priv_size = sizeof(LUT3DCudaContext),
+
+    FILTER_INPUTS(lut3d_cuda_inputs),
+    FILTER_OUTPUTS(lut3d_cuda_outputs),
+
+    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
+
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
diff --git a/libavfilter/vf_lut3d_cuda.cu b/libavfilter/vf_lut3d_cuda.cu
new file mode 100644
index 00000000000..a142299e13d
--- /dev/null
+++ b/libavfilter/vf_lut3d_cuda.cu
@@ -0,0 +1,423 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "cuda/vector_helpers.cuh"
+
+#define BLOCKX 32
+#define BLOCKY 16
+
+extern "C" {
+
+__device__ static inline float3 lerpf3(float3 a, float3 b, float t)
+{
+    return make_float3(
+        a.x + (b.x - a.x) * t,
+        a.y + (b.y - a.y) * t,
+        a.z + (b.z - a.z) * t
+    );
+}
+
+__device__ static inline float3 interp_nearest_cuda(float3 *lut, int lutsize, float3 s)
+{
+    int r = (int)(s.x + 0.5f);
+    int g = (int)(s.y + 0.5f);
+    int b = (int)(s.z + 0.5f);
+    
+    r = min(max(r, 0), lutsize - 1);
+    g = min(max(g, 0), lutsize - 1);
+    b = min(max(b, 0), lutsize - 1);
+    
+    return lut[r * lutsize * lutsize + g * lutsize + b];
+}
+
+__device__ static inline float3 interp_trilinear_cuda(float3 *lut, int lutsize, float3 s)
+{
+    int lutsize2 = lutsize * lutsize;
+    
+    int prev_r = (int)s.x;
+    int prev_g = (int)s.y;
+    int prev_b = (int)s.z;
+    
+    int next_r = min(prev_r + 1, lutsize - 1);
+    int next_g = min(prev_g + 1, lutsize - 1);
+    int next_b = min(prev_b + 1, lutsize - 1);
+    
+    float3 d = make_float3(s.x - prev_r, s.y - prev_g, s.z - prev_b);
+    
+    float3 c000 = lut[prev_r * lutsize2 + prev_g * lutsize + prev_b];
+    float3 c001 = lut[prev_r * lutsize2 + prev_g * lutsize + next_b];
+    float3 c010 = lut[prev_r * lutsize2 + next_g * lutsize + prev_b];
+    float3 c011 = lut[prev_r * lutsize2 + next_g * lutsize + next_b];
+    float3 c100 = lut[next_r * lutsize2 + prev_g * lutsize + prev_b];
+    float3 c101 = lut[next_r * lutsize2 + prev_g * lutsize + next_b];
+    float3 c110 = lut[next_r * lutsize2 + next_g * lutsize + prev_b];
+    float3 c111 = lut[next_r * lutsize2 + next_g * lutsize + next_b];
+    
+    float3 c00 = lerpf3(c000, c100, d.x);
+    float3 c10 = lerpf3(c010, c110, d.x);
+    float3 c01 = lerpf3(c001, c101, d.x);
+    float3 c11 = lerpf3(c011, c111, d.x);
+    float3 c0 = lerpf3(c00, c10, d.y);
+    float3 c1 = lerpf3(c01, c11, d.y);
+    
+    return lerpf3(c0, c1, d.z);
+}
+
+__device__ static inline float3 interp_tetrahedral_cuda(float3 *lut, int lutsize, float3 s)
+{
+    int lutsize2 = lutsize * lutsize;
+    
+    int prev_r = (int)s.x;
+    int prev_g = (int)s.y;
+    int prev_b = (int)s.z;
+    
+    int next_r = min(prev_r + 1, lutsize - 1);
+    int next_g = min(prev_g + 1, lutsize - 1);
+    int next_b = min(prev_b + 1, lutsize - 1);
+    
+    float3 d = make_float3(s.x - prev_r, s.y - prev_g, s.z - prev_b);
+    
+    float3 c000 = lut[prev_r * lutsize2 + prev_g * lutsize + prev_b];
+    float3 c111 = lut[next_r * lutsize2 + next_g * lutsize + next_b];
+    
+    float3 c;
+    if (d.x > d.y && d.x > d.z) {
+        if (d.y > d.z) {
+            float3 c100 = lut[next_r * lutsize2 + prev_g * lutsize + prev_b];
+            float3 c110 = lut[next_r * lutsize2 + next_g * lutsize + prev_b];
+            c = make_float3(
+                c000.x + d.x * (c100.x - c000.x) + d.y * (c110.x - c100.x) + d.z * (c111.x - c110.x),
+                c000.y + d.x * (c100.y - c000.y) + d.y * (c110.y - c100.y) + d.z * (c111.y - c110.y),
+                c000.z + d.x * (c100.z - c000.z) + d.y * (c110.z - c100.z) + d.z * (c111.z - c110.z)
+            );
+        } else {
+            float3 c100 = lut[next_r * lutsize2 + prev_g * lutsize + prev_b];
+            float3 c101 = lut[next_r * lutsize2 + prev_g * lutsize + next_b];
+            c = make_float3(
+                c000.x + d.x * (c100.x - c000.x) + d.z * (c101.x - c100.x) + d.y * (c111.x - c101.x),
+                c000.y + d.x * (c100.y - c000.y) + d.z * (c101.y - c100.y) + d.y * (c111.y - c101.y),
+                c000.z + d.x * (c100.z - c000.z) + d.z * (c101.z - c100.z) + d.y * (c111.z - c101.z)
+            );
+        }
+    } else if (d.y > d.z) {
+        if (d.x > d.z) {
+            float3 c010 = lut[prev_r * lutsize2 + next_g * lutsize + prev_b];
+            float3 c110 = lut[next_r * lutsize2 + next_g * lutsize + prev_b];
+            c = make_float3(
+                c000.x + d.y * (c010.x - c000.x) + d.x * (c110.x - c010.x) + d.z * (c111.x - c110.x),
+                c000.y + d.y * (c010.y - c000.y) + d.x * (c110.y - c010.y) + d.z * (c111.y - c110.y),
+                c000.z + d.y * (c010.z - c000.z) + d.x * (c110.z - c010.z) + d.z * (c111.z - c110.z)
+            );
+        } else {
+            float3 c010 = lut[prev_r * lutsize2 + next_g * lutsize + prev_b];
+            float3 c011 = lut[prev_r * lutsize2 + next_g * lutsize + next_b];
+            c = make_float3(
+                c000.x + d.y * (c010.x - c000.x) + d.z * (c011.x - c010.x) + d.x * (c111.x - c011.x),
+                c000.y + d.y * (c010.y - c000.y) + d.z * (c011.y - c010.y) + d.x * (c111.y - c011.y),
+                c000.z + d.y * (c010.z - c000.z) + d.z * (c011.z - c010.z) + d.x * (c111.z - c011.z)
+            );
+        }
+    } else {
+        if (d.x > d.y) {
+            float3 c001 = lut[prev_r * lutsize2 + prev_g * lutsize + next_b];
+            float3 c101 = lut[next_r * lutsize2 + prev_g * lutsize + next_b];
+            c = make_float3(
+                c000.x + d.z * (c001.x - c000.x) + d.x * (c101.x - c001.x) + d.y * (c111.x - c101.x),
+                c000.y + d.z * (c001.y - c000.y) + d.x * (c101.y - c001.y) + d.y * (c111.y - c101.y),
+                c000.z + d.z * (c001.z - c000.z) + d.x * (c101.z - c001.z) + d.y * (c111.z - c101.z)
+            );
+        } else {
+            float3 c001 = lut[prev_r * lutsize2 + prev_g * lutsize + next_b];
+            float3 c011 = lut[prev_r * lutsize2 + next_g * lutsize + next_b];
+            c = make_float3(
+                c000.x + d.z * (c001.x - c000.x) + d.y * (c011.x - c001.x) + d.x * (c111.x - c011.x),
+                c000.y + d.z * (c001.y - c000.y) + d.y * (c011.y - c001.y) + d.x * (c111.y - c011.y),
+                c000.z + d.z * (c001.z - c000.z) + d.y * (c011.z - c001.z) + d.x * (c111.z - c011.z)
+            );
+        }
+    }
+    
+    return c;
+}
+
+__global__ void lut3d_interp_8_nearest(
+    cudaTextureObject_t src_tex,
+    uchar4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    uchar4 src = tex2D<uchar4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        src.x / 255.0f * scale_r,
+        src.y / 255.0f * scale_g,
+        src.z / 255.0f * scale_b
+    );
+    
+    float3 result = interp_nearest_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(uchar4)) + x] = make_uchar4(
+        (unsigned char)(result.x * 255.0f + 0.5f),
+        (unsigned char)(result.y * 255.0f + 0.5f),
+        (unsigned char)(result.z * 255.0f + 0.5f),
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_10_nearest(
+    cudaTextureObject_t src_tex,
+    ushort4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    ushort4 src = tex2D<ushort4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        (src.x >> 6) * scale_r,  // Convert 10-bit to normalized range
+        (src.y >> 6) * scale_g,
+        (src.z >> 6) * scale_b
+    );
+    
+    float3 result = interp_nearest_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4(
+        (unsigned short)(result.x * 1023.0f + 0.5f) << 6,  // Convert back to 10-bit in 16-bit container
+        (unsigned short)(result.y * 1023.0f + 0.5f) << 6,
+        (unsigned short)(result.z * 1023.0f + 0.5f) << 6,
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_12_nearest(
+    cudaTextureObject_t src_tex,
+    ushort4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    ushort4 src = tex2D<ushort4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        (src.x >> 4) * scale_r,  // Convert 12-bit to normalized range
+        (src.y >> 4) * scale_g,
+        (src.z >> 4) * scale_b
+    );
+    
+    float3 result = interp_nearest_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4(
+        (unsigned short)(result.x * 4095.0f + 0.5f) << 4,  // Convert back to 12-bit in 16-bit container
+        (unsigned short)(result.y * 4095.0f + 0.5f) << 4,
+        (unsigned short)(result.z * 4095.0f + 0.5f) << 4,
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_8_trilinear(
+    cudaTextureObject_t src_tex,
+    uchar4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    uchar4 src = tex2D<uchar4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        src.x / 255.0f * scale_r,
+        src.y / 255.0f * scale_g,
+        src.z / 255.0f * scale_b
+    );
+    
+    float3 result = interp_trilinear_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(uchar4)) + x] = make_uchar4(
+        (unsigned char)(result.x * 255.0f + 0.5f),
+        (unsigned char)(result.y * 255.0f + 0.5f),
+        (unsigned char)(result.z * 255.0f + 0.5f),
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_10_trilinear(
+    cudaTextureObject_t src_tex,
+    ushort4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    ushort4 src = tex2D<ushort4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        (src.x >> 6) / 1023.0f * scale_r,
+        (src.y >> 6) / 1023.0f * scale_g,
+        (src.z >> 6) / 1023.0f * scale_b
+    );
+    
+    float3 result = interp_trilinear_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4(
+        (unsigned short)(result.x * 1023.0f + 0.5f) << 6,
+        (unsigned short)(result.y * 1023.0f + 0.5f) << 6,
+        (unsigned short)(result.z * 1023.0f + 0.5f) << 6,
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_12_trilinear(
+    cudaTextureObject_t src_tex,
+    ushort4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    ushort4 src = tex2D<ushort4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        (src.x >> 4) / 4095.0f * scale_r,
+        (src.y >> 4) / 4095.0f * scale_g,
+        (src.z >> 4) / 4095.0f * scale_b
+    );
+    
+    float3 result = interp_trilinear_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4(
+        (unsigned short)(result.x * 4095.0f + 0.5f) << 4,
+        (unsigned short)(result.y * 4095.0f + 0.5f) << 4,
+        (unsigned short)(result.z * 4095.0f + 0.5f) << 4,
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_8_tetrahedral(
+    cudaTextureObject_t src_tex,
+    uchar4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    uchar4 src = tex2D<uchar4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        src.x / 255.0f * scale_r,
+        src.y / 255.0f * scale_g,
+        src.z / 255.0f * scale_b
+    );
+    
+    float3 result = interp_tetrahedral_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(uchar4)) + x] = make_uchar4(
+        (unsigned char)(result.x * 255.0f + 0.5f),
+        (unsigned char)(result.y * 255.0f + 0.5f),
+        (unsigned char)(result.z * 255.0f + 0.5f),
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_10_tetrahedral(
+    cudaTextureObject_t src_tex,
+    ushort4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    ushort4 src = tex2D<ushort4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        (src.x >> 6) / 1023.0f * scale_r,
+        (src.y >> 6) / 1023.0f * scale_g,
+        (src.z >> 6) / 1023.0f * scale_b
+    );
+    
+    float3 result = interp_tetrahedral_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4(
+        (unsigned short)(result.x * 1023.0f + 0.5f) << 6,
+        (unsigned short)(result.y * 1023.0f + 0.5f) << 6,
+        (unsigned short)(result.z * 1023.0f + 0.5f) << 6,
+        src.w
+    );
+}
+
+__global__ void lut3d_interp_12_tetrahedral(
+    cudaTextureObject_t src_tex,
+    ushort4 *dst, int dst_width, int dst_height, int dst_pitch,
+    float3 *lut, int lutsize, float scale_r, float scale_g, float scale_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    ushort4 src = tex2D<ushort4>(src_tex, x, y);
+    
+    float3 s = make_float3(
+        (src.x >> 4) / 4095.0f * scale_r,
+        (src.y >> 4) / 4095.0f * scale_g,
+        (src.z >> 4) / 4095.0f * scale_b
+    );
+    
+    float3 result = interp_tetrahedral_cuda(lut, lutsize, s);
+    
+    dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4(
+        (unsigned short)(result.x * 4095.0f + 0.5f) << 4,
+        (unsigned short)(result.y * 4095.0f + 0.5f) << 4,
+        (unsigned short)(result.z * 4095.0f + 0.5f) << 4,
+        src.w
+    );
+}
+
+}
\ No newline at end of file
diff --git a/libavfilter/vf_lut3d_cuda.h b/libavfilter/vf_lut3d_cuda.h
new file mode 100644
index 00000000000..967177a3df3
--- /dev/null
+++ b/libavfilter/vf_lut3d_cuda.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef AVFILTER_LUT3D_CUDA_H
+#define AVFILTER_LUT3D_CUDA_H
+
+#endif
diff --git a/libavfilter/vf_showinfo_cuda.c b/libavfilter/vf_showinfo_cuda.c
new file mode 100644
index 00000000000..f634556e607
--- /dev/null
+++ b/libavfilter/vf_showinfo_cuda.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2025 Claude Code Assistant
+ * Based on the original showinfo filter by Stefano Sabatini
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * CUDA-compatible filter for showing textual video frame information
+ */
+
+#include <ctype.h>
+#include <inttypes.h>
+
+#include "libavutil/buffer.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/bswap.h"
+#include "libavutil/adler32.h"
+#include "libavutil/display.h"
+#include "libavutil/dovi_meta.h"
+#include "libavutil/imgutils.h"
+#include "libavutil/internal.h"
+#include "libavutil/film_grain_params.h"
+#include "libavutil/hdr_dynamic_metadata.h"
+#include "libavutil/hdr_dynamic_vivid_metadata.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/spherical.h"
+#include "libavutil/stereo3d.h"
+#include "libavutil/timestamp.h"
+#include "libavutil/timecode.h"
+#include "libavutil/mastering_display_metadata.h"
+#include "libavutil/video_enc_params.h"
+#include "libavutil/detection_bbox.h"
+#include "libavutil/ambient_viewing_environment.h"
+#include "libavutil/uuid.h"
+
+#include "avfilter.h"
+#include "filters.h"
+#include "formats.h"
+#include "video.h"
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+typedef struct ShowInfoCudaContext {
+    const AVClass *class;
+    
+    AVCUDADeviceContext *hwctx;
+    AVBufferRef *hw_frames_ctx;
+    
+    int calculate_checksums;
+    int udu_sei_as_ascii;
+    
+    AVFrame *tmp_frame;
+} ShowInfoCudaContext;
+
+#define OFFSET(x) offsetof(ShowInfoCudaContext, x)
+#define VF AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
+
+static const AVOption showinfo_cuda_options[] = {
+    { "checksum", "calculate checksums", OFFSET(calculate_checksums), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, VF },
+    { "udu_sei_as_ascii", "try to print user data unregistered SEI as ascii character when possible",
+        OFFSET(udu_sei_as_ascii), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, VF },
+    { NULL }
+};
+
+AVFILTER_DEFINE_CLASS(showinfo_cuda);
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_NV12,
+    AV_PIX_FMT_YUV444P,
+    AV_PIX_FMT_P010,
+    AV_PIX_FMT_P016,
+    AV_PIX_FMT_YUV444P16,
+    AV_PIX_FMT_0RGB32,
+    AV_PIX_FMT_0BGR32,
+    AV_PIX_FMT_RGB32,
+    AV_PIX_FMT_BGR32,
+};
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+    for (int i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+static int showinfo_cuda_query_formats(const AVFilterContext *avctx,
+                                       AVFilterFormatsConfig **cfg_in,
+                                       AVFilterFormatsConfig **cfg_out)
+{
+    int err;
+
+    if ((err = ff_formats_ref(ff_formats_pixdesc_filter(AV_PIX_FMT_FLAG_HWACCEL, 0),
+                              &cfg_in[0]->formats)) < 0)
+        return err;
+
+    if ((err = ff_formats_ref(ff_formats_pixdesc_filter(AV_PIX_FMT_FLAG_HWACCEL, 0),
+                              &cfg_out[0]->formats)) < 0)
+        return err;
+
+    return 0;
+}
+
+static int showinfo_cuda_config_props(AVFilterLink *outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    AVFilterLink *inlink = outlink->src->inputs[0];
+    ShowInfoCudaContext *s = ctx->priv;
+    FilterLink *il = ff_filter_link(inlink);
+    FilterLink *ol = ff_filter_link(outlink);
+    AVHWFramesContext *frames_ctx;
+
+    av_buffer_unref(&s->hw_frames_ctx);
+
+    if (!il->hw_frames_ctx) {
+        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
+        return AVERROR(EINVAL);
+    }
+
+    s->hw_frames_ctx = av_buffer_ref(il->hw_frames_ctx);
+    if (!s->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    frames_ctx = (AVHWFramesContext*)s->hw_frames_ctx->data;
+    s->hwctx = frames_ctx->device_ctx->hwctx;
+
+    if (!format_is_supported(frames_ctx->sw_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n", 
+               av_get_pix_fmt_name(frames_ctx->sw_format));
+        return AVERROR(ENOSYS);
+    }
+
+    ol->hw_frames_ctx = av_buffer_ref(s->hw_frames_ctx);
+    if (!ol->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+// Include all the dump functions from the original showinfo filter
+// (These functions work on metadata which doesn't require CUDA processing)
+
+static void dump_spherical(AVFilterContext *ctx, AVFrame *frame, const AVFrameSideData *sd)
+{
+    const AVSphericalMapping *spherical = (const AVSphericalMapping *)sd->data;
+    double yaw, pitch, roll;
+
+    av_log(ctx, AV_LOG_INFO, "%s ", av_spherical_projection_name(spherical->projection));
+
+    if (spherical->yaw || spherical->pitch || spherical->roll) {
+        yaw = ((double)spherical->yaw) / (1 << 16);
+        pitch = ((double)spherical->pitch) / (1 << 16);
+        roll = ((double)spherical->roll) / (1 << 16);
+        av_log(ctx, AV_LOG_INFO, "(%f/%f/%f) ", yaw, pitch, roll);
+    }
+
+    if (spherical->projection == AV_SPHERICAL_EQUIRECTANGULAR_TILE) {
+        size_t l, t, r, b;
+        av_spherical_tile_bounds(spherical, frame->width, frame->height,
+                                 &l, &t, &r, &b);
+        av_log(ctx, AV_LOG_INFO,
+               "[%"SIZE_SPECIFIER", %"SIZE_SPECIFIER", %"SIZE_SPECIFIER", %"SIZE_SPECIFIER"] ",
+               l, t, r, b);
+    } else if (spherical->projection == AV_SPHERICAL_CUBEMAP) {
+        av_log(ctx, AV_LOG_INFO, "[pad %"PRIu32"] ", spherical->padding);
+    }
+}
+
+static void dump_color_property(AVFilterContext *ctx, AVFrame *frame)
+{
+    const char *color_range_str     = av_color_range_name(frame->color_range);
+    const char *colorspace_str      = av_color_space_name(frame->colorspace);
+    const char *color_primaries_str = av_color_primaries_name(frame->color_primaries);
+    const char *color_trc_str       = av_color_transfer_name(frame->color_trc);
+
+    if (!color_range_str || frame->color_range == AVCOL_RANGE_UNSPECIFIED) {
+        av_log(ctx, AV_LOG_INFO, "color_range:unknown");
+    } else {
+        av_log(ctx, AV_LOG_INFO, "color_range:%s", color_range_str);
+    }
+
+    if (!colorspace_str || frame->colorspace == AVCOL_SPC_UNSPECIFIED) {
+        av_log(ctx, AV_LOG_INFO, " color_space:unknown");
+    } else {
+        av_log(ctx, AV_LOG_INFO, " color_space:%s", colorspace_str);
+    }
+
+    if (!color_primaries_str || frame->color_primaries == AVCOL_PRI_UNSPECIFIED) {
+        av_log(ctx, AV_LOG_INFO, " color_primaries:unknown");
+    } else {
+        av_log(ctx, AV_LOG_INFO, " color_primaries:%s", color_primaries_str);
+    }
+
+    if (!color_trc_str || frame->color_trc == AVCOL_TRC_UNSPECIFIED) {
+        av_log(ctx, AV_LOG_INFO, " color_trc:unknown");
+    } else {
+        av_log(ctx, AV_LOG_INFO, " color_trc:%s", color_trc_str);
+    }
+    av_log(ctx, AV_LOG_INFO, "\n");
+}
+
+static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
+{
+    FilterLink *inl = ff_filter_link(inlink);
+    AVFilterContext *ctx = inlink->dst;
+    ShowInfoCudaContext *s = ctx->priv;
+    const AVPixFmtDescriptor *desc;
+    AVHWFramesContext *frames_ctx;
+    uint32_t plane_checksum[4] = {0}, checksum = 0;
+    int32_t pixelcount[4] = {0};
+    int err, i;
+
+    if (!frame->hw_frames_ctx) {
+        av_log(ctx, AV_LOG_ERROR, "Input frame missing hw_frames_ctx\n");
+        return AVERROR(EINVAL);
+    }
+
+    frames_ctx = (AVHWFramesContext*)frame->hw_frames_ctx->data;
+    desc = av_pix_fmt_desc_get(frames_ctx->sw_format);
+
+    // For CUDA frames, we show info without downloading unless checksums are requested
+    av_log(ctx, AV_LOG_INFO,
+           "n:%4"PRId64" pts:%7s pts_time:%-7s duration:%7"PRId64
+           " duration_time:%-7s "
+           "fmt:%s(cuda) cl:%s sar:%d/%d s:%dx%d i:%c iskey:%d type:%c ",
+           inl->frame_count_out,
+           av_ts2str(frame->pts), av_ts2timestr(frame->pts, &inlink->time_base),
+           frame->duration, av_ts2timestr(frame->duration, &inlink->time_base),
+           desc->name, av_chroma_location_name(frame->chroma_location),
+           frame->sample_aspect_ratio.num, frame->sample_aspect_ratio.den,
+           frame->width, frame->height,
+           !(frame->flags & AV_FRAME_FLAG_INTERLACED)     ? 'P' :         /* Progressive  */
+           (frame->flags & AV_FRAME_FLAG_TOP_FIELD_FIRST) ? 'T' : 'B',    /* Top / Bottom */
+           !!(frame->flags & AV_FRAME_FLAG_KEY),
+           av_get_picture_type_char(frame->pict_type));
+
+    // If checksums are requested, we need to download the frame
+    if (s->calculate_checksums) {
+        if (!s->tmp_frame) {
+            s->tmp_frame = av_frame_alloc();
+            if (!s->tmp_frame)
+                return AVERROR(ENOMEM);
+        }
+
+        err = av_hwframe_transfer_data(s->tmp_frame, frame, 0);
+        if (err < 0) {
+            av_log(ctx, AV_LOG_WARNING, "Failed to download frame for checksum calculation: %s\n", 
+                   av_err2str(err));
+            av_log(ctx, AV_LOG_INFO, "checksum:N/A");
+        } else {
+            // Calculate checksums on downloaded frame
+            int bitdepth = desc->comp[0].depth;
+            int plane, vsub = desc->log2_chroma_h;
+
+            for (plane = 0; plane < 4 && s->tmp_frame->data[plane] && s->tmp_frame->linesize[plane]; plane++) {
+                uint8_t *data = s->tmp_frame->data[plane];
+                int h = plane == 1 || plane == 2 ? AV_CEIL_RSHIFT(inlink->h, vsub) : inlink->h;
+                int linesize = av_image_get_linesize(s->tmp_frame->format, s->tmp_frame->width, plane);
+                int width = linesize >> (bitdepth > 8);
+
+                if (linesize < 0) {
+                    av_frame_unref(s->tmp_frame);
+                    return linesize;
+                }
+
+                for (i = 0; i < h; i++) {
+                    plane_checksum[plane] = av_adler32_update(plane_checksum[plane], data, linesize);
+                    checksum = av_adler32_update(checksum, data, linesize);
+                    pixelcount[plane] += width;
+                    data += s->tmp_frame->linesize[plane];
+                }
+            }
+
+            av_log(ctx, AV_LOG_INFO,
+                   "checksum:%08"PRIX32" plane_checksum:[%08"PRIX32,
+                   checksum, plane_checksum[0]);
+
+            for (plane = 1; plane < 4 && s->tmp_frame->data[plane] && s->tmp_frame->linesize[plane]; plane++)
+                av_log(ctx, AV_LOG_INFO, " %08"PRIX32, plane_checksum[plane]);
+            av_log(ctx, AV_LOG_INFO, "]");
+        }
+
+        av_frame_unref(s->tmp_frame);
+    }
+
+    av_log(ctx, AV_LOG_INFO, "\n");
+
+    // Show side data (this doesn't require frame data download)
+    for (i = 0; i < frame->nb_side_data; i++) {
+        AVFrameSideData *sd = frame->side_data[i];
+        const char *name = av_frame_side_data_name(sd->type);
+
+        av_log(ctx, AV_LOG_INFO, "  side data - ");
+        if (name)
+            av_log(ctx, AV_LOG_INFO, "%s: ", name);
+        
+        switch (sd->type) {
+        case AV_FRAME_DATA_SPHERICAL:
+            dump_spherical(ctx, frame, sd);
+            break;
+        case AV_FRAME_DATA_DISPLAYMATRIX:
+            av_log(ctx, AV_LOG_INFO, "rotation of %.2f degrees",
+                   av_display_rotation_get((int32_t *)sd->data));
+            break;
+        case AV_FRAME_DATA_AFD:
+            av_log(ctx, AV_LOG_INFO, "value of %"PRIu8, sd->data[0]);
+            break;
+        default:
+            if (name)
+                av_log(ctx, AV_LOG_INFO,
+                       "(%"SIZE_SPECIFIER" bytes)", sd->size);
+            else
+                av_log(ctx, AV_LOG_WARNING, "unknown side data type %d "
+                       "(%"SIZE_SPECIFIER" bytes)", sd->type, sd->size);
+            break;
+        }
+
+        av_log(ctx, AV_LOG_INFO, "\n");
+    }
+
+    dump_color_property(ctx, frame);
+
+    return ff_filter_frame(inlink->dst->outputs[0], frame);
+}
+
+static av_cold void showinfo_cuda_uninit(AVFilterContext *ctx)
+{
+    ShowInfoCudaContext *s = ctx->priv;
+
+    av_frame_free(&s->tmp_frame);
+    av_buffer_unref(&s->hw_frames_ctx);
+}
+
+static const AVFilterPad showinfo_cuda_inputs[] = {
+    {
+        .name         = "default",
+        .type         = AVMEDIA_TYPE_VIDEO,
+        .filter_frame = filter_frame,
+    },
+};
+
+static const AVFilterPad showinfo_cuda_outputs[] = {
+    {
+        .name          = "default",
+        .type          = AVMEDIA_TYPE_VIDEO,
+        .config_props  = showinfo_cuda_config_props,
+    },
+};
+
+const AVFilter ff_vf_showinfo_cuda = {
+    .name        = "showinfo_cuda",
+    .description = NULL_IF_CONFIG_SMALL("Show textual information for each CUDA video frame."),
+    .priv_class  = &showinfo_cuda_class,
+    .flags       = AVFILTER_FLAG_METADATA_ONLY,
+    .priv_size     = sizeof(ShowInfoCudaContext),
+    .uninit        = showinfo_cuda_uninit,
+    FILTER_INPUTS(showinfo_cuda_inputs),
+    FILTER_OUTPUTS(showinfo_cuda_outputs),
+    FILTER_QUERY_FUNC2(showinfo_cuda_query_formats),
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
\ No newline at end of file
diff --git a/libavfilter/vf_tonemap_cuda.c b/libavfilter/vf_tonemap_cuda.c
new file mode 100644
index 00000000000..02b0347293d
--- /dev/null
+++ b/libavfilter/vf_tonemap_cuda.c
@@ -0,0 +1,925 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <float.h>
+#include <stdio.h>
+
+#include "libavutil/common.h"
+#include "libavutil/hwcontext.h"
+#include "libavutil/hwcontext_cuda_internal.h"
+#include "libavutil/cuda_check.h"
+#include "libavutil/internal.h"
+#include "colorspace.h"
+#include "libavutil/opt.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/csp.h"
+
+#include "avfilter.h"
+#include "filters.h"
+#include "video.h"
+
+#include "cuda/load_helper.h"
+#include "vf_tonemap_cuda.h"
+#include "cuda/cuda_async_queue.h"
+
+static const enum AVPixelFormat supported_formats[] = {
+    AV_PIX_FMT_GBRPF32,
+    AV_PIX_FMT_RGB48,
+    AV_PIX_FMT_RGBA64,
+    AV_PIX_FMT_P010LE,
+    AV_PIX_FMT_P016LE,
+    AV_PIX_FMT_YUV420P,
+    AV_PIX_FMT_YUV420P10LE,
+    AV_PIX_FMT_YUV420P16LE,
+    AV_PIX_FMT_NV12,
+};
+
+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
+#define BLOCKX 32
+#define BLOCKY 16
+
+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x)
+
+enum TonemapAlgorithm {
+    TONEMAP_NONE,
+    TONEMAP_LINEAR,
+    TONEMAP_GAMMA,
+    TONEMAP_CLIP,
+    TONEMAP_REINHARD,
+    TONEMAP_HABLE,
+    TONEMAP_MOBIUS,
+    TONEMAP_MAX,
+};
+
+typedef struct TonemapCudaContext {
+    const AVClass *class;
+
+    AVCUDADeviceContext *hwctx;
+
+    enum AVPixelFormat in_fmt, out_fmt;
+    const AVPixFmtDescriptor *in_desc, *out_desc;
+
+    AVBufferRef *frames_ctx;
+    AVFrame *frame;
+    AVFrame *tmp_frame;
+
+    CUcontext cu_ctx;
+    CUmodule cu_module;
+    CUfunction cu_func;
+    CUstream cu_stream;
+
+
+    enum TonemapAlgorithm tonemap;
+    double param;
+    double desat;
+    double peak;
+
+    const AVLumaCoefficients *coeffs;
+    int passthrough;
+
+    int async_depth;
+    int async_streams;
+    CudaAsyncQueue async_queue;
+} TonemapCudaContext;
+
+static int format_is_supported(enum AVPixelFormat fmt)
+{
+    int i;
+
+    for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
+        if (supported_formats[i] == fmt)
+            return 1;
+    return 0;
+}
+
+static av_cold int tonemap_cuda_init(AVFilterContext * ctx)
+{
+    TonemapCudaContext *s = ctx->priv;
+
+    s->frame = av_frame_alloc();
+    if (!s->frame)
+        return AVERROR(ENOMEM);
+
+    s->tmp_frame = av_frame_alloc();
+    if (!s->tmp_frame)
+        return AVERROR(ENOMEM);
+
+
+    switch (s->tonemap) {
+    case TONEMAP_GAMMA:
+        if (isnan(s->param))
+            s->param = 1.8f;
+        break;
+    case TONEMAP_REINHARD:
+        if (!isnan(s->param))
+            s->param = (1.0f - s->param) / s->param;
+        break;
+    case TONEMAP_MOBIUS:
+        if (isnan(s->param))
+            s->param = 0.3f;
+        break;
+    }
+
+    if (isnan(s->param))
+        s->param = 1.0f;
+
+    return 0;
+}
+
+static int process_frame_async(void *filter_ctx, AVFrame * out,
+                               AVFrame * in, CUstream stream);
+
+static av_cold void tonemap_cuda_uninit(AVFilterContext * ctx)
+{
+    TonemapCudaContext *s = ctx->priv;
+
+
+    if (s->hwctx && s->cu_module) {
+        CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+        CUcontext dummy;
+
+        CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+        CHECK_CU(cu->cuModuleUnload(s->cu_module));
+        s->cu_module = NULL;
+        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    }
+
+    av_frame_free(&s->frame);
+    av_frame_free(&s->tmp_frame);
+
+    // Clean up async queue BEFORE freeing frames_ctx to ensure
+    // that async frames can properly return their buffer references
+    if (s->async_depth > 1 || s->async_streams > 1) {
+        ff_cuda_async_queue_uninit(&s->async_queue);
+    }
+
+    av_buffer_unref(&s->frames_ctx);
+}
+
+static av_cold int init_hwframe_ctx(TonemapCudaContext * s,
+                                    AVBufferRef * device_ctx, int width,
+                                    int height)
+{
+    AVBufferRef *out_ref = NULL;
+    AVHWFramesContext *out_ctx;
+    int ret;
+
+    out_ref = av_hwframe_ctx_alloc(device_ctx);
+    if (!out_ref)
+        return AVERROR(ENOMEM);
+    out_ctx = (AVHWFramesContext *) out_ref->data;
+
+    out_ctx->format = AV_PIX_FMT_CUDA;
+    out_ctx->sw_format = s->out_fmt;
+    out_ctx->width = FFALIGN(width, 32);
+    out_ctx->height = FFALIGN(height, 32);
+
+    ret = av_hwframe_ctx_init(out_ref);
+    if (ret < 0)
+        goto fail;
+
+    av_frame_unref(s->frame);
+    ret = av_hwframe_get_buffer(out_ref, s->frame, 0);
+    if (ret < 0)
+        goto fail;
+
+    s->frame->width = width;
+    s->frame->height = height;
+
+    av_buffer_unref(&s->frames_ctx);
+    s->frames_ctx = out_ref;
+
+    return 0;
+  fail:
+    av_buffer_unref(&out_ref);
+    return ret;
+}
+
+static av_cold int tonemap_cuda_load_functions(AVFilterContext * ctx)
+{
+    TonemapCudaContext *s = ctx->priv;
+    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    char function_name[128];
+    int ret;
+
+    extern const unsigned char ff_vf_tonemap_cuda_ptx_data[];
+    extern const unsigned int ff_vf_tonemap_cuda_ptx_len;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = ff_cuda_load_module(ctx, s->hwctx, &s->cu_module,
+                              ff_vf_tonemap_cuda_ptx_data,
+                              ff_vf_tonemap_cuda_ptx_len);
+    if (ret < 0)
+        goto fail;
+
+    if (s->in_fmt == AV_PIX_FMT_GBRPF32) {
+        snprintf(function_name, sizeof(function_name),
+                 "tonemap_cuda_planar_float");
+    } else
+        if ((s->in_fmt == AV_PIX_FMT_P010LE
+             || s->in_fmt == AV_PIX_FMT_P016LE)
+            && s->out_fmt == AV_PIX_FMT_NV12) {
+        snprintf(function_name, sizeof(function_name),
+                 "tonemap_cuda_p016_to_nv12");
+    } else if (s->in_fmt == AV_PIX_FMT_YUV420P10LE
+               || s->in_fmt == AV_PIX_FMT_YUV420P16LE) {
+        snprintf(function_name, sizeof(function_name),
+                 "tonemap_cuda_yuv420p10");
+    } else {
+        snprintf(function_name, sizeof(function_name),
+                 "tonemap_cuda_16bit");
+    }
+
+    ret =
+        CHECK_CU(cu->
+                 cuModuleGetFunction(&s->cu_func, s->cu_module,
+                                     function_name));
+    if (ret < 0) {
+        av_log(ctx, AV_LOG_FATAL, "Failed to load CUDA function %s\n",
+               function_name);
+        goto fail;
+    }
+
+  fail:
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    return ret;
+}
+
+static av_cold int init_processing_chain(AVFilterContext * ctx,
+                                         int in_width, int in_height,
+                                         int out_width, int out_height)
+{
+    TonemapCudaContext *s = ctx->priv;
+    FilterLink *inl = ff_filter_link(ctx->inputs[0]);
+    FilterLink *outl = ff_filter_link(ctx->outputs[0]);
+
+    AVHWFramesContext *in_frames_ctx;
+    enum AVPixelFormat in_format;
+    enum AVPixelFormat out_format;
+    int ret;
+
+    if (!inl->hw_frames_ctx) {
+        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
+        return AVERROR(EINVAL);
+    }
+    in_frames_ctx = (AVHWFramesContext *) inl->hw_frames_ctx->data;
+    in_format = in_frames_ctx->sw_format;
+    // Convert HDR to SDR format for encoder compatibility
+    if (in_format == AV_PIX_FMT_P010LE || in_format == AV_PIX_FMT_P016LE) {
+        out_format = AV_PIX_FMT_NV12;
+    } else if (in_format == AV_PIX_FMT_YUV420P10LE
+               || in_format == AV_PIX_FMT_YUV420P16LE) {
+        out_format = AV_PIX_FMT_YUV420P;
+    } else {
+        out_format = in_format;
+    }
+
+    if (!format_is_supported(in_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported input format: %s\n",
+               av_get_pix_fmt_name(in_format));
+        return AVERROR(ENOSYS);
+    }
+    if (!format_is_supported(out_format)) {
+        av_log(ctx, AV_LOG_ERROR, "Unsupported output format: %s\n",
+               av_get_pix_fmt_name(out_format));
+        return AVERROR(ENOSYS);
+    }
+
+    s->in_fmt = in_format;
+    s->out_fmt = out_format;
+    s->in_desc = av_pix_fmt_desc_get(s->in_fmt);
+    s->out_desc = av_pix_fmt_desc_get(s->out_fmt);
+
+    if (in_width == out_width && in_height == out_height
+        && in_format == out_format) {
+        s->frames_ctx = av_buffer_ref(inl->hw_frames_ctx);
+        if (!s->frames_ctx)
+            return AVERROR(ENOMEM);
+    } else {
+        s->passthrough = 0;
+
+        ret =
+            init_hwframe_ctx(s, in_frames_ctx->device_ref, out_width,
+                             out_height);
+        if (ret < 0)
+            return ret;
+    }
+
+    outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
+    if (!outl->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    return 0;
+}
+
+static av_cold int tonemap_cuda_config_props(AVFilterLink * outlink)
+{
+    AVFilterContext *ctx = outlink->src;
+    AVFilterLink *inlink = outlink->src->inputs[0];
+    FilterLink *inl = ff_filter_link(inlink);
+    TonemapCudaContext *s = ctx->priv;
+    AVHWFramesContext *frames_ctx;
+    AVCUDADeviceContext *device_hwctx;
+    int ret;
+
+    outlink->w = inlink->w;
+    outlink->h = inlink->h;
+
+    ret =
+        init_processing_chain(ctx, inlink->w, inlink->h, outlink->w,
+                              outlink->h);
+    if (ret < 0)
+        return ret;
+
+    frames_ctx = (AVHWFramesContext *) inl->hw_frames_ctx->data;
+    device_hwctx = frames_ctx->device_ctx->hwctx;
+
+    s->hwctx = device_hwctx;
+    s->cu_stream = s->hwctx->stream;
+
+    s->coeffs = av_csp_luma_coeffs_from_avcsp(inlink->colorspace);
+
+    av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d fmt:%s -> w:%d h:%d fmt:%s%s\n",
+           inlink->w, inlink->h, av_get_pix_fmt_name(s->in_fmt),
+           outlink->w, outlink->h, av_get_pix_fmt_name(s->out_fmt),
+           s->passthrough ? " (passthrough)" : "");
+
+    ret = tonemap_cuda_load_functions(ctx);
+    if (ret < 0)
+        return ret;
+
+    if (s->async_depth > 1 || s->async_streams > 1) {
+        av_log(ctx, AV_LOG_DEBUG,
+               "Async processing enabled: depth=%d streams=%d\n",
+               s->async_depth, s->async_streams);
+
+        ret = ff_cuda_async_queue_init(&s->async_queue, s->hwctx,
+                                       s->async_depth, s->async_streams,
+                                       ctx, process_frame_async);
+        if (ret < 0) {
+            av_log(ctx, AV_LOG_ERROR,
+                   "Failed to initialize async queue\n");
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+static int call_tonemap_kernel(AVFilterContext * ctx, AVFrame * out,
+                               AVFrame * in)
+{
+    TonemapCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    CUcontext dummy, cuda_ctx = s->hwctx->cuda_ctx;
+    CUtexObject tex = 0;
+    int ret;
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    if (s->in_fmt == AV_PIX_FMT_GBRPF32) {
+        // Planar float format
+        float coeff_r = av_q2d(s->coeffs->cr);
+        float coeff_g = av_q2d(s->coeffs->cg);
+        float coeff_b = av_q2d(s->coeffs->cb);
+        void *args[] = {
+            &in->data[0], &in->data[1], &in->data[2],
+            &out->data[0], &out->data[1], &out->data[2],
+            &out->width, &out->height, &in->linesize[0], &out->linesize[0],
+            &s->tonemap, &s->param, &s->desat, &s->peak,
+            &coeff_r, &coeff_g, &coeff_b
+        };
+
+        ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func,
+                                          DIV_UP(out->width, BLOCKX),
+                                          DIV_UP(out->height, BLOCKY), 1,
+                                          BLOCKX, BLOCKY, 1, 0,
+                                          s->cu_stream, args, NULL));
+    } else
+        if ((s->in_fmt == AV_PIX_FMT_P010LE
+             || s->in_fmt == AV_PIX_FMT_P016LE)
+            && s->out_fmt == AV_PIX_FMT_NV12) {
+        // P016 to NV12 conversion
+        CUDA_TEXTURE_DESC tex_desc_y = {
+            .filterMode = CU_TR_FILTER_MODE_POINT,
+            .flags = CU_TRSF_READ_AS_INTEGER,
+        };
+
+        CUDA_TEXTURE_DESC tex_desc_uv = {
+            .filterMode = CU_TR_FILTER_MODE_POINT,
+            .flags = CU_TRSF_READ_AS_INTEGER,
+        };
+
+        CUDA_RESOURCE_DESC res_desc_y = {
+            .resType = CU_RESOURCE_TYPE_PITCH2D,
+            .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16,
+            .res.pitch2D.numChannels = 1,
+            .res.pitch2D.width = in->width,
+            .res.pitch2D.height = in->height,
+            .res.pitch2D.pitchInBytes = in->linesize[0],
+            .res.pitch2D.devPtr = (CUdeviceptr) in->data[0],
+        };
+
+        CUDA_RESOURCE_DESC res_desc_uv = {
+            .resType = CU_RESOURCE_TYPE_PITCH2D,
+            .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16,
+            .res.pitch2D.numChannels = 2,
+            .res.pitch2D.width = in->width / 2,
+            .res.pitch2D.height = in->height / 2,
+            .res.pitch2D.pitchInBytes = in->linesize[1],
+            .res.pitch2D.devPtr = (CUdeviceptr) in->data[1],
+        };
+
+        CUtexObject tex_y = 0, tex_uv = 0;
+        ret =
+            CHECK_CU(cu->
+                     cuTexObjectCreate(&tex_y, &res_desc_y, &tex_desc_y,
+                                       NULL));
+        if (ret < 0)
+            goto exit;
+
+        ret =
+            CHECK_CU(cu->
+                     cuTexObjectCreate(&tex_uv, &res_desc_uv, &tex_desc_uv,
+                                       NULL));
+        if (ret < 0)
+            goto exit;
+
+        float coeff_r = av_q2d(s->coeffs->cr);
+        float coeff_g = av_q2d(s->coeffs->cg);
+        float coeff_b = av_q2d(s->coeffs->cb);
+
+        void *args[] = {
+            &tex_y, &tex_uv,
+            &out->data[0], &out->data[1],
+            &out->width, &out->height, &out->linesize[0],
+                &out->linesize[1],
+            &s->tonemap, &s->param, &s->desat, &s->peak,
+            &coeff_r, &coeff_g, &coeff_b
+        };
+
+        ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func,
+                                          DIV_UP(out->width, BLOCKX),
+                                          DIV_UP(out->height, BLOCKY), 1,
+                                          BLOCKX, BLOCKY, 1, 0,
+                                          s->cu_stream, args, NULL));
+
+        if (tex_y)
+            CHECK_CU(cu->cuTexObjectDestroy(tex_y));
+        if (tex_uv)
+            CHECK_CU(cu->cuTexObjectDestroy(tex_uv));
+    } else if (s->in_fmt == AV_PIX_FMT_YUV420P10LE
+               || s->in_fmt == AV_PIX_FMT_YUV420P16LE) {
+        // YUV format handling
+        CUDA_TEXTURE_DESC tex_desc_y = {
+            .filterMode = CU_TR_FILTER_MODE_POINT,
+            .flags = CU_TRSF_READ_AS_INTEGER,
+        };
+
+        CUDA_TEXTURE_DESC tex_desc_uv = {
+            .filterMode = CU_TR_FILTER_MODE_POINT,
+            .flags = CU_TRSF_READ_AS_INTEGER,
+        };
+
+        CUDA_RESOURCE_DESC res_desc_y = {
+            .resType = CU_RESOURCE_TYPE_PITCH2D,
+            .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16,
+            .res.pitch2D.numChannels = 1,
+            .res.pitch2D.width = in->width,
+            .res.pitch2D.height = in->height,
+            .res.pitch2D.pitchInBytes = in->linesize[0],
+            .res.pitch2D.devPtr = (CUdeviceptr) in->data[0],
+        };
+
+        CUDA_RESOURCE_DESC res_desc_uv = {
+            .resType = CU_RESOURCE_TYPE_PITCH2D,
+            .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16,
+            .res.pitch2D.numChannels = 2,
+            .res.pitch2D.width = in->width / 2,
+            .res.pitch2D.height = in->height / 2,
+            .res.pitch2D.pitchInBytes = in->linesize[1],
+            .res.pitch2D.devPtr = (CUdeviceptr) in->data[1],
+        };
+
+        CUtexObject tex_y = 0, tex_uv = 0;
+        ret =
+            CHECK_CU(cu->
+                     cuTexObjectCreate(&tex_y, &res_desc_y, &tex_desc_y,
+                                       NULL));
+        if (ret < 0)
+            goto exit;
+
+        ret =
+            CHECK_CU(cu->
+                     cuTexObjectCreate(&tex_uv, &res_desc_uv, &tex_desc_uv,
+                                       NULL));
+        if (ret < 0)
+            goto exit;
+
+        float coeff_r = av_q2d(s->coeffs->cr);
+        float coeff_g = av_q2d(s->coeffs->cg);
+        float coeff_b = av_q2d(s->coeffs->cb);
+
+        void *args[] = {
+            &tex_y, &tex_uv,
+            &out->data[0], &out->data[1], &out->data[2],
+            &out->width, &out->height, &out->linesize[0],
+                &out->linesize[1],
+            &s->tonemap, &s->param, &s->desat, &s->peak,
+            &coeff_r, &coeff_g, &coeff_b
+        };
+
+        ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func,
+                                          DIV_UP(out->width, BLOCKX),
+                                          DIV_UP(out->height, BLOCKY), 1,
+                                          BLOCKX, BLOCKY, 1, 0,
+                                          s->cu_stream, args, NULL));
+
+        if (tex_y)
+            CHECK_CU(cu->cuTexObjectDestroy(tex_y));
+        if (tex_uv)
+            CHECK_CU(cu->cuTexObjectDestroy(tex_uv));
+    } else {
+        // 16-bit formats with texture
+        CUDA_TEXTURE_DESC tex_desc = {
+            .filterMode = CU_TR_FILTER_MODE_POINT,
+            .flags = CU_TRSF_READ_AS_INTEGER,
+        };
+
+        CUDA_RESOURCE_DESC res_desc = {
+            .resType = CU_RESOURCE_TYPE_PITCH2D,
+            .res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT16,
+            .res.pitch2D.numChannels = 4,
+            .res.pitch2D.width = in->width,
+            .res.pitch2D.height = in->height,
+            .res.pitch2D.pitchInBytes = in->linesize[0],
+            .res.pitch2D.devPtr = (CUdeviceptr) in->data[0],
+        };
+
+        ret =
+            CHECK_CU(cu->
+                     cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL));
+        if (ret < 0)
+            goto exit;
+
+        int bit_depth = (s->in_fmt == AV_PIX_FMT_RGB48) ? 16 : 16;
+        float coeff_r = av_q2d(s->coeffs->cr);
+        float coeff_g = av_q2d(s->coeffs->cg);
+        float coeff_b = av_q2d(s->coeffs->cb);
+
+        void *args[] = {
+            &tex,
+            &out->data[0],
+            &out->width, &out->height, &out->linesize[0],
+            &s->tonemap, &s->param, &s->desat, &s->peak,
+            &coeff_r, &coeff_g, &coeff_b,
+            &bit_depth
+        };
+
+        ret = CHECK_CU(cu->cuLaunchKernel(s->cu_func,
+                                          DIV_UP(out->width, BLOCKX),
+                                          DIV_UP(out->height, BLOCKY), 1,
+                                          BLOCKX, BLOCKY, 1, 0,
+                                          s->cu_stream, args, NULL));
+    }
+
+  exit:
+    if (tex)
+        CHECK_CU(cu->cuTexObjectDestroy(tex));
+
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    return ret;
+}
+
+
+static int tonemap_cuda_filter_frame(AVFilterLink * link, AVFrame * in);
+
+static int process_frame_async(void *filter_ctx, AVFrame * out,
+                               AVFrame * in, CUstream stream)
+{
+    AVFilterContext *ctx = (AVFilterContext *) filter_ctx;
+    TonemapCudaContext *s = ctx->priv;
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+    CUstream old_stream = s->cu_stream;
+    CUcontext dummy;
+    int ret;
+
+    // Allocate output frame buffer (context will be managed by call_tonemap_kernel)
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (ret < 0)
+        return ret;
+
+    ret = av_hwframe_get_buffer(s->frames_ctx, out, 0);
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+
+    if (ret < 0)
+        return ret;
+
+    // Ensure output frame has correct dimensions (not aligned dimensions)
+    out->width = in->width;
+    out->height = in->height;
+
+    // Set the stream and call the kernel (kernel manages its own context)
+    s->cu_stream = stream;
+    ret = call_tonemap_kernel(ctx, out, in);
+    s->cu_stream = old_stream;
+
+    if (ret >= 0) {
+        ret = av_frame_copy_props(out, in);
+    }
+
+    return ret;
+}
+
+static int tonemap_cuda_activate(AVFilterContext * ctx)
+{
+    AVFilterLink *inlink = ctx->inputs[0];
+    AVFilterLink *outlink = ctx->outputs[0];
+    TonemapCudaContext *s = ctx->priv;
+    AVFrame *in = NULL;
+    int ret, status;
+    int64_t pts;
+
+    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
+
+    if (s->async_depth > 1 || s->async_streams > 1) {
+        // Try to receive completed frames first to avoid memory buildup
+        AVFrame *out = NULL;
+        ret = ff_cuda_async_queue_receive(&s->async_queue, &out);
+        if (ret == 0 && out) {
+            return ff_filter_frame(outlink, out);
+        } else if (ret < 0 && ret != AVERROR(EAGAIN)) {
+            return ret;
+        }
+
+        // Only submit new frames if there's space and no completed frames ready
+        if (!ff_cuda_async_queue_is_full(&s->async_queue)) {
+            ret = ff_inlink_consume_frame(inlink, &in);
+            if (ret < 0) {
+                if (ret == AVERROR_EOF || ret == AVERROR(EAGAIN)) {
+                    // No input available, but we might have frames in queue
+                    return AVERROR(EAGAIN);
+                }
+                return ret;
+            }
+
+            if (in) {
+                ret = ff_cuda_async_queue_submit(&s->async_queue, in);
+                av_frame_free(&in);
+                if (ret < 0) {
+                    return ret;
+                }
+                av_log(ctx, AV_LOG_DEBUG,
+                       "Submitted frame to async queue\n");
+            }
+        }
+    } else {
+        // Synchronous processing
+        ret = ff_inlink_consume_frame(inlink, &in);
+        if (ret < 0)
+            return ret;
+
+        if (in) {
+            ret = tonemap_cuda_filter_frame(inlink, in);
+            if (ret < 0)
+                return ret;
+        }
+    }
+
+    if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
+        if (s->async_depth > 1 || s->async_streams > 1) {
+            // Flush all remaining async frames
+            while (!ff_cuda_async_queue_is_empty(&s->async_queue)) {
+                AVFrame *out = NULL;
+                ret = ff_cuda_async_queue_receive(&s->async_queue, &out);
+                if (ret == 0 && out) {
+                    ret = ff_filter_frame(outlink, out);
+                    if (ret < 0)
+                        return ret;
+                } else if (ret < 0 && ret != AVERROR(EAGAIN)) {
+                    av_log(ctx, AV_LOG_WARNING,
+                           "Error receiving async frame during flush: %d\n",
+                           ret);
+                    break;
+                }
+            }
+        }
+        ff_outlink_set_status(outlink, status, pts);
+        return 0;
+    }
+
+    FF_FILTER_FORWARD_WANTED(outlink, inlink);
+
+    return FFERROR_NOT_READY;
+}
+
+static int tonemap_cuda_filter_frame(AVFilterLink * link, AVFrame * in)
+{
+    AVFilterContext *ctx = link->dst;
+    TonemapCudaContext *s = ctx->priv;
+    AVFilterLink *outlink = ctx->outputs[0];
+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
+
+    AVFrame *out = NULL;
+    CUcontext dummy;
+    int ret = 0;
+
+    if (s->passthrough)
+        return ff_filter_frame(outlink, in);
+
+    /* read peak from side data if not passed in */
+    if (!s->peak) {
+        s->peak = ff_determine_signal_peak(in);
+        av_log(s, AV_LOG_DEBUG, "Computed signal peak: %f\n", s->peak);
+    }
+
+    if (s->async_depth > 1 || s->async_streams > 1) {
+        ret = ff_cuda_async_queue_submit(&s->async_queue, in);
+        if (ret == AVERROR(EAGAIN)) {
+            AVFrame *completed_frame = NULL;
+            ret =
+                ff_cuda_async_queue_receive(&s->async_queue,
+                                            &completed_frame);
+            if (ret < 0 && ret != AVERROR(EAGAIN)) {
+                av_frame_free(&in);
+                return ret;
+            }
+            if (completed_frame) {
+                ret = ff_filter_frame(outlink, completed_frame);
+                if (ret < 0) {
+                    av_frame_free(&in);
+                    return ret;
+                }
+            }
+            ret = ff_cuda_async_queue_submit(&s->async_queue, in);
+        }
+        if (ret < 0) {
+            av_frame_free(&in);
+            return ret;
+        }
+
+        while (ff_inlink_queued_frames(link) == 0) {
+            AVFrame *completed_frame = NULL;
+            ret =
+                ff_cuda_async_queue_receive(&s->async_queue,
+                                            &completed_frame);
+            if (ret == AVERROR(EAGAIN)) {
+                break;
+            } else if (ret < 0) {
+                return ret;
+            }
+            if (completed_frame) {
+                ret = ff_filter_frame(outlink, completed_frame);
+                if (ret < 0)
+                    return ret;
+            }
+        }
+
+        return 0;
+    }
+
+    out = av_frame_alloc();
+    if (!out) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
+    if (ret < 0)
+        goto fail;
+
+    ret = av_hwframe_get_buffer(s->frames_ctx, out, 0);
+    if (ret < 0)
+        goto fail;
+
+    // Ensure output frame has correct dimensions (not aligned dimensions)
+    out->width = in->width;
+    out->height = in->height;
+
+    ret = call_tonemap_kernel(ctx, out, in);
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
+    if (ret < 0)
+        goto fail;
+
+    ret = av_frame_copy_props(out, in);
+    if (ret < 0)
+        goto fail;
+
+    av_frame_free(&in);
+    return ff_filter_frame(outlink, out);
+
+  fail:
+    av_frame_free(&in);
+    av_frame_free(&out);
+    return ret;
+}
+
+#define OFFSET(x) offsetof(TonemapCudaContext, x)
+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
+
+static const AVOption tonemap_cuda_options[] = {
+    { "tonemap", "tonemap algorithm selection", OFFSET(tonemap),
+     AV_OPT_TYPE_INT, {.i64 =
+                       TONEMAP_NONE}, 0, TONEMAP_MAX - 1, FLAGS,.unit =
+     "tonemap" },
+    { "none", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_NONE}, 0, 0,
+     FLAGS,.unit = "tonemap" },
+    { "linear", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_LINEAR}, 0, 0,
+     FLAGS,.unit = "tonemap" },
+    { "gamma", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_GAMMA}, 0, 0,
+     FLAGS,.unit = "tonemap" },
+    { "clip", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_CLIP}, 0, 0,
+     FLAGS,.unit = "tonemap" },
+    { "reinhard", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_REINHARD}, 0, 0,
+     FLAGS,.unit = "tonemap" },
+    { "hable", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_HABLE}, 0, 0,
+     FLAGS,.unit = "tonemap" },
+    { "mobius", 0, 0, AV_OPT_TYPE_CONST, {.i64 = TONEMAP_MOBIUS}, 0, 0,
+     FLAGS,.unit = "tonemap" },
+    { "param", "tonemap parameter", OFFSET(param), AV_OPT_TYPE_DOUBLE,
+     {.dbl = NAN}, DBL_MIN, DBL_MAX, FLAGS },
+    { "desat", "desaturation strength", OFFSET(desat), AV_OPT_TYPE_DOUBLE,
+     {.dbl = 2}, 0, DBL_MAX, FLAGS },
+    { "peak", "signal peak override", OFFSET(peak), AV_OPT_TYPE_DOUBLE,
+     {.dbl = 0}, 0, DBL_MAX, FLAGS },
+    { "passthrough", "Do not process frames at all if parameters match",
+     OFFSET(passthrough), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS },
+    { "async_depth", "Frame queue depth for async processing",
+     OFFSET(async_depth), AV_OPT_TYPE_INT, {.i64 =
+                                            1}, 1, MAX_FRAME_QUEUE_SIZE,
+     FLAGS },
+    { "async_streams", "Number of CUDA streams for async processing",
+     OFFSET(async_streams), AV_OPT_TYPE_INT, {.i64 =
+                                              1}, 1, MAX_CUDA_STREAMS,
+     FLAGS },
+    { NULL },
+};
+
+static const AVClass tonemap_cuda_class = {
+    .class_name = "tonemap_cuda",
+    .item_name = av_default_item_name,
+    .option = tonemap_cuda_options,
+    .version = LIBAVUTIL_VERSION_INT,
+};
+
+static const AVFilterPad tonemap_cuda_inputs[] = {
+    {
+     .name = "default",
+     .type = AVMEDIA_TYPE_VIDEO,
+      },
+};
+
+static const AVFilterPad tonemap_cuda_outputs[] = {
+    {
+     .name = "default",
+     .type = AVMEDIA_TYPE_VIDEO,
+     .config_props = tonemap_cuda_config_props,
+      },
+};
+
+const AVFilter ff_vf_tonemap_cuda = {
+    .name = "tonemap_cuda",
+    .description =
+        NULL_IF_CONFIG_SMALL("GPU accelerated HDR to SDR tonemap filter"),
+    .priv_class = &tonemap_cuda_class,
+
+    .init = tonemap_cuda_init,
+    .uninit = tonemap_cuda_uninit,
+    .activate = tonemap_cuda_activate,
+
+    .priv_size = sizeof(TonemapCudaContext),
+
+    FILTER_INPUTS(tonemap_cuda_inputs),
+    FILTER_OUTPUTS(tonemap_cuda_outputs),
+
+    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
+
+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
+};
diff --git a/libavfilter/vf_tonemap_cuda.cu b/libavfilter/vf_tonemap_cuda.cu
new file mode 100644
index 00000000000..742f917db06
--- /dev/null
+++ b/libavfilter/vf_tonemap_cuda.cu
@@ -0,0 +1,330 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "cuda/vector_helpers.cuh"
+
+#define BLOCKX 32
+#define BLOCKY 16
+
+extern "C" {
+
+enum TonemapAlgorithm {
+    TONEMAP_NONE,
+    TONEMAP_LINEAR,
+    TONEMAP_GAMMA,
+    TONEMAP_CLIP,
+    TONEMAP_REINHARD,
+    TONEMAP_HABLE,
+    TONEMAP_MOBIUS,
+    TONEMAP_MAX,
+};
+
+__device__ static inline float hable_cuda(float in)
+{
+    float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f;
+    return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f;
+}
+
+__device__ static inline float mobius_cuda(float in, float j, float peak)
+{
+    float a, b;
+
+    if (in <= j)
+        return in;
+
+    a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak);
+    b = (j * j - 2.0f * j * peak + peak) / fmaxf(peak - 1.0f, 1e-6f);
+
+    return (b * b + 2.0f * b * j + j * j) / (b - a) * (in + a) / (in + b);
+}
+
+__device__ static inline float apply_tonemap_cuda(int algorithm, float sig, float param, float peak)
+{
+    switch (algorithm) {
+    case TONEMAP_NONE:
+        return sig;
+    case TONEMAP_LINEAR:
+        return sig * param / peak;
+    case TONEMAP_GAMMA:
+        return sig > 0.05f ? powf(sig / peak, 1.0f / param)
+                           : sig * powf(0.05f / peak, 1.0f / param) / 0.05f;
+    case TONEMAP_CLIP:
+        return fminf(fmaxf(sig * param, 0.0f), 1.0f);
+    case TONEMAP_HABLE:
+        return hable_cuda(sig) / hable_cuda(peak);
+    case TONEMAP_REINHARD:
+        return sig / (sig + param) * (peak + param) / peak;
+    case TONEMAP_MOBIUS:
+        return mobius_cuda(sig, param, peak);
+    default:
+        return sig;
+    }
+}
+
+__global__ void tonemap_cuda_float(
+    cudaTextureObject_t src_tex,
+    float *dst_r, float *dst_g, float *dst_b,
+    int dst_width, int dst_height, int dst_pitch,
+    int algorithm, float param, float desat, float peak,
+    float coeff_r, float coeff_g, float coeff_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    float4 src = tex2D<float4>(src_tex, x, y);
+    float r = src.x;
+    float g = src.y;
+    float b = src.z;
+    
+    // Desaturate to prevent unnatural colors
+    if (desat > 0) {
+        float luma = coeff_r * r + coeff_g * g + coeff_b * b;
+        float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f);
+        r = r * (1.0f - overbright) + luma * overbright;
+        g = g * (1.0f - overbright) + luma * overbright;
+        b = b * (1.0f - overbright) + luma * overbright;
+    }
+    
+    // Pick the brightest component
+    float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f);
+    float sig_orig = sig;
+    
+    // Apply tonemap algorithm
+    sig = apply_tonemap_cuda(algorithm, sig, param, peak);
+    
+    // Apply the computed scale factor to the color
+    float scale = sig / sig_orig;
+    r *= scale;
+    g *= scale;
+    b *= scale;
+    
+    int idx = y * (dst_pitch / sizeof(float)) + x;
+    dst_r[idx] = r;
+    dst_g[idx] = g;
+    dst_b[idx] = b;
+}
+
+__global__ void tonemap_cuda_planar_float(
+    float *src_r, float *src_g, float *src_b,
+    float *dst_r, float *dst_g, float *dst_b,
+    int width, int height, int src_pitch, int dst_pitch,
+    int algorithm, float param, float desat, float peak,
+    float coeff_r, float coeff_g, float coeff_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= width || y >= height)
+        return;
+    
+    int src_idx = y * (src_pitch / sizeof(float)) + x;
+    int dst_idx = y * (dst_pitch / sizeof(float)) + x;
+    
+    float r = src_r[src_idx];
+    float g = src_g[src_idx];
+    float b = src_b[src_idx];
+    
+    // Desaturate to prevent unnatural colors
+    if (desat > 0) {
+        float luma = coeff_r * r + coeff_g * g + coeff_b * b;
+        float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f);
+        r = r * (1.0f - overbright) + luma * overbright;
+        g = g * (1.0f - overbright) + luma * overbright;
+        b = b * (1.0f - overbright) + luma * overbright;
+    }
+    
+    // Pick the brightest component
+    float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f);
+    float sig_orig = sig;
+    
+    // Apply tonemap algorithm
+    sig = apply_tonemap_cuda(algorithm, sig, param, peak);
+    
+    // Apply the computed scale factor to the color
+    float scale = sig / sig_orig;
+    r *= scale;
+    g *= scale;
+    b *= scale;
+    
+    dst_r[dst_idx] = r;
+    dst_g[dst_idx] = g;
+    dst_b[dst_idx] = b;
+}
+
+__global__ void tonemap_cuda_16bit(
+    cudaTextureObject_t src_tex,
+    ushort4 *dst, int dst_width, int dst_height, int dst_pitch,
+    int algorithm, float param, float desat, float peak,
+    float coeff_r, float coeff_g, float coeff_b, int bit_depth)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    ushort4 src = tex2D<ushort4>(src_tex, x, y);
+    float max_val = (1 << bit_depth) - 1;
+    
+    float r = src.x / max_val;
+    float g = src.y / max_val;
+    float b = src.z / max_val;
+    
+    // Desaturate to prevent unnatural colors
+    if (desat > 0) {
+        float luma = coeff_r * r + coeff_g * g + coeff_b * b;
+        float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f);
+        r = r * (1.0f - overbright) + luma * overbright;
+        g = g * (1.0f - overbright) + luma * overbright;
+        b = b * (1.0f - overbright) + luma * overbright;
+    }
+    
+    // Pick the brightest component
+    float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f);
+    float sig_orig = sig;
+    
+    // Apply tonemap algorithm
+    sig = apply_tonemap_cuda(algorithm, sig, param, peak);
+    
+    // Apply the computed scale factor to the color
+    float scale = sig / sig_orig;
+    r *= scale;
+    g *= scale;
+    b *= scale;
+    
+    dst[y * (dst_pitch / sizeof(ushort4)) + x] = make_ushort4(
+        (unsigned short)(r * max_val + 0.5f),
+        (unsigned short)(g * max_val + 0.5f),
+        (unsigned short)(b * max_val + 0.5f),
+        src.w
+    );
+}
+
+__global__ void tonemap_cuda_p016_to_nv12(
+    cudaTextureObject_t src_y_tex, cudaTextureObject_t src_uv_tex,
+    unsigned char *dst_y, unsigned char *dst_uv,
+    int dst_width, int dst_height, int dst_pitch_y, int dst_pitch_uv,
+    int algorithm, float param, float desat, float peak,
+    float coeff_r, float coeff_g, float coeff_b)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+    
+    if (x >= dst_width || y >= dst_height)
+        return;
+    
+    // Sample Y component (16-bit)
+    unsigned short y_val = tex2D<unsigned short>(src_y_tex, x, y);
+    
+    // Sample UV components (subsampled, 16-bit) - ensure consistent sampling for 2x2 blocks
+    int uv_x = (x / 2);
+    int uv_y = (y / 2);
+    ushort2 uv_val = tex2D<ushort2>(src_uv_tex, uv_x, uv_y);
+    
+    // Convert YUV to RGB (BT.2020 for HDR, P016LE format: 10-bit left-justified in 16-bit)
+    // P016LE: 10-bit values are stored in upper 10 bits, so divide by 64 to get 10-bit range
+    // Add safety clamping to prevent extreme values
+    unsigned short y_10bit = y_val >> 6;
+    unsigned short u_10bit = uv_val.x >> 6;
+    unsigned short v_10bit = uv_val.y >> 6;
+    
+    // Clamp to valid 10-bit TV range before normalization
+    y_10bit = max(64, min(940, (int)y_10bit));
+    u_10bit = max(64, min(960, (int)u_10bit));
+    v_10bit = max(64, min(960, (int)v_10bit));
+    
+    float Y = (y_10bit - 64.0f) / 876.0f; // 10-bit Y: range 64-940 -> 0.0-1.0
+    float U = (u_10bit - 512.0f) / 448.0f; // 10-bit U: range 64-960, centered at 512, scale by half-range
+    float V = (v_10bit - 512.0f) / 448.0f; // 10-bit V: range 64-960, centered at 512, scale by half-range
+    
+    // BT.2020 YUV to RGB conversion (corrected coefficients)
+    float r = Y + 1.7166f * V;
+    float g = Y - 0.1873f * U - 0.6526f * V;
+    float b = Y + 2.0426f * U;
+    
+    r = fmaxf(r, 0.0f);
+    g = fmaxf(g, 0.0f);
+    b = fmaxf(b, 0.0f);
+    
+    // Desaturate to prevent unnatural colors
+    if (desat > 0) {
+        float luma = coeff_r * r + coeff_g * g + coeff_b * b;
+        float overbright = fmaxf(luma - desat, 1e-6f) / fmaxf(luma, 1e-6f);
+        r = r * (1.0f - overbright) + luma * overbright;
+        g = g * (1.0f - overbright) + luma * overbright;
+        b = b * (1.0f - overbright) + luma * overbright;
+    }
+    
+    // Pick the brightest component
+    float sig = fmaxf(fmaxf(fmaxf(r, g), b), 1e-6f);
+    float sig_orig = sig;
+    
+    // Apply tonemap algorithm
+    sig = apply_tonemap_cuda(algorithm, sig, param, peak);
+    
+    // Apply the computed scale factor to the color
+    float scale = sig / sig_orig;
+    
+    // Debug: clamp extreme scale values for hable to prevent artifacts
+    if (algorithm == TONEMAP_HABLE) {
+        scale = fmaxf(0.01f, fminf(10.0f, scale));
+    }
+    
+    r *= scale;
+    g *= scale;
+    b *= scale;
+    
+    // Clamp RGB to valid range [0.0, 1.0] before YUV conversion
+    r = fmaxf(0.0f, fminf(1.0f, r));
+    g = fmaxf(0.0f, fminf(1.0f, g));
+    b = fmaxf(0.0f, fminf(1.0f, b));
+    
+    // Convert back to YUV (BT.2020 to preserve input colorspace)
+    float out_Y = 0.2627f * r + 0.6780f * g + 0.0593f * b;
+    float out_U = -0.1396f * r - 0.3604f * g + 0.5000f * b;
+    float out_V = 0.5000f * r - 0.4598f * g - 0.0402f * b;
+    
+    // Scale to 8-bit TV range and clamp
+    int y_out = (int)(out_Y * 219.0f + 16.0f + 0.5f);
+    int u_out = (int)(out_U * 224.0f + 128.0f + 0.5f);
+    int v_out = (int)(out_V * 224.0f + 128.0f + 0.5f);
+    
+    // Clamp to valid TV range
+    y_out = max(16, min(235, y_out));
+    u_out = max(16, min(240, u_out));  // Proper TV range for chroma
+    v_out = max(16, min(240, v_out));
+    
+    // Write Y component
+    dst_y[y * dst_pitch_y + x] = y_out;
+    
+    // Write UV components in NV12 format (only for one thread per 2x2 block to avoid race conditions)
+    if (x % 2 == 0 && y % 2 == 0) {
+        int uv_idx = (y / 2) * dst_pitch_uv + (x / 2) * 2;  // Correct NV12 UV indexing
+        dst_uv[uv_idx] = u_out;
+        dst_uv[uv_idx + 1] = v_out;
+    }
+}
+
+}
\ No newline at end of file
diff --git a/libavfilter/vf_tonemap_cuda.h b/libavfilter/vf_tonemap_cuda.h
new file mode 100644
index 00000000000..7ce8d2865dc
--- /dev/null
+++ b/libavfilter/vf_tonemap_cuda.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef AVFILTER_VF_TONEMAP_CUDA_H
+#define AVFILTER_VF_TONEMAP_CUDA_H
+
+#include "avfilter.h"
+
+#endif                          /* AVFILTER_VF_TONEMAP_CUDA_H */
diff --git a/libavformat/mov.c b/libavformat/mov.c
index ff32d4160e1..0c5eb810af7 100644
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -85,6 +85,7 @@ typedef struct MOVParseTableEntry {
 static int mov_read_default(MOVContext *c, AVIOContext *pb, MOVAtom atom);
 static int mov_read_mfra(MOVContext *c, AVIOContext *f);
 static void mov_free_stream_context(AVFormatContext *s, AVStream *st);
+static int mov_switch_root(AVFormatContext *s, int64_t target, int index);
 static int64_t add_ctts_entry(MOVCtts** ctts_data, unsigned int* ctts_count, unsigned int* allocated_size,
                               int count, int duration);
 
@@ -1653,6 +1654,31 @@ static int64_t get_frag_time(AVFormatContext *s, AVStream *dst_st,
         return frag_stream_info->sidx_pts;
     }
 
+
+    // Check if the requested stream is present in the fragment
+    int exists = 0;
+    for (i = 0; i < frag_index->item[index].nb_stream_info; i++) {
+        if (dst_st->id != frag_index->item[index].stream_info[i].id) {
+            continue;
+        }
+
+        if ( get_stream_info_time(&frag_index->item[index].stream_info[i]) != AV_NOPTS_VALUE) {
+            exists = 1;
+            break;
+        }
+        if ( mov_switch_root(s,-1,index) < 0)
+            return AV_NOPTS_VALUE;
+
+        if ( get_stream_info_time(&frag_index->item[index].stream_info[i]) != AV_NOPTS_VALUE) {
+            exists = 1;
+            break;
+        }
+    }
+
+    if (!exists)
+        return AV_NOPTS_VALUE;
+
+
     for (i = 0; i < frag_index->item[index].nb_stream_info; i++) {
         AVStream *frag_stream = NULL;
         frag_stream_info = &frag_index->item[index].stream_info[i];