diff --git a/.gitignore b/.gitignore
index 80ad06ff..c19231fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ va-api-nvidia.files
 va-api-nvidia.includes
 meson.build.user
 .idea
+pr_summary.md
diff --git a/cross-i386.txt b/cross-i386.txt
new file mode 100644
index 00000000..c7c4f2bd
--- /dev/null
+++ b/cross-i386.txt
@@ -0,0 +1,22 @@
+[binaries]
+c = 'gcc'
+cpp = 'g++'
+ar = 'ar'
+strip = 'strip'
+pkg-config = 'pkg-config'
+
+[built-in options]
+c_args = ['-m32']
+c_link_args = ['-m32']
+cpp_args = ['-m32']
+cpp_link_args = ['-m32']
+
+[properties]
+pkg_config_libdir = ['/usr/lib/i386-linux-gnu/pkgconfig', '/usr/share/pkgconfig', '/usr/lib/pkgconfig']
+sys_root = '/'
+
+[host_machine]
+system = 'linux'
+cpu_family = 'x86'
+cpu = 'i686'
+endian = 'little'
diff --git a/docs/install-fedora.md b/docs/install-fedora.md
new file mode 100644
index 00000000..9089f86d
--- /dev/null
+++ b/docs/install-fedora.md
@@ -0,0 +1,141 @@
+# Installation on Fedora
+
+Tested on Fedora 43 with NVIDIA driver 580.126.18 (RPM Fusion).
+
+## Prerequisites
+
+NVIDIA proprietary driver installed via RPM Fusion (`akmod-nvidia`).
+
+Verify:
+```bash
+nvidia-smi --query-gpu=driver_version --format=csv,noheader
+```
+
+## Step 1 — Install build dependencies (64-bit)
+
+```bash
+sudo dnf install -y \
+    meson ninja-build gcc pkg-config \
+    libva-devel libdrm-devel mesa-libEGL-devel nv-codec-headers \
+    libva-utils
+```
+
+## Step 2 — Install build dependencies (32-bit, for Steam)
+
+```bash
+sudo dnf install -y \
+    glibc-devel.i686 \
+    libva-devel.i686 libdrm-devel.i686 mesa-libEGL-devel.i686
+```
+
+## Step 3 — Remove stock libva-nvidia-driver
+
+If you have the Fedora-packaged version (v0.0.16, decode-only), remove it first:
+
+```bash
+sudo dnf remove -y libva-nvidia-driver
+```
+
+## Step 4 — Build 64-bit
+
+```bash
+meson setup build64 . --wipe --prefix=/usr
+meson compile -C build64
+```
+
+## Step 5 — Build 32-bit (cross-compile)
+
+Fedora uses `/usr/lib/pkgconfig` for 32-bit `.pc` files (not `/usr/lib/i386-linux-gnu/`).
+Create a cross-file:
+
+```bash
+cat > cross-i386-fedora.txt << 'EOF'
+[binaries]
+c = 'gcc'
+cpp = 'g++'
+ar = 'ar'
+strip = 'strip'
+pkg-config = 'pkg-config'
+
+[built-in options]
+c_args = ['-m32']
+c_link_args = ['-m32']
+cpp_args = ['-m32']
+cpp_link_args = ['-m32']
+
+[properties]
+pkg_config_libdir = ['/usr/lib/pkgconfig', '/usr/share/pkgconfig']
+sys_root = '/'
+
+[host_machine]
+system = 'linux'
+cpu_family = 'x86'
+cpu = 'i686'
+endian = 'little'
+EOF
+```
+
+Then build:
+
+```bash
+meson setup build32 . --wipe --cross-file cross-i386-fedora.txt
+meson compile -C build32
+```
+
+## Step 6 — Install
+
+```bash
+sudo meson install -C build64
+sudo mkdir -p /usr/lib/dri
+sudo cp build32/nvidia_drv_video.so /usr/lib/dri/nvidia_drv_video.so
+```
+
+This installs:
+- 64-bit driver → `/usr/lib64/dri/nvidia_drv_video.so`
+- 32-bit driver → `/usr/lib/dri/nvidia_drv_video.so`
+- nvenc-helper → `/usr/libexec/nvenc-helper`
+
+## Step 7 — Systemd user service
+
+```bash
+mkdir -p ~/.config/systemd/user
+cat > ~/.config/systemd/user/nvenc-helper.service << 'EOF'
+[Unit]
+Description=NVENC encode helper for nvidia-vaapi-driver
+Documentation=https://github.com/efortin/nvidia-vaapi-driver
+After=graphical-session.target
+
+[Service]
+Type=simple
+ExecStart=/usr/libexec/nvenc-helper
+Restart=on-failure
+RestartSec=2
+
+[Install]
+WantedBy=graphical-session.target
+EOF
+
+systemctl --user daemon-reload
+systemctl --user enable nvenc-helper.service
+systemctl --user restart nvenc-helper.service
+```
+
+## Step 8 — Verify
+
+```bash
+# Check helper is running
+systemctl --user is-active nvenc-helper.service
+
+# Check VA-API profiles (should show VAEntrypointEncSlice for encode)
+vainfo --display drm --device /dev/dri/renderD128
+```
+
+Expected output includes both decode (VLD) and encode (EncSlice) entrypoints:
+```
+VAProfileH264Main               :  VAEntrypointVLD
+VAProfileH264Main               :  VAEntrypointEncSlice
+VAProfileHEVCMain               :  VAEntrypointVLD
+VAProfileHEVCMain               :  VAEntrypointEncSlice
+```
+
+No environment variables needed. Just launch Steam.
diff --git a/docs/install-ubuntu.md b/docs/install-ubuntu.md
new file mode 100644
index 00000000..42134f14
--- /dev/null
+++ b/docs/install-ubuntu.md
@@ -0,0 +1,114 @@
+# Installation on Ubuntu
+
+Tested on Ubuntu 22.04+ with NVIDIA proprietary driver.
+
+## Prerequisites
+
+NVIDIA proprietary driver installed.
+
+Verify:
+```bash
+nvidia-smi --query-gpu=driver_version --format=csv,noheader
+```
+
+Detect the driver version (used for 32-bit packages):
+```bash
+NV_VER=$(dpkg -l | grep 'libnvidia-compute-.*amd64' | awk '{print $2}' | sed 's/libnvidia-compute-//' | sed 's/:amd64//' | head -1)
+echo "NVIDIA driver: $NV_VER"
+```
+
+## Step 1 — Install build dependencies (64-bit)
+
+```bash
+sudo apt-get install -y --no-install-recommends \
+    meson ninja-build gcc pkg-config \
+    libva-dev libdrm-dev libegl-dev libffmpeg-nvenc-dev \
+    vainfo
+```
+
+## Step 2 — Install build dependencies (32-bit, for Steam)
+
+```bash
+sudo dpkg --add-architecture i386
+sudo apt-get update
+
+sudo apt-get install -y --no-install-recommends \
+    gcc-multilib \
+    libva-dev:i386 libdrm-dev:i386 libegl-dev:i386 \
+    libnvidia-compute-${NV_VER}:i386 \
+    libnvidia-encode-${NV_VER}:i386
+```
+
+## Step 3 — Build 64-bit
+
+```bash
+meson setup build64 . --wipe --prefix=/usr
+meson compile -C build64
+```
+
+## Step 4 — Build 32-bit (cross-compile)
+
+The repo includes `cross-i386.txt` configured for Ubuntu paths (`/usr/lib/i386-linux-gnu/`).
+
+```bash
+meson setup build32 . --wipe --cross-file cross-i386.txt
+meson compile -C build32
+```
+
+## Step 5 — Install
+
+```bash
+sudo meson install -C build64
+sudo mkdir -p /usr/lib/i386-linux-gnu/dri
+sudo cp build32/nvidia_drv_video.so /usr/lib/i386-linux-gnu/dri/nvidia_drv_video.so
+```
+
+This installs:
+- 64-bit driver → `/usr/lib/x86_64-linux-gnu/dri/nvidia_drv_video.so`
+- 32-bit driver → `/usr/lib/i386-linux-gnu/dri/nvidia_drv_video.so`
+- nvenc-helper → `/usr/libexec/nvenc-helper`
+
+## Step 6 — Systemd user service
+
+```bash
+mkdir -p ~/.config/systemd/user
+cat > ~/.config/systemd/user/nvenc-helper.service << 'EOF'
+[Unit]
+Description=NVENC encode helper for nvidia-vaapi-driver
+Documentation=https://github.com/efortin/nvidia-vaapi-driver
+After=graphical-session.target
+
+[Service]
+Type=simple
+ExecStart=/usr/libexec/nvenc-helper
+Restart=on-failure
+RestartSec=2
+
+[Install]
+WantedBy=graphical-session.target
+EOF
+
+systemctl --user daemon-reload
+systemctl --user enable nvenc-helper.service
+systemctl --user restart nvenc-helper.service
+```
+
+## Step 7 — Verify
+
+```bash
+# Check helper is running
+systemctl --user is-active nvenc-helper.service
+
+# Check VA-API profiles (should show VAEntrypointEncSlice for encode)
+vainfo --display drm --device /dev/dri/renderD128
+```
+
+Expected output includes both decode (VLD) and encode (EncSlice) entrypoints:
+```
+VAProfileH264Main               :  VAEntrypointVLD
+VAProfileH264Main               :  VAEntrypointEncSlice
+VAProfileHEVCMain               :  VAEntrypointVLD
+VAProfileHEVCMain               :  VAEntrypointEncSlice
+```
+
+No environment variables needed. Just launch Steam.
diff --git a/meson.build b/meson.build
index 990c2b21..6a9e0447 100644
--- a/meson.build
+++ b/meson.build
@@ -55,10 +55,14 @@ sources = [
     'src/direct/direct-export-buf.c',
     'src/direct/nv-driver.c',
     'src/h264.c',
+    'src/h264_encode.c',
     'src/hevc.c',
+    'src/hevc_encode.c',
     'src/jpeg.c',
     'src/mpeg2.c',
     'src/mpeg4.c',
+    'src/nvenc.c',
+    'src/nvenc-ipc-client.c',
     'src/vabackend.c',
     'src/vc1.c',
     'src/vp8.c',
@@ -84,6 +88,47 @@ shared_library(
     gnu_symbol_visibility: 'hidden',
 )
 
+# Build the 64-bit NVENC helper daemon (only for native builds, not cross-compiled i386)
+if host_machine.cpu_family() == 'x86_64' or host_machine.cpu_family() == 'aarch64'
+    helper_deps = [
+        cc.find_library('dl', required : false),
+        dependency('ffnvcodec', version: '>= 11.1.5.1'),
+        dependency('threads'),
+    ]
+    executable(
+        'nvenc-helper',
+        'src/nvenc-helper.c',
+        'src/nvenc-ipc-client.c',   # for nvenc_ipc_get_socket_path
+        dependencies: helper_deps,
+        install: true,
+        install_dir: get_option('libexecdir'),
+    )
+endif
+
+# Tests (native builds only, not cross-compiled)
+if not meson.is_cross_build()
+    libva_test_deps = [
+        dependency('libva'),
+        dependency('libva-drm'),
+        cc.find_library('m', required : false),
+    ]
+
+    test_encode = executable('test_encode', 'tests/test_encode.c',
+        dependencies : libva_test_deps, install : false)
+    test('encode', test_encode, timeout : 60)
+
+    test_encode_config = executable('test_encode_config', 'tests/test_encode_config.c',
+        dependencies : libva_test_deps, install : false)
+    test('encode_config', test_encode_config, timeout : 60)
+
+    gst_launch = find_program('gst-launch-1.0', required : false)
+    if gst_launch.found()
+        test('gstreamer', find_program('tests/test_gstreamer.sh'),
+             timeout : 120,
+             env : ['GST_VAAPI_ALL_DRIVERS=1', 'LIBVA_DRIVER_NAME=nvidia'])
+    endif
+endif
+
 meson.add_devenv(environment({
     'NVD_LOG': '1',
     'LIBVA_DRIVER_NAME': 'nvidia',
diff --git a/nvenc-helper.service b/nvenc-helper.service
new file mode 100644
index 00000000..30317f6c
--- /dev/null
+++ b/nvenc-helper.service
@@ -0,0 +1,13 @@
+[Unit]
+Description=NVENC encode helper for nvidia-vaapi-driver
+Documentation=https://github.com/efortin/nvidia-vaapi-driver
+After=graphical-session.target
+
+[Service]
+Type=simple
+ExecStart=/usr/libexec/nvenc-helper
+Restart=on-failure
+RestartSec=2
+
+[Install]
+WantedBy=graphical-session.target
diff --git a/src/direct/direct-export-buf.c b/src/direct/direct-export-buf.c
index 47843e92..5a53108e 100644
--- a/src/direct/direct-export-buf.c
+++ b/src/direct/direct-export-buf.c
@@ -23,6 +23,12 @@ static void findGPUIndexFromFd(NVDriver *drv) {
     uint8_t drmUuid[16];
     get_device_uuid(&drv->driverContext, drmUuid);
 
+    /* If CUDA is not available (32-bit encode-only mode), default to GPU 0 */
+    if (!drv->cudaAvailable) {
+        drv->cudaGpuId = 0;
+        return;
+    }
+
     int gpuCount = 0;
     if (CHECK_CUDA_RESULT(drv->cu->cuDeviceGetCount(&gpuCount))) {
         return;
@@ -193,9 +199,26 @@ static BackingImage *direct_allocateBackingImage(NVDriver *drv, NVSurface *surfa
                     p[i].channelCount, 8 * fmtInfo->bppc, p[i].fourcc, &driverImages[i]);
     }
 
-    for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
-        if (!import_to_cuda(drv, &driverImages[i], 8 * fmtInfo->bppc, p[i].channelCount, &backingImage->cudaImages[i], &backingImage->arrays[i]))
-            goto bail;
+    /* Import into CUDA only when CUDA is available.
+     * In IPC encode-only mode, surfaces are allocated via DRM but not imported
+     * into CUDA — the 64-bit helper handles CUDA import from the DMA-BUF fd. */
+    if (drv->cudaAvailable) {
+        for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
+            if (!import_to_cuda(drv, &driverImages[i], 8 * fmtInfo->bppc, p[i].channelCount, &backingImage->cudaImages[i], &backingImage->arrays[i]))
+                goto bail;
+        }
+    } else {
+        /* Without CUDA, keep the nvFd handles for the IPC helper to import.
+         * Close nvFd2 which import_to_cuda would normally close. */
+        for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
+            backingImage->nvFds[i] = driverImages[i].nvFd;
+            backingImage->memorySizes[i] = driverImages[i].memorySize;
+            driverImages[i].nvFd = 0; /* Ownership transferred to backingImage */
+            if (driverImages[i].nvFd2 != 0) {
+                close(driverImages[i].nvFd2);
+                driverImages[i].nvFd2 = 0;
+            }
+        }
     }
 
     backingImage->width = surface->width;
@@ -241,6 +264,10 @@ static void destroyBackingImage(NVDriver *drv, BackingImage *img) {
         if (img->fds[i] > 0) {
             close(img->fds[i]);
         }
+        /* Close NVIDIA opaque fds kept for IPC encode mode */
+        if (img->nvFds[i] > 0) {
+            close(img->nvFds[i]);
+        }
     }
 
     for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
diff --git a/src/h264_encode.c b/src/h264_encode.c
new file mode 100644
index 00000000..c3e0bc37
--- /dev/null
+++ b/src/h264_encode.c
@@ -0,0 +1,132 @@
+#include "vabackend.h"
+#include "nvenc.h"
+#include <string.h>
+#include <va/va.h>
+
+void h264enc_handle_sequence_params(NVENCContext *nvencCtx, NVBuffer *buffer)
+{
+    VAEncSequenceParameterBufferH264 *seq =
+        (VAEncSequenceParameterBufferH264*) buffer->ptr;
+
+    LOG("H264 encode: seq params %ux%u, intra_period=%u, ip_period=%u",
+        seq->picture_width_in_mbs * 16, seq->picture_height_in_mbs * 16,
+        seq->intra_period, seq->ip_period);
+
+    /* Store basic sequence-level encode parameters */
+    nvencCtx->width = seq->picture_width_in_mbs * 16;
+    nvencCtx->height = seq->picture_height_in_mbs * 16;
+
+    if (seq->intra_period > 0) {
+        nvencCtx->intraPeriod = seq->intra_period;
+    }
+    if (seq->ip_period > 0) {
+        nvencCtx->ipPeriod = seq->ip_period;
+    }
+
+    /* Frame rate from time_scale / num_units_in_tick / 2 if provided */
+    if (seq->num_units_in_tick > 0 && seq->time_scale > 0) {
+        nvencCtx->frameRateNum = seq->time_scale;
+        nvencCtx->frameRateDen = seq->num_units_in_tick * 2;
+    }
+
+    /* Bitrate (VA-API provides in bits/sec) */
+    if (seq->bits_per_second > 0) {
+        nvencCtx->bitrate = seq->bits_per_second;
+        if (nvencCtx->maxBitrate == 0) {
+            nvencCtx->maxBitrate = seq->bits_per_second;
+        }
+    }
+
+    nvencCtx->seqParamSet = true;
+}
+
+void h264enc_handle_picture_params(NVENCContext *nvencCtx, NVBuffer *buffer)
+{
+    VAEncPictureParameterBufferH264 *pic =
+        (VAEncPictureParameterBufferH264*) buffer->ptr;
+
+    /* Only log first few frames to avoid flooding at 60fps */
+    if (nvencCtx->frameCount < 3) {
+        LOG("H264 encode: picture params, coded_buf=%d, pic_fields=0x%x",
+            pic->coded_buf, pic->pic_fields.value);
+    }
+
+    nvencCtx->currentCodedBufId = pic->coded_buf;
+    nvencCtx->forceIDR = (pic->pic_fields.bits.idr_pic_flag != 0);
+    if (nvencCtx->forceIDR) {
+        LOG("H264 encode: IDR requested, coded_buf=%d", pic->coded_buf);
+    }
+}
+
+void h264enc_handle_slice_params(NVENCContext *nvencCtx, NVBuffer *buffer)
+{
+    const VAEncSliceParameterBufferH264 *slice =
+        (VAEncSliceParameterBufferH264*) buffer->ptr;
+
+    /* Map VA-API H.264 slice_type to NVENC picture type.
+     * Currently unused (enablePTD=1), but kept for future B-frame support. */
+    switch (slice->slice_type) {
+    case 2: case 7: /* I / SI */
+        nvencCtx->picType = nvencCtx->forceIDR
+            ? NV_ENC_PIC_TYPE_IDR : NV_ENC_PIC_TYPE_I;
+        break;
+    case 0: case 5: /* P / SP */
+        nvencCtx->picType = NV_ENC_PIC_TYPE_P;
+        break;
+    case 1: case 6: /* B */
+        nvencCtx->picType = NV_ENC_PIC_TYPE_B;
+        break;
+    default:
+        nvencCtx->picType = NV_ENC_PIC_TYPE_UNKNOWN;
+        break;
+    }
+}
+
+void h264enc_handle_misc_params(NVENCContext *nvencCtx, NVBuffer *buffer)
+{
+    VAEncMiscParameterBuffer *misc = (VAEncMiscParameterBuffer*) buffer->ptr;
+
+    switch (misc->type) {
+    case VAEncMiscParameterTypeRateControl: {
+        VAEncMiscParameterRateControl *rc =
+            (VAEncMiscParameterRateControl*) misc->data;
+        LOG("H264 encode: rate control bits_per_second=%u, target_percentage=%u",
+            rc->bits_per_second, rc->target_percentage);
+        if (rc->bits_per_second > 0) {
+            nvencCtx->maxBitrate = rc->bits_per_second;
+            if (rc->target_percentage > 0) {
+                nvencCtx->bitrate = (uint32_t)((uint64_t)rc->bits_per_second * rc->target_percentage / 100);
+            } else {
+                nvencCtx->bitrate = rc->bits_per_second;
+            }
+        }
+        break;
+    }
+    case VAEncMiscParameterTypeFrameRate: {
+        const VAEncMiscParameterFrameRate *fr =
+            (VAEncMiscParameterFrameRate*) misc->data;
+        if (fr->framerate > 0) {
+            /* framerate can be packed as (num | (den << 16)) or just num */
+            uint32_t num = fr->framerate & 0xffff;
+            uint32_t den = (fr->framerate >> 16) & 0xffff;
+            if (den == 0) den = 1;
+            nvencCtx->frameRateNum = num;
+            nvencCtx->frameRateDen = den;
+            LOG("H264 encode: framerate %u/%u", num, den);
+        }
+        break;
+    }
+    case VAEncMiscParameterTypeHRD: {
+        VAEncMiscParameterHRD *hrd =
+            (VAEncMiscParameterHRD*) misc->data;
+        if (hrd->buffer_size > 0)
+            nvencCtx->vbvBufferSize = hrd->buffer_size;
+        if (hrd->initial_buffer_fullness > 0)
+            nvencCtx->vbvInitialDelay = hrd->initial_buffer_fullness;
+        break;
+    }
+    default:
+        LOG("H264 encode: unhandled misc param type %d", misc->type);
+        break;
+    }
+}
diff --git a/src/hevc_encode.c b/src/hevc_encode.c
new file mode 100644
index 00000000..14a9df2d
--- /dev/null
+++ b/src/hevc_encode.c
@@ -0,0 +1,121 @@
+#include "vabackend.h"
+#include "nvenc.h"
+#include <string.h>
+#include <va/va.h>
+
+void hevcenc_handle_sequence_params(NVENCContext *nvencCtx, NVBuffer *buffer)
+{
+    VAEncSequenceParameterBufferHEVC *seq =
+        (VAEncSequenceParameterBufferHEVC*) buffer->ptr;
+
+    LOG("HEVC encode: seq params %ux%u, intra_period=%u, ip_period=%u",
+        seq->pic_width_in_luma_samples, seq->pic_height_in_luma_samples,
+        seq->intra_period, seq->ip_period);
+
+    nvencCtx->width = seq->pic_width_in_luma_samples;
+    nvencCtx->height = seq->pic_height_in_luma_samples;
+
+    if (seq->intra_period > 0) {
+        nvencCtx->intraPeriod = seq->intra_period;
+    }
+    if (seq->ip_period > 0) {
+        nvencCtx->ipPeriod = seq->ip_period;
+    }
+
+    /* VUI timing info */
+    if (seq->vui_num_units_in_tick > 0 && seq->vui_time_scale > 0) {
+        nvencCtx->frameRateNum = seq->vui_time_scale;
+        nvencCtx->frameRateDen = seq->vui_num_units_in_tick * 2;
+    }
+
+    /* Bitrate (VA-API provides in bits/sec) */
+    if (seq->bits_per_second > 0) {
+        nvencCtx->bitrate = seq->bits_per_second;
+        if (nvencCtx->maxBitrate == 0) {
+            nvencCtx->maxBitrate = seq->bits_per_second;
+        }
+    }
+
+    nvencCtx->seqParamSet = true;
+}
+
+void hevcenc_handle_picture_params(NVENCContext *nvencCtx, NVBuffer *buffer)
+{
+    VAEncPictureParameterBufferHEVC *pic =
+        (VAEncPictureParameterBufferHEVC*) buffer->ptr;
+
+    nvencCtx->currentCodedBufId = pic->coded_buf;
+    nvencCtx->forceIDR = (pic->pic_fields.bits.idr_pic_flag != 0);
+    if (nvencCtx->forceIDR) {
+        LOG("HEVC encode: picture params, coded_buf=%d, IDR requested", pic->coded_buf);
+    }
+}
+
+void hevcenc_handle_slice_params(NVENCContext *nvencCtx, NVBuffer *buffer)
+{
+    const VAEncSliceParameterBufferHEVC *slice =
+        (VAEncSliceParameterBufferHEVC*) buffer->ptr;
+
+    /* Map VA-API HEVC slice_type to NVENC picture type.
+     * HEVC slice types: 0=B, 1=P, 2=I */
+    switch (slice->slice_type) {
+    case 2: /* I */
+        nvencCtx->picType = nvencCtx->forceIDR
+            ? NV_ENC_PIC_TYPE_IDR : NV_ENC_PIC_TYPE_I;
+        break;
+    case 1: /* P */
+        nvencCtx->picType = NV_ENC_PIC_TYPE_P;
+        break;
+    case 0: /* B */
+        nvencCtx->picType = NV_ENC_PIC_TYPE_B;
+        break;
+    default:
+        nvencCtx->picType = NV_ENC_PIC_TYPE_UNKNOWN;
+        break;
+    }
+}
+
+void hevcenc_handle_misc_params(NVENCContext *nvencCtx, NVBuffer *buffer)
+{
+    VAEncMiscParameterBuffer *misc = (VAEncMiscParameterBuffer*) buffer->ptr;
+
+    switch (misc->type) {
+    case VAEncMiscParameterTypeRateControl: {
+        VAEncMiscParameterRateControl *rc =
+            (VAEncMiscParameterRateControl*) misc->data;
+        LOG("HEVC encode: rate control bits_per_second=%u", rc->bits_per_second);
+        if (rc->bits_per_second > 0) {
+            nvencCtx->maxBitrate = rc->bits_per_second;
+            if (rc->target_percentage > 0) {
+                nvencCtx->bitrate = (uint32_t)((uint64_t)rc->bits_per_second * rc->target_percentage / 100);
+            } else {
+                nvencCtx->bitrate = rc->bits_per_second;
+            }
+        }
+        break;
+    }
+    case VAEncMiscParameterTypeFrameRate: {
+        const VAEncMiscParameterFrameRate *fr =
+            (VAEncMiscParameterFrameRate*) misc->data;
+        if (fr->framerate > 0) {
+            uint32_t num = fr->framerate & 0xffff;
+            uint32_t den = (fr->framerate >> 16) & 0xffff;
+            if (den == 0) den = 1;
+            nvencCtx->frameRateNum = num;
+            nvencCtx->frameRateDen = den;
+        }
+        break;
+    }
+    case VAEncMiscParameterTypeHRD: {
+        VAEncMiscParameterHRD *hrd =
+            (VAEncMiscParameterHRD*) misc->data;
+        if (hrd->buffer_size > 0)
+            nvencCtx->vbvBufferSize = hrd->buffer_size;
+        if (hrd->initial_buffer_fullness > 0)
+            nvencCtx->vbvInitialDelay = hrd->initial_buffer_fullness;
+        break;
+    }
+    default:
+        break;
+    }
+}
diff --git a/src/nvenc-helper.c b/src/nvenc-helper.c
new file mode 100644
index 00000000..2c873f0c
--- /dev/null
+++ b/src/nvenc-helper.c
@@ -0,0 +1,1179 @@
+/*
+ * nvenc-helper: 64-bit NVENC encode helper daemon.
+ *
+ * This standalone process runs as 64-bit, where CUDA works on all GPUs.
+ * It receives raw NV12/P010 frames from the VA-API driver via
+ * a Unix domain socket, encodes them with NVENC, and returns the
+ * encoded bitstream.
+ *
+ * Usage: nvenc-helper [--foreground]
+ * The socket is created at $XDG_RUNTIME_DIR/nvenc-helper.sock
+ *
+ * The helper runs persistently until stopped via SIGTERM/SIGINT.
+ * It is managed by a systemd user service (nvenc-helper.service).
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <poll.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+
+#include <ffnvcodec/dynlink_loader.h>
+#include <ffnvcodec/nvEncodeAPI.h>
+#include "nvenc-ipc.h"
+
+static CudaFunctions *cu;
+static NvencFunctions *nv_dl;
+static volatile sig_atomic_t running = 1;
+static int log_enabled = 0;
+
+/* Force an IDR keyframe every N frames for streaming error recovery.
+ * At 60fps this is ~1 second. At 30fps this is ~2 seconds. */
+#define NVENC_HELPER_IDR_INTERVAL 60
+
+static inline bool check_cuda_helper(CUresult err, const char *func, int line) {
+    if (err != CUDA_SUCCESS) {
+        const char *s = NULL;
+        cu->cuGetErrorString(err, &s);
+        fprintf(stderr, "[nvenc-helper] CUDA error: %s (%d) at %s:%d\n",
+                s ? s : "?", err, func, line);
+        return true;
+    }
+    return false;
+}
+#define CHECK_CUDA_RESULT_HELPER(err) check_cuda_helper(err, __func__, __LINE__)
+
+static void helper_log(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
+static void helper_log(const char *fmt, ...) {
+    if (!log_enabled) return;
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    fprintf(stderr, "[nvenc-helper %ld.%03ld] ", (long)ts.tv_sec, ts.tv_nsec / 1000000);
+    va_list args;
+    va_start(args, fmt);
+    vfprintf(stderr, fmt, args);
+    va_end(args);
+    fputc('\n', stderr);
+}
+#define HELPER_LOG helper_log
+
+/* Per-client encoder state */
+typedef struct {
+    CUcontext                   cudaCtx;
+    void                       *encoder;
+    NV_ENCODE_API_FUNCTION_LIST funcs;
+    bool                        initialized;
+    NV_ENC_INPUT_PTR            inputBuffer;   /* NVENC-managed (fallback) */
+    NV_ENC_OUTPUT_PTR           outputBuffer;
+    /* Persistent CUDA buffer for GPU-side encode (avoids nvEncLockInputBuffer) */
+    CUdeviceptr                 gpuBuf;        /* Linear CUDA VRAM buffer */
+    uint32_t                    gpuBufPitch;   /* Aligned pitch */
+    uint32_t                    gpuBufSize;    /* Total allocation size */
+    NV_ENC_REGISTERED_PTR       gpuBufReg;     /* Persistent NVENC registration */
+    bool                        gpuBufReady;   /* true if GPU path available */
+    uint32_t                    width;
+    uint32_t                    height;
+    uint32_t                    is10bit;
+    uint64_t                    frameCount;
+    uint8_t                    *bsBuf;         /* pre-allocated bitstream output */
+    uint32_t                    bsBufSize;
+} HelperEncoder;
+
+/* Reliable I/O */
+static bool send_all(int fd, const void *buf, size_t len)
+{
+    const char *p = buf;
+    while (len > 0) {
+        ssize_t n = send(fd, p, len, MSG_NOSIGNAL);
+        if (n <= 0) {
+            if (n < 0 && errno == EINTR) continue;
+            return false;
+        }
+        p += n;
+        len -= (size_t)n;
+    }
+    return true;
+}
+
+static bool recv_all(int fd, void *buf, size_t len)
+{
+    char *p = buf;
+    while (len > 0) {
+        ssize_t n = recv(fd, p, len, 0);
+        if (n <= 0) {
+            if (n < 0 && errno == EINTR) continue;
+            return false;
+        }
+        p += n;
+        len -= (size_t)n;
+    }
+    return true;
+}
+
+static bool send_response(int fd, int32_t status, const void *data, uint32_t size)
+{
+    NVEncIPCRespHeader resp = { .status = status, .payload_size = size };
+    if (!send_all(fd, &resp, sizeof(resp))) return false;
+    if (size > 0 && data != NULL) {
+        if (!send_all(fd, data, size)) return false;
+    }
+    return true;
+}
+
+/* Send response header with an fd attached via SCM_RIGHTS */
+static bool send_response_with_fd(int sock, int32_t status, int send_fd,
+                                   const void *data, uint32_t size)
+{
+    NVEncIPCRespHeader resp = { .status = status, .payload_size = size };
+
+    struct iovec iov = { .iov_base = &resp, .iov_len = sizeof(resp) };
+    union {
+        char buf[CMSG_SPACE(sizeof(int))];
+        struct cmsghdr align;
+    } cmsg_buf;
+    memset(&cmsg_buf, 0, sizeof(cmsg_buf));
+
+    struct msghdr msg = {
+        .msg_iov = &iov,
+        .msg_iovlen = 1,
+        .msg_control = cmsg_buf.buf,
+        .msg_controllen = sizeof(cmsg_buf.buf),
+    };
+
+    struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+    cmsg->cmsg_level = SOL_SOCKET;
+    cmsg->cmsg_type = SCM_RIGHTS;
+    cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+    memcpy(CMSG_DATA(cmsg), &send_fd, sizeof(int));
+
+    ssize_t n = sendmsg(sock, &msg, MSG_NOSIGNAL);
+    if (n != sizeof(resp)) return false;
+
+    if (size > 0 && data != NULL) {
+        if (!send_all(sock, data, size)) return false;
+    }
+    return true;
+}
+
+/* Encoder lifecycle */
+static bool encoder_init(HelperEncoder *enc, const NVEncIPCInitParams *params)
+{
+    HELPER_LOG("Init: %ux%u codec=%u profile=%u bitrate=%u",
+               params->width, params->height, params->codec, params->profile,
+               params->bitrate);
+
+    /* Create CUDA context */
+    if (CHECK_CUDA_RESULT_HELPER(cu->cuCtxCreate(&enc->cudaCtx, 0, 0))) {
+        return false;
+    }
+
+    /* Get NVENC function list */
+    enc->funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER;
+    NVENCSTATUS st = nv_dl->NvEncodeAPICreateInstance(&enc->funcs);
+    if (st != NV_ENC_SUCCESS) {
+        HELPER_LOG("NvEncodeAPICreateInstance failed: %d", st);
+        cu->cuCtxDestroy(enc->cudaCtx);
+        return false;
+    }
+
+    /* Open NVENC session */
+    NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS sessParams = {0};
+    sessParams.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
+    sessParams.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
+    sessParams.device = enc->cudaCtx;
+    sessParams.apiVersion = NVENCAPI_VERSION;
+
+    st = enc->funcs.nvEncOpenEncodeSessionEx(&sessParams, &enc->encoder);
+    if (st != NV_ENC_SUCCESS) {
+        HELPER_LOG("nvEncOpenEncodeSessionEx failed: %d", st);
+        cu->cuCtxDestroy(enc->cudaCtx);
+        return false;
+    }
+
+    /* Select codec and profile GUIDs */
+    GUID codecGuid = (params->codec == 0) ? NV_ENC_CODEC_H264_GUID : NV_ENC_CODEC_HEVC_GUID;
+    GUID profileGuid;
+    if (params->codec == 0) {
+        /* H.264 */
+        profileGuid = NV_ENC_H264_PROFILE_HIGH_GUID;
+    } else {
+        /* HEVC */
+        profileGuid = params->is10bit ? NV_ENC_HEVC_PROFILE_MAIN10_GUID : NV_ENC_HEVC_PROFILE_MAIN_GUID;
+    }
+
+    /* Get preset config */
+    NV_ENC_PRESET_CONFIG presetConfig = {0};
+    presetConfig.version = NV_ENC_PRESET_CONFIG_VER;
+    presetConfig.presetCfg.version = NV_ENC_CONFIG_VER;
+
+    st = enc->funcs.nvEncGetEncodePresetConfigEx(
+        enc->encoder, codecGuid, NV_ENC_PRESET_P4_GUID,
+        NV_ENC_TUNING_INFO_LOW_LATENCY, &presetConfig);
+    if (st != NV_ENC_SUCCESS) {
+        HELPER_LOG("nvEncGetEncodePresetConfigEx failed: %d", st);
+        goto fail;
+    }
+
+    NV_ENC_CONFIG encConfig;
+    memcpy(&encConfig, &presetConfig.presetCfg, sizeof(encConfig));
+    encConfig.version = NV_ENC_CONFIG_VER;
+    encConfig.profileGUID = profileGuid;
+    encConfig.frameIntervalP = 1; /* No B-frames for synchronous encode */
+
+    if (params->bitrate > 0) {
+        encConfig.rcParams.averageBitRate = params->bitrate;
+    }
+    if (params->maxBitrate > 0) {
+        encConfig.rcParams.maxBitRate = params->maxBitrate;
+    }
+    if (params->gopLength > 0) {
+        encConfig.gopLength = params->gopLength;
+    }
+
+    /* Initialize encoder */
+    NV_ENC_INITIALIZE_PARAMS initParams = {0};
+    initParams.version = NV_ENC_INITIALIZE_PARAMS_VER;
+    initParams.encodeGUID = codecGuid;
+    initParams.presetGUID = NV_ENC_PRESET_P4_GUID;
+    initParams.encodeWidth = params->width;
+    initParams.encodeHeight = params->height;
+    initParams.darWidth = params->width;
+    initParams.darHeight = params->height;
+    initParams.frameRateNum = params->frameRateNum > 0 ? params->frameRateNum : 30;
+    initParams.frameRateDen = params->frameRateDen > 0 ? params->frameRateDen : 1;
+    initParams.enablePTD = 1;
+    initParams.encodeConfig = &encConfig;
+    initParams.maxEncodeWidth = params->width;
+    initParams.maxEncodeHeight = params->height;
+    initParams.tuningInfo = NV_ENC_TUNING_INFO_LOW_LATENCY;
+
+    st = enc->funcs.nvEncInitializeEncoder(enc->encoder, &initParams);
+    if (st != NV_ENC_SUCCESS) {
+        HELPER_LOG("nvEncInitializeEncoder failed: %d", st);
+        goto fail;
+    }
+
+    /* Create NVENC-managed input buffer */
+    NV_ENC_CREATE_INPUT_BUFFER createIn = {0};
+    createIn.version = NV_ENC_CREATE_INPUT_BUFFER_VER;
+    createIn.width = params->width;
+    createIn.height = params->height;
+    createIn.bufferFmt = params->is10bit ? NV_ENC_BUFFER_FORMAT_YUV420_10BIT : NV_ENC_BUFFER_FORMAT_NV12;
+
+    st = enc->funcs.nvEncCreateInputBuffer(enc->encoder, &createIn);
+    if (st != NV_ENC_SUCCESS) {
+        HELPER_LOG("nvEncCreateInputBuffer failed: %d", st);
+        goto fail;
+    }
+    enc->inputBuffer = createIn.inputBuffer;
+
+    /* Create output bitstream buffer */
+    NV_ENC_CREATE_BITSTREAM_BUFFER createOut = {0};
+    createOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
+
+    st = enc->funcs.nvEncCreateBitstreamBuffer(enc->encoder, &createOut);
+    if (st != NV_ENC_SUCCESS) {
+        HELPER_LOG("nvEncCreateBitstreamBuffer failed: %d", st);
+        enc->funcs.nvEncDestroyInputBuffer(enc->encoder, enc->inputBuffer);
+        goto fail;
+    }
+    enc->outputBuffer = createOut.bitstreamBuffer;
+
+    enc->width = params->width;
+    enc->height = params->height;
+    enc->is10bit = params->is10bit;
+    enc->frameCount = 0;
+    enc->bsBufSize = 4 * 1024 * 1024;
+    enc->bsBuf = malloc(enc->bsBufSize);
+    enc->initialized = true;
+
+    /* Allocate persistent CUDA linear buffer for GPU-side encode.
+     * This replaces nvEncLockInputBuffer (host memory) with a CUDA device
+     * buffer registered once with NVENC. Per-frame: single cuMemcpy2D
+     * (host→device with pitch conversion) + nvEncMapInputResource. */
+    uint32_t bpp = params->is10bit ? 2 : 1;
+    enc->gpuBufPitch = params->width * bpp;
+    enc->gpuBufPitch = (enc->gpuBufPitch + 255) & ~255; /* Align to 256 */
+    enc->gpuBufSize = enc->gpuBufPitch * params->height * 3 / 2;
+    enc->gpuBufReady = false;
+
+    CUresult cres = cu->cuMemAlloc(&enc->gpuBuf, enc->gpuBufSize);
+    if (cres == CUDA_SUCCESS) {
+        NV_ENC_BUFFER_FORMAT bufFmt = params->is10bit
+            ? NV_ENC_BUFFER_FORMAT_YUV420_10BIT : NV_ENC_BUFFER_FORMAT_NV12;
+
+        NV_ENC_REGISTER_RESOURCE regRes = {0};
+        regRes.version = NV_ENC_REGISTER_RESOURCE_VER;
+        regRes.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
+        regRes.resourceToRegister = (void *)enc->gpuBuf;
+        regRes.width = params->width;
+        regRes.height = params->height;
+        regRes.pitch = enc->gpuBufPitch;
+        regRes.bufferFormat = bufFmt;
+        regRes.bufferUsage = NV_ENC_INPUT_IMAGE;
+
+        st = enc->funcs.nvEncRegisterResource(enc->encoder, &regRes);
+        if (st == NV_ENC_SUCCESS) {
+            enc->gpuBufReg = regRes.registeredResource;
+            enc->gpuBufReady = true;
+            HELPER_LOG("GPU buffer: %u bytes, pitch=%u (persistent CUDA+NVENC)",
+                       enc->gpuBufSize, enc->gpuBufPitch);
+        } else {
+            HELPER_LOG("GPU buffer register failed (%d), falling back to host path", st);
+            cu->cuMemFree(enc->gpuBuf);
+            enc->gpuBuf = 0;
+        }
+    } else {
+        HELPER_LOG("GPU buffer alloc failed (%d), falling back to host path", cres);
+        enc->gpuBuf = 0;
+    }
+
+    HELPER_LOG("Encoder initialized: %ux%u %s %s (gpu=%s)",
+               params->width, params->height,
+               params->codec == 0 ? "H.264" : "HEVC",
+               params->is10bit ? "10-bit" : "8-bit",
+               enc->gpuBufReady ? "yes" : "no");
+    return true;
+
+fail:
+    enc->funcs.nvEncDestroyEncoder(enc->encoder);
+    enc->encoder = NULL;
+    cu->cuCtxDestroy(enc->cudaCtx);
+    enc->cudaCtx = NULL;
+    return false;
+}
+
+static bool encoder_encode(HelperEncoder *enc, const void *frame_data,
+                           uint32_t frame_width, uint32_t frame_height,
+                           uint32_t frame_size, bool force_idr,
+                           void **out_data, uint32_t *out_size)
+{
+    NVENCSTATUS st;
+    uint32_t bpp = enc->is10bit ? 2 : 1;
+    uint32_t srcPitch = frame_width * bpp;
+    NV_ENC_INPUT_PTR encodeInput;
+    NV_ENC_BUFFER_FORMAT encFmt = enc->is10bit
+        ? NV_ENC_BUFFER_FORMAT_YUV420_10BIT : NV_ENC_BUFFER_FORMAT_NV12;
+    uint32_t encodePitch;
+    bool usedGpuPath = false;
+
+    if (enc->gpuBufReady) {
+        /* GPU FAST PATH: cuMemcpy2D host→device with pitch conversion.
+         * Single CUDA call replaces 1080+ individual memcpy calls.
+         * GPU DMA engine handles pitch conversion in hardware.
+         * NVENC reads from VRAM — no PCIe upload at encode time. */
+        uint32_t padLines = enc->height - frame_height;
+
+        /* Luma: host SHM → GPU buffer */
+        CUDA_MEMCPY2D cpyLuma = {0};
+        cpyLuma.srcMemoryType = CU_MEMORYTYPE_HOST;
+        cpyLuma.srcHost = frame_data;
+        cpyLuma.srcPitch = srcPitch;
+        cpyLuma.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+        cpyLuma.dstDevice = enc->gpuBuf;
+        cpyLuma.dstPitch = enc->gpuBufPitch;
+        cpyLuma.WidthInBytes = srcPitch;
+        cpyLuma.Height = frame_height;
+
+        CUresult cres = cu->cuMemcpy2D(&cpyLuma);
+        if (cres != CUDA_SUCCESS) {
+            HELPER_LOG("GPU path: luma cuMemcpy2D failed: %d, falling back", cres);
+            goto host_fallback;
+        }
+
+        /* Chroma: host SHM → GPU buffer */
+        uint32_t chromaOff_src = srcPitch * frame_height;
+        uint32_t chromaOff_dst = enc->gpuBufPitch * enc->height;
+        uint32_t chromaHeight = frame_height / 2;
+
+        CUDA_MEMCPY2D cpyChroma = {0};
+        cpyChroma.srcMemoryType = CU_MEMORYTYPE_HOST;
+        cpyChroma.srcHost = (const uint8_t *)frame_data + chromaOff_src;
+        cpyChroma.srcPitch = srcPitch;
+        cpyChroma.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+        cpyChroma.dstDevice = enc->gpuBuf + chromaOff_dst;
+        cpyChroma.dstPitch = enc->gpuBufPitch;
+        cpyChroma.WidthInBytes = srcPitch;
+        cpyChroma.Height = chromaHeight;
+
+        cres = cu->cuMemcpy2D(&cpyChroma);
+        if (cres != CUDA_SUCCESS) {
+            HELPER_LOG("GPU path: chroma cuMemcpy2D failed: %d, falling back", cres);
+            goto host_fallback;
+        }
+
+        /* Zero padding rows on GPU (async, only if needed) */
+        if (padLines > 0) {
+            cu->cuMemsetD8Async(enc->gpuBuf + enc->gpuBufPitch * frame_height,
+                                0, enc->gpuBufPitch * padLines, 0);
+            cu->cuMemsetD8Async(enc->gpuBuf + chromaOff_dst + enc->gpuBufPitch * chromaHeight,
+                                128, enc->gpuBufPitch * (padLines / 2), 0);
+        }
+
+        /* Map the persistent registered resource */
+        NV_ENC_MAP_INPUT_RESOURCE mapRes = {0};
+        mapRes.version = NV_ENC_MAP_INPUT_RESOURCE_VER;
+        mapRes.registeredResource = enc->gpuBufReg;
+
+        st = enc->funcs.nvEncMapInputResource(enc->encoder, &mapRes);
+        if (st != NV_ENC_SUCCESS) {
+            HELPER_LOG("GPU path: nvEncMapInputResource failed: %d, falling back", st);
+            goto host_fallback;
+        }
+
+        encodeInput = mapRes.mappedResource;
+        encFmt = mapRes.mappedBufferFmt;
+        encodePitch = enc->gpuBufPitch;
+        usedGpuPath = true;
+        goto do_encode;
+    }
+
+host_fallback:
+    /* HOST FALLBACK: nvEncLockInputBuffer + memcpy (original path) */
+    {
+        NV_ENC_LOCK_INPUT_BUFFER lockIn = {0};
+        lockIn.version = NV_ENC_LOCK_INPUT_BUFFER_VER;
+        lockIn.inputBuffer = enc->inputBuffer;
+
+        st = enc->funcs.nvEncLockInputBuffer(enc->encoder, &lockIn);
+        if (st != NV_ENC_SUCCESS) {
+            HELPER_LOG("nvEncLockInputBuffer failed: %d", st);
+            return false;
+        }
+
+        uint32_t dstPitch = lockIn.pitch;
+        uint8_t *src = (uint8_t *)frame_data;
+        uint8_t *dst = (uint8_t *)lockIn.bufferDataPtr;
+        uint32_t chromaOffset_src = srcPitch * frame_height;
+        uint32_t chromaOffset_dst = dstPitch * enc->height;
+        uint32_t chromaHeight = frame_height / 2;
+        uint32_t padLines = enc->height - frame_height;
+
+        if (srcPitch == dstPitch) {
+            memcpy(dst, src, srcPitch * frame_height);
+            memcpy(dst + chromaOffset_dst, src + chromaOffset_src, srcPitch * chromaHeight);
+        } else {
+            for (uint32_t y = 0; y < frame_height; y++)
+                memcpy(dst + y * dstPitch, src + y * srcPitch, srcPitch);
+            for (uint32_t y = 0; y < chromaHeight; y++)
+                memcpy(dst + chromaOffset_dst + y * dstPitch,
+                       src + chromaOffset_src + y * srcPitch, srcPitch);
+        }
+
+        if (padLines > 0) {
+            memset(dst + dstPitch * frame_height, 0, dstPitch * padLines);
+            memset(dst + chromaOffset_dst + dstPitch * chromaHeight, 128, dstPitch * (padLines / 2));
+        }
+
+        enc->funcs.nvEncUnlockInputBuffer(enc->encoder, enc->inputBuffer);
+        encodeInput = enc->inputBuffer;
+        encodePitch = dstPitch;
+    }
+
+do_encode:;
+    /* Encode */
+    NV_ENC_PIC_PARAMS picParams = {0};
+    picParams.version = NV_ENC_PIC_PARAMS_VER;
+    picParams.inputBuffer = encodeInput;
+    picParams.bufferFmt = encFmt;
+    picParams.inputWidth = enc->width;
+    picParams.inputHeight = enc->height;
+    picParams.inputPitch = encodePitch;
+    picParams.outputBitstream = enc->outputBuffer;
+    picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
+    picParams.pictureType = NV_ENC_PIC_TYPE_UNKNOWN;
+    /* Force IDR: on first frame, on explicit request, or every 60 frames
+     * for streaming recovery. Without periodic IDR, a single lost packet
+     * causes the client to freeze until the next intra_period (up to 60s). */
+    bool needIDR = (enc->frameCount == 0) || force_idr || (enc->frameCount % NVENC_HELPER_IDR_INTERVAL == 0);
+    picParams.encodePicFlags = needIDR
+        ? (NV_ENC_PIC_FLAG_OUTPUT_SPSPPS | NV_ENC_PIC_FLAG_FORCEIDR)
+        : 0;
+    picParams.frameIdx = (uint32_t)enc->frameCount;
+    picParams.inputTimeStamp = enc->frameCount;
+
+    st = enc->funcs.nvEncEncodePicture(enc->encoder, &picParams);
+
+    /* Unmap the GPU resource after encode (must happen before next map) */
+    if (usedGpuPath) {
+        enc->funcs.nvEncUnmapInputResource(enc->encoder, encodeInput);
+    }
+
+    if (st != NV_ENC_SUCCESS) {
+        HELPER_LOG("nvEncEncodePicture failed: %d", st);
+        return false;
+    }
+
+    enc->frameCount++;
+
+    if (enc->frameCount % 300 == 0) {
+        HELPER_LOG("Encoded %lu frames", (unsigned long)enc->frameCount);
+    }
+
+    /* Lock output bitstream */
+    NV_ENC_LOCK_BITSTREAM lockOut = {0};
+    lockOut.version = NV_ENC_LOCK_BITSTREAM_VER;
+    lockOut.outputBitstream = enc->outputBuffer;
+
+    st = enc->funcs.nvEncLockBitstream(enc->encoder, &lockOut);
+    if (st != NV_ENC_SUCCESS) {
+        HELPER_LOG("nvEncLockBitstream failed: %d", st);
+        return false;
+    }
+
+    /* Copy bitstream data */
+    *out_size = lockOut.bitstreamSizeInBytes;
+
+    //grow pre-allocated buffer if needed
+    if (lockOut.bitstreamSizeInBytes > enc->bsBufSize) {
+        uint32_t newSize = lockOut.bitstreamSizeInBytes + (lockOut.bitstreamSizeInBytes >> 1);
+        uint8_t *newBuf = realloc(enc->bsBuf, newSize);
+        if (newBuf == NULL) {
+            enc->funcs.nvEncUnlockBitstream(enc->encoder, enc->outputBuffer);
+            return false;
+        }
+        enc->bsBuf = newBuf;
+        enc->bsBufSize = newSize;
+    }
+    memcpy(enc->bsBuf, lockOut.bitstreamBufferPtr, lockOut.bitstreamSizeInBytes);
+    *out_data = enc->bsBuf;
+
+    enc->funcs.nvEncUnlockBitstream(enc->encoder, enc->outputBuffer);
+
+    return true;
+}
+
+static void encoder_close(HelperEncoder *enc)
+{
+    if (enc->encoder == NULL) return;
+
+    /* Flush */
+    if (enc->initialized) {
+        NV_ENC_PIC_PARAMS picParams = {0};
+        picParams.version = NV_ENC_PIC_PARAMS_VER;
+        picParams.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
+        enc->funcs.nvEncEncodePicture(enc->encoder, &picParams);
+    }
+
+    if (enc->outputBuffer) {
+        enc->funcs.nvEncDestroyBitstreamBuffer(enc->encoder, enc->outputBuffer);
+    }
+    if (enc->inputBuffer) {
+        enc->funcs.nvEncDestroyInputBuffer(enc->encoder, enc->inputBuffer);
+    }
+    /* Free persistent GPU buffer */
+    if (enc->gpuBufReady) {
+        enc->funcs.nvEncUnregisterResource(enc->encoder, enc->gpuBufReg);
+        enc->gpuBufReady = false;
+    }
+    if (enc->gpuBuf) {
+        cu->cuMemFree(enc->gpuBuf);
+        enc->gpuBuf = 0;
+    }
+
+    enc->funcs.nvEncDestroyEncoder(enc->encoder);
+    enc->encoder = NULL;
+
+    if (enc->cudaCtx) {
+        cu->cuCtxDestroy(enc->cudaCtx);
+        enc->cudaCtx = NULL;
+    }
+
+    free(enc->bsBuf);
+    enc->bsBuf = NULL;
+    enc->bsBufSize = 0;
+    enc->initialized = false;
+    HELPER_LOG("Encoder closed (encoded %lu frames)", (unsigned long)enc->frameCount);
+}
+
+/* Handle one client connection */
+static void handle_client(int client_fd)
+{
+    HelperEncoder enc = {0};
+    void *shm_ptr = MAP_FAILED;
+    uint32_t shm_size = 0;
+    int shm_fd = -1;
+
+    HELPER_LOG("Client connected (fd=%d)", client_fd);
+
+    while (running) {
+        //wait for data with 5s timeout (detect dead clients)
+        struct pollfd cpfd = { .fd = client_fd, .events = POLLIN };
+        int pr = poll(&cpfd, 1, 5000);
+        if (pr == 0) {
+            HELPER_LOG("Client timeout (5s), disconnecting");
+            break;
+        }
+        if (pr < 0) {
+            if (errno == EINTR) continue;
+            break;
+        }
+
+        NVEncIPCMsgHeader hdr;
+        if (!recv_all(client_fd, &hdr, sizeof(hdr))) {
+            HELPER_LOG("Client disconnected");
+            break;
+        }
+
+        switch (hdr.cmd) {
+        case NVENC_IPC_CMD_INIT: {
+            if (hdr.payload_size != sizeof(NVEncIPCInitParams)) {
+                send_response(client_fd, -1, NULL, 0);
+                break;
+            }
+            NVEncIPCInitParams params;
+            if (!recv_all(client_fd, &params, sizeof(params))) goto done;
+
+            if (enc.initialized) {
+                encoder_close(&enc);
+            }
+
+            /* Clean up old shm if any */
+            if (shm_ptr != MAP_FAILED) {
+                munmap(shm_ptr, shm_size);
+                shm_ptr = MAP_FAILED;
+            }
+            if (shm_fd >= 0) {
+                close(shm_fd);
+                shm_fd = -1;
+            }
+
+            bool ok = encoder_init(&enc, &params);
+            if (!ok) {
+                send_response(client_fd, -1, NULL, 0);
+                break;
+            }
+
+            /* Create shared memory for frame transfer.
+             * NV12 = w*h*1.5, P010 = w*h*3 */
+            uint32_t bpp = params.is10bit ? 2 : 1;
+            shm_size = params.width * bpp * params.height * 3 / 2;
+            shm_fd = memfd_create("nvenc-frame", MFD_CLOEXEC);
+            if (shm_fd < 0 || ftruncate(shm_fd, shm_size) < 0) {
+                HELPER_LOG("Failed to create shm: %s", strerror(errno));
+                if (shm_fd >= 0) { close(shm_fd); shm_fd = -1; }
+                /* Fall back to socket-based transfer (no shm).
+                 * Send normal response without fd (no SCM_RIGHTS with fd=-1). */
+                NVEncIPCInitResponse iresp = { .shm_size = 0 };
+                send_response(client_fd, 0, &iresp, sizeof(iresp));
+                break;
+            }
+
+            shm_ptr = mmap(NULL, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0);
+            if (shm_ptr == MAP_FAILED) {
+                HELPER_LOG("Failed to mmap shm: %s", strerror(errno));
+                close(shm_fd);
+                shm_fd = -1;
+                NVEncIPCInitResponse iresp = { .shm_size = 0 };
+                send_response(client_fd, 0, &iresp, sizeof(iresp));
+                break;
+            }
+
+            /* Send shm fd to client */
+            int client_shm_fd = dup(shm_fd); /* dup because SCM_RIGHTS transfers ownership */
+            NVEncIPCInitResponse iresp = { .shm_size = shm_size };
+            HELPER_LOG("Created shm: %u bytes, fd=%d", shm_size, client_shm_fd);
+            send_response_with_fd(client_fd, 0, client_shm_fd, &iresp, sizeof(iresp));
+            close(client_shm_fd);
+            break;
+        }
+
+        case NVENC_IPC_CMD_ENCODE: {
+            if (!enc.initialized || hdr.payload_size > NVENC_IPC_MAX_FRAME_SIZE + sizeof(NVEncIPCEncodeParams)) {
+                /* Drain the payload with a fixed buffer to avoid huge malloc */
+                char drain[4096];
+                uint32_t remaining = hdr.payload_size;
+                while (remaining > 0) {
+                    uint32_t chunk = remaining < sizeof(drain) ? remaining : sizeof(drain);
+                    if (!recv_all(client_fd, drain, chunk)) goto done;
+                    remaining -= chunk;
+                }
+                send_response(client_fd, -1, NULL, 0);
+                break;
+            }
+
+            NVEncIPCEncodeParams ep;
+            if (!recv_all(client_fd, &ep, sizeof(ep))) goto done;
+
+            if (ep.frame_size > NVENC_IPC_MAX_FRAME_SIZE) {
+                HELPER_LOG("CMD_ENCODE: frame_size %u exceeds max %u", ep.frame_size, NVENC_IPC_MAX_FRAME_SIZE);
+                send_response(client_fd, -1, NULL, 0);
+                goto done;
+            }
+
+            /* Receive frame data */
+            void *frame = malloc(ep.frame_size);
+            if (frame == NULL) {
+                send_response(client_fd, -1, NULL, 0);
+                goto done;
+            }
+            if (!recv_all(client_fd, frame, ep.frame_size)) {
+                free(frame);
+                goto done;
+            }
+
+
+            void *bitstream = NULL;
+            uint32_t bsSize = 0;
+            bool ok = encoder_encode(&enc, frame, ep.width, ep.height, ep.frame_size, ep.force_idr, &bitstream, &bsSize);
+            free(frame);
+
+
+            if (ok) {
+                send_response(client_fd, 0, bitstream, bsSize);
+            } else {
+                send_response(client_fd, -1, NULL, 0);
+            }
+            break;
+        }
+
+        case NVENC_IPC_CMD_ENCODE_DMABUF: {
+            if (!enc.initialized) {
+                if (hdr.payload_size > 0) {
+                    void *tmp = malloc(hdr.payload_size);
+                    if (tmp) { recv_all(client_fd, tmp, hdr.payload_size); free(tmp); }
+                }
+                send_response(client_fd, -1, NULL, 0);
+                break;
+            }
+
+            /* Receive params WITH per-plane DMA-BUF fds via SCM_RIGHTS */
+            NVEncIPCEncodeDmaBufParams dp;
+            int dmabuf_fds[4] = {-1, -1, -1, -1};
+            int num_fds = 0;
+            {
+                struct iovec iov = { .iov_base = &dp, .iov_len = sizeof(dp) };
+                union {
+                    char buf[CMSG_SPACE(sizeof(int) * 4)];
+                    struct cmsghdr align;
+                } cmsg_buf;
+                memset(&cmsg_buf, 0, sizeof(cmsg_buf));
+
+                struct msghdr msg = {
+                    .msg_iov = &iov,
+                    .msg_iovlen = 1,
+                    .msg_control = cmsg_buf.buf,
+                    .msg_controllen = sizeof(cmsg_buf.buf),
+                };
+
+                ssize_t n = recvmsg(client_fd, &msg, 0);
+                if (n != sizeof(dp)) {
+                    HELPER_LOG("DMABUF: recvmsg failed: %zd (errno=%d)", n, errno);
+                    send_response(client_fd, -1, NULL, 0);
+                    break;
+                }
+
+                struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+                if (cmsg && cmsg->cmsg_level == SOL_SOCKET &&
+                    cmsg->cmsg_type == SCM_RIGHTS) {
+                    num_fds = (int)((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int));
+                    if (num_fds > 4) num_fds = 4;
+                    memcpy(dmabuf_fds, CMSG_DATA(cmsg), (size_t)num_fds * sizeof(int));
+                }
+            }
+
+            if (num_fds < 1 || dmabuf_fds[0] < 0) {
+                HELPER_LOG("DMABUF: no fds received");
+                send_response(client_fd, -1, NULL, 0);
+                break;
+            }
+
+
+            if (enc.frameCount < 3) {
+                HELPER_LOG("DMABUF: fds=[%d,%d] %ux%u planes=%u bppc=%u sizes=[%u,%u]",
+                           dmabuf_fds[0], dmabuf_fds[1],
+                           dp.width, dp.height, dp.num_planes, dp.bppc,
+                           dp.sizes[0], dp.sizes[1]);
+            }
+
+            /* Import each plane's DMA-BUF into CUDA as a CUarray,
+             * same as the driver's import_to_cuda in direct-export-buf.c */
+            CUexternalMemory extMems[4] = {0};
+            CUmipmappedArray mipmaps[4] = {0};
+            CUarray arrays[4] = {0};
+            bool importOk = true;
+
+            for (int i = 0; i < (int)dp.num_planes && i < num_fds; i++) {
+                CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {
+                    .type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
+                    .handle.fd = dmabuf_fds[i],
+                    .size = dp.sizes[i],
+                    .flags = 0,
+                };
+
+                CUresult cres = cu->cuImportExternalMemory(&extMems[i], &extMemDesc);
+                /* CUDA takes ownership of the fd on success */
+                if (cres != CUDA_SUCCESS) {
+                    HELPER_LOG("DMABUF: cuImportExternalMemory plane %d failed: %d", i, cres);
+                    close(dmabuf_fds[i]);
+                    importOk = false;
+                    break;
+                }
+
+                /* Determine plane format */
+                int bpc = 8 * dp.bppc;
+                int channels = (i == 0) ? 1 : 2; /* Y=1ch, UV=2ch interleaved */
+                uint32_t planeW = (i == 0) ? dp.width : dp.width / 2;
+                uint32_t planeH = (i == 0) ? dp.height : dp.height / 2;
+
+                CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC mipmapDesc = {
+                    .arrayDesc = {
+                        .Width = planeW,
+                        .Height = planeH,
+                        .Depth = 0,
+                        .Format = (bpc == 8) ? CU_AD_FORMAT_UNSIGNED_INT8 : CU_AD_FORMAT_UNSIGNED_INT16,
+                        .NumChannels = (unsigned int)channels,
+                        .Flags = 0,
+                    },
+                    .numLevels = 1,
+                    .offset = 0,
+                };
+
+                cres = cu->cuExternalMemoryGetMappedMipmappedArray(&mipmaps[i], extMems[i], &mipmapDesc);
+                if (cres != CUDA_SUCCESS) {
+                    HELPER_LOG("DMABUF: cuExternalMemoryGetMappedMipmappedArray plane %d failed: %d", i, cres);
+                    importOk = false;
+                    break;
+                }
+
+                cres = cu->cuMipmappedArrayGetLevel(&arrays[i], mipmaps[i], 0);
+                if (cres != CUDA_SUCCESS) {
+                    HELPER_LOG("DMABUF: cuMipmappedArrayGetLevel plane %d failed: %d", i, cres);
+                    importOk = false;
+                    break;
+                }
+            }
+
+            if (!importOk) {
+                for (int i = 0; i < 4; i++) {
+                    if (mipmaps[i]) cu->cuMipmappedArrayDestroy(mipmaps[i]);
+                    if (extMems[i]) cu->cuDestroyExternalMemory(extMems[i]);
+                    /* Close any fds that CUDA didn't take ownership of */
+                    else if (i < num_fds && dmabuf_fds[i] >= 0) close(dmabuf_fds[i]);
+                }
+                /* Close remaining fds beyond what we tried to import */
+                for (int i = (int)dp.num_planes; i < num_fds; i++) {
+                    if (dmabuf_fds[i] >= 0) close(dmabuf_fds[i]);
+                }
+                send_response(client_fd, -1, NULL, 0);
+                break;
+            }
+
+            /* Copy CUarrays to linear buffer (same as nvEndPictureEncode direct path) */
+            uint32_t bpp = dp.is10bit ? 2 : 1;
+            uint32_t pitch = dp.width * bpp;
+            pitch = (pitch + 255) & ~255; /* Align to 256 */
+            uint32_t lumaSize = pitch * dp.height;
+            uint32_t chromaSize = pitch * (dp.height / 2);
+            uint32_t totalSize = lumaSize + chromaSize;
+
+            CUdeviceptr linearBuf = 0;
+            CUresult cres = cu->cuMemAlloc(&linearBuf, totalSize);
+            if (cres != CUDA_SUCCESS) {
+                HELPER_LOG("DMABUF: cuMemAlloc(%u) failed: %d", totalSize, cres);
+                goto dmabuf_cleanup;
+            }
+            cu->cuMemsetD8Async(linearBuf, 0, totalSize, 0);
+
+            /* Copy luma */
+            CUDA_MEMCPY2D cpy = {0};
+            cpy.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+            cpy.srcArray = arrays[0];
+            cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+            cpy.dstDevice = linearBuf;
+            cpy.dstPitch = pitch;
+            cpy.WidthInBytes = dp.width * bpp;
+            cpy.Height = dp.height;
+            cres = cu->cuMemcpy2D(&cpy);
+            if (cres != CUDA_SUCCESS) {
+                HELPER_LOG("DMABUF: luma cuMemcpy2D failed: %d", cres);
+                cu->cuMemFree(linearBuf);
+                goto dmabuf_cleanup;
+            }
+
+            /* Copy chroma */
+            if (dp.num_planes >= 2 && arrays[1]) {
+                memset(&cpy, 0, sizeof(cpy));
+                cpy.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+                cpy.srcArray = arrays[1];
+                cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+                cpy.dstDevice = linearBuf + lumaSize;
+                cpy.dstPitch = pitch;
+                cpy.WidthInBytes = dp.width * bpp;
+                cpy.Height = dp.height / 2;
+                cres = cu->cuMemcpy2D(&cpy);
+                if (cres != CUDA_SUCCESS) {
+                    HELPER_LOG("DMABUF: chroma cuMemcpy2D failed: %d", cres);
+                    cu->cuMemFree(linearBuf);
+                    goto dmabuf_cleanup;
+                }
+            }
+
+            /* Register linear buffer with NVENC */
+            NV_ENC_BUFFER_FORMAT bufFmt = dp.is10bit
+                ? NV_ENC_BUFFER_FORMAT_YUV420_10BIT : NV_ENC_BUFFER_FORMAT_NV12;
+
+            NV_ENC_REGISTER_RESOURCE regRes = {0};
+            regRes.version = NV_ENC_REGISTER_RESOURCE_VER;
+            regRes.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
+            regRes.resourceToRegister = (void *)linearBuf;
+            regRes.width = dp.width;
+            regRes.height = dp.height;
+            regRes.pitch = pitch;
+            regRes.bufferFormat = bufFmt;
+            regRes.bufferUsage = NV_ENC_INPUT_IMAGE;
+
+            NVENCSTATUS nvst = enc.funcs.nvEncRegisterResource(enc.encoder, &regRes);
+            if (nvst != NV_ENC_SUCCESS) {
+                HELPER_LOG("DMABUF: nvEncRegisterResource failed: %d", nvst);
+                cu->cuMemFree(linearBuf);
+                goto dmabuf_cleanup;
+            }
+
+            NV_ENC_MAP_INPUT_RESOURCE mapRes = {0};
+            mapRes.version = NV_ENC_MAP_INPUT_RESOURCE_VER;
+            mapRes.registeredResource = regRes.registeredResource;
+            nvst = enc.funcs.nvEncMapInputResource(enc.encoder, &mapRes);
+            if (nvst != NV_ENC_SUCCESS) {
+                enc.funcs.nvEncUnregisterResource(enc.encoder, regRes.registeredResource);
+                cu->cuMemFree(linearBuf);
+                goto dmabuf_cleanup;
+            }
+
+            /* Encode */
+            NV_ENC_PIC_PARAMS picParams = {0};
+            picParams.version = NV_ENC_PIC_PARAMS_VER;
+            picParams.inputBuffer = mapRes.mappedResource;
+            picParams.bufferFmt = mapRes.mappedBufferFmt;
+            picParams.inputWidth = dp.width;
+            picParams.inputHeight = dp.height;
+            picParams.inputPitch = pitch;
+            picParams.outputBitstream = enc.outputBuffer;
+            picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
+            picParams.pictureType = NV_ENC_PIC_TYPE_UNKNOWN;
+            picParams.encodePicFlags = (enc.frameCount == 0)
+                ? (NV_ENC_PIC_FLAG_OUTPUT_SPSPPS | NV_ENC_PIC_FLAG_FORCEIDR) : 0;
+            picParams.frameIdx = (uint32_t)enc.frameCount;
+            picParams.inputTimeStamp = enc.frameCount;
+
+            nvst = enc.funcs.nvEncEncodePicture(enc.encoder, &picParams);
+
+            enc.funcs.nvEncUnmapInputResource(enc.encoder, mapRes.mappedResource);
+            enc.funcs.nvEncUnregisterResource(enc.encoder, regRes.registeredResource);
+            cu->cuMemFree(linearBuf);
+
+            if (nvst != NV_ENC_SUCCESS) {
+                HELPER_LOG("DMABUF: nvEncEncodePicture failed: %d", nvst);
+                goto dmabuf_cleanup;
+            }
+
+            enc.frameCount++;
+            if (enc.frameCount % 300 == 0) {
+                HELPER_LOG("Encoded %lu frames (DMABUF)", (unsigned long)enc.frameCount);
+            }
+
+            /* Lock and send bitstream */
+            {
+                NV_ENC_LOCK_BITSTREAM lockOut = {0};
+                lockOut.version = NV_ENC_LOCK_BITSTREAM_VER;
+                lockOut.outputBitstream = enc.outputBuffer;
+                nvst = enc.funcs.nvEncLockBitstream(enc.encoder, &lockOut);
+                if (nvst == NV_ENC_SUCCESS) {
+                    send_response(client_fd, 0, lockOut.bitstreamBufferPtr,
+                                  lockOut.bitstreamSizeInBytes);
+                    enc.funcs.nvEncUnlockBitstream(enc.encoder, enc.outputBuffer);
+                } else {
+                    send_response(client_fd, -1, NULL, 0);
+                }
+            }
+
+dmabuf_cleanup:
+            for (int i = 0; i < 4; i++) {
+                if (mipmaps[i]) cu->cuMipmappedArrayDestroy(mipmaps[i]);
+                if (extMems[i]) cu->cuDestroyExternalMemory(extMems[i]);
+            }
+            break;
+        }
+
+        case NVENC_IPC_CMD_ENCODE_SHM: {
+            if (!enc.initialized || shm_ptr == MAP_FAILED) {
+                /* Drain payload */
+                if (hdr.payload_size > 0) {
+                    void *tmp = malloc(hdr.payload_size);
+                    if (tmp) { recv_all(client_fd, tmp, hdr.payload_size); free(tmp); }
+                }
+                send_response(client_fd, -1, NULL, 0);
+                break;
+            }
+
+            NVEncIPCEncodeShmParams sp;
+            if (!recv_all(client_fd, &sp, sizeof(sp))) goto done;
+
+
+            /* Encode directly from shared memory — no socket data transfer */
+            void *bitstream = NULL;
+            uint32_t bsSize = 0;
+            bool ok = encoder_encode(&enc, shm_ptr, sp.width, sp.height,
+                                     sp.frame_size, sp.force_idr,
+                                     &bitstream, &bsSize);
+
+
+            if (ok) {
+                send_response(client_fd, 0, bitstream, bsSize);
+            } else {
+                send_response(client_fd, -1, NULL, 0);
+            }
+            break;
+        }
+
+        case NVENC_IPC_CMD_CLOSE:
+            encoder_close(&enc);
+            send_response(client_fd, 0, NULL, 0);
+            goto done;
+
+        default:
+            HELPER_LOG("Unknown command: %u", hdr.cmd);
+            send_response(client_fd, -1, NULL, 0);
+            break;
+        }
+    }
+
+done:
+    if (enc.initialized) {
+        cu->cuCtxPushCurrent(enc.cudaCtx);
+        encoder_close(&enc);
+        cu->cuCtxPopCurrent(NULL);
+    }
+    if (shm_ptr != MAP_FAILED) {
+        munmap(shm_ptr, shm_size);
+    }
+    if (shm_fd >= 0) {
+        close(shm_fd);
+    }
+    close(client_fd);
+    HELPER_LOG("Client handler done");
+}
+
+static void sighandler(int sig)
+{
+    (void)sig;
+    running = 0;
+}
+
+int main(int argc, char **argv)
+{
+    (void)argc; (void)argv;
+
+    /* Always log to stderr — this is a daemon, logs are essential for diagnostics */
+    log_enabled = 1;
+
+    signal(SIGTERM, sighandler);
+    signal(SIGINT, sighandler);
+    signal(SIGPIPE, SIG_IGN);
+
+    HELPER_LOG("Starting nvenc-helper (pid=%d)", getpid());
+
+    /* Load CUDA */
+    if (cuda_load_functions(&cu, NULL) != 0 || cu == NULL) {
+        HELPER_LOG("Failed to load CUDA");
+        return 1;
+    }
+
+    CUresult cres = cu->cuInit(0);
+    if (cres != CUDA_SUCCESS) {
+        HELPER_LOG("cuInit failed: %d", cres);
+        cuda_free_functions(&cu);
+        return 1;
+    }
+
+    /* Load NVENC */
+    if (nvenc_load_functions(&nv_dl, NULL) != 0 || nv_dl == NULL) {
+        HELPER_LOG("Failed to load NVENC");
+        cuda_free_functions(&cu);
+        return 1;
+    }
+
+    HELPER_LOG("CUDA and NVENC loaded");
+
+    /* Create socket */
+    char sock_path[256];
+    if (!nvenc_ipc_get_socket_path(sock_path, sizeof(sock_path))) {
+        HELPER_LOG("Failed to get socket path");
+        return 1;
+    }
+
+    unlink(sock_path); /* Remove stale socket */
+
+    int listen_fd = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (listen_fd < 0) {
+        HELPER_LOG("socket: %s", strerror(errno));
+        return 1;
+    }
+
+    struct sockaddr_un addr;
+    memset(&addr, 0, sizeof(addr));
+    addr.sun_family = AF_UNIX;
+    strncpy(addr.sun_path, sock_path, sizeof(addr.sun_path) - 1);
+
+    mode_t old_umask = umask(0077); //socket created with 0700 permissions
+    if (bind(listen_fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+        HELPER_LOG("bind(%s): %s", sock_path, strerror(errno));
+        umask(old_umask);
+        close(listen_fd);
+        return 1;
+    }
+    umask(old_umask);
+
+    if (listen(listen_fd, 8) < 0) {
+        HELPER_LOG("listen: %s", strerror(errno));
+        close(listen_fd);
+        unlink(sock_path);
+        return 1;
+    }
+
+    HELPER_LOG("Listening on %s", sock_path);
+
+    /* Accept loop — runs until SIGTERM/SIGINT */
+    while (running) {
+        struct pollfd pfd = { .fd = listen_fd, .events = POLLIN };
+        int ret = poll(&pfd, 1, -1); /* Block forever until connection or signal */
+
+        if (ret < 0) {
+            if (errno == EINTR) continue;
+            HELPER_LOG("poll: %s", strerror(errno));
+            break;
+        }
+
+        int client_fd = accept(listen_fd, NULL, NULL);
+        if (client_fd < 0) {
+            if (errno == EINTR) continue;
+            HELPER_LOG("accept: %s", strerror(errno));
+            continue; /* Don't exit on accept error — keep listening */
+        }
+
+        /* Handle one client at a time (sufficient for Steam's single encode stream) */
+        handle_client(client_fd);
+        HELPER_LOG("Ready for next client");
+    }
+
+    close(listen_fd);
+    unlink(sock_path);
+    nvenc_free_functions(&nv_dl);
+    cuda_free_functions(&cu);
+    HELPER_LOG("Exiting");
+    return 0;
+}
diff --git a/src/nvenc-ipc-client.c b/src/nvenc-ipc-client.c
new file mode 100644
index 00000000..e910e08e
--- /dev/null
+++ b/src/nvenc-ipc-client.c
@@ -0,0 +1,379 @@
+#define _GNU_SOURCE
+#include "nvenc-ipc.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <signal.h>
+
+/* Reliable send: loop until all bytes sent */
+static bool send_all(int fd, const void *buf, size_t len)
+{
+    const char *p = buf;
+    while (len > 0) {
+        ssize_t n = send(fd, p, len, MSG_NOSIGNAL);
+        if (n <= 0) {
+            if (n < 0 && errno == EINTR) continue;
+            return false;
+        }
+        p += n;
+        len -= (size_t)n;
+    }
+    return true;
+}
+
+/* Reliable recv: loop until all bytes received */
+static bool recv_all(int fd, void *buf, size_t len)
+{
+    char *p = buf;
+    while (len > 0) {
+        ssize_t n = recv(fd, p, len, 0);
+        if (n <= 0) {
+            if (n < 0 && errno == EINTR) continue;
+            return false;
+        }
+        p += n;
+        len -= (size_t)n;
+    }
+    return true;
+}
+
+bool nvenc_ipc_get_socket_path(char *buf, size_t bufsize)
+{
+    const char *runtime_dir = getenv("XDG_RUNTIME_DIR");
+    if (runtime_dir == NULL) {
+        runtime_dir = "/tmp";
+    }
+    int ret = snprintf(buf, bufsize, "%s/%s", runtime_dir, NVENC_IPC_SOCK_NAME);
+    return ret > 0 && (size_t)ret < bufsize;
+}
+
+int nvenc_ipc_connect(void)
+{
+    char path[256];
+    if (!nvenc_ipc_get_socket_path(path, sizeof(path))) {
+        return -1;
+    }
+
+    int fd = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (fd < 0) {
+        return -1;
+    }
+
+    struct sockaddr_un addr;
+    memset(&addr, 0, sizeof(addr));
+    addr.sun_family = AF_UNIX;
+    strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1);
+
+    if (connect(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+        close(fd);
+        return -1;
+    }
+
+    return fd;
+}
+
+int nvenc_ipc_connect_or_start(const char *helper_path)
+{
+    /* Try connecting first */
+    int fd = nvenc_ipc_connect();
+    if (fd >= 0) {
+        return fd;
+    }
+
+    /* Helper not running — start it */
+    pid_t pid = fork();
+    if (pid < 0) {
+        return -1;
+    }
+
+    if (pid == 0) {
+        /* Child: exec the helper.
+         * Detach from parent's session so it survives parent exit. */
+        setsid();
+
+        /* Close inherited fds */
+        for (int i = 3; i < 1024; i++) {
+            close(i);
+        }
+
+        /* Redirect stdout/stderr to /dev/null unless NVD_LOG is set */
+        if (getenv("NVD_LOG") == NULL) {
+            int devnull = open("/dev/null", O_WRONLY);
+            if (devnull >= 0) {
+                dup2(devnull, STDOUT_FILENO);
+                dup2(devnull, STDERR_FILENO);
+                close(devnull);
+            }
+        }
+
+        execl(helper_path, helper_path, NULL);
+        _exit(127);
+    }
+
+    /* Parent: wait for the helper to create the socket */
+    for (int attempt = 0; attempt < 50; attempt++) {
+        usleep(100000); /* 100ms */
+        fd = nvenc_ipc_connect();
+        if (fd >= 0) {
+            return fd;
+        }
+    }
+
+    /* Timed out — kill the child */
+    kill(pid, SIGTERM);
+    waitpid(pid, NULL, 0);
+    return -1;
+}
+
+/* Receive a single fd via SCM_RIGHTS */
+static int recv_fd(int sock, void *buf, size_t len)
+{
+    struct iovec iov = { .iov_base = buf, .iov_len = len };
+    union {
+        char buf[CMSG_SPACE(sizeof(int))];
+        struct cmsghdr align;
+    } cmsg_buf;
+    memset(&cmsg_buf, 0, sizeof(cmsg_buf));
+
+    struct msghdr msg = {
+        .msg_iov = &iov,
+        .msg_iovlen = 1,
+        .msg_control = cmsg_buf.buf,
+        .msg_controllen = sizeof(cmsg_buf.buf),
+    };
+
+    ssize_t n = recvmsg(sock, &msg, 0);
+    if (n != (ssize_t)len) return -1;
+
+    struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+    if (cmsg && cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
+        int received_fd = -1;
+        memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int));
+        return received_fd;
+    }
+    return -1;
+}
+
+int nvenc_ipc_init(int fd, const NVEncIPCInitParams *params,
+                   int *shm_fd_out, uint32_t *shm_size_out)
+{
+    NVEncIPCMsgHeader hdr = {
+        .cmd = NVENC_IPC_CMD_INIT,
+        .payload_size = sizeof(*params)
+    };
+
+    if (!send_all(fd, &hdr, sizeof(hdr))) return -1;
+    if (!send_all(fd, params, sizeof(*params))) return -1;
+
+    /* Response includes shm fd via SCM_RIGHTS + NVEncIPCInitResponse payload */
+    NVEncIPCRespHeader resp;
+    NVEncIPCInitResponse init_resp = {0};
+
+    int shm_fd = recv_fd(fd, &resp, sizeof(resp));
+
+    if (resp.status != 0) {
+        if (shm_fd >= 0) close(shm_fd);
+        return resp.status;
+    }
+
+    if (resp.payload_size >= sizeof(init_resp)) {
+        if (!recv_all(fd, &init_resp, sizeof(init_resp))) {
+            if (shm_fd >= 0) close(shm_fd);
+            return -1;
+        }
+    }
+
+    if (shm_fd_out) {
+        *shm_fd_out = shm_fd;
+    } else if (shm_fd >= 0) {
+        close(shm_fd);
+    }
+    if (shm_size_out) *shm_size_out = init_resp.shm_size;
+
+    return 0;
+}
+
+int nvenc_ipc_encode(int fd, const void *frame_data,
+                     uint32_t width, uint32_t height, uint32_t frame_size,
+                     uint32_t force_idr,
+                     void **bitstream_out, uint32_t *bitstream_size_out)
+{
+    NVEncIPCEncodeParams enc_params = {
+        .width = width,
+        .height = height,
+        .frame_size = frame_size,
+        .force_idr = force_idr,
+    };
+
+    NVEncIPCMsgHeader hdr = {
+        .cmd = NVENC_IPC_CMD_ENCODE,
+        .payload_size = sizeof(enc_params) + frame_size
+    };
+
+    if (!send_all(fd, &hdr, sizeof(hdr))) return -1;
+    if (!send_all(fd, &enc_params, sizeof(enc_params))) return -1;
+    if (!send_all(fd, frame_data, frame_size)) return -1;
+
+    NVEncIPCRespHeader resp;
+    if (!recv_all(fd, &resp, sizeof(resp))) return -1;
+
+    if (resp.status != 0) {
+        *bitstream_out = NULL;
+        *bitstream_size_out = 0;
+        return resp.status;
+    }
+
+    if (resp.payload_size > 0) {
+        void *data = malloc(resp.payload_size);
+        if (data == NULL) return -1;
+        if (!recv_all(fd, data, resp.payload_size)) {
+            free(data);
+            return -1;
+        }
+        *bitstream_out = data;
+        *bitstream_size_out = resp.payload_size;
+    } else {
+        *bitstream_out = NULL;
+        *bitstream_size_out = 0;
+    }
+
+    return 0;
+}
+
+/* Send multiple DMA-BUF fds via SCM_RIGHTS ancillary data */
+static bool send_fds(int sock, const int *fds, int num_fds, const void *data, size_t len)
+{
+    struct iovec iov = { .iov_base = (void *)data, .iov_len = len };
+    union {
+        char buf[CMSG_SPACE(sizeof(int) * 4)]; /* up to 4 fds */
+        struct cmsghdr align;
+    } cmsg_buf;
+    memset(&cmsg_buf, 0, sizeof(cmsg_buf));
+
+    size_t fd_size = sizeof(int) * (size_t)num_fds;
+    struct msghdr msg = {
+        .msg_iov = &iov,
+        .msg_iovlen = 1,
+        .msg_control = cmsg_buf.buf,
+        .msg_controllen = CMSG_SPACE(fd_size),
+    };
+
+    struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
+    cmsg->cmsg_level = SOL_SOCKET;
+    cmsg->cmsg_type = SCM_RIGHTS;
+    cmsg->cmsg_len = CMSG_LEN(fd_size);
+    memcpy(CMSG_DATA(cmsg), fds, fd_size);
+
+    ssize_t n = sendmsg(sock, &msg, MSG_NOSIGNAL);
+    return n == (ssize_t)len;
+}
+
+int nvenc_ipc_encode_dmabuf(int fd, const int *dmabuf_fds, int num_fds,
+                            const NVEncIPCEncodeDmaBufParams *params,
+                            void **bitstream_out, uint32_t *bitstream_size_out)
+{
+    NVEncIPCMsgHeader hdr = {
+        .cmd = NVENC_IPC_CMD_ENCODE_DMABUF,
+        .payload_size = sizeof(*params)
+    };
+
+    /* Send the header normally */
+    if (!send_all(fd, &hdr, sizeof(hdr))) return -1;
+
+    /* Send the params WITH the fds attached via SCM_RIGHTS */
+    if (!send_fds(fd, dmabuf_fds, num_fds, params, sizeof(*params))) return -1;
+
+    /* Receive response */
+    NVEncIPCRespHeader resp;
+    if (!recv_all(fd, &resp, sizeof(resp))) return -1;
+
+    if (resp.status != 0) {
+        *bitstream_out = NULL;
+        *bitstream_size_out = 0;
+        return resp.status;
+    }
+
+    if (resp.payload_size > 0) {
+        void *data = malloc(resp.payload_size);
+        if (data == NULL) return -1;
+        if (!recv_all(fd, data, resp.payload_size)) {
+            free(data);
+            return -1;
+        }
+        *bitstream_out = data;
+        *bitstream_size_out = resp.payload_size;
+    } else {
+        *bitstream_out = NULL;
+        *bitstream_size_out = 0;
+    }
+
+    return 0;
+}
+
+int nvenc_ipc_encode_shm(int fd, uint32_t width, uint32_t height,
+                         uint32_t frame_size, uint32_t force_idr,
+                         void **bitstream_out, uint32_t *bitstream_size_out)
+{
+    NVEncIPCEncodeShmParams sp = {
+        .width = width,
+        .height = height,
+        .frame_size = frame_size,
+        .force_idr = force_idr,
+    };
+
+    NVEncIPCMsgHeader hdr = {
+        .cmd = NVENC_IPC_CMD_ENCODE_SHM,
+        .payload_size = sizeof(sp)
+    };
+
+    /* Only send the small header + params — pixel data is already in shm */
+    if (!send_all(fd, &hdr, sizeof(hdr))) return -1;
+    if (!send_all(fd, &sp, sizeof(sp))) return -1;
+
+    NVEncIPCRespHeader resp;
+    if (!recv_all(fd, &resp, sizeof(resp))) return -1;
+
+    if (resp.status != 0) {
+        *bitstream_out = NULL;
+        *bitstream_size_out = 0;
+        return resp.status;
+    }
+
+    if (resp.payload_size > 0) {
+        void *data = malloc(resp.payload_size);
+        if (data == NULL) return -1;
+        if (!recv_all(fd, data, resp.payload_size)) {
+            free(data);
+            return -1;
+        }
+        *bitstream_out = data;
+        *bitstream_size_out = resp.payload_size;
+    } else {
+        *bitstream_out = NULL;
+        *bitstream_size_out = 0;
+    }
+
+    return 0;
+}
+
+void nvenc_ipc_close(int fd)
+{
+    NVEncIPCMsgHeader hdr = {
+        .cmd = NVENC_IPC_CMD_CLOSE,
+        .payload_size = 0
+    };
+    /* Best-effort send; ignore errors since we're closing anyway */
+    send_all(fd, &hdr, sizeof(hdr));
+
+    NVEncIPCRespHeader resp;
+    recv_all(fd, &resp, sizeof(resp));
+
+    close(fd);
+}
diff --git a/src/nvenc-ipc.h b/src/nvenc-ipc.h
new file mode 100644
index 00000000..e4532bbc
--- /dev/null
+++ b/src/nvenc-ipc.h
@@ -0,0 +1,139 @@
+#ifndef NVENC_IPC_H
+#define NVENC_IPC_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+
+/*
+ * IPC protocol between the VA-API driver and the 64-bit NVENC helper.
+ *
+ * When CUDA is unavailable (e.g. 32-bit process on Blackwell GPUs where
+ * cuInit fails), the driver delegates encoding to a 64-bit helper process
+ * via a Unix domain socket. On systems where CUDA works, the driver uses
+ * NVENC directly without the helper.
+ *
+ * Socket path: /run/user/<uid>/nvenc-helper.sock
+ *
+ * All integers are in host byte order (both processes are on the same machine).
+ * Messages are: header + payload. Responses are: header + payload.
+ */
+
+#define NVENC_IPC_SOCK_NAME "nvenc-helper.sock"
+
+/* Maximum frame size we'll accept over the socket (64MB, enough for 8K NV12) */
+#define NVENC_IPC_MAX_FRAME_SIZE (64 * 1024 * 1024)
+
+/* Commands */
+#define NVENC_IPC_CMD_INIT    1  /* Initialize encoder */
+#define NVENC_IPC_CMD_ENCODE  2  /* Encode a frame (host pixel data) */
+#define NVENC_IPC_CMD_CLOSE   3  /* Close encoder and disconnect */
+#define NVENC_IPC_CMD_ENCODE_DMABUF 4  /* Encode from DMA-BUF fd (GPU zero-copy) */
+#define NVENC_IPC_CMD_ENCODE_SHM   5  /* Encode from shared memory (zero-copy host) */
+
+/* Message header (client → helper) */
+typedef struct {
+    uint32_t cmd;
+    uint32_t payload_size;
+} NVEncIPCMsgHeader;
+
+/* Response header (helper → client) */
+typedef struct {
+    int32_t  status;        /* 0 = success, <0 = error code */
+    uint32_t payload_size;  /* size of following data */
+} NVEncIPCRespHeader;
+
+/* CMD_INIT payload */
+typedef struct {
+    uint32_t width;
+    uint32_t height;
+    uint32_t codec;         /* 0 = H.264, 1 = HEVC */
+    uint32_t profile;       /* VA-API profile value */
+    uint32_t frameRateNum;
+    uint32_t frameRateDen;
+    uint32_t bitrate;
+    uint32_t maxBitrate;
+    uint32_t gopLength;
+    uint32_t is10bit;       /* 0 = 8-bit NV12, 1 = 10-bit P010 */
+} NVEncIPCInitParams;
+
+/* CMD_ENCODE payload header (followed by frame_size bytes of NV12/P010 data) */
+typedef struct {
+    uint32_t width;
+    uint32_t height;
+    uint32_t frame_size;    /* total bytes of pixel data */
+    uint32_t force_idr;     /* 1 = force IDR keyframe */
+} NVEncIPCEncodeParams;
+
+/* CMD_ENCODE_DMABUF payload.
+ * Multiple DMA-BUF fds (one per plane) sent via SCM_RIGHTS ancillary data.
+ * For NV12: 2 fds (Y plane, UV plane). */
+typedef struct {
+    uint32_t width;
+    uint32_t height;
+    uint32_t pitches[4];     /* stride per plane */
+    uint32_t offsets[4];     /* offset per plane */
+    uint32_t sizes[4];       /* memory size per plane */
+    uint32_t num_planes;
+    uint32_t bppc;           /* bytes per pixel per channel */
+    uint32_t is10bit;
+} NVEncIPCEncodeDmaBufParams;
+
+/* CMD_INIT response includes a shm fd via SCM_RIGHTS.
+ * The shm region is large enough for one NV12/P010 frame. */
+typedef struct {
+    uint32_t shm_size;          /* size of the shared memory region */
+} NVEncIPCInitResponse;
+
+/* CMD_ENCODE_SHM payload (frame data is already in shared memory) */
+typedef struct {
+    uint32_t width;
+    uint32_t height;
+    uint32_t frame_size;
+    uint32_t force_idr;
+} NVEncIPCEncodeShmParams;
+
+/* IPC client functions (used by the driver when CUDA is unavailable) */
+
+/* Get the socket path for this user */
+bool nvenc_ipc_get_socket_path(char *buf, size_t bufsize);
+
+/* Try to connect to the helper. Returns socket fd or -1. */
+int nvenc_ipc_connect(void);
+
+/* Start the helper if not running, then connect. Returns socket fd or -1. */
+int nvenc_ipc_connect_or_start(const char *helper_path);
+
+/* Send init command. Returns 0 on success.
+ * If shm_fd_out is non-NULL, receives the shared memory fd from the helper.
+ * If shm_size_out is non-NULL, receives the shm region size. */
+int nvenc_ipc_init(int fd, const NVEncIPCInitParams *params,
+                   int *shm_fd_out, uint32_t *shm_size_out);
+
+/* Send frame data and receive encoded bitstream.
+ * bitstream_out is malloc'd by this function, caller must free.
+ * Returns 0 on success. */
+int nvenc_ipc_encode(int fd, const void *frame_data,
+                     uint32_t width, uint32_t height, uint32_t frame_size,
+                     uint32_t force_idr,
+                     void **bitstream_out, uint32_t *bitstream_size_out);
+
+/* Send DMA-BUF fd and receive encoded bitstream (GPU zero-copy path).
+ * The fd is sent via SCM_RIGHTS ancillary data.
+ * bitstream_out is malloc'd by this function, caller must free.
+ * Returns 0 on success. */
+int nvenc_ipc_encode_dmabuf(int fd, const int *dmabuf_fds, int num_fds,
+                            const NVEncIPCEncodeDmaBufParams *params,
+                            void **bitstream_out, uint32_t *bitstream_size_out);
+
+/* Encode from shared memory — frame data already written to shm.
+ * Only sends a small header, no pixel data over the socket.
+ * Returns 0 on success. */
+int nvenc_ipc_encode_shm(int fd, uint32_t width, uint32_t height,
+                         uint32_t frame_size, uint32_t force_idr,
+                         void **bitstream_out, uint32_t *bitstream_size_out);
+
+/* Send close command and close the socket. */
+void nvenc_ipc_close(int fd);
+
+#endif /* NVENC_IPC_H */
diff --git a/src/nvenc.c b/src/nvenc.c
new file mode 100644
index 00000000..d946c2d9
--- /dev/null
+++ b/src/nvenc.c
@@ -0,0 +1,436 @@
+#include "nvenc.h"
+#include "vabackend.h"
+
+#include <string.h>
+#include <stdlib.h>
+
+static bool check_nvenc_status(NVENCSTATUS status, const char *func, int line)
+{
+    if (status != NV_ENC_SUCCESS) {
+        LOG("NVENC error %d at %s:%d", status, func, line);
+        return false;
+    }
+    return true;
+}
+#define CHECK_NVENC(status) check_nvenc_status(status, __func__, __LINE__)
+
+bool nvenc_load(NvencFunctions **nvenc_dl)
+{
+    int ret = nvenc_load_functions(nvenc_dl, NULL);
+    if (ret != 0) {
+        LOG("Failed to load NVENC functions (libnvidia-encode.so)");
+        *nvenc_dl = NULL;
+        return false;
+    }
+    //version format: API returns (major << 4 | minor)
+    uint32_t maxVersion = 0;
+    NVENCSTATUS st = (*nvenc_dl)->NvEncodeAPIGetMaxSupportedVersion(&maxVersion);
+    if (st != NV_ENC_SUCCESS) {
+        LOG("NvEncodeAPIGetMaxSupportedVersion failed: %d", st);
+        nvenc_free_functions(nvenc_dl);
+        *nvenc_dl = NULL;
+        return false;
+    }
+    uint32_t currentVersion = (NVENCAPI_MAJOR_VERSION << 4) | NVENCAPI_MINOR_VERSION;
+    LOG("NVENC max supported version: %u.%u, header version: %u.%u",
+        maxVersion >> 4, maxVersion & 0xf,
+        NVENCAPI_MAJOR_VERSION, NVENCAPI_MINOR_VERSION);
+
+    if (currentVersion > maxVersion) {
+        LOG("NVENC header version (%u) is newer than driver supports (%u)",
+            currentVersion, maxVersion);
+        nvenc_free_functions(nvenc_dl);
+        *nvenc_dl = NULL;
+        return false;
+    }
+    return true;
+}
+
+void nvenc_unload(NvencFunctions **nvenc_dl)
+{
+    if (*nvenc_dl != NULL) {
+        nvenc_free_functions(nvenc_dl);
+        *nvenc_dl = NULL;
+    }
+}
+
+bool nvenc_open_session(NVENCContext *nvencCtx, NvencFunctions *nvenc_dl, CUcontext cudaCtx)
+{
+    /* Fill function list */
+    nvencCtx->funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER;
+    NVENCSTATUS st = nvenc_dl->NvEncodeAPICreateInstance(&nvencCtx->funcs);
+    if (!CHECK_NVENC(st)) {
+        return false;
+    }
+
+    /* Open encode session */
+    NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS sessionParams = {0};
+    sessionParams.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
+    sessionParams.deviceType = NV_ENC_DEVICE_TYPE_CUDA;
+    sessionParams.device = cudaCtx;
+    sessionParams.apiVersion = NVENCAPI_VERSION;
+
+    st = nvencCtx->funcs.nvEncOpenEncodeSessionEx(&sessionParams, &nvencCtx->encoder);
+    if (!CHECK_NVENC(st)) {
+        nvencCtx->encoder = NULL;
+        return false;
+    }
+
+    LOG("NVENC session opened: %p", nvencCtx->encoder);
+    return true;
+}
+
+void nvenc_close_session(NVENCContext *nvencCtx)
+{
+    if (nvencCtx->encoder == NULL) {
+        return;
+    }
+
+    /* Send EOS to flush encoder before freeing any buffers */
+    if (nvencCtx->initialized) {
+        NV_ENC_PIC_PARAMS picParams = {0};
+        picParams.version = NV_ENC_PIC_PARAMS_VER;
+        picParams.encodePicFlags = NV_ENC_PIC_FLAG_EOS;
+        nvencCtx->funcs.nvEncEncodePicture(nvencCtx->encoder, &picParams);
+    }
+
+    /* Free output buffer after flush */
+    nvenc_free_output_buffer(nvencCtx);
+
+    /* Destroy encoder */
+    NVENCSTATUS st = nvencCtx->funcs.nvEncDestroyEncoder(nvencCtx->encoder);
+    if (st != NV_ENC_SUCCESS) {
+        LOG("nvEncDestroyEncoder failed: %d", st);
+    }
+
+    LOG("NVENC session closed");
+    nvencCtx->encoder = NULL;
+    nvencCtx->initialized = false;
+}
+
+bool nvenc_init_encoder(NVENCContext *nvencCtx, uint32_t width, uint32_t height,
+                        GUID codecGuid, GUID profileGuid, GUID presetGuid,
+                        NV_ENC_TUNING_INFO tuningInfo)
+{
+    NVENCSTATUS st;
+
+    nvencCtx->codecGuid = codecGuid;
+    nvencCtx->profileGuid = profileGuid;
+    nvencCtx->width = width;
+    nvencCtx->height = height;
+
+    //get preset config
+    NV_ENC_PRESET_CONFIG presetConfig = {0};
+    presetConfig.version = NV_ENC_PRESET_CONFIG_VER;
+    presetConfig.presetCfg.version = NV_ENC_CONFIG_VER;
+
+    st = nvencCtx->funcs.nvEncGetEncodePresetConfigEx(
+        nvencCtx->encoder, codecGuid, presetGuid, tuningInfo, &presetConfig);
+    if (!CHECK_NVENC(st)) {
+        return false;
+    }
+
+    //apply overrides
+    memcpy(&nvencCtx->encodeConfig, &presetConfig.presetCfg, sizeof(NV_ENC_CONFIG));
+    nvencCtx->encodeConfig.encodeCodecConfig.hevcConfig.pixelBitDepthMinus8 = nvencCtx->inputFormat == NV_ENC_BUFFER_FORMAT_YUV420_10BIT? 2: 0;
+    nvencCtx->encodeConfig.version = NV_ENC_CONFIG_VER;
+    nvencCtx->encodeConfig.profileGUID = profileGuid;
+
+    if (nvencCtx->rcMode != 0) {
+        nvencCtx->encodeConfig.rcParams.rateControlMode = (NV_ENC_PARAMS_RC_MODE)nvencCtx->rcMode;
+    }
+    if (nvencCtx->bitrate > 0) {
+        nvencCtx->encodeConfig.rcParams.averageBitRate = nvencCtx->bitrate;
+    }
+    if (nvencCtx->maxBitrate > 0) {
+        nvencCtx->encodeConfig.rcParams.maxBitRate = nvencCtx->maxBitrate;
+    }
+    if (nvencCtx->vbvBufferSize > 0) {
+        nvencCtx->encodeConfig.rcParams.vbvBufferSize = nvencCtx->vbvBufferSize;
+    }
+    if (nvencCtx->vbvInitialDelay > 0) {
+        nvencCtx->encodeConfig.rcParams.vbvInitialDelay = nvencCtx->vbvInitialDelay;
+    }
+
+    if (nvencCtx->intraPeriod > 0) {
+        nvencCtx->encodeConfig.gopLength = nvencCtx->intraPeriod;
+    }
+    //no B-frames: NVENC needs DPB management or returns NEED_MORE_INPUT which ffmpeg 6.x can't handle
+    nvencCtx->encodeConfig.frameIntervalP = 1;
+
+    memset(&nvencCtx->initParams, 0, sizeof(nvencCtx->initParams));
+    nvencCtx->initParams.version = NV_ENC_INITIALIZE_PARAMS_VER;
+    nvencCtx->initParams.encodeGUID = codecGuid;
+    nvencCtx->initParams.presetGUID = presetGuid;
+    nvencCtx->initParams.encodeWidth = width;
+    nvencCtx->initParams.encodeHeight = height;
+    nvencCtx->initParams.darWidth = width;
+    nvencCtx->initParams.darHeight = height;
+    nvencCtx->initParams.frameRateNum = nvencCtx->frameRateNum > 0 ? nvencCtx->frameRateNum : 30;
+    nvencCtx->initParams.frameRateDen = nvencCtx->frameRateDen > 0 ? nvencCtx->frameRateDen : 1;
+    nvencCtx->initParams.enablePTD = 1;
+    nvencCtx->initParams.encodeConfig = &nvencCtx->encodeConfig;
+    nvencCtx->initParams.maxEncodeWidth = width;
+    nvencCtx->initParams.maxEncodeHeight = height;
+    nvencCtx->initParams.tuningInfo = tuningInfo;
+
+    st = nvencCtx->funcs.nvEncInitializeEncoder(nvencCtx->encoder, &nvencCtx->initParams);
+    if (!CHECK_NVENC(st)) {
+        return false;
+    }
+
+    nvencCtx->initialized = true;
+    LOG("NVENC encoder initialized: %ux%u codec=%s",
+        width, height,
+        memcmp(&codecGuid, &NV_ENC_CODEC_H264_GUID, sizeof(GUID)) == 0 ? "H.264" : "HEVC");
+
+    return true;
+}
+
+bool nvenc_alloc_output_buffer(NVENCContext *nvencCtx)
+{
+    if (nvencCtx->outputBuffer.allocated) {
+        return true;
+    }
+
+    NV_ENC_CREATE_BITSTREAM_BUFFER createBuf = {0};
+    createBuf.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
+
+    NVENCSTATUS st = nvencCtx->funcs.nvEncCreateBitstreamBuffer(
+        nvencCtx->encoder, &createBuf);
+    if (!CHECK_NVENC(st)) {
+        return false;
+    }
+
+    nvencCtx->outputBuffer.bitstreamBuffer = createBuf.bitstreamBuffer;
+    nvencCtx->outputBuffer.allocated = true;
+    nvencCtx->outputBuffer.locked = false;
+    nvencCtx->outputBuffer.lockedPtr = NULL;
+    nvencCtx->outputBuffer.lockedSize = 0;
+
+    return true;
+}
+
+void nvenc_free_output_buffer(NVENCContext *nvencCtx)
+{
+    if (!nvencCtx->outputBuffer.allocated || nvencCtx->encoder == NULL) {
+        return;
+    }
+
+    /* Unlock if still locked */
+    if (nvencCtx->outputBuffer.locked) {
+        nvenc_unlock_bitstream(nvencCtx);
+    }
+
+    nvencCtx->funcs.nvEncDestroyBitstreamBuffer(
+        nvencCtx->encoder, nvencCtx->outputBuffer.bitstreamBuffer);
+    nvencCtx->outputBuffer.bitstreamBuffer = NULL;
+    nvencCtx->outputBuffer.allocated = false;
+}
+
+bool nvenc_register_cuda_resource(NVENCContext *nvencCtx, CUdeviceptr devPtr,
+                                  uint32_t width, uint32_t height, uint32_t pitch,
+                                  NV_ENC_BUFFER_FORMAT format,
+                                  NV_ENC_REGISTERED_PTR *outRegistered)
+{
+    NV_ENC_REGISTER_RESOURCE regRes = {0};
+    regRes.version = NV_ENC_REGISTER_RESOURCE_VER;
+    regRes.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
+    regRes.resourceToRegister = (void*)devPtr;
+    regRes.width = width;
+    regRes.height = height;
+    regRes.pitch = pitch;
+    regRes.bufferFormat = format;
+    regRes.bufferUsage = NV_ENC_INPUT_IMAGE;
+
+    NVENCSTATUS st = nvencCtx->funcs.nvEncRegisterResource(
+        nvencCtx->encoder, &regRes);
+    if (!CHECK_NVENC(st)) {
+        return false;
+    }
+
+    *outRegistered = regRes.registeredResource;
+    return true;
+}
+
+bool nvenc_map_resource(NVENCContext *nvencCtx, NV_ENC_REGISTERED_PTR registered,
+                        NV_ENC_INPUT_PTR *outMapped, NV_ENC_BUFFER_FORMAT *outFmt)
+{
+    NV_ENC_MAP_INPUT_RESOURCE mapRes = {0};
+    mapRes.version = NV_ENC_MAP_INPUT_RESOURCE_VER;
+    mapRes.registeredResource = registered;
+
+    NVENCSTATUS st = nvencCtx->funcs.nvEncMapInputResource(
+        nvencCtx->encoder, &mapRes);
+    if (!CHECK_NVENC(st)) {
+        return false;
+    }
+
+    *outMapped = mapRes.mappedResource;
+    if (outFmt) {
+        *outFmt = mapRes.mappedBufferFmt;
+    }
+    return true;
+}
+
+bool nvenc_unmap_resource(NVENCContext *nvencCtx, NV_ENC_INPUT_PTR mapped)
+{
+    NVENCSTATUS st = nvencCtx->funcs.nvEncUnmapInputResource(
+        nvencCtx->encoder, mapped);
+    return CHECK_NVENC(st);
+}
+
+bool nvenc_unregister_resource(NVENCContext *nvencCtx, NV_ENC_REGISTERED_PTR registered)
+{
+    NVENCSTATUS st = nvencCtx->funcs.nvEncUnregisterResource(
+        nvencCtx->encoder, registered);
+    return CHECK_NVENC(st);
+}
+
+/*
+ * Encode a frame. Returns:
+ *  1 = encoded successfully, output available
+ *  0 = needs more input (B-frame buffering), no output yet
+ * -1 = error
+ */
+int nvenc_encode_frame(NVENCContext *nvencCtx, NV_ENC_INPUT_PTR inputBuffer,
+                       NV_ENC_BUFFER_FORMAT bufferFmt,
+                       uint32_t inputWidth, uint32_t inputHeight, uint32_t inputPitch,
+                       NV_ENC_PIC_TYPE picType, uint32_t picFlags)
+{
+    if (!nvencCtx->outputBuffer.allocated) {
+        if (!nvenc_alloc_output_buffer(nvencCtx)) {
+            return -1;
+        }
+    }
+
+    NV_ENC_PIC_PARAMS picParams = {0};
+    picParams.version = NV_ENC_PIC_PARAMS_VER;
+    picParams.inputBuffer = inputBuffer;
+    picParams.bufferFmt = bufferFmt;
+    picParams.inputWidth = inputWidth;
+    picParams.inputHeight = inputHeight;
+    picParams.inputPitch = inputPitch;
+    picParams.outputBitstream = nvencCtx->outputBuffer.bitstreamBuffer;
+    picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME;
+    picParams.pictureType = picType;
+    picParams.encodePicFlags = picFlags;
+    picParams.frameIdx = (uint32_t)nvencCtx->frameCount;
+    picParams.inputTimeStamp = nvencCtx->frameCount;
+
+    NVENCSTATUS st = nvencCtx->funcs.nvEncEncodePicture(
+        nvencCtx->encoder, &picParams);
+
+    nvencCtx->frameCount++;
+
+    if (st == NV_ENC_ERR_NEED_MORE_INPUT) {
+        /* B-frame reordering: NVENC needs more frames before producing output */
+        return 0;
+    }
+    if (st != NV_ENC_SUCCESS) {
+        LOG("nvEncEncodePicture failed: %d", st);
+        return -1;
+    }
+
+    return 1;
+}
+
+bool nvenc_lock_bitstream(NVENCContext *nvencCtx, void **outPtr, uint32_t *outSize)
+{
+    NV_ENC_LOCK_BITSTREAM lockParams = {0};
+    lockParams.version = NV_ENC_LOCK_BITSTREAM_VER;
+    lockParams.outputBitstream = nvencCtx->outputBuffer.bitstreamBuffer;
+    lockParams.doNotWait = 0;
+
+    NVENCSTATUS st = nvencCtx->funcs.nvEncLockBitstream(
+        nvencCtx->encoder, &lockParams);
+    if (!CHECK_NVENC(st)) {
+        return false;
+    }
+
+    *outPtr = lockParams.bitstreamBufferPtr;
+    *outSize = lockParams.bitstreamSizeInBytes;
+    nvencCtx->outputBuffer.locked = true;
+    nvencCtx->outputBuffer.lockedPtr = lockParams.bitstreamBufferPtr;
+    nvencCtx->outputBuffer.lockedSize = lockParams.bitstreamSizeInBytes;
+
+    return true;
+}
+
+bool nvenc_unlock_bitstream(NVENCContext *nvencCtx)
+{
+    if (!nvencCtx->outputBuffer.locked) {
+        return true;
+    }
+
+    NVENCSTATUS st = nvencCtx->funcs.nvEncUnlockBitstream(
+        nvencCtx->encoder, nvencCtx->outputBuffer.bitstreamBuffer);
+    nvencCtx->outputBuffer.locked = false;
+    nvencCtx->outputBuffer.lockedPtr = NULL;
+    nvencCtx->outputBuffer.lockedSize = 0;
+
+    return CHECK_NVENC(st);
+}
+
+/* Profile/entrypoint helpers */
+
+bool nvenc_is_encode_profile(VAProfile profile)
+{
+    switch (profile) {
+    case VAProfileH264ConstrainedBaseline:
+    case VAProfileH264Main:
+    case VAProfileH264High:
+    case VAProfileHEVCMain:
+    case VAProfileHEVCMain10:
+        return true;
+    default:
+        return false;
+    }
+}
+
+GUID nvenc_va_profile_to_codec_guid(VAProfile profile)
+{
+    switch (profile) {
+    case VAProfileH264ConstrainedBaseline:
+    case VAProfileH264Main:
+    case VAProfileH264High:
+        return NV_ENC_CODEC_H264_GUID;
+    case VAProfileHEVCMain:
+    case VAProfileHEVCMain10:
+        return NV_ENC_CODEC_HEVC_GUID;
+    default: {
+        GUID empty = {0};
+        return empty;
+    }
+    }
+}
+
+GUID nvenc_va_profile_to_profile_guid(VAProfile profile)
+{
+    switch (profile) {
+    case VAProfileH264ConstrainedBaseline:
+        return NV_ENC_H264_PROFILE_BASELINE_GUID;
+    case VAProfileH264Main:
+        return NV_ENC_H264_PROFILE_MAIN_GUID;
+    case VAProfileH264High:
+        return NV_ENC_H264_PROFILE_HIGH_GUID;
+    case VAProfileHEVCMain:
+        return NV_ENC_HEVC_PROFILE_MAIN_GUID;
+    case VAProfileHEVCMain10:
+        return NV_ENC_HEVC_PROFILE_MAIN10_GUID;
+    default: {
+        GUID empty = {0};
+        return empty;
+    }
+    }
+}
+
+NV_ENC_BUFFER_FORMAT nvenc_surface_format(VAProfile profile)
+{
+    switch (profile) {
+    case VAProfileHEVCMain10:
+        return NV_ENC_BUFFER_FORMAT_YUV420_10BIT;
+    default:
+        return NV_ENC_BUFFER_FORMAT_NV12;
+    }
+}
diff --git a/src/nvenc.h b/src/nvenc.h
new file mode 100644
index 00000000..dccbb4ab
--- /dev/null
+++ b/src/nvenc.h
@@ -0,0 +1,108 @@
+#ifndef NVENC_H
+#define NVENC_H
+
+#include <ffnvcodec/nvEncodeAPI.h>
+#include <ffnvcodec/dynlink_loader.h>
+#include <va/va.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include "vabackend.h"
+
+// Encode-specific context, stored in NVContext->encodeData
+// when created with VAEntrypointEncSlice.
+
+typedef struct {
+    NV_ENC_OUTPUT_PTR       bitstreamBuffer;
+    bool                    allocated;
+    void                   *lockedPtr;      //locked bitstream pointer
+    uint32_t                lockedSize;
+    bool                    locked;
+} NVENCOutputBuffer;
+
+typedef struct {
+    void                           *encoder;        //NVENC session handle
+    NV_ENCODE_API_FUNCTION_LIST     funcs;
+    bool                            initialized;
+    GUID                            codecGuid;
+    GUID                            profileGuid;
+    NV_ENC_CONFIG                   encodeConfig;
+    NV_ENC_INITIALIZE_PARAMS        initParams;
+    uint32_t                        width;
+    uint32_t                        height;
+    NV_ENC_BUFFER_FORMAT            inputFormat;
+    bool                            seqParamSet;
+    uint32_t                        rcMode;         //VA-API rate control mode
+    uint32_t                        bitrate;        //bits/sec
+    uint32_t                        maxBitrate;
+    uint32_t                        frameRateNum;
+    uint32_t                        frameRateDen;
+    uint32_t                        intraPeriod;    //GOP length
+    uint32_t                        ipPeriod;
+    uint32_t                        vbvBufferSize;  //HRD buffer size (bits)
+    uint32_t                        vbvInitialDelay; //HRD initial fullness (bits)
+    uint64_t                        frameCount;
+    NVENCOutputBuffer               outputBuffer;
+    VABufferID                      currentCodedBufId;
+    bool                            forceIDR;       //from idr_pic_flag
+    NV_ENC_PIC_TYPE                 picType;        //from slice params
+    bool                            useIPC;         //encode via 64-bit helper
+    int                             ipcFd;          //socket fd, -1 if not connected
+    void                           *shmPtr;         //mmap'd shared memory for frame data
+    uint32_t                        shmSize;
+    int                             shmFd;
+} NVENCContext;
+
+// Wraps VACodedBufferSegment with NVENC bitstream storage
+typedef struct {
+    VACodedBufferSegment    segment;
+    void                   *bitstreamData;
+    uint32_t                bitstreamSize;
+    uint32_t                bitstreamAlloc;
+    bool                    hasData;
+} NVCodedBuffer;
+
+bool nvenc_load(NvencFunctions **nvenc_dl);
+void nvenc_unload(NvencFunctions **nvenc_dl);
+
+bool nvenc_open_session(NVENCContext *nvencCtx, NvencFunctions *nvenc_dl, CUcontext cudaCtx);
+void nvenc_close_session(NVENCContext *nvencCtx);
+
+bool nvenc_init_encoder(NVENCContext *nvencCtx, uint32_t width, uint32_t height,
+                        GUID codecGuid, GUID profileGuid, GUID presetGuid,
+                        NV_ENC_TUNING_INFO tuningInfo);
+
+bool nvenc_alloc_output_buffer(NVENCContext *nvencCtx);
+void nvenc_free_output_buffer(NVENCContext *nvencCtx);
+
+bool nvenc_register_cuda_resource(NVENCContext *nvencCtx, CUdeviceptr devPtr,
+                                  uint32_t width, uint32_t height, uint32_t pitch,
+                                  NV_ENC_BUFFER_FORMAT format,
+                                  NV_ENC_REGISTERED_PTR *outRegistered);
+bool nvenc_map_resource(NVENCContext *nvencCtx, NV_ENC_REGISTERED_PTR registered,
+                        NV_ENC_INPUT_PTR *outMapped, NV_ENC_BUFFER_FORMAT *outFmt);
+bool nvenc_unmap_resource(NVENCContext *nvencCtx, NV_ENC_INPUT_PTR mapped);
+bool nvenc_unregister_resource(NVENCContext *nvencCtx, NV_ENC_REGISTERED_PTR registered);
+
+int nvenc_encode_frame(NVENCContext *nvencCtx, NV_ENC_INPUT_PTR inputBuffer,
+                       NV_ENC_BUFFER_FORMAT bufferFmt,
+                       uint32_t inputWidth, uint32_t inputHeight, uint32_t inputPitch,
+                       NV_ENC_PIC_TYPE picType, uint32_t picFlags);
+
+bool nvenc_lock_bitstream(NVENCContext *nvencCtx, void **outPtr, uint32_t *outSize);
+bool nvenc_unlock_bitstream(NVENCContext *nvencCtx);
+
+bool nvenc_is_encode_profile(VAProfile profile);
+GUID nvenc_va_profile_to_codec_guid(VAProfile profile);
+GUID nvenc_va_profile_to_profile_guid(VAProfile profile);
+NV_ENC_BUFFER_FORMAT nvenc_surface_format(VAProfile profile);
+
+void h264enc_handle_sequence_params(NVENCContext *ctx, NVBuffer *buf);
+void h264enc_handle_picture_params(NVENCContext *ctx, NVBuffer *buf);
+void h264enc_handle_slice_params(NVENCContext *ctx, NVBuffer *buf);
+void h264enc_handle_misc_params(NVENCContext *ctx, NVBuffer *buf);
+void hevcenc_handle_sequence_params(NVENCContext *ctx, NVBuffer *buf);
+void hevcenc_handle_picture_params(NVENCContext *ctx, NVBuffer *buf);
+void hevcenc_handle_slice_params(NVENCContext *ctx, NVBuffer *buf);
+void hevcenc_handle_misc_params(NVENCContext *ctx, NVBuffer *buf);
+
+#endif // NVENC_H
diff --git a/src/vabackend.c b/src/vabackend.c
index fb964f50..581210c3 100644
--- a/src/vabackend.c
+++ b/src/vabackend.c
@@ -2,6 +2,9 @@
 
 #include "vabackend.h"
 #include "backend-common.h"
+#include "nvenc.h"
+#include "nvenc-ipc.h"
+
 
 #include <assert.h>
 #include <stdio.h>
@@ -11,6 +14,7 @@
 #include <malloc.h>
 #include <fcntl.h>
 #include <sys/param.h>
+#include <sys/mman.h>
 
 #include <va/va_backend.h>
 #include <va/va_drmcommon.h>
@@ -67,6 +71,8 @@ static uint32_t max_instances;
 
 static CudaFunctions *cu;
 static CuvidFunctions *cv;
+static NvencFunctions *nv;
+static bool cudaInitSuccess;
 
 extern const NVCodec __start_nvd_codecs[];
 extern const NVCodec __stop_nvd_codecs[];
@@ -164,12 +170,22 @@ static void init() {
         return;
     }
 
+    /* Load NVENC functions (optional — encoding won't work without it but decode still will) */
+    if (!nvenc_load(&nv)) {
+        LOG("NVENC not available, encoding support disabled");
+        /* nv is already NULL from nvenc_load on failure */
+    }
+
     //Not really much we can do here to abort the loading of the library
-    CHECK_CUDA_RESULT(cu->cuInit(0));
+    cudaInitSuccess = !CHECK_CUDA_RESULT(cu->cuInit(0));
+    if (!cudaInitSuccess) {
+        LOG("CUDA init failed — encode-only mode via IPC helper");
+    }
 }
 
 __attribute__ ((destructor))
 static void cleanup() {
+    nvenc_unload(&nv);
     if (cv != NULL) {
         cuvid_free_functions(&cv);
     }
@@ -318,7 +334,34 @@ static void deleteObject(NVDriver *drv, VAGenericID id) {
 }
 
 static bool destroyContext(NVDriver *drv, NVContext *nvCtx) {
-    CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), false);
+    if (drv->cudaAvailable) {
+        CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), false);
+    }
+
+    if (nvCtx->isEncode) {
+        /* Encode context cleanup */
+        NVENCContext *nvencCtx = (NVENCContext*) nvCtx->encodeData;
+        if (nvencCtx != NULL) {
+            if (nvencCtx->useIPC) {
+                if (nvencCtx->shmPtr != NULL) {
+                    munmap(nvencCtx->shmPtr, nvencCtx->shmSize);
+                    nvencCtx->shmPtr = NULL;
+                }
+                if (nvencCtx->ipcFd >= 0) {
+                    nvenc_ipc_close(nvencCtx->ipcFd);
+                    nvencCtx->ipcFd = -1;
+                }
+            } else {
+                nvenc_close_session(nvencCtx);
+            }
+            free(nvencCtx);
+            nvCtx->encodeData = NULL;
+        }
+        if (drv->cudaAvailable) {
+            CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), false);
+        }
+        return true;
+    }
 
     LOG("Signaling resolve thread to exit");
     struct timespec timeout;
@@ -607,30 +650,31 @@ static VAStatus nvQueryConfigEntrypoints(
         int *num_entrypoints			/* out */
     )
 {
-    entrypoint_list[0] = VAEntrypointVLD;
-    *num_entrypoints = 1;
+    NVDriver *drv = (NVDriver*) ctx->pDriverData;
+    int count = 0;
+
+    /* Decode entrypoint — supported for all profiles that have a codec (requires CUDA) */
+    if (drv->cudaAvailable && vaToCuCodec(profile) != cudaVideoCodec_NONE) {
+        entrypoint_list[count++] = VAEntrypointVLD;
+    }
+
+    /* Encode entrypoint — supported for H.264 and HEVC if NVENC is available */
+    if (drv->nvencAvailable && nvenc_is_encode_profile(profile)) {
+        entrypoint_list[count++] = VAEntrypointEncSlice;
+    }
+
+    *num_entrypoints = count;
 
     return VA_STATUS_SUCCESS;
 }
 
-static VAStatus nvGetConfigAttributes(
-        VADriverContextP ctx,
+static void nvGetConfigAttributesDecode(
+        NVDriver *drv,
         VAProfile profile,
-        VAEntrypoint entrypoint,
-        VAConfigAttrib *attrib_list,	/* in/out */
+        VAConfigAttrib *attrib_list,
         int num_attribs
     )
 {
-    if (entrypoint != VAEntrypointVLD) {
-        return VA_STATUS_ERROR_UNSUPPORTED_ENTRYPOINT;
-    }
-
-    NVDriver *drv = (NVDriver*) ctx->pDriverData;
-    if (vaToCuCodec(profile) == cudaVideoCodec_NONE) {
-        return VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
-    }
-    //LOG("Got here with profile: %d == %d", profile, vaToCuCodec(profile));
-
     for (int i = 0; i < num_attribs; i++)
     {
         if (attrib_list[i].type == VAConfigAttribRTFormat)
@@ -683,6 +727,81 @@ static VAStatus nvGetConfigAttributes(
             LOG("unhandled config attribute: %d", attrib_list[i].type);
         }
     }
+}
+
+static void nvGetConfigAttributesEncode(
+        VAProfile profile,
+        VAConfigAttrib *attrib_list,
+        int num_attribs
+    )
+{
+    for (int i = 0; i < num_attribs; i++)
+    {
+        switch (attrib_list[i].type) {
+        case VAConfigAttribRTFormat:
+            attrib_list[i].value = VA_RT_FORMAT_YUV420;
+            if (profile == VAProfileHEVCMain10) {
+                attrib_list[i].value |= VA_RT_FORMAT_YUV420_10;
+            }
+            break;
+        case VAConfigAttribRateControl:
+            attrib_list[i].value = VA_RC_CQP | VA_RC_CBR | VA_RC_VBR;
+            break;
+        case VAConfigAttribEncPackedHeaders:
+            //accept all packed header types; NVENC generates its own but
+            //apps (Steam) expect the driver to accept them without warning
+            attrib_list[i].value = VA_ENC_PACKED_HEADER_SEQUENCE
+                                 | VA_ENC_PACKED_HEADER_PICTURE
+                                 | VA_ENC_PACKED_HEADER_SLICE
+                                 | VA_ENC_PACKED_HEADER_MISC;
+            break;
+        case VAConfigAttribEncMaxRefFrames:
+            /* NVENC supports multiple reference frames; report a safe value */
+            attrib_list[i].value = 1 | (1 << 16); /* 1 L0, 1 L1 */
+            break;
+        case VAConfigAttribMaxPictureWidth:
+            attrib_list[i].value = 4096;
+            break;
+        case VAConfigAttribMaxPictureHeight:
+            attrib_list[i].value = 4096;
+            break;
+        case VAConfigAttribEncQualityRange:
+            attrib_list[i].value = 7; //NVENC presets P1-P7
+            break;
+        default:
+            attrib_list[i].value = VA_ATTRIB_NOT_SUPPORTED;
+            break;
+        }
+    }
+}
+
+static VAStatus nvGetConfigAttributes(
+        VADriverContextP ctx,
+        VAProfile profile,
+        VAEntrypoint entrypoint,
+        VAConfigAttrib *attrib_list,	/* in/out */
+        int num_attribs
+    )
+{
+    NVDriver *drv = (NVDriver*) ctx->pDriverData;
+
+    if (entrypoint == VAEntrypointEncSlice) {
+        if (!drv->nvencAvailable || !nvenc_is_encode_profile(profile)) {
+            return VA_STATUS_ERROR_UNSUPPORTED_ENTRYPOINT;
+        }
+        nvGetConfigAttributesEncode(profile, attrib_list, num_attribs);
+        return VA_STATUS_SUCCESS;
+    }
+
+    if (entrypoint != VAEntrypointVLD) {
+        return VA_STATUS_ERROR_UNSUPPORTED_ENTRYPOINT;
+    }
+
+    if (vaToCuCodec(profile) == cudaVideoCodec_NONE) {
+        return VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
+    }
+
+    nvGetConfigAttributesDecode(drv, profile, attrib_list, num_attribs);
 
     return VA_STATUS_SUCCESS;
 }
@@ -697,6 +816,28 @@ static VAStatus nvCreateConfig(
     )
 {
     NVDriver *drv = (NVDriver*) ctx->pDriverData;
+
+    if (entrypoint == VAEntrypointEncSlice) {
+        /* Encode config */
+        if (!drv->nvencAvailable || !nvenc_is_encode_profile(profile)) {
+            LOG("Encode not supported for profile: %d", profile);
+            return VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
+        }
+        Object obj = allocateObject(drv, OBJECT_TYPE_CONFIG, sizeof(NVConfig));
+        NVConfig *cfg = (NVConfig*) obj->obj;
+        cfg->profile = profile;
+        cfg->entrypoint = entrypoint;
+        cfg->isEncode = true;
+        cfg->cudaCodec = cudaVideoCodec_NONE;
+        cfg->chromaFormat = cudaVideoChromaFormat_420;
+        cfg->bitDepth = (profile == VAProfileHEVCMain10) ? 10 : 8;
+        cfg->surfaceFormat = (profile == VAProfileHEVCMain10)
+            ? cudaVideoSurfaceFormat_P016
+            : cudaVideoSurfaceFormat_NV12;
+        *config_id = obj->id;
+        return VA_STATUS_SUCCESS;
+    }
+
     //LOG("got profile: %d with %d attributes", profile, num_attribs);
     cudaVideoCodec cudaCodec = vaToCuCodec(profile);
 
@@ -867,6 +1008,20 @@ static VAStatus nvQueryConfigAttributes(
 
     *profile = cfg->profile;
     *entrypoint = cfg->entrypoint;
+
+    /* Encode config attributes */
+    if (cfg->isEncode) {
+        int i = 0;
+        attrib_list[i].type = VAConfigAttribRTFormat;
+        attrib_list[i].value = VA_RT_FORMAT_YUV420;
+        if (cfg->profile == VAProfileHEVCMain10) {
+            attrib_list[i].value |= VA_RT_FORMAT_YUV420_10;
+        }
+        i++;
+        *num_attribs = i;
+        return VA_STATUS_SUCCESS;
+    }
+
     int i = 0;
     attrib_list[i].value = VA_RT_FORMAT_YUV420;
     attrib_list[i].type = VAConfigAttribRTFormat;
@@ -922,6 +1077,35 @@ static VAStatus nvCreateSurfaces2(
 {
     NVDriver *drv = (NVDriver*) ctx->pDriverData;
 
+    /* Log surface attributes for diagnostics */
+    uint32_t memType = VA_SURFACE_ATTRIB_MEM_TYPE_VA;
+    VASurfaceAttribExternalBuffers *extBuf = NULL;
+    for (unsigned int a = 0; a < num_attribs; a++) {
+        LOG("Surface attrib[%u]: type=%d, flags=0x%x, value_type=%d",
+            a, attrib_list[a].type, attrib_list[a].flags,
+            attrib_list[a].value.type);
+        if (attrib_list[a].type == VASurfaceAttribMemoryType &&
+            attrib_list[a].value.type == VAGenericValueTypeInteger) {
+            memType = attrib_list[a].value.value.i;
+            LOG("  MemoryType: 0x%x", memType);
+        }
+        if (attrib_list[a].type == VASurfaceAttribExternalBufferDescriptor &&
+            attrib_list[a].value.type == VAGenericValueTypePointer) {
+            extBuf = (VASurfaceAttribExternalBuffers*)attrib_list[a].value.value.p;
+            if (extBuf) {
+                LOG("  ExternalBuffers: %ux%u fmt=0x%x planes=%u bufs=%u size=%u",
+                    extBuf->width, extBuf->height, extBuf->pixel_format,
+                    extBuf->num_planes, extBuf->num_buffers, extBuf->data_size);
+                for (unsigned int b = 0; b < extBuf->num_buffers && b < 4; b++) {
+                    LOG("    buffer[%u] = %lu (fd or ptr)", b, (unsigned long)extBuf->buffers[b]);
+                }
+                for (unsigned int p = 0; p < extBuf->num_planes && p < 4; p++) {
+                    LOG("    plane[%u]: pitch=%u offset=%u", p, extBuf->pitches[p], extBuf->offsets[p]);
+                }
+            }
+        }
+    }
+
     cudaVideoSurfaceFormat nvFormat;
     cudaVideoChromaFormat chromaFormat;
     int bitdepth;
@@ -978,7 +1162,9 @@ static VAStatus nvCreateSurfaces2(
             break;
     }
 
-    CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED);
+    if (drv->cudaAvailable) {
+        CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED);
+    }
 
     for (uint32_t i = 0; i < num_surfaces; i++) {
         Object surfaceObject = allocateObject(drv, OBJECT_TYPE_SURFACE, sizeof(NVSurface));
@@ -991,13 +1177,47 @@ static VAStatus nvCreateSurfaces2(
         suf->bitDepth = bitdepth;
         suf->context = NULL;
         suf->chromaFormat = chromaFormat;
+        suf->hostPixelData = NULL;
+        suf->hostPixelSize = 0;
+        suf->importedDmaBufFd = -1;
+        suf->importedNumPlanes = 0;
+        suf->importedDataSize = 0;
         pthread_mutex_init(&suf->mutex, NULL);
         pthread_cond_init(&suf->cond, NULL);
 
-        LOG("Creating surface %ux%u, format %X (%p)", width, height, format, suf);
+        /* Store imported DMA-BUF if provided via external buffer attribs */
+        if (extBuf != NULL && extBuf->num_buffers > 0) {
+            /* DRM_PRIME: buffers[] contains DMA-BUF fds.
+             * dup() the fd so the surface owns its own copy. */
+            if (memType & (VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME | VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2)) {
+                int srcFd = (int)extBuf->buffers[i < extBuf->num_buffers ? i : 0];
+                suf->importedDmaBufFd = dup(srcFd);
+                suf->importedNumPlanes = extBuf->num_planes;
+                suf->importedDataSize = extBuf->data_size;
+                for (uint32_t p = 0; p < extBuf->num_planes && p < 4; p++) {
+                    suf->importedPitches[p] = extBuf->pitches[p];
+                    suf->importedOffsets[p] = extBuf->offsets[p];
+                }
+                LOG("  Surface %u: imported DMA-BUF fd=%d (dup of %d), size=%u",
+                    i, suf->importedDmaBufFd, srcFd, suf->importedDataSize);
+            }
+        }
+
+        /* In IPC encode-only mode, eagerly allocate the backing image now
+         * so the surface has GPU memory that can be exported via DMA-BUF.
+         * Steam's OpenGL capture needs to render into these surfaces BEFORE
+         * the encode begins. Without early allocation, the surface is empty. */
+        if (!drv->cudaAvailable && drv->backend != NULL) {
+            drv->backend->realiseSurface(drv, suf);
+        }
+
+        LOG("Creating surface %ux%u, format %X (%p) dmabuf=%d backing=%p",
+            width, height, format, suf, suf->importedDmaBufFd, suf->backingImage);
     }
 
-    CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED);
+    if (drv->cudaAvailable) {
+        CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED);
+    }
 
     return VA_STATUS_SUCCESS;
 }
@@ -1031,7 +1251,19 @@ static VAStatus nvDestroySurfaces(
 
         LOG("Destroying surface %d (%p)", surface->pictureIdx, surface);
 
-        drv->backend->detachBackingImageFromSurface(drv, surface);
+        if (!surface->hostPixelIsShm) {
+            free(surface->hostPixelData);
+        }
+        surface->hostPixelData = NULL;
+
+        if (surface->importedDmaBufFd >= 0) {
+            close(surface->importedDmaBufFd);
+            surface->importedDmaBufFd = -1;
+        }
+
+        if (drv->backend != NULL) {
+            drv->backend->detachBackingImageFromSurface(drv, surface);
+        }
 
         deleteObject(drv, surface_list[i]);
     }
@@ -1057,7 +1289,68 @@ static VAStatus nvCreateContext(
         return VA_STATUS_ERROR_INVALID_CONFIG;
     }
 
-    LOG("Creating context with %d render targets, at %dx%d", num_render_targets, picture_width, picture_height);
+    LOG("Creating context with %d render targets, at %dx%d (encode=%d)",
+        num_render_targets, picture_width, picture_height, cfg->isEncode);
+
+    /* Encode context path */
+    if (cfg->isEncode) {
+        NVENCContext *nvencCtx = (NVENCContext*) calloc(1, sizeof(NVENCContext));
+        if (nvencCtx == NULL) {
+            return VA_STATUS_ERROR_ALLOCATION_FAILED;
+        }
+
+        nvencCtx->width = picture_width;
+        nvencCtx->height = picture_height;
+        nvencCtx->inputFormat = nvenc_surface_format(cfg->profile);
+        nvencCtx->frameRateNum = 30;
+        nvencCtx->frameRateDen = 1;
+        nvencCtx->ipcFd = -1;
+        nvencCtx->shmPtr = NULL;
+        nvencCtx->shmSize = 0;
+        nvencCtx->shmFd = -1;
+
+        if (drv->cudaAvailable) {
+            /* Direct NVENC path (64-bit, CUDA works) */
+            if (CHECK_CUDA_RESULT(cu->cuCtxPushCurrent(drv->cudaContext))) {
+                free(nvencCtx);
+                return VA_STATUS_ERROR_OPERATION_FAILED;
+            }
+
+            if (!nvenc_open_session(nvencCtx, drv->nv, drv->cudaContext)) {
+                CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+                free(nvencCtx);
+                return VA_STATUS_ERROR_OPERATION_FAILED;
+            }
+
+            if (CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL))) {
+                nvenc_close_session(nvencCtx);
+                free(nvencCtx);
+                return VA_STATUS_ERROR_OPERATION_FAILED;
+            }
+            nvencCtx->useIPC = false;
+        } else {
+            /* IPC path: CUDA unavailable (e.g. 32-bit on Blackwell).
+             * Encoding delegated to 64-bit nvenc-helper via Unix socket. */
+            LOG("Using IPC encode path (CUDA unavailable)");
+            nvencCtx->useIPC = true;
+        }
+
+        Object contextObj = allocateObject(drv, OBJECT_TYPE_CONTEXT, sizeof(NVContext));
+        NVContext *nvCtx = (NVContext*) contextObj->obj;
+        nvCtx->drv = drv;
+        nvCtx->profile = cfg->profile;
+        nvCtx->entrypoint = cfg->entrypoint;
+        nvCtx->width = picture_width;
+        nvCtx->height = picture_height;
+        nvCtx->isEncode = true;
+        nvCtx->encodeData = nvencCtx;
+        nvCtx->decoder = NULL;
+        nvCtx->codec = NULL;
+
+        *context = contextObj->id;
+        LOG("Created encode context id: %d, ipc=%d", contextObj->id, nvencCtx->useIPC);
+        return VA_STATUS_SUCCESS;
+    }
 
     //find the codec they've selected
     const NVCodec *selectedCodec = NULL;
@@ -1214,6 +1507,35 @@ static VAStatus nvCreateBuffer(
         return VA_STATUS_ERROR_INVALID_CONTEXT;
     }
 
+    /* Coded buffer for encoding: allocate NVCodedBuffer */
+    if (type == VAEncCodedBufferType) {
+        NVCodedBuffer *coded = (NVCodedBuffer*) calloc(1, sizeof(NVCodedBuffer));
+        if (coded == NULL) {
+            return VA_STATUS_ERROR_ALLOCATION_FAILED;
+        }
+
+        /* Pre-allocate the bitstream storage */
+        coded->bitstreamAlloc = size; /* size requested by app is the max coded size */
+        coded->bitstreamData = malloc(size);
+        if (coded->bitstreamData == NULL) {
+            free(coded);
+            return VA_STATUS_ERROR_ALLOCATION_FAILED;
+        }
+        coded->hasData = false;
+
+        Object bufferObject = allocateObject(drv, OBJECT_TYPE_BUFFER, sizeof(NVBuffer));
+        *buf_id = bufferObject->id;
+
+        NVBuffer *buf = (NVBuffer*) bufferObject->obj;
+        buf->bufferType = type;
+        buf->elements = 1;
+        buf->size = sizeof(NVCodedBuffer);
+        buf->ptr = coded;
+        buf->offset = 0;
+
+        return VA_STATUS_SUCCESS;
+    }
+
     //HACK: This is an awful hack to support VP8 videos when running within FFMPEG.
     //VA-API doesn't pass enough information for NVDEC to work with, but the information is there
     //just before the start of the buffer that was passed to us.
@@ -1266,10 +1588,34 @@ static VAStatus nvMapBuffer(
     NVDriver *drv = (NVDriver*) ctx->pDriverData;
     NVBuffer *buf = getObjectPtr(drv, OBJECT_TYPE_BUFFER, buf_id);
 
-    if (buf == NULL) {
+    if (buf == NULL || buf->ptr == NULL) {
         return VA_STATUS_ERROR_INVALID_BUFFER;
     }
 
+    /* Coded buffer: return pointer to VACodedBufferSegment */
+    if (buf->bufferType == VAEncCodedBufferType) {
+        NVCodedBuffer *coded = (NVCodedBuffer*) buf->ptr;
+        if (coded->hasData) {
+            coded->segment.size = coded->bitstreamSize;
+            coded->segment.bit_offset = 0;
+            coded->segment.status = 0;
+            coded->segment.reserved = 0;
+            coded->segment.buf = coded->bitstreamData;
+            coded->segment.next = NULL;
+            *pbuf = &coded->segment;
+        } else {
+            /* No data yet — return empty segment */
+            coded->segment.size = 0;
+            coded->segment.bit_offset = 0;
+            coded->segment.status = 0;
+            coded->segment.reserved = 0;
+            coded->segment.buf = NULL;
+            coded->segment.next = NULL;
+            *pbuf = &coded->segment;
+        }
+        return VA_STATUS_SUCCESS;
+    }
+
     *pbuf = buf->ptr;
 
     return VA_STATUS_SUCCESS;
@@ -1296,6 +1642,12 @@ static VAStatus nvDestroyBuffer(
     }
 
     if (buf->ptr != NULL) {
+        /* Free coded buffer internals before freeing the NVCodedBuffer itself */
+        if (buf->bufferType == VAEncCodedBufferType) {
+            NVCodedBuffer *coded = (NVCodedBuffer*) buf->ptr;
+            free(coded->bitstreamData);
+            coded->bitstreamData = NULL;
+        }
         free(buf->ptr);
     }
 
@@ -1322,6 +1674,13 @@ static VAStatus nvBeginPicture(
         return VA_STATUS_ERROR_INVALID_SURFACE;
     }
 
+    /* Encode path: just record the render target */
+    if (nvCtx->isEncode) {
+        nvCtx->renderTarget = surface;
+        surface->context = nvCtx;
+        return VA_STATUS_SUCCESS;
+    }
+
     if (surface->context != NULL && surface->context != nvCtx) {
         //this surface was last used on a different context, we need to free up the backing image (it might not be the correct size)
         if (surface->backingImage != NULL) {
@@ -1356,6 +1715,55 @@ static VAStatus nvBeginPicture(
     return VA_STATUS_SUCCESS;
 }
 
+static void nvRenderPictureEncode(NVContext *nvCtx, NVBuffer *buf)
+{
+    NVENCContext *nvencCtx = (NVENCContext*) nvCtx->encodeData;
+    bool isH264 = (nvCtx->profile == VAProfileH264ConstrainedBaseline ||
+                   nvCtx->profile == VAProfileH264Main ||
+                   nvCtx->profile == VAProfileH264High);
+
+    switch (buf->bufferType) {
+    case VAEncSequenceParameterBufferType:
+        if (isH264) {
+            h264enc_handle_sequence_params(nvencCtx, buf);
+        } else {
+            hevcenc_handle_sequence_params(nvencCtx, buf);
+        }
+        break;
+    case VAEncPictureParameterBufferType:
+        if (isH264) {
+            h264enc_handle_picture_params(nvencCtx, buf);
+        } else {
+            hevcenc_handle_picture_params(nvencCtx, buf);
+        }
+        break;
+    case VAEncSliceParameterBufferType:
+        if (isH264) {
+            h264enc_handle_slice_params(nvencCtx, buf);
+        } else {
+            hevcenc_handle_slice_params(nvencCtx, buf);
+        }
+        break;
+    case VAEncMiscParameterBufferType:
+        if (isH264) {
+            h264enc_handle_misc_params(nvencCtx, buf);
+        } else {
+            hevcenc_handle_misc_params(nvencCtx, buf);
+        }
+        break;
+    case VAEncCodedBufferType:
+        /* Coded buffer is handled at EndPicture */
+        break;
+    case VAEncPackedHeaderParameterBufferType:
+    case VAEncPackedHeaderDataBufferType:
+        /* Packed headers: NVENC generates its own headers, skip these */
+        break;
+    default:
+        LOG("Encode: unhandled buffer type: %d", buf->bufferType);
+        break;
+    }
+}
+
 static VAStatus nvRenderPicture(
         VADriverContextP ctx,
         VAContextID context,
@@ -1370,14 +1778,19 @@ static VAStatus nvRenderPicture(
         return VA_STATUS_ERROR_INVALID_CONTEXT;
     }
 
-    CUVIDPICPARAMS *picParams = &nvCtx->pPicParams;
-
     for (int i = 0; i < num_buffers; i++) {
         NVBuffer *buf = (NVBuffer*) getObjectPtr(drv, OBJECT_TYPE_BUFFER, buffers[i]);
         if (buf == NULL || buf->ptr == NULL) {
             LOG("Invalid buffer detected, skipping: %d", buffers[i]);
             continue;
         }
+
+        if (nvCtx->isEncode) {
+            nvRenderPictureEncode(nvCtx, buf);
+            continue;
+        }
+
+        CUVIDPICPARAMS *picParams = &nvCtx->pPicParams;
         HandlerFunc func = nvCtx->codec->handlers[buf->bufferType];
         if (func != NULL) {
             func(nvCtx, buf, picParams);
@@ -1389,6 +1802,450 @@ static VAStatus nvRenderPicture(
     return VA_STATUS_SUCCESS;
 }
 
+static VAStatus nvEndPictureEncodeIPC(NVDriver *drv, NVContext *nvCtx);
+
+static VAStatus nvEndPictureEncode(NVDriver *drv, NVContext *nvCtx)
+{
+    NVENCContext *nvencCtx = (NVENCContext*) nvCtx->encodeData;
+    NVSurface *surface = nvCtx->renderTarget;
+
+    if (nvencCtx == NULL) {
+        return VA_STATUS_ERROR_INVALID_CONTEXT;
+    }
+
+    /* IPC path: delegate to 64-bit helper */
+    if (nvencCtx->useIPC) {
+        return nvEndPictureEncodeIPC(drv, nvCtx);
+    }
+
+    if (nvencCtx->encoder == NULL) {
+        return VA_STATUS_ERROR_INVALID_CONTEXT;
+    }
+
+    CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED);
+
+    /* Initialize encoder on first frame (we now have all params from sequence/picture buffers) */
+    if (!nvencCtx->initialized) {
+        GUID codecGuid = nvenc_va_profile_to_codec_guid(nvCtx->profile);
+        GUID profileGuid = nvenc_va_profile_to_profile_guid(nvCtx->profile);
+
+        if (!nvenc_init_encoder(nvencCtx, nvencCtx->width, nvencCtx->height,
+                                codecGuid, profileGuid,
+                                NV_ENC_PRESET_P4_GUID,
+                                NV_ENC_TUNING_INFO_LOW_LATENCY)) {
+            CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+            return VA_STATUS_ERROR_OPERATION_FAILED;
+        }
+
+        if (!nvenc_alloc_output_buffer(nvencCtx)) {
+            CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+            return VA_STATUS_ERROR_ALLOCATION_FAILED;
+        }
+    }
+
+    /* Realise the surface so we have a backing image with CUDA memory */
+    if (!drv->backend->realiseSurface(drv, surface)) {
+        LOG("Encode: failed to realise input surface");
+        CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+        return VA_STATUS_ERROR_OPERATION_FAILED;
+    }
+
+    BackingImage *img = surface->backingImage;
+    if (img == NULL) {
+        LOG("Encode: surface has no backing image");
+        CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+        return VA_STATUS_ERROR_OPERATION_FAILED;
+    }
+
+    /*
+     * The backing image contains CUarray(s) for each plane.
+     * NVENC needs a linear CUdeviceptr. We need to allocate a linear buffer,
+     * copy the CUarray contents into it, then register with NVENC.
+     *
+     * Use surface dimensions for the copy (the CUarray matches the surface).
+     * NVENC width/height may differ due to MB/CTU alignment.
+     */
+    uint32_t surfWidth = surface->width;
+    uint32_t surfHeight = surface->height;
+    uint32_t encWidth = nvencCtx->width;
+    uint32_t encHeight = nvencCtx->height;
+    NV_ENC_BUFFER_FORMAT encFmt = nvencCtx->inputFormat;
+
+    /* Calculate pitch and size for NV12/P010 linear buffer.
+     * Allocate for the full encode height (may be larger than surface due to alignment)
+     * but only copy surfHeight rows from the CUarray. */
+    uint32_t bytesPerPixel = (encFmt == NV_ENC_BUFFER_FORMAT_YUV420_10BIT) ? 2 : 1;
+    uint32_t pitch = encWidth * bytesPerPixel;
+    /* Align pitch to 256 bytes for NVENC */
+    pitch = (pitch + 255) & ~255;
+    uint32_t lumaSize = pitch * encHeight;
+    uint32_t chromaSize = pitch * (encHeight / 2);
+    uint32_t totalSize = lumaSize + chromaSize;
+
+    CUdeviceptr linearBuffer = 0;
+    CUresult cuRes = cu->cuMemAlloc(&linearBuffer, totalSize);
+    if (cuRes != CUDA_SUCCESS) {
+        LOG("Encode: failed to allocate linear buffer (%u bytes): %d", totalSize, cuRes);
+        CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+        return VA_STATUS_ERROR_ALLOCATION_FAILED;
+    }
+
+    /* Zero the buffer so padded rows are clean */
+    cu->cuMemsetD8Async(linearBuffer, 0, totalSize, 0);
+
+    /* Copy luma plane from CUarray to linear buffer */
+    CUDA_MEMCPY2D copy = {0};
+    copy.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+    copy.srcArray = img->arrays[0];
+    copy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    copy.dstDevice = linearBuffer;
+    copy.dstPitch = pitch;
+    copy.WidthInBytes = surfWidth * bytesPerPixel;
+    copy.Height = surfHeight;
+
+    cuRes = cu->cuMemcpy2D(&copy);
+    if (cuRes != CUDA_SUCCESS) {
+        LOG("Encode: luma copy failed: %d (surface=%ux%u, pitch=%u)", cuRes, surfWidth, surfHeight, pitch);
+        cu->cuMemFree(linearBuffer);
+        CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+        return VA_STATUS_ERROR_OPERATION_FAILED;
+    }
+
+    /* Copy chroma plane (interleaved UV) */
+    memset(&copy, 0, sizeof(copy));
+    copy.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+    copy.srcArray = img->arrays[1];
+    copy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    copy.dstDevice = linearBuffer + lumaSize;
+    copy.dstPitch = pitch;
+    /* Chroma plane: each pixel has 2 channels (U,V) interleaved */
+    copy.WidthInBytes = surfWidth * bytesPerPixel;
+    copy.Height = surfHeight / 2;
+
+    cuRes = cu->cuMemcpy2D(&copy);
+    if (cuRes != CUDA_SUCCESS) {
+        LOG("Encode: chroma copy failed: %d", cuRes);
+        cu->cuMemFree(linearBuffer);
+        CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+        return VA_STATUS_ERROR_OPERATION_FAILED;
+    }
+
+    /* Register the linear buffer with NVENC */
+    NV_ENC_REGISTERED_PTR registeredRes = NULL;
+    if (!nvenc_register_cuda_resource(nvencCtx, linearBuffer,
+                                      encWidth, encHeight, pitch,
+                                      encFmt, &registeredRes)) {
+        cu->cuMemFree(linearBuffer);
+        CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+        return VA_STATUS_ERROR_OPERATION_FAILED;
+    }
+
+    /* Map the registered resource */
+    NV_ENC_INPUT_PTR mappedResource = NULL;
+    NV_ENC_BUFFER_FORMAT mappedFmt = encFmt;
+    if (!nvenc_map_resource(nvencCtx, registeredRes, &mappedResource, &mappedFmt)) {
+        nvenc_unregister_resource(nvencCtx, registeredRes);
+        cu->cuMemFree(linearBuffer);
+        CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+        return VA_STATUS_ERROR_OPERATION_FAILED;
+    }
+
+    /* Encode the frame.
+     * Use only OUTPUT_SPSPPS on the first frame; after that let NVENC handle it. */
+    uint32_t picFlags = (nvencCtx->frameCount == 0 || nvencCtx->forceIDR)
+        ? (NV_ENC_PIC_FLAG_OUTPUT_SPSPPS | NV_ENC_PIC_FLAG_FORCEIDR)
+        : 0;
+    nvencCtx->forceIDR = false;
+    int encResult = nvenc_encode_frame(nvencCtx, mappedResource, mappedFmt,
+                                       encWidth, encHeight, pitch,
+                                       nvencCtx->picType, picFlags);
+
+    /* Unmap and unregister regardless of encode result */
+    nvenc_unmap_resource(nvencCtx, mappedResource);
+    nvenc_unregister_resource(nvencCtx, registeredRes);
+    cu->cuMemFree(linearBuffer);
+
+    if (encResult < 0) {
+        CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+        return VA_STATUS_ERROR_ENCODING_ERROR;
+    }
+
+    /* Find the coded buffer */
+    NVBuffer *codedBuf = (NVBuffer*) getObjectPtr(drv, OBJECT_TYPE_BUFFER,
+                                                    nvencCtx->currentCodedBufId);
+
+    if (encResult == 0) {
+        /* NVENC needs more input (B-frame reordering). Mark coded buffer as empty. */
+        if (codedBuf != NULL && codedBuf->ptr != NULL) {
+            NVCodedBuffer *coded = (NVCodedBuffer*) codedBuf->ptr;
+            coded->bitstreamSize = 0;
+            coded->hasData = false;
+        }
+        LOG("Encode: frame %lu buffered (needs more input)",
+            (unsigned long)(nvencCtx->frameCount - 1));
+        CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED);
+        return VA_STATUS_SUCCESS;
+    }
+
+    /* Lock bitstream and copy into the coded buffer */
+    void *bitstreamPtr = NULL;
+    uint32_t bitstreamSize = 0;
+    if (!nvenc_lock_bitstream(nvencCtx, &bitstreamPtr, &bitstreamSize)) {
+        CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+        return VA_STATUS_ERROR_ENCODING_ERROR;
+    }
+
+    if (codedBuf != NULL && codedBuf->ptr != NULL) {
+        NVCodedBuffer *coded = (NVCodedBuffer*) codedBuf->ptr;
+        /* Grow the buffer if needed */
+        if (bitstreamSize > coded->bitstreamAlloc) {
+            void *newBuf = realloc(coded->bitstreamData, bitstreamSize);
+            if (newBuf != NULL) {
+                coded->bitstreamData = newBuf;
+                coded->bitstreamAlloc = bitstreamSize;
+            } else {
+                LOG("Encode: failed to grow coded buffer to %u bytes", bitstreamSize);
+                nvenc_unlock_bitstream(nvencCtx);
+                CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+                return VA_STATUS_ERROR_ALLOCATION_FAILED;
+            }
+        }
+        memcpy(coded->bitstreamData, bitstreamPtr, bitstreamSize);
+        coded->bitstreamSize = bitstreamSize;
+        coded->hasData = true;
+        LOG("Encode: frame %lu encoded, %u bytes",
+            (unsigned long)(nvencCtx->frameCount - 1), bitstreamSize);
+    } else {
+        LOG("Encode: WARNING - no coded buffer found for id %d", nvencCtx->currentCodedBufId);
+    }
+
+    nvenc_unlock_bitstream(nvencCtx);
+
+    CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED);
+    return VA_STATUS_SUCCESS;
+}
+
+/* IPC encode path: send frame data to 64-bit helper, receive bitstream */
+static VAStatus nvEndPictureEncodeIPC(NVDriver *drv, NVContext *nvCtx)
+{
+    NVENCContext *nvencCtx = (NVENCContext*) nvCtx->encodeData;
+    NVSurface *surface = nvCtx->renderTarget;
+
+    (void)drv;
+
+    /* Connect to helper on first use */
+    if (nvencCtx->ipcFd < 0) {
+        /* Try connecting to an already-running helper first, then start one */
+        static const char *helper_paths[] = {
+            "/usr/libexec/nvenc-helper",
+            "/usr/local/libexec/nvenc-helper",
+            "/usr/lib/nvidia-vaapi-driver/nvenc-helper",
+            NULL
+        };
+        nvencCtx->ipcFd = nvenc_ipc_connect();
+        if (nvencCtx->ipcFd < 0) {
+            for (int pi = 0; helper_paths[pi] != NULL; pi++) {
+                if (access(helper_paths[pi], X_OK) == 0) {
+                    LOG("IPC encode: starting helper: %s", helper_paths[pi]);
+                    nvencCtx->ipcFd = nvenc_ipc_connect_or_start(helper_paths[pi]);
+                    if (nvencCtx->ipcFd >= 0) break;
+                }
+            }
+        }
+        if (nvencCtx->ipcFd < 0) {
+            LOG("IPC encode: failed to connect to nvenc-helper (is it installed?)");
+            return VA_STATUS_ERROR_OPERATION_FAILED;
+        }
+        LOG("IPC encode: connected to nvenc-helper (fd=%d)", nvencCtx->ipcFd);
+    }
+
+    /* Initialize encoder via IPC on first frame */
+    if (!nvencCtx->initialized) {
+        bool isH264 = (nvCtx->profile == VAProfileH264ConstrainedBaseline ||
+                       nvCtx->profile == VAProfileH264Main ||
+                       nvCtx->profile == VAProfileH264High);
+        NVEncIPCInitParams params = {
+            .width = nvencCtx->width,
+            .height = nvencCtx->height,
+            .codec = isH264 ? 0 : 1,
+            .profile = (uint32_t)nvCtx->profile,
+            .frameRateNum = nvencCtx->frameRateNum,
+            .frameRateDen = nvencCtx->frameRateDen,
+            .bitrate = nvencCtx->bitrate,
+            .maxBitrate = nvencCtx->maxBitrate,
+            .gopLength = nvencCtx->intraPeriod,
+            .is10bit = (nvencCtx->inputFormat == NV_ENC_BUFFER_FORMAT_YUV420_10BIT) ? 1 : 0,
+        };
+
+        int shm_fd = -1;
+        uint32_t shm_size = 0;
+        if (nvenc_ipc_init(nvencCtx->ipcFd, &params, &shm_fd, &shm_size) != 0) {
+            LOG("IPC encode: init failed");
+            return VA_STATUS_ERROR_OPERATION_FAILED;
+        }
+        nvencCtx->initialized = true;
+
+        /* Map shared memory if the helper provided one */
+        if (shm_fd >= 0 && shm_size > 0) {
+            nvencCtx->shmPtr = mmap(NULL, shm_size, PROT_READ | PROT_WRITE,
+                                     MAP_SHARED, shm_fd, 0);
+            if (nvencCtx->shmPtr == MAP_FAILED) {
+                nvencCtx->shmPtr = NULL;
+                LOG("IPC encode: shm mmap failed, falling back to socket");
+            } else {
+                nvencCtx->shmSize = shm_size;
+                nvencCtx->shmFd = shm_fd;
+
+                /* Redirect the surface's hostPixelData to the SHM region.
+                 * This eliminates the memcpy in EndPicture — Steam writes
+                 * directly to shared memory via vaDeriveImage → vaMapBuffer.
+                 * The helper reads from the same physical pages. Zero copy. */
+                if (surface->hostPixelSize <= shm_size) {
+                    if (!surface->hostPixelIsShm) {
+                        free(surface->hostPixelData);
+                    }
+                    surface->hostPixelData = nvencCtx->shmPtr;
+                    surface->hostPixelSize = shm_size;
+                    surface->hostPixelIsShm = true;
+                    LOG("IPC encode: shm zero-copy enabled, %u bytes", shm_size);
+                } else {
+                    LOG("IPC encode: shm enabled (copy mode), %u bytes", shm_size);
+                }
+            }
+            close(shm_fd); /* mmap keeps the mapping alive after close */
+        }
+
+        LOG("IPC encode: encoder initialized %ux%u shm=%s",
+            params.width, params.height, nvencCtx->shmPtr ? "yes" : "no");
+    }
+
+    /* Encode via IPC.
+     * Priority: 1) Host pixel data from vaDeriveImage/vaPutImage (has actual captured pixels)
+     *           2) DRM-backed surface via NVIDIA opaque fds (GPU zero-copy)
+     * Host data takes priority because vaDeriveImage is how Steam writes captured
+     * frames — the GPU surface may exist but not contain the capture. */
+    void *bitstream = NULL;
+    uint32_t bsSize = 0;
+    int ret;
+    int dmabuf_fds[4] = {-1, -1, -1, -1};
+    int num_dmabuf_fds = 0;
+    NVEncIPCEncodeDmaBufParams dp = {0};
+    bool useDmaBuf = false;
+    bool useHostData = false;
+
+    /* Prefer host pixel data if available (written by vaDeriveImage → vaMapBuffer) */
+    if (surface->hostPixelData != NULL && surface->hostPixelSize > 0) {
+        useHostData = true;
+    } else if (surface->backingImage != NULL && surface->backingImage->nvFds[0] > 0) {
+        /* DRM-backed surface: send per-plane NVIDIA opaque fds to helper.
+         * The helper imports each into CUDA (cuImportExternalMemory with
+         * OPAQUE_FD), maps to CUarray, copies to linear buffer, encodes.
+         * We dup() the fds because CUDA takes ownership on import. */
+        BackingImage *img = surface->backingImage;
+        const NVFormatInfo *fmtInfo = &formatsInfo[img->format];
+        dp.width = surface->width;
+        dp.height = surface->height;
+        dp.num_planes = fmtInfo->numPlanes;
+        dp.bppc = fmtInfo->bppc;
+        dp.is10bit = (nvencCtx->inputFormat == NV_ENC_BUFFER_FORMAT_YUV420_10BIT) ? 1 : 0;
+        for (uint32_t p = 0; p < fmtInfo->numPlanes && p < 4; p++) {
+            dmabuf_fds[p] = dup(img->nvFds[p]);
+            dp.pitches[p] = img->strides[p];
+            dp.offsets[p] = 0;
+            dp.sizes[p] = img->memorySizes[p];
+        }
+        num_dmabuf_fds = (int)fmtInfo->numPlanes;
+        useDmaBuf = true;
+    }
+
+    if (useHostData) {
+        /* Host memory path: pixel data from vaDeriveImage/vaPutImage.
+         * IMPORTANT: use the SURFACE dimensions (e.g. 1920x1080), not the
+         * encoder dimensions (e.g. 1920x1088). */
+        uint32_t surfW = surface->width;
+        uint32_t surfH = surface->height;
+        uint32_t frameSize = surface->hostPixelSize;
+        uint32_t forceIDR = nvencCtx->forceIDR ? 1 : 0;
+        nvencCtx->forceIDR = false;
+
+        if (nvencCtx->shmPtr != NULL && frameSize <= nvencCtx->shmSize) {
+            /* SHM path: if hostPixelData IS the shm (zero-copy), skip memcpy.
+             * Otherwise copy frame to shared memory. */
+            if (surface->hostPixelData != nvencCtx->shmPtr) {
+                memcpy(nvencCtx->shmPtr, surface->hostPixelData, frameSize);
+            }
+            if (nvencCtx->frameCount < 3) {
+                LOG("IPC encode: SHM path %ux%u %u bytes", surfW, surfH, frameSize);
+            }
+            ret = nvenc_ipc_encode_shm(nvencCtx->ipcFd, surfW, surfH,
+                                        frameSize, forceIDR,
+                                        &bitstream, &bsSize);
+        } else {
+            /* Socket fallback: snapshot + full send */
+            void *snapshot = malloc(frameSize);
+            if (snapshot == NULL) {
+                return VA_STATUS_ERROR_ALLOCATION_FAILED;
+            }
+            memcpy(snapshot, surface->hostPixelData, frameSize);
+            if (nvencCtx->frameCount < 3) {
+                LOG("IPC encode: SOCKET path %ux%u %u bytes", surfW, surfH, frameSize);
+            }
+            ret = nvenc_ipc_encode(nvencCtx->ipcFd, snapshot,
+                                    surfW, surfH, frameSize, forceIDR,
+                                    &bitstream, &bsSize);
+            free(snapshot);
+        }
+    } else if (useDmaBuf) {
+        if (nvencCtx->frameCount < 3) {
+            LOG("IPC encode: DMABUF planes=%d fds=[%d,%d] %ux%u pitch=%u sizes=[%u,%u]",
+                num_dmabuf_fds, dmabuf_fds[0], dmabuf_fds[1],
+                dp.width, dp.height, dp.pitches[0], dp.sizes[0], dp.sizes[1]);
+        }
+        ret = nvenc_ipc_encode_dmabuf(nvencCtx->ipcFd, dmabuf_fds, num_dmabuf_fds,
+                                       &dp, &bitstream, &bsSize);
+    } else {
+        LOG("IPC encode: surface has no pixel data (no DMA-BUF, no host data)");
+        return VA_STATUS_ERROR_OPERATION_FAILED;
+    }
+
+    if (ret != 0) {
+        LOG("IPC encode: encode failed (ret=%d)", ret);
+        return VA_STATUS_ERROR_ENCODING_ERROR;
+    }
+
+    /* Copy bitstream into coded buffer */
+    NVBuffer *codedBuf = (NVBuffer*) getObjectPtr(drv, OBJECT_TYPE_BUFFER,
+                                                    nvencCtx->currentCodedBufId);
+    if (codedBuf != NULL && codedBuf->ptr != NULL) {
+        NVCodedBuffer *coded = (NVCodedBuffer*) codedBuf->ptr;
+        if (bsSize > coded->bitstreamAlloc) {
+            void *newBuf = realloc(coded->bitstreamData, bsSize);
+            if (newBuf != NULL) {
+                coded->bitstreamData = newBuf;
+                coded->bitstreamAlloc = bsSize;
+            } else {
+                free(bitstream);
+                return VA_STATUS_ERROR_ALLOCATION_FAILED;
+            }
+        }
+        memcpy(coded->bitstreamData, bitstream, bsSize);
+        coded->bitstreamSize = bsSize;
+        coded->hasData = true;
+        if (nvencCtx->frameCount < 5 || nvencCtx->frameCount % 300 == 0) {
+            unsigned char *bs = (unsigned char *)coded->bitstreamData;
+            LOG("IPC encode: frame %lu, %u bytes, first4=[%02x %02x %02x %02x]",
+                (unsigned long)nvencCtx->frameCount, bsSize,
+                bsSize > 0 ? bs[0] : 0, bsSize > 1 ? bs[1] : 0,
+                bsSize > 2 ? bs[2] : 0, bsSize > 3 ? bs[3] : 0);
+        }
+    }
+
+    free(bitstream);
+    nvencCtx->frameCount++;
+
+    return VA_STATUS_SUCCESS;
+}
+
 static VAStatus nvEndPicture(
         VADriverContextP ctx,
         VAContextID context
@@ -1397,7 +2254,16 @@ static VAStatus nvEndPicture(
     NVDriver *drv = (NVDriver*) ctx->pDriverData;
     NVContext *nvCtx = (NVContext*) getObjectPtr(drv, OBJECT_TYPE_CONTEXT, context);
 
-    if (nvCtx == NULL || nvCtx->decoder == NULL) {
+    if (nvCtx == NULL) {
+        return VA_STATUS_ERROR_INVALID_CONTEXT;
+    }
+
+    /* Encode path */
+    if (nvCtx->isEncode) {
+        return nvEndPictureEncode(drv, nvCtx);
+    }
+
+    if (nvCtx->decoder == NULL) {
         return VA_STATUS_ERROR_INVALID_CONTEXT;
     }
 
@@ -1453,6 +2319,11 @@ static VAStatus nvSyncSurface(
         return VA_STATUS_ERROR_INVALID_SURFACE;
     }
 
+    /* Encode is synchronous — EndPicture blocks until encode is done */
+    if (surface->context != NULL && surface->context->isEncode) {
+        return VA_STATUS_SUCCESS;
+    }
+
     //LOG("Syncing on surface: %d (%p)", surface->pictureIdx, surface);
 
     //wait for resolve to occur before synchronising
@@ -1611,8 +2482,75 @@ static VAStatus nvDeriveImage(
         VAImage *image     /* out */
     )
 {
-    //LOG("In %s", __func__);
-    //FAILED because we don't support it
+    NVDriver *drv = (NVDriver*) ctx->pDriverData;
+    NVSurface *surfaceObj = (NVSurface*) getObjectPtr(drv, OBJECT_TYPE_SURFACE, surface);
+
+    if (surfaceObj == NULL) {
+        return VA_STATUS_ERROR_INVALID_SURFACE;
+    }
+
+    /* In IPC encode-only mode, derive a host-memory image so Steam's ffmpeg
+     * can write captured NV12 frames into it via vaMapBuffer. The encoder
+     * then reads from this host memory via the IPC pixel-data path. */
+    if (!drv->cudaAvailable) {
+        uint32_t width = surfaceObj->width;
+        uint32_t height = surfaceObj->height;
+        int bpp = (surfaceObj->bitDepth > 8) ? 2 : 1;
+        uint32_t lumaSize = width * bpp * height;
+        uint32_t chromaSize = width * bpp * (height / 2);
+        uint32_t totalSize = lumaSize + chromaSize;
+
+        /* Allocate or reuse the surface's host pixel buffer */
+        if (surfaceObj->hostPixelData == NULL || surfaceObj->hostPixelSize < totalSize) {
+            free(surfaceObj->hostPixelData);
+            surfaceObj->hostPixelData = malloc(totalSize);
+            if (surfaceObj->hostPixelData == NULL) {
+                surfaceObj->hostPixelSize = 0;
+                return VA_STATUS_ERROR_ALLOCATION_FAILED;
+            }
+            surfaceObj->hostPixelSize = totalSize;
+            memset(surfaceObj->hostPixelData, 0, totalSize);
+        }
+
+        /* Create a buffer object for the image data (points to the surface's host memory) */
+        Object imageBufferObj = allocateObject(drv, OBJECT_TYPE_BUFFER, sizeof(NVBuffer));
+        NVBuffer *imageBuf = (NVBuffer*) imageBufferObj->obj;
+        imageBuf->bufferType = VAImageBufferType;
+        imageBuf->size = totalSize;
+        imageBuf->elements = 1;
+        imageBuf->ptr = surfaceObj->hostPixelData; /* Shared with surface! */
+        imageBuf->offset = (size_t)-1; /* Sentinel: don't free ptr on destroy */
+
+        /* Create the image object */
+        Object imageObj = allocateObject(drv, OBJECT_TYPE_IMAGE, sizeof(NVImage));
+        NVImage *img = (NVImage*) imageObj->obj;
+        img->width = width;
+        img->height = height;
+        img->format = (bpp == 1) ? NV_FORMAT_NV12 : NV_FORMAT_P010;
+        img->imageBuffer = imageBuf;
+
+        /* Fill VAImage output */
+        memset(image, 0, sizeof(*image));
+        image->image_id = imageObj->id;
+        image->format.fourcc = (bpp == 1) ? VA_FOURCC_NV12 : VA_FOURCC_P010;
+        image->format.byte_order = VA_LSB_FIRST;
+        image->format.bits_per_pixel = (bpp == 1) ? 12 : 24;
+        image->buf = imageBufferObj->id;
+        image->width = width;
+        image->height = height;
+        image->data_size = totalSize;
+        image->num_planes = 2;
+        image->pitches[0] = width * bpp;
+        image->pitches[1] = width * bpp;
+        image->offsets[0] = 0;
+        image->offsets[1] = lumaSize;
+
+        LOG("DeriveImage: surface %d → host image %d (%ux%u, %u bytes)",
+            surface, imageObj->id, width, height, totalSize);
+        return VA_STATUS_SUCCESS;
+    }
+
+    /* Normal CUDA path: not supported */
     return VA_STATUS_ERROR_OPERATION_FAILED;
 }
 
@@ -1631,7 +2569,10 @@ static VAStatus nvDestroyImage(
     Object imageBufferObj = getObjectByPtr(drv, OBJECT_TYPE_BUFFER, img->imageBuffer);
 
     if (imageBufferObj != NULL) {
-        if (img->imageBuffer->ptr != NULL) {
+        /* For derived images, the buffer ptr is shared with the surface's
+         * hostPixelData — don't free it (the surface owns the memory).
+         * For regular images (from vaCreateImage), we own the buffer. */
+        if (img->imageBuffer->ptr != NULL && img->imageBuffer->offset != (size_t)-1) {
             free(img->imageBuffer->ptr);
         }
 
@@ -1735,7 +2676,98 @@ static VAStatus nvPutImage(
         unsigned int dest_height
     )
 {
-    LOG("In %s", __func__);
+    NVDriver *drv = (NVDriver*) ctx->pDriverData;
+
+    NVSurface *surfaceObj = (NVSurface*) getObjectPtr(drv, OBJECT_TYPE_SURFACE, surface);
+    NVImage *imageObj = (NVImage*) getObjectPtr(drv, OBJECT_TYPE_IMAGE, image);
+
+    if (surfaceObj == NULL) {
+        return VA_STATUS_ERROR_INVALID_SURFACE;
+    }
+    if (imageObj == NULL) {
+        return VA_STATUS_ERROR_INVALID_IMAGE;
+    }
+
+    const NVFormatInfo *fmtInfo = &formatsInfo[imageObj->format];
+
+    /* Host-memory path: when CUDA is unavailable (IPC encode-only mode),
+     * store pixel data directly in the surface for later IPC transmission. */
+    if (!drv->cudaAvailable) {
+        uint32_t totalSize = imageObj->imageBuffer->size;
+        if (surfaceObj->hostPixelData == NULL || surfaceObj->hostPixelSize < totalSize) {
+            free(surfaceObj->hostPixelData);
+            surfaceObj->hostPixelData = malloc(totalSize);
+            if (surfaceObj->hostPixelData == NULL) {
+                surfaceObj->hostPixelSize = 0;
+                return VA_STATUS_ERROR_ALLOCATION_FAILED;
+            }
+            surfaceObj->hostPixelSize = totalSize;
+        }
+        memcpy(surfaceObj->hostPixelData, imageObj->imageBuffer->ptr, totalSize);
+        return VA_STATUS_SUCCESS;
+    }
+
+    CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED);
+
+    /* Ensure the surface has a backing image to write into */
+    if (!drv->backend->realiseSurface(drv, surfaceObj)) {
+        LOG("PutImage: failed to realise surface");
+        CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+        return VA_STATUS_ERROR_OPERATION_FAILED;
+    }
+
+    BackingImage *backImg = surfaceObj->backingImage;
+    if (backImg == NULL) {
+        LOG("PutImage: no backing image");
+        CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+        return VA_STATUS_ERROR_OPERATION_FAILED;
+    }
+
+    /* Copy each plane from host memory (image buffer) to GPU (CUarray).
+     * Apply source/destination offsets per the VA-API spec. */
+    uint32_t copyWidth = src_width > 0 ? src_width : imageObj->width;
+    uint32_t copyHeight = src_height > 0 ? src_height : imageObj->height;
+    uint32_t imgWidth = imageObj->width;
+    uint32_t imgHeight = imageObj->height;
+    uint32_t imgPlaneOffset = 0;
+
+    for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) {
+        const NVFormatPlane *p = &fmtInfo->plane[i];
+        /* Subsampled offsets and dimensions */
+        uint32_t planeSrcX = (uint32_t)((src_x > 0 ? src_x : 0)) >> p->ss.x;
+        uint32_t planeSrcY = (uint32_t)((src_y > 0 ? src_y : 0)) >> p->ss.y;
+        uint32_t planeDstX = (uint32_t)((dest_x > 0 ? dest_x : 0)) >> p->ss.x;
+        uint32_t planeDstY = (uint32_t)((dest_y > 0 ? dest_y : 0)) >> p->ss.y;
+        uint32_t planeCopyW = copyWidth >> p->ss.x;
+        uint32_t planeCopyH = copyHeight >> p->ss.y;
+        uint32_t imgPlanePitch = imgWidth * fmtInfo->bppc;
+
+        CUDA_MEMCPY2D memcpy2d = {
+            .srcXInBytes = planeSrcX * fmtInfo->bppc * p->channelCount,
+            .srcY = planeSrcY,
+            .srcMemoryType = CU_MEMORYTYPE_HOST,
+            .srcHost = (char*)imageObj->imageBuffer->ptr + imgPlaneOffset,
+            .srcPitch = imgPlanePitch,
+
+            .dstXInBytes = planeDstX * fmtInfo->bppc * p->channelCount,
+            .dstY = planeDstY,
+            .dstMemoryType = CU_MEMORYTYPE_ARRAY,
+            .dstArray = backImg->arrays[i],
+
+            .WidthInBytes = planeCopyW * fmtInfo->bppc * p->channelCount,
+            .Height = planeCopyH,
+        };
+
+        CUresult result = cu->cuMemcpy2D(&memcpy2d);
+        if (result != CUDA_SUCCESS) {
+            LOG("PutImage: cuMemcpy2D failed for plane %u: %d", i, result);
+            CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL));
+            return VA_STATUS_ERROR_OPERATION_FAILED;
+        }
+        imgPlaneOffset += ((imgWidth * imgHeight) >> (p->ss.x + p->ss.y)) * fmtInfo->bppc * p->channelCount;
+    }
+
+    CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED);
     return VA_STATUS_SUCCESS;
 }
 
@@ -1882,6 +2914,41 @@ static VAStatus nvQuerySurfaceAttributes(
         return VA_STATUS_ERROR_INVALID_CONFIG;
     }
 
+    /* Encode config surface attributes — GStreamer needs min/max dimensions */
+    if (cfg->isEncode) {
+        int cnt = 5;
+        if (num_attribs != NULL) {
+            *num_attribs = cnt;
+        }
+        if (attrib_list != NULL) {
+            attrib_list[0].type = VASurfaceAttribMinWidth;
+            attrib_list[0].flags = VA_SURFACE_ATTRIB_GETTABLE;
+            attrib_list[0].value.type = VAGenericValueTypeInteger;
+            attrib_list[0].value.value.i = 16;
+
+            attrib_list[1].type = VASurfaceAttribMinHeight;
+            attrib_list[1].flags = VA_SURFACE_ATTRIB_GETTABLE;
+            attrib_list[1].value.type = VAGenericValueTypeInteger;
+            attrib_list[1].value.value.i = 16;
+
+            attrib_list[2].type = VASurfaceAttribMaxWidth;
+            attrib_list[2].flags = VA_SURFACE_ATTRIB_GETTABLE;
+            attrib_list[2].value.type = VAGenericValueTypeInteger;
+            attrib_list[2].value.value.i = 4096;
+
+            attrib_list[3].type = VASurfaceAttribMaxHeight;
+            attrib_list[3].flags = VA_SURFACE_ATTRIB_GETTABLE;
+            attrib_list[3].value.type = VAGenericValueTypeInteger;
+            attrib_list[3].value.value.i = 4096;
+
+            attrib_list[4].type = VASurfaceAttribPixelFormat;
+            attrib_list[4].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+            attrib_list[4].value.type = VAGenericValueTypeInteger;
+            attrib_list[4].value.value.i = (cfg->bitDepth > 8) ? VA_FOURCC_P010 : VA_FOURCC_NV12;
+        }
+        return VA_STATUS_SUCCESS;
+    }
+
     //LOG("with %d (%d) %p %d", cfg->cudaCodec, cfg->bitDepth, attrib_list, *num_attribs);
 
     if (cfg->chromaFormat != cudaVideoChromaFormat_420 && cfg->chromaFormat != cudaVideoChromaFormat_444) {
@@ -2152,9 +3219,11 @@ static VAStatus nvExportSurfaceHandle(
         return VA_STATUS_ERROR_INVALID_SURFACE;
     }
 
-    //LOG("Exporting surface: %d (%p)", surface->pictureIdx, surface);
+    LOG("Exporting surface: %d (%p)", surface->pictureIdx, surface);
 
-    CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED);
+    if (drv->cudaAvailable) {
+        CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED);
+    }
 
     if (!drv->backend->realiseSurface(drv, surface)) {
         LOG("Unable to export surface");
@@ -2170,7 +3239,9 @@ static VAStatus nvExportSurfaceHandle(
     //                                                             ptr->layers[1].offset[0], ptr->layers[1].pitch[0],
     //                                                             ptr->objects[1].drm_format_modifier);
 
-    CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED);
+    if (drv->cudaAvailable) {
+        CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED);
+    }
 
     return VA_STATUS_SUCCESS;
 }
@@ -2180,23 +3251,32 @@ static VAStatus nvTerminate( VADriverContextP ctx )
     NVDriver *drv = (NVDriver*) ctx->pDriverData;
     LOG("Terminating %p", ctx);
 
-    CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED);
-
-    drv->backend->destroyAllBackingImage(drv);
-
-    deleteAllObjects(drv);
+    if (drv->cudaAvailable) {
+        CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED);
 
-    drv->backend->releaseExporter(drv);
+        drv->backend->destroyAllBackingImage(drv);
+        deleteAllObjects(drv);
+        drv->backend->releaseExporter(drv);
 
-    CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED);
+        CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED);
+    } else {
+        deleteAllObjects(drv);
+        /* Release the DRM backend if it was initialized for IPC mode */
+        if (drv->backend != NULL) {
+            drv->backend->destroyAllBackingImage(drv);
+            drv->backend->releaseExporter(drv);
+        }
+    }
 
     pthread_mutex_lock(&concurrency_mutex);
     instances--;
     LOG("Now have %d (%d max) instances", instances, max_instances);
     pthread_mutex_unlock(&concurrency_mutex);
 
-    CHECK_CUDA_RESULT_RETURN(cu->cuCtxDestroy(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED);
-    drv->cudaContext = NULL;
+    if (drv->cudaAvailable && drv->cudaContext != NULL) {
+        CHECK_CUDA_RESULT_RETURN(cu->cuCtxDestroy(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED);
+        drv->cudaContext = NULL;
+    }
 
     free(drv);
 
@@ -2299,7 +3379,8 @@ VAStatus __vaDriverInit_1_0(VADriverContextP ctx) {
     pthread_mutex_unlock(&concurrency_mutex);
 
     //check to make sure we initialised the CUDA functions correctly
-    if (cu == NULL || cv == NULL) {
+    //If CUDA loaded but cuInit failed, we can still do encode-only via IPC
+    if (cu == NULL) {
         return VA_STATUS_ERROR_OPERATION_FAILED;
     }
 
@@ -2308,6 +3389,9 @@ VAStatus __vaDriverInit_1_0(VADriverContextP ctx) {
 
     drv->cu = cu;
     drv->cv = cv;
+    drv->nv = nv;
+    drv->nvencAvailable = (nv != NULL);
+    drv->cudaAvailable = cudaInitSuccess;
     drv->useCorrectNV12Format = true;
     drv->cudaGpuId = gpu;
     //make sure that we want the default GPU, and that a DRM fd that we care about is passed in
@@ -2322,16 +3406,24 @@ VAStatus __vaDriverInit_1_0(VADriverContextP ctx) {
     }
 
     ctx->max_profiles = MAX_PROFILES;
-    ctx->max_entrypoints = 1;
+    ctx->max_entrypoints = 2;
     ctx->max_attributes = 1;
     ctx->max_display_attributes = 1;
     ctx->max_image_formats = ARRAY_SIZE(formatsInfo) - 1;
     ctx->max_subpic_formats = 1;
 
-    if (backend == DIRECT) {
-        ctx->str_vendor = "VA-API NVDEC driver [direct backend]";
-    } else if (backend == EGL) {
-        ctx->str_vendor = "VA-API NVDEC driver [egl backend]";
+    if (drv->cudaAvailable) {
+        if (backend == DIRECT) {
+            ctx->str_vendor = drv->nvencAvailable
+                ? "VA-API NVDEC/NVENC driver [direct backend]"
+                : "VA-API NVDEC driver [direct backend]";
+        } else if (backend == EGL) {
+            ctx->str_vendor = drv->nvencAvailable
+                ? "VA-API NVDEC/NVENC driver [egl backend]"
+                : "VA-API NVDEC driver [egl backend]";
+        }
+    } else {
+        ctx->str_vendor = "VA-API NVENC driver [IPC encode-only]";
     }
 
     pthread_mutexattr_t attrib;
@@ -2341,21 +3433,43 @@ VAStatus __vaDriverInit_1_0(VADriverContextP ctx) {
     pthread_mutex_init(&drv->imagesMutex, &attrib);
     pthread_mutex_init(&drv->exportMutex, NULL);
 
-    if (!drv->backend->initExporter(drv)) {
-        LOG("Exporter failed");
-        free(drv);
-        return VA_STATUS_ERROR_OPERATION_FAILED;
-    }
+    if (drv->cudaAvailable) {
+        /* Full CUDA path: init exporter and create CUDA context */
+        if (!drv->backend->initExporter(drv)) {
+            LOG("Exporter failed");
+            free(drv);
+            return VA_STATUS_ERROR_OPERATION_FAILED;
+        }
 
-    if (CHECK_CUDA_RESULT(cu->cuCtxCreate(&drv->cudaContext, CU_CTX_SCHED_BLOCKING_SYNC, drv->cudaGpuId))) {
-        drv->backend->releaseExporter(drv);
-        free(drv);
-        return VA_STATUS_ERROR_OPERATION_FAILED;
-    }
+        if (CHECK_CUDA_RESULT(cu->cuCtxCreate(&drv->cudaContext, CU_CTX_SCHED_BLOCKING_SYNC, drv->cudaGpuId))) {
+            drv->backend->releaseExporter(drv);
+            free(drv);
+            return VA_STATUS_ERROR_OPERATION_FAILED;
+        }
 
-    //CHECK_CUDA_RESULT_RETURN(cv->cuvidCtxLockCreate(&drv->vidLock, drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED);
+        nvQueryConfigProfiles2(ctx, drv->profiles, &drv->profileCount);
+    } else {
+        /* Encode-only IPC path: no CUDA context, no decode profiles.
+         * Init the direct backend for GPU surface allocation via DRM.
+         * This lets Steam render into our surfaces via OpenGL/EGL,
+         * and we send the DMA-BUF fds to the 64-bit helper for encoding. */
+        LOG("CUDA unavailable — encode-only mode, init DRM backend for surfaces");
+        drv->cudaContext = NULL;
+
+        if (backend == DIRECT && drv->backend->initExporter(drv)) {
+            LOG("DRM backend initialized for surface allocation");
+        } else {
+            LOG("DRM backend init failed — surfaces will have no GPU backing");
+        }
 
-    nvQueryConfigProfiles2(ctx, drv->profiles, &drv->profileCount);
+        int p = 0;
+        drv->profiles[p++] = VAProfileH264ConstrainedBaseline;
+        drv->profiles[p++] = VAProfileH264Main;
+        drv->profiles[p++] = VAProfileH264High;
+        drv->profiles[p++] = VAProfileHEVCMain;
+        drv->profiles[p++] = VAProfileHEVCMain10;
+        drv->profileCount = p;
+    }
 
     *ctx->vtable = vtable;
     return VA_STATUS_SUCCESS;
diff --git a/src/vabackend.h b/src/vabackend.h
index 672c489f..df6b4412 100644
--- a/src/vabackend.h
+++ b/src/vabackend.h
@@ -2,6 +2,7 @@
 #define VABACKEND_H
 
 #include <ffnvcodec/dynlink_loader.h>
+#include <ffnvcodec/nvEncodeAPI.h>
 #include <va/va_backend.h>
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
@@ -69,6 +70,16 @@ typedef struct
     pthread_mutex_t         mutex;
     pthread_cond_t          cond;
     bool                    decodeFailed;
+    /* Host-memory pixel buffer for encode-only IPC path (no CUDA) */
+    void                   *hostPixelData;
+    uint32_t                hostPixelSize;
+    bool                    hostPixelIsShm; /* true if hostPixelData points to SHM (don't free) */
+    /* Imported DMA-BUF for IPC encode (fd from Steam's GPU capture) */
+    int                     importedDmaBufFd;
+    uint32_t                importedPitches[4];
+    uint32_t                importedOffsets[4];
+    uint32_t                importedNumPlanes;
+    uint32_t                importedDataSize;
 } NVSurface;
 
 typedef enum
@@ -110,6 +121,9 @@ typedef struct _BackingImage {
     //direct backend only
     NVCudaImage cudaImages[3];
     NVFormat    format;
+    /* NVIDIA opaque fds for CUDA import (IPC encode path) */
+    int         nvFds[4];
+    uint32_t    memorySizes[4];
 } BackingImage;
 
 struct _NVDriver;
@@ -129,6 +143,7 @@ typedef struct _NVDriver
 {
     CudaFunctions           *cu;
     CuvidFunctions          *cv;
+    NvencFunctions          *nv;
     CUcontext               cudaContext;
     CUvideoctxlock          vidLock;
     Array/*<Object>*/       objects;
@@ -154,6 +169,8 @@ typedef struct _NVDriver
     int                     numFramesPresented;
     int                     profileCount;
     VAProfile               profiles[MAX_PROFILES];
+    bool                    nvencAvailable;
+    bool                    cudaAvailable;  /* false when 32-bit CUDA fails */
 } NVDriver;
 
 struct _NVCodec;
@@ -185,6 +202,8 @@ typedef struct _NVContext
     pthread_mutex_t     surfaceCreationMutex;
     int                 surfaceCount;
     bool                firstKeyframeValid;
+    bool                isEncode;
+    void               *encodeData; /* NVENCContext* for encode contexts */
 } NVContext;
 
 typedef struct
@@ -195,6 +214,7 @@ typedef struct
     cudaVideoChromaFormat   chromaFormat;
     int                     bitDepth;
     cudaVideoCodec          cudaCodec;
+    bool                    isEncode;
 } NVConfig;
 
 typedef void (*HandlerFunc)(NVContext*, NVBuffer* , CUVIDPICPARAMS*);
diff --git a/tests/test_common.h b/tests/test_common.h
new file mode 100644
index 00000000..1648adde
--- /dev/null
+++ b/tests/test_common.h
@@ -0,0 +1,138 @@
+/*
+ * test_common.h — Shared test utilities for nvidia-vaapi-driver tests.
+ * Inspired by Intel's i965 test infrastructure.
+ */
+
+#ifndef TEST_COMMON_H
+#define TEST_COMMON_H
+
+#define _POSIX_C_SOURCE 199309L
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <time.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <va/va.h>
+#include <va/va_drm.h>
+
+#define DRM_DEVICE "/dev/dri/renderD128"
+
+/* Test counters */
+static int g_pass = 0;
+static int g_fail = 0;
+static int g_skip = 0;
+
+/* Colors */
+#define C_GREEN  "\033[32m"
+#define C_RED    "\033[31m"
+#define C_YELLOW "\033[33m"
+#define C_RESET  "\033[0m"
+
+/* Test macros */
+#define TEST_START(name) \
+    printf("  %-55s ", name); fflush(stdout);
+
+#define TEST_PASS() do { \
+    printf(C_GREEN "PASS" C_RESET "\n"); g_pass++; \
+} while (0)
+
+#define TEST_FAIL(reason) do { \
+    printf(C_RED "FAIL" C_RESET " (%s)\n", reason); g_fail++; \
+} while (0)
+
+#define TEST_SKIP(reason) do { \
+    printf(C_YELLOW "SKIP" C_RESET " (%s)\n", reason); g_skip++; \
+} while (0)
+
+/* Assert that aborts current test function on failure */
+#define EXPECT_STATUS(st) do { \
+    if ((st) != VA_STATUS_SUCCESS) { \
+        char _msg[64]; snprintf(_msg, sizeof(_msg), "VA status %d", (st)); \
+        TEST_FAIL(_msg); return; \
+    } \
+} while (0)
+
+#define EXPECT_STATUS_EQ(expect, st) do { \
+    VAStatus _s = (st); \
+    if (_s != (expect)) { \
+        char _msg[64]; snprintf(_msg, sizeof(_msg), \
+            "expected status %d, got %d", (expect), _s); \
+        TEST_FAIL(_msg); return; \
+    } \
+} while (0)
+
+#define EXPECT_TRUE(cond, reason) do { \
+    if (!(cond)) { TEST_FAIL(reason); return; } \
+} while (0)
+
+#define EXPECT_NOT_NULL(ptr, reason) do { \
+    if ((ptr) == NULL) { TEST_FAIL(reason); return; } \
+} while (0)
+
+/* Timer for performance measurement */
+typedef struct {
+    struct timespec start;
+    struct timespec end;
+} TestTimer;
+
+static inline void timer_start(TestTimer *t) {
+    clock_gettime(CLOCK_MONOTONIC, &t->start);
+}
+
+static inline double timer_stop_ms(TestTimer *t) {
+    clock_gettime(CLOCK_MONOTONIC, &t->end);
+    return (t->end.tv_sec - t->start.tv_sec) * 1000.0
+         + (t->end.tv_nsec - t->start.tv_nsec) / 1000000.0;
+}
+
+/* Global VA display setup */
+static VADisplay g_dpy;
+static int g_drm_fd;
+
+static void test_global_setup(void) {
+    g_drm_fd = open(DRM_DEVICE, O_RDWR);
+    if (g_drm_fd < 0) {
+        fprintf(stderr, "Cannot open %s\n", DRM_DEVICE);
+        exit(1);
+    }
+    g_dpy = vaGetDisplayDRM(g_drm_fd);
+    if (!g_dpy) {
+        fprintf(stderr, "vaGetDisplayDRM failed\n");
+        exit(1);
+    }
+    int major, minor;
+    VAStatus st = vaInitialize(g_dpy, &major, &minor);
+    if (st != VA_STATUS_SUCCESS) {
+        fprintf(stderr, "vaInitialize failed: %d\n", st);
+        exit(1);
+    }
+}
+
+static void test_global_teardown(void) {
+    vaTerminate(g_dpy);
+    close(g_drm_fd);
+}
+
+static void test_print_summary(const char *suite_name) {
+    printf("\n=== %s: %d passed, %d failed, %d skipped ===\n\n",
+           suite_name, g_pass, g_fail, g_skip);
+}
+
+/* Check if a profile+entrypoint combination is supported */
+static bool test_has_entrypoint(VADisplay dpy, VAProfile profile, VAEntrypoint ep) {
+    int ne = vaMaxNumEntrypoints(dpy);
+    VAEntrypoint *eps = calloc(ne, sizeof(VAEntrypoint));
+    int n = 0;
+    vaQueryConfigEntrypoints(dpy, profile, eps, &n);
+    bool found = false;
+    for (int i = 0; i < n; i++) {
+        if (eps[i] == ep) { found = true; break; }
+    }
+    free(eps);
+    return found;
+}
+
+#endif /* TEST_COMMON_H */
diff --git a/tests/test_encode.c b/tests/test_encode.c
new file mode 100644
index 00000000..454b3b6c
--- /dev/null
+++ b/tests/test_encode.c
@@ -0,0 +1,488 @@
+/*
+ * test_encode.c — Encode path integration tests for nvidia-vaapi-driver.
+ *
+ * Build:
+ *   gcc -o test_encode test_encode.c -lva -lva-drm -lm
+ *
+ * Run:
+ *   ./test_encode           # all tests
+ *   ./test_encode h264      # H.264 tests only
+ *   ./test_encode hevc      # HEVC tests only
+ *
+ * Exit code: 0 = all pass, 1 = failure
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <math.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <va/va.h>
+#include <va/va_drm.h>
+#include <va/va_enc_h264.h>
+#include <va/va_enc_hevc.h>
+
+#define DRM_DEVICE "/dev/dri/renderD128"
+
+static int pass_count = 0;
+static int fail_count = 0;
+
+#define TEST_START(name) \
+    printf("  %-50s ", name); fflush(stdout);
+
+#define TEST_PASS() do { \
+    printf("\033[32mPASS\033[0m\n"); pass_count++; \
+} while (0)
+
+#define TEST_FAIL(reason) do { \
+    printf("\033[31mFAIL\033[0m (%s)\n", reason); fail_count++; \
+} while (0)
+
+#define TEST_ASSERT(cond, reason) do { \
+    if (!(cond)) { TEST_FAIL(reason); return; } \
+} while (0)
+
+static VADisplay dpy;
+static int drm_fd;
+
+static void setup(void)
+{
+    drm_fd = open(DRM_DEVICE, O_RDWR);
+    if (drm_fd < 0) {
+        fprintf(stderr, "Cannot open %s\n", DRM_DEVICE);
+        exit(1);
+    }
+    dpy = vaGetDisplayDRM(drm_fd);
+    if (!dpy) {
+        fprintf(stderr, "vaGetDisplayDRM failed\n");
+        exit(1);
+    }
+    int major, minor;
+    VAStatus st = vaInitialize(dpy, &major, &minor);
+    if (st != VA_STATUS_SUCCESS) {
+        fprintf(stderr, "vaInitialize failed: %d\n", st);
+        exit(1);
+    }
+}
+
+static void teardown(void)
+{
+    vaTerminate(dpy);
+    close(drm_fd);
+}
+
+/* --- Test: Entrypoints --- */
+
+static void test_entrypoints_h264(void)
+{
+    TEST_START("H.264 EncSlice entrypoint exists");
+    int ne = vaMaxNumEntrypoints(dpy);
+    VAEntrypoint *eps = calloc(ne, sizeof(VAEntrypoint));
+    int n = 0;
+    vaQueryConfigEntrypoints(dpy, VAProfileH264High, eps, &n);
+    bool found = false;
+    for (int i = 0; i < n; i++) {
+        if (eps[i] == VAEntrypointEncSlice) found = true;
+    }
+    free(eps);
+    TEST_ASSERT(found, "VAEntrypointEncSlice not found for H264High");
+    TEST_PASS();
+}
+
+static void test_entrypoints_hevc(void)
+{
+    TEST_START("HEVC EncSlice entrypoint exists");
+    int ne = vaMaxNumEntrypoints(dpy);
+    VAEntrypoint *eps = calloc(ne, sizeof(VAEntrypoint));
+    int n = 0;
+    vaQueryConfigEntrypoints(dpy, VAProfileHEVCMain, eps, &n);
+    bool found = false;
+    for (int i = 0; i < n; i++) {
+        if (eps[i] == VAEntrypointEncSlice) found = true;
+    }
+    free(eps);
+    TEST_ASSERT(found, "VAEntrypointEncSlice not found for HEVCMain");
+    TEST_PASS();
+}
+
+/* --- Test: Config attributes --- */
+
+static void test_config_attributes(void)
+{
+    TEST_START("Encode config attributes (RTFormat, RateControl)");
+    VAConfigAttrib attribs[3] = {
+        { .type = VAConfigAttribRTFormat },
+        { .type = VAConfigAttribRateControl },
+        { .type = VAConfigAttribEncMaxRefFrames },
+    };
+    VAStatus st = vaGetConfigAttributes(dpy, VAProfileH264High,
+                                         VAEntrypointEncSlice, attribs, 3);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaGetConfigAttributes failed");
+    TEST_ASSERT(attribs[0].value & VA_RT_FORMAT_YUV420, "no YUV420 RTFormat");
+    TEST_ASSERT(attribs[1].value & VA_RC_CQP, "no CQP rate control");
+    TEST_ASSERT(attribs[1].value & VA_RC_CBR, "no CBR rate control");
+    TEST_ASSERT(attribs[1].value & VA_RC_VBR, "no VBR rate control");
+    TEST_PASS();
+}
+
+/* --- Test: Create/destroy config+surfaces+context --- */
+
+static void test_create_destroy(void)
+{
+    TEST_START("Create and destroy encode config/surfaces/context");
+
+    VAConfigAttrib attrib = { .type = VAConfigAttribRTFormat,
+                               .value = VA_RT_FORMAT_YUV420 };
+    VAConfigID config;
+    VAStatus st = vaCreateConfig(dpy, VAProfileH264High,
+                                  VAEntrypointEncSlice, &attrib, 1, &config);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaCreateConfig failed");
+
+    VASurfaceID surfaces[4];
+    st = vaCreateSurfaces(dpy, VA_RT_FORMAT_YUV420, 320, 240,
+                           surfaces, 4, NULL, 0);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaCreateSurfaces failed");
+
+    VAContextID context;
+    st = vaCreateContext(dpy, config, 320, 240, VA_PROGRESSIVE,
+                          surfaces, 4, &context);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaCreateContext failed");
+
+    st = vaDestroyContext(dpy, context);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaDestroyContext failed");
+    st = vaDestroySurfaces(dpy, surfaces, 4);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaDestroySurfaces failed");
+    st = vaDestroyConfig(dpy, config);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaDestroyConfig failed");
+    TEST_PASS();
+}
+
+/* --- Test: Full encode cycle (1 frame) --- */
+
+static void test_encode_one_frame(VAProfile profile, const char *codec_name)
+{
+    char name[64];
+    snprintf(name, sizeof(name), "%s encode 1 frame (320x240)", codec_name);
+    TEST_START(name);
+
+    VAConfigAttrib attrib = { .type = VAConfigAttribRTFormat,
+                               .value = VA_RT_FORMAT_YUV420 };
+    VAConfigID config;
+    VAStatus st = vaCreateConfig(dpy, profile, VAEntrypointEncSlice,
+                                  &attrib, 1, &config);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "config");
+
+    VASurfaceID surface;
+    st = vaCreateSurfaces(dpy, VA_RT_FORMAT_YUV420, 320, 240,
+                           &surface, 1, NULL, 0);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "surface");
+
+    VAContextID context;
+    st = vaCreateContext(dpy, config, 320, 240, VA_PROGRESSIVE,
+                          &surface, 1, &context);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "context");
+
+    /* Coded buffer */
+    VABufferID coded_buf;
+    st = vaCreateBuffer(dpy, context, VAEncCodedBufferType, 320 * 240,
+                         1, NULL, &coded_buf);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "coded_buf");
+
+    /* Create NV12 image and fill with gray */
+    VAImageFormat fmt = { .fourcc = VA_FOURCC_NV12 };
+    VAImage image;
+    st = vaCreateImage(dpy, &fmt, 320, 240, &image);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "image");
+    void *img_data;
+    vaMapBuffer(dpy, image.buf, &img_data);
+    memset(img_data, 128, image.data_size);
+    vaUnmapBuffer(dpy, image.buf);
+    vaPutImage(dpy, surface, image.image_id, 0, 0, 320, 240, 0, 0, 320, 240);
+
+    /* Sequence params */
+    VABufferID seq_buf;
+    if (profile == VAProfileH264High || profile == VAProfileH264Main ||
+        profile == VAProfileH264ConstrainedBaseline) {
+        VAEncSequenceParameterBufferH264 seq = {
+            .picture_width_in_mbs = 320 / 16,
+            .picture_height_in_mbs = 240 / 16,
+            .intra_period = 30, .ip_period = 1,
+        };
+        vaCreateBuffer(dpy, context, VAEncSequenceParameterBufferType,
+                        sizeof(seq), 1, &seq, &seq_buf);
+    } else {
+        VAEncSequenceParameterBufferHEVC seq = {
+            .pic_width_in_luma_samples = 320,
+            .pic_height_in_luma_samples = 240,
+            .intra_period = 30, .ip_period = 1,
+        };
+        vaCreateBuffer(dpy, context, VAEncSequenceParameterBufferType,
+                        sizeof(seq), 1, &seq, &seq_buf);
+    }
+
+    /* Picture params */
+    VABufferID pic_buf;
+    if (profile == VAProfileH264High || profile == VAProfileH264Main ||
+        profile == VAProfileH264ConstrainedBaseline) {
+        VAEncPictureParameterBufferH264 pic = {
+            .coded_buf = coded_buf,
+            .pic_fields.bits.idr_pic_flag = 1,
+        };
+        vaCreateBuffer(dpy, context, VAEncPictureParameterBufferType,
+                        sizeof(pic), 1, &pic, &pic_buf);
+    } else {
+        VAEncPictureParameterBufferHEVC pic = {
+            .coded_buf = coded_buf,
+            .pic_fields.bits.idr_pic_flag = 1,
+        };
+        vaCreateBuffer(dpy, context, VAEncPictureParameterBufferType,
+                        sizeof(pic), 1, &pic, &pic_buf);
+    }
+
+    /* Slice params */
+    VABufferID slice_buf;
+    if (profile == VAProfileH264High || profile == VAProfileH264Main ||
+        profile == VAProfileH264ConstrainedBaseline) {
+        VAEncSliceParameterBufferH264 slice = { .slice_type = 2 };
+        vaCreateBuffer(dpy, context, VAEncSliceParameterBufferType,
+                        sizeof(slice), 1, &slice, &slice_buf);
+    } else {
+        VAEncSliceParameterBufferHEVC slice = { .slice_type = 2 };
+        vaCreateBuffer(dpy, context, VAEncSliceParameterBufferType,
+                        sizeof(slice), 1, &slice, &slice_buf);
+    }
+
+    /* Encode */
+    st = vaBeginPicture(dpy, context, surface);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaBeginPicture");
+    VABufferID bufs[] = { seq_buf, pic_buf, slice_buf };
+    st = vaRenderPicture(dpy, context, bufs, 3);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaRenderPicture");
+    st = vaEndPicture(dpy, context);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaEndPicture");
+
+    st = vaSyncSurface(dpy, surface);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaSyncSurface");
+
+    /* Map coded buffer and check output */
+    VACodedBufferSegment *seg = NULL;
+    st = vaMapBuffer(dpy, coded_buf, (void **)&seg);
+    TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaMapBuffer");
+    TEST_ASSERT(seg != NULL, "coded segment is NULL");
+    TEST_ASSERT(seg->buf != NULL, "coded data is NULL");
+    TEST_ASSERT(seg->size > 0, "coded size is 0");
+
+    /* Check for valid NAL start code */
+    unsigned char *bs = (unsigned char *)seg->buf;
+    bool has_start_code = (bs[0] == 0 && bs[1] == 0 && bs[2] == 0 && bs[3] == 1);
+    TEST_ASSERT(has_start_code, "no NAL start code 00 00 00 01");
+
+    vaUnmapBuffer(dpy, coded_buf);
+
+    /* Cleanup */
+    vaDestroyBuffer(dpy, coded_buf);
+    vaDestroyBuffer(dpy, seq_buf);
+    vaDestroyBuffer(dpy, pic_buf);
+    vaDestroyBuffer(dpy, slice_buf);
+    vaDestroyImage(dpy, image.image_id);
+    vaDestroyContext(dpy, context);
+    vaDestroySurfaces(dpy, &surface, 1);
+    vaDestroyConfig(dpy, config);
+    TEST_PASS();
+}
+
+/* --- Test: Sequential encodes (leak check) --- */
+
+static void test_sequential_encodes(void)
+{
+    TEST_START("10 sequential H.264 encodes (leak check)");
+
+    for (int run = 0; run < 10; run++) {
+        VAConfigAttrib attrib = { .type = VAConfigAttribRTFormat,
+                                   .value = VA_RT_FORMAT_YUV420 };
+        VAConfigID config;
+        vaCreateConfig(dpy, VAProfileH264High, VAEntrypointEncSlice,
+                        &attrib, 1, &config);
+        VASurfaceID surface;
+        vaCreateSurfaces(dpy, VA_RT_FORMAT_YUV420, 320, 240, &surface, 1, NULL, 0);
+        VAContextID context;
+        vaCreateContext(dpy, config, 320, 240, VA_PROGRESSIVE, &surface, 1, &context);
+        VABufferID coded;
+        vaCreateBuffer(dpy, context, VAEncCodedBufferType, 320 * 240, 1, NULL, &coded);
+
+        VAEncSequenceParameterBufferH264 seq = {
+            .picture_width_in_mbs = 20, .picture_height_in_mbs = 15,
+            .intra_period = 30, .ip_period = 1,
+        };
+        VAEncPictureParameterBufferH264 pic = {
+            .coded_buf = coded, .pic_fields.bits.idr_pic_flag = 1,
+        };
+        VAEncSliceParameterBufferH264 slice = { .slice_type = 2 };
+        VABufferID bufs[3];
+        vaCreateBuffer(dpy, context, VAEncSequenceParameterBufferType,
+                        sizeof(seq), 1, &seq, &bufs[0]);
+        vaCreateBuffer(dpy, context, VAEncPictureParameterBufferType,
+                        sizeof(pic), 1, &pic, &bufs[1]);
+        vaCreateBuffer(dpy, context, VAEncSliceParameterBufferType,
+                        sizeof(slice), 1, &slice, &bufs[2]);
+
+        vaBeginPicture(dpy, context, surface);
+        vaRenderPicture(dpy, context, bufs, 3);
+        VAStatus st = vaEndPicture(dpy, context);
+        if (st != VA_STATUS_SUCCESS) {
+            TEST_FAIL("vaEndPicture failed in sequential run");
+            return;
+        }
+
+        vaDestroyBuffer(dpy, coded);
+        vaDestroyBuffer(dpy, bufs[0]);
+        vaDestroyBuffer(dpy, bufs[1]);
+        vaDestroyBuffer(dpy, bufs[2]);
+        vaDestroyContext(dpy, context);
+        vaDestroySurfaces(dpy, &surface, 1);
+        vaDestroyConfig(dpy, config);
+    }
+    TEST_PASS();
+}
+
+/* --- Test: Coded buffer reuse across frames --- */
+
+static void test_coded_buffer_reuse(void)
+{
+    TEST_START("Coded buffer reuse across 5 frames");
+
+    VAConfigAttrib attrib = { .type = VAConfigAttribRTFormat,
+                               .value = VA_RT_FORMAT_YUV420 };
+    VAConfigID config;
+    vaCreateConfig(dpy, VAProfileH264High, VAEntrypointEncSlice,
+                    &attrib, 1, &config);
+    VASurfaceID surface;
+    vaCreateSurfaces(dpy, VA_RT_FORMAT_YUV420, 320, 240, &surface, 1, NULL, 0);
+    VAContextID context;
+    vaCreateContext(dpy, config, 320, 240, VA_PROGRESSIVE, &surface, 1, &context);
+    VABufferID coded;
+    vaCreateBuffer(dpy, context, VAEncCodedBufferType, 320 * 240, 1, NULL, &coded);
+
+    for (int frame = 0; frame < 5; frame++) {
+        VAEncSequenceParameterBufferH264 seq = {
+            .picture_width_in_mbs = 20, .picture_height_in_mbs = 15,
+            .intra_period = 30, .ip_period = 1,
+        };
+        VAEncPictureParameterBufferH264 pic = {
+            .coded_buf = coded,
+            .pic_fields.bits.idr_pic_flag = (frame == 0) ? 1 : 0,
+        };
+        VAEncSliceParameterBufferH264 slice = {
+            .slice_type = (frame == 0) ? 2 : 0,
+        };
+        VABufferID bufs[3];
+        vaCreateBuffer(dpy, context, VAEncSequenceParameterBufferType,
+                        sizeof(seq), 1, &seq, &bufs[0]);
+        vaCreateBuffer(dpy, context, VAEncPictureParameterBufferType,
+                        sizeof(pic), 1, &pic, &bufs[1]);
+        vaCreateBuffer(dpy, context, VAEncSliceParameterBufferType,
+                        sizeof(slice), 1, &slice, &bufs[2]);
+
+        vaBeginPicture(dpy, context, surface);
+        vaRenderPicture(dpy, context, bufs, 3);
+        VAStatus st = vaEndPicture(dpy, context);
+        if (st != VA_STATUS_SUCCESS) {
+            TEST_FAIL("vaEndPicture failed");
+            goto cleanup;
+        }
+
+        VACodedBufferSegment *seg;
+        vaMapBuffer(dpy, coded, (void **)&seg);
+        if (!seg || !seg->buf || seg->size == 0) {
+            TEST_FAIL("empty coded buffer");
+            vaUnmapBuffer(dpy, coded);
+            goto cleanup;
+        }
+        vaUnmapBuffer(dpy, coded);
+
+        vaDestroyBuffer(dpy, bufs[0]);
+        vaDestroyBuffer(dpy, bufs[1]);
+        vaDestroyBuffer(dpy, bufs[2]);
+    }
+    TEST_PASS();
+
+cleanup:
+    vaDestroyBuffer(dpy, coded);
+    vaDestroyContext(dpy, context);
+    vaDestroySurfaces(dpy, &surface, 1);
+    vaDestroyConfig(dpy, config);
+}
+
+/* --- Test: Decode regression --- */
+
+static void test_decode_still_works(void)
+{
+    TEST_START("Decode entrypoints still present (VLD)");
+    int ne = vaMaxNumEntrypoints(dpy);
+    VAEntrypoint *eps = calloc(ne, sizeof(VAEntrypoint));
+    int n = 0;
+    vaQueryConfigEntrypoints(dpy, VAProfileH264High, eps, &n);
+    bool found_vld = false;
+    bool found_enc = false;
+    for (int i = 0; i < n; i++) {
+        if (eps[i] == VAEntrypointVLD) found_vld = true;
+        if (eps[i] == VAEntrypointEncSlice) found_enc = true;
+    }
+    free(eps);
+    TEST_ASSERT(found_vld, "VAEntrypointVLD missing");
+    TEST_ASSERT(found_enc, "VAEntrypointEncSlice missing");
+    TEST_PASS();
+}
+
+/* --- Main --- */
+
+int main(int argc, char **argv)
+{
+    bool run_h264 = true, run_hevc = true;
+    if (argc > 1) {
+        if (strcmp(argv[1], "h264") == 0) run_hevc = false;
+        else if (strcmp(argv[1], "hevc") == 0) run_h264 = false;
+    }
+
+    setup();
+
+    printf("\n=== nvidia-vaapi-driver encode tests ===\n");
+    printf("Driver: %s\n\n", vaQueryVendorString(dpy));
+
+    printf("Entrypoints:\n");
+    test_entrypoints_h264();
+    test_entrypoints_hevc();
+
+    printf("\nConfig:\n");
+    test_config_attributes();
+
+    printf("\nLifecycle:\n");
+    test_create_destroy();
+
+    if (run_h264) {
+        printf("\nH.264 Encode:\n");
+        test_encode_one_frame(VAProfileH264High, "H.264 High");
+        test_encode_one_frame(VAProfileH264Main, "H.264 Main");
+        test_encode_one_frame(VAProfileH264ConstrainedBaseline, "H.264 CB");
+    }
+
+    if (run_hevc) {
+        printf("\nHEVC Encode:\n");
+        test_encode_one_frame(VAProfileHEVCMain, "HEVC Main");
+    }
+
+    printf("\nStress:\n");
+    test_sequential_encodes();
+    test_coded_buffer_reuse();
+
+    printf("\nRegression:\n");
+    test_decode_still_works();
+
+    printf("\n=== Results: %d passed, %d failed ===\n\n",
+           pass_count, fail_count);
+
+    teardown();
+    return fail_count > 0 ? 1 : 0;
+}
diff --git a/tests/test_encode_config.c b/tests/test_encode_config.c
new file mode 100644
index 00000000..1780d5fb
--- /dev/null
+++ b/tests/test_encode_config.c
@@ -0,0 +1,259 @@
+/*
+ * test_encode_config.c — Config and capability tests.
+ * Tests profile/entrypoint validation, config attributes, and error paths.
+ *
+ * Build: gcc -o test_encode_config tests/test_encode_config.c -lva -lva-drm
+ * Run:   ./test_encode_config
+ */
+
+#include "test_common.h"
+
+/* --- Profile/Entrypoint matrix --- */
+
+typedef struct {
+    VAProfile profile;
+    const char *name;
+    bool expect_encode;
+    bool expect_decode;
+} ProfileTest;
+
+static const ProfileTest profile_tests[] = {
+    { VAProfileH264ConstrainedBaseline, "H264 CB",   true,  true  },
+    { VAProfileH264Main,               "H264 Main", true,  true  },
+    { VAProfileH264High,               "H264 High", true,  true  },
+    { VAProfileHEVCMain,               "HEVC Main", true,  true  },
+    { VAProfileHEVCMain10,             "HEVC M10",  true,  true  },
+    { VAProfileMPEG2Simple,            "MPEG2",     false, true  },
+    { VAProfileVP9Profile0,            "VP9 P0",    false, false }, /* VP9 requires gstreamer-codecparsers */
+    { VAProfileAV1Profile0,            "AV1 P0",    false, true  },
+    { VAProfileJPEGBaseline,           "JPEG",      false, true  },
+};
+#define NUM_PROFILE_TESTS (sizeof(profile_tests) / sizeof(profile_tests[0]))
+
+static void test_encode_entrypoints(void) {
+    for (int i = 0; i < (int)NUM_PROFILE_TESTS; i++) {
+        char name[64];
+        snprintf(name, sizeof(name), "EncSlice for %-10s → %s",
+                 profile_tests[i].name,
+                 profile_tests[i].expect_encode ? "present" : "absent");
+        TEST_START(name);
+
+        bool has = test_has_entrypoint(g_dpy, profile_tests[i].profile,
+                                        VAEntrypointEncSlice);
+        if (has == profile_tests[i].expect_encode) {
+            TEST_PASS();
+        } else {
+            TEST_FAIL(has ? "unexpected EncSlice" : "missing EncSlice");
+        }
+    }
+}
+
+static void test_decode_entrypoints(void) {
+    for (int i = 0; i < (int)NUM_PROFILE_TESTS; i++) {
+        char name[64];
+        snprintf(name, sizeof(name), "VLD for %-10s → %s",
+                 profile_tests[i].name,
+                 profile_tests[i].expect_decode ? "present" : "absent");
+        TEST_START(name);
+
+        bool has = test_has_entrypoint(g_dpy, profile_tests[i].profile,
+                                        VAEntrypointVLD);
+        if (has == profile_tests[i].expect_decode) {
+            TEST_PASS();
+        } else {
+            TEST_FAIL(has ? "unexpected VLD" : "missing VLD");
+        }
+    }
+}
+
+/* --- Config attribute validation --- */
+
+static void test_config_rtformat(void) {
+    TEST_START("H264 High RTFormat includes YUV420");
+    VAConfigAttrib a = { .type = VAConfigAttribRTFormat };
+    EXPECT_STATUS(vaGetConfigAttributes(g_dpy, VAProfileH264High,
+                                         VAEntrypointEncSlice, &a, 1));
+    EXPECT_TRUE(a.value & VA_RT_FORMAT_YUV420, "no YUV420");
+    TEST_PASS();
+}
+
+static void test_config_ratecontrol(void) {
+    TEST_START("Rate control: CQP + CBR + VBR supported");
+    VAConfigAttrib a = { .type = VAConfigAttribRateControl };
+    EXPECT_STATUS(vaGetConfigAttributes(g_dpy, VAProfileH264High,
+                                         VAEntrypointEncSlice, &a, 1));
+    EXPECT_TRUE(a.value & VA_RC_CQP, "no CQP");
+    EXPECT_TRUE(a.value & VA_RC_CBR, "no CBR");
+    EXPECT_TRUE(a.value & VA_RC_VBR, "no VBR");
+    TEST_PASS();
+}
+
+static void test_config_packed_headers(void) {
+    TEST_START("Packed headers: SEQ + PIC advertised");
+    VAConfigAttrib a = { .type = VAConfigAttribEncPackedHeaders };
+    EXPECT_STATUS(vaGetConfigAttributes(g_dpy, VAProfileH264High,
+                                         VAEntrypointEncSlice, &a, 1));
+    EXPECT_TRUE(a.value & VA_ENC_PACKED_HEADER_SEQUENCE, "no SEQ");
+    EXPECT_TRUE(a.value & VA_ENC_PACKED_HEADER_PICTURE, "no PIC");
+    TEST_PASS();
+}
+
+static void test_config_max_ref_frames(void) {
+    TEST_START("Max ref frames reported");
+    VAConfigAttrib a = { .type = VAConfigAttribEncMaxRefFrames };
+    EXPECT_STATUS(vaGetConfigAttributes(g_dpy, VAProfileH264High,
+                                         VAEntrypointEncSlice, &a, 1));
+    EXPECT_TRUE(a.value != VA_ATTRIB_NOT_SUPPORTED, "not supported");
+    EXPECT_TRUE((a.value & 0xffff) >= 1, "L0 refs < 1");
+    TEST_PASS();
+}
+
+static void test_config_quality_range(void) {
+    TEST_START("Quality range attribute reported");
+    VAConfigAttrib a = { .type = VAConfigAttribEncQualityRange };
+    EXPECT_STATUS(vaGetConfigAttributes(g_dpy, VAProfileH264High,
+                                         VAEntrypointEncSlice, &a, 1));
+    EXPECT_TRUE(a.value != VA_ATTRIB_NOT_SUPPORTED, "not supported");
+    EXPECT_TRUE(a.value >= 1, "quality range < 1");
+    TEST_PASS();
+}
+
+/* --- Error path tests --- */
+
+static void test_invalid_entrypoint(void) {
+    TEST_START("vaCreateConfig with invalid entrypoint → error");
+    VAConfigAttrib a = { .type = VAConfigAttribRTFormat, .value = VA_RT_FORMAT_YUV420 };
+    VAConfigID config;
+    /* Use a valid profile but wrong entrypoint type (0xFF) */
+    VAStatus st = vaCreateConfig(g_dpy, VAProfileH264High, (VAEntrypoint)0xFF,
+                                  &a, 1, &config);
+    EXPECT_TRUE(st != VA_STATUS_SUCCESS, "should fail for invalid entrypoint");
+    TEST_PASS();
+}
+
+static void test_encode_on_decode_only_profile(void) {
+    TEST_START("vaCreateConfig encode on MPEG2 (decode-only) → error");
+    VAConfigAttrib a = { .type = VAConfigAttribRTFormat, .value = VA_RT_FORMAT_YUV420 };
+    VAConfigID config;
+    VAStatus st = vaCreateConfig(g_dpy, VAProfileMPEG2Simple,
+                                  VAEntrypointEncSlice, &a, 1, &config);
+    EXPECT_TRUE(st != VA_STATUS_SUCCESS, "should fail for decode-only profile");
+    TEST_PASS();
+}
+
+static void test_create_config_all_encode_profiles(void) {
+    VAProfile profiles[] = {
+        VAProfileH264ConstrainedBaseline, VAProfileH264Main, VAProfileH264High,
+        VAProfileHEVCMain, VAProfileHEVCMain10,
+    };
+    for (int i = 0; i < 5; i++) {
+        char name[64];
+        snprintf(name, sizeof(name), "vaCreateConfig for encode profile %d", profiles[i]);
+        TEST_START(name);
+
+        VAConfigAttrib a = { .type = VAConfigAttribRTFormat, .value = VA_RT_FORMAT_YUV420 };
+        VAConfigID config;
+        VAStatus st = vaCreateConfig(g_dpy, profiles[i], VAEntrypointEncSlice,
+                                      &a, 1, &config);
+        EXPECT_STATUS(st);
+        st = vaDestroyConfig(g_dpy, config);
+        EXPECT_STATUS(st);
+        TEST_PASS();
+    }
+}
+
+/* --- Surface creation tests --- */
+
+static void test_surface_nv12(void) {
+    TEST_START("Create NV12 surface 1920x1080");
+    VASurfaceID surface;
+    VAStatus st = vaCreateSurfaces(g_dpy, VA_RT_FORMAT_YUV420, 1920, 1080,
+                                    &surface, 1, NULL, 0);
+    EXPECT_STATUS(st);
+    vaDestroySurfaces(g_dpy, &surface, 1);
+    TEST_PASS();
+}
+
+static void test_surface_p010(void) {
+    TEST_START("Create P010 surface 1920x1080 (10-bit)");
+    VASurfaceID surface;
+    VAStatus st = vaCreateSurfaces(g_dpy, VA_RT_FORMAT_YUV420_10, 1920, 1080,
+                                    &surface, 1, NULL, 0);
+    if (st != VA_STATUS_SUCCESS) {
+        TEST_SKIP("10-bit surfaces not supported");
+        return;
+    }
+    vaDestroySurfaces(g_dpy, &surface, 1);
+    TEST_PASS();
+}
+
+static void test_surface_multiple(void) {
+    TEST_START("Create 16 surfaces simultaneously");
+    VASurfaceID surfaces[16];
+    VAStatus st = vaCreateSurfaces(g_dpy, VA_RT_FORMAT_YUV420, 640, 480,
+                                    surfaces, 16, NULL, 0);
+    EXPECT_STATUS(st);
+    vaDestroySurfaces(g_dpy, surfaces, 16);
+    TEST_PASS();
+}
+
+static void test_surface_small(void) {
+    TEST_START("Create tiny surface 16x16");
+    VASurfaceID surface;
+    VAStatus st = vaCreateSurfaces(g_dpy, VA_RT_FORMAT_YUV420, 16, 16,
+                                    &surface, 1, NULL, 0);
+    EXPECT_STATUS(st);
+    vaDestroySurfaces(g_dpy, &surface, 1);
+    TEST_PASS();
+}
+
+static void test_surface_4k(void) {
+    TEST_START("Create 4K surface 3840x2160");
+    VASurfaceID surface;
+    VAStatus st = vaCreateSurfaces(g_dpy, VA_RT_FORMAT_YUV420, 3840, 2160,
+                                    &surface, 1, NULL, 0);
+    EXPECT_STATUS(st);
+    vaDestroySurfaces(g_dpy, &surface, 1);
+    TEST_PASS();
+}
+
+/* --- Main --- */
+
+int main(void)
+{
+    test_global_setup();
+
+    printf("\n=== nvidia-vaapi-driver config & capability tests ===\n");
+    printf("Driver: %s\n\n", vaQueryVendorString(g_dpy));
+
+    printf("Encode entrypoints:\n");
+    test_encode_entrypoints();
+
+    printf("\nDecode entrypoints:\n");
+    test_decode_entrypoints();
+
+    printf("\nConfig attributes:\n");
+    test_config_rtformat();
+    test_config_ratecontrol();
+    test_config_packed_headers();
+    test_config_max_ref_frames();
+    test_config_quality_range();
+
+    printf("\nError paths:\n");
+    test_invalid_entrypoint();
+    test_encode_on_decode_only_profile();
+
+    printf("\nConfig creation:\n");
+    test_create_config_all_encode_profiles();
+
+    printf("\nSurface creation:\n");
+    test_surface_nv12();
+    test_surface_p010();
+    test_surface_multiple();
+    test_surface_small();
+    test_surface_4k();
+
+    test_print_summary("Config tests");
+    test_global_teardown();
+    return g_fail > 0 ? 1 : 0;
+}
diff --git a/tests/test_gstreamer.sh b/tests/test_gstreamer.sh
new file mode 100755
index 00000000..58834620
--- /dev/null
+++ b/tests/test_gstreamer.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+#
+# test_gstreamer.sh — GStreamer VA-API encode integration tests
+#
+# Requires: gstreamer1-vaapi (Fedora) or gstreamer1.0-vaapi (Ubuntu)
+#
+# Exit code: 0 = all pass, 1 = failure
+
+set -u
+
+export GST_VAAPI_ALL_DRIVERS=1
+export LIBVA_DRIVER_NAME=nvidia
+
+PASS=0
+FAIL=0
+SKIP=0
+TMPDIR=$(mktemp -d)
+trap 'rm -rf "$TMPDIR"' EXIT
+
+pass() { printf "  %-55s \033[32mPASS\033[0m\n" "$1"; PASS=$((PASS+1)); }
+fail() { printf "  %-55s \033[31mFAIL\033[0m (%s)\n" "$1" "$2"; FAIL=$((FAIL+1)); }
+skip() { printf "  %-55s \033[33mSKIP\033[0m (%s)\n" "$1" "$2"; SKIP=$((SKIP+1)); }
+
+has_element() { gst-inspect-1.0 "$1" >/dev/null 2>&1; }
+
+echo ""
+echo "=== nvidia-vaapi-driver GStreamer tests ==="
+echo ""
+
+# --- Check prerequisites ---
+
+echo "Prerequisites:"
+
+if ! has_element vaapih264enc; then
+    skip "vaapih264enc available" "gstreamer-vaapi not installed"
+    echo ""
+    echo "=== Results: $PASS passed, $FAIL failed, $SKIP skipped ==="
+    exit 1
+fi
+pass "vaapih264enc available"
+
+if ! has_element vaapih265enc; then
+    skip "vaapih265enc available" "element not found"
+else
+    pass "vaapih265enc available"
+fi
+
+# --- H.264 encode tests ---
+
+echo ""
+echo "H.264 Encode:"
+
+# Basic encode to fakesink
+if gst-launch-1.0 -e videotestsrc num-buffers=30 \
+    ! video/x-raw,width=320,height=240,framerate=30/1 \
+    ! vaapih264enc ! h264parse ! fakesink 2>&1 | grep -q "EOS"; then
+    pass "H.264 320x240 30 frames → fakesink"
+else
+    fail "H.264 320x240 30 frames → fakesink" "pipeline error"
+fi
+
+# Encode to file and validate
+OUT="$TMPDIR/h264.mp4"
+if gst-launch-1.0 -e videotestsrc num-buffers=60 \
+    ! video/x-raw,width=1920,height=1080,framerate=30/1 \
+    ! vaapih264enc bitrate=5000 ! h264parse \
+    ! mp4mux ! filesink location="$OUT" 2>&1 | grep -q "EOS"; then
+    SIZE=$(stat -c%s "$OUT" 2>/dev/null || echo 0)
+    if [ "$SIZE" -gt 1000 ]; then
+        pass "H.264 1080p 60 frames → mp4 (${SIZE} bytes)"
+    else
+        fail "H.264 1080p 60 frames → mp4" "file too small: ${SIZE} bytes"
+    fi
+else
+    fail "H.264 1080p 60 frames → mp4" "pipeline error"
+fi
+
+# CBR bitrate control
+OUT="$TMPDIR/h264_cbr.mp4"
+if gst-launch-1.0 -e videotestsrc num-buffers=90 \
+    ! video/x-raw,width=1280,height=720,framerate=30/1 \
+    ! vaapih264enc rate-control=cbr bitrate=2000 ! h264parse \
+    ! mp4mux ! filesink location="$OUT" 2>&1 | grep -q "EOS"; then
+    SIZE=$(stat -c%s "$OUT" 2>/dev/null || echo 0)
+    if [ "$SIZE" -gt 1000 ]; then
+        pass "H.264 720p CBR 2Mbps 90 frames"
+    else
+        fail "H.264 720p CBR 2Mbps 90 frames" "file too small"
+    fi
+else
+    fail "H.264 720p CBR 2Mbps 90 frames" "pipeline error"
+fi
+
+# Small resolution (GStreamer vaapi requires ~256x256 minimum)
+if gst-launch-1.0 -e videotestsrc num-buffers=10 \
+    ! video/x-raw,width=256,height=256,framerate=30/1 \
+    ! vaapih264enc ! h264parse ! fakesink 2>&1 | grep -q "EOS"; then
+    pass "H.264 256x256 small resolution"
+else
+    fail "H.264 256x256 small resolution" "pipeline error"
+fi
+
+# 4K resolution
+if gst-launch-1.0 -e videotestsrc num-buffers=5 \
+    ! video/x-raw,width=3840,height=2160,framerate=30/1 \
+    ! vaapih264enc ! h264parse ! fakesink 2>&1 | grep -q "EOS"; then
+    pass "H.264 4K 5 frames"
+else
+    fail "H.264 4K 5 frames" "pipeline error"
+fi
+
+# --- HEVC encode tests ---
+
+echo ""
+echo "HEVC Encode:"
+
+if has_element vaapih265enc; then
+    # Basic encode
+    if gst-launch-1.0 -e videotestsrc num-buffers=30 \
+        ! video/x-raw,width=320,height=240,framerate=30/1 \
+        ! vaapih265enc ! h265parse ! fakesink 2>&1 | grep -q "EOS"; then
+        pass "HEVC 320x240 30 frames → fakesink"
+    else
+        fail "HEVC 320x240 30 frames → fakesink" "pipeline error"
+    fi
+
+    # Encode to file
+    OUT="$TMPDIR/hevc.mp4"
+    if gst-launch-1.0 -e videotestsrc num-buffers=60 \
+        ! video/x-raw,width=1920,height=1080,framerate=30/1 \
+        ! vaapih265enc bitrate=5000 ! h265parse \
+        ! mp4mux ! filesink location="$OUT" 2>&1 | grep -q "EOS"; then
+        SIZE=$(stat -c%s "$OUT" 2>/dev/null || echo 0)
+        if [ "$SIZE" -gt 1000 ]; then
+            pass "HEVC 1080p 60 frames → mp4 (${SIZE} bytes)"
+        else
+            fail "HEVC 1080p 60 frames → mp4" "file too small: ${SIZE} bytes"
+        fi
+    else
+        fail "HEVC 1080p 60 frames → mp4" "pipeline error"
+    fi
+
+    # 4K
+    if gst-launch-1.0 -e videotestsrc num-buffers=5 \
+        ! video/x-raw,width=3840,height=2160,framerate=30/1 \
+        ! vaapih265enc ! h265parse ! fakesink 2>&1 | grep -q "EOS"; then
+        pass "HEVC 4K 5 frames"
+    else
+        fail "HEVC 4K 5 frames" "pipeline error"
+    fi
+else
+    skip "HEVC tests" "vaapih265enc not available"
+fi
+
+# --- Decode regression ---
+
+echo ""
+echo "Decode regression:"
+
+if has_element vaapih264dec; then
+    pass "vaapih264dec still available"
+else
+    fail "vaapih264dec still available" "element missing"
+fi
+
+if has_element vaapih265dec; then
+    pass "vaapih265dec still available"
+else
+    fail "vaapih265dec still available" "element missing"
+fi
+
+# Decode an encoded file (round-trip)
+if [ -f "$TMPDIR/h264.mp4" ]; then
+    if gst-launch-1.0 -e filesrc location="$TMPDIR/h264.mp4" \
+        ! qtdemux ! h264parse ! vaapih264dec ! fakesink 2>&1 | grep -q "EOS"; then
+        pass "H.264 encode → decode round-trip"
+    else
+        fail "H.264 encode → decode round-trip" "decode pipeline error"
+    fi
+fi
+
+# --- Stress ---
+
+echo ""
+echo "Stress:"
+
+# Sequential pipeline restarts (leak check)
+ALL_OK=1
+for i in $(seq 1 10); do
+    if ! gst-launch-1.0 -e videotestsrc num-buffers=10 \
+        ! video/x-raw,width=320,height=240,framerate=30/1 \
+        ! vaapih264enc ! fakesink 2>&1 | grep -q "EOS"; then
+        ALL_OK=0
+        break
+    fi
+done
+if [ "$ALL_OK" = "1" ]; then
+    pass "10 sequential H.264 pipeline restarts"
+else
+    fail "10 sequential H.264 pipeline restarts" "failed at iteration $i"
+fi
+
+# Long encode (300 frames)
+if gst-launch-1.0 -e videotestsrc num-buffers=300 \
+    ! video/x-raw,width=1920,height=1080,framerate=60/1 \
+    ! vaapih264enc bitrate=8000 ! h264parse ! fakesink 2>&1 | grep -q "EOS"; then
+    pass "H.264 1080p60 300 frames sustained"
+else
+    fail "H.264 1080p60 300 frames sustained" "pipeline error"
+fi
+
+# --- Summary ---
+
+echo ""
+echo "=== Results: $PASS passed, $FAIL failed, $SKIP skipped ==="
+echo ""
+exit $FAIL
diff --git a/tests/test_ipc_fuzz.c b/tests/test_ipc_fuzz.c
new file mode 100644
index 00000000..c579f201
--- /dev/null
+++ b/tests/test_ipc_fuzz.c
@@ -0,0 +1,204 @@
+/*
+ * test_ipc_fuzz.c — Fuzz the nvenc-helper IPC protocol with malformed messages.
+ * Tests robustness against corrupt/malicious data from the socket.
+ *
+ * Build: gcc -o test_ipc_fuzz tests/test_ipc_fuzz.c src/nvenc-ipc-client.c -lm
+ * Run:   ./test_ipc_fuzz  (nvenc-helper must be running)
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <signal.h>
+#include <errno.h>
+
+#include "../src/nvenc-ipc.h"
+
+static int g_pass = 0, g_fail = 0;
+#define C_GREEN  "\033[32m"
+#define C_RED    "\033[31m"
+#define C_RESET  "\033[0m"
+#define TEST_START(n) printf("  %-55s ", n); fflush(stdout);
+#define TEST_PASS() do { printf(C_GREEN "PASS" C_RESET "\n"); g_pass++; } while(0)
+#define TEST_FAIL(r) do { printf(C_RED "FAIL" C_RESET " (%s)\n", r); g_fail++; } while(0)
+#define EXPECT_TRUE(c, r) do { if(!(c)) { TEST_FAIL(r); return; } } while(0)
+
+static bool send_raw(int fd, const void *buf, size_t len) {
+    const char *p = buf;
+    while (len > 0) {
+        ssize_t n = send(fd, p, len, MSG_NOSIGNAL);
+        if (n <= 0) return false;
+        p += n;
+        len -= (size_t)n;
+    }
+    return true;
+}
+
+static int connect_helper(void) {
+    char path[256];
+    nvenc_ipc_get_socket_path(path, sizeof(path));
+    int fd = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (fd < 0) return -1;
+    struct sockaddr_un addr = {0};
+    addr.sun_family = AF_UNIX;
+    strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1);
+    if (connect(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+        close(fd);
+        return -1;
+    }
+    return fd;
+}
+
+static void test_invalid_command(void) {
+    TEST_START("Invalid command ID (0xFF)");
+    int fd = connect_helper();
+    EXPECT_TRUE(fd >= 0, "can't connect to helper");
+    NVEncIPCMsgHeader hdr = { .cmd = 0xFF, .payload_size = 0 };
+    send_raw(fd, &hdr, sizeof(hdr));
+    NVEncIPCRespHeader resp = {0};
+    recv(fd, &resp, sizeof(resp), 0);
+    EXPECT_TRUE(resp.status != 0, "should reject unknown command");
+    close(fd);
+    TEST_PASS();
+}
+
+static void test_zero_payload(void) {
+    TEST_START("CMD_INIT with zero payload");
+    int fd = connect_helper();
+    EXPECT_TRUE(fd >= 0, "can't connect");
+    NVEncIPCMsgHeader hdr = { .cmd = NVENC_IPC_CMD_INIT, .payload_size = 0 };
+    send_raw(fd, &hdr, sizeof(hdr));
+    NVEncIPCRespHeader resp = {0};
+    recv(fd, &resp, sizeof(resp), 0);
+    EXPECT_TRUE(resp.status != 0, "should reject zero-size init");
+    close(fd);
+    TEST_PASS();
+}
+
+static void test_truncated_init(void) {
+    TEST_START("CMD_INIT with truncated payload (5 bytes)");
+    int fd = connect_helper();
+    EXPECT_TRUE(fd >= 0, "can't connect");
+    NVEncIPCMsgHeader hdr = { .cmd = NVENC_IPC_CMD_INIT, .payload_size = sizeof(NVEncIPCInitParams) };
+    send_raw(fd, &hdr, sizeof(hdr));
+    char partial[5] = {1, 2, 3, 4, 5};
+    send_raw(fd, partial, sizeof(partial));
+    close(fd); //disconnect mid-message
+    TEST_PASS(); //helper should not crash
+}
+
+static void test_huge_payload_size(void) {
+    TEST_START("CMD_ENCODE with payload_size=0xFFFFFFFF");
+    int fd = connect_helper();
+    EXPECT_TRUE(fd >= 0, "can't connect");
+    //first init a valid encoder
+    NVEncIPCMsgHeader ihdr = { .cmd = NVENC_IPC_CMD_INIT, .payload_size = sizeof(NVEncIPCInitParams) };
+    NVEncIPCInitParams params = { .width = 320, .height = 240, .codec = 0,
+        .frameRateNum = 30, .frameRateDen = 1 };
+    send_raw(fd, &ihdr, sizeof(ihdr));
+    send_raw(fd, &params, sizeof(params));
+    //drain init response (may include shm fd)
+    char drain[256];
+    recv(fd, drain, sizeof(drain), 0);
+
+    //now send encode with huge size
+    NVEncIPCMsgHeader hdr = { .cmd = NVENC_IPC_CMD_ENCODE, .payload_size = 0xFFFFFFFF };
+    send_raw(fd, &hdr, sizeof(hdr));
+    close(fd);
+    TEST_PASS(); //helper should not malloc 4GB and crash
+}
+
+static void test_encode_without_init(void) {
+    TEST_START("CMD_ENCODE_SHM without prior CMD_INIT");
+    int fd = connect_helper();
+    EXPECT_TRUE(fd >= 0, "can't connect");
+    NVEncIPCMsgHeader hdr = { .cmd = NVENC_IPC_CMD_ENCODE_SHM,
+        .payload_size = sizeof(NVEncIPCEncodeShmParams) };
+    NVEncIPCEncodeShmParams sp = { .width = 320, .height = 240, .frame_size = 115200 };
+    send_raw(fd, &hdr, sizeof(hdr));
+    send_raw(fd, &sp, sizeof(sp));
+    NVEncIPCRespHeader resp = {0};
+    recv(fd, &resp, sizeof(resp), 0);
+    EXPECT_TRUE(resp.status != 0, "should reject encode without init");
+    close(fd);
+    TEST_PASS();
+}
+
+static void test_rapid_connect_disconnect(void) {
+    TEST_START("50 rapid connect/disconnect cycles");
+    for (int i = 0; i < 50; i++) {
+        int fd = connect_helper();
+        if (fd >= 0) close(fd);
+    }
+    //verify helper still alive
+    int fd = connect_helper();
+    EXPECT_TRUE(fd >= 0, "helper died after rapid cycles");
+    close(fd);
+    TEST_PASS();
+}
+
+static void test_close_without_init(void) {
+    TEST_START("CMD_CLOSE without prior CMD_INIT");
+    int fd = connect_helper();
+    EXPECT_TRUE(fd >= 0, "can't connect");
+    NVEncIPCMsgHeader hdr = { .cmd = NVENC_IPC_CMD_CLOSE, .payload_size = 0 };
+    send_raw(fd, &hdr, sizeof(hdr));
+    NVEncIPCRespHeader resp = {0};
+    recv(fd, &resp, sizeof(resp), 0);
+    EXPECT_TRUE(resp.status == 0, "close should succeed even without init");
+    close(fd);
+    TEST_PASS();
+}
+
+static void test_double_init(void) {
+    TEST_START("Two CMD_INIT in a row (re-init)");
+    int fd = connect_helper();
+    EXPECT_TRUE(fd >= 0, "can't connect");
+    NVEncIPCInitParams params = { .width = 320, .height = 240, .codec = 0,
+        .frameRateNum = 30, .frameRateDen = 1 };
+
+    for (int i = 0; i < 2; i++) {
+        NVEncIPCMsgHeader hdr = { .cmd = NVENC_IPC_CMD_INIT, .payload_size = sizeof(params) };
+        send_raw(fd, &hdr, sizeof(hdr));
+        send_raw(fd, &params, sizeof(params));
+        char drain[256];
+        recv(fd, drain, sizeof(drain), 0);
+    }
+    //clean close
+    NVEncIPCMsgHeader chdr = { .cmd = NVENC_IPC_CMD_CLOSE, .payload_size = 0 };
+    send_raw(fd, &chdr, sizeof(chdr));
+    char drain[64];
+    recv(fd, drain, sizeof(drain), 0);
+    close(fd);
+    TEST_PASS();
+}
+
+int main(void) {
+    signal(SIGPIPE, SIG_IGN);
+
+    printf("\n=== nvenc-helper IPC fuzz tests ===\n\n");
+
+    //check helper is running
+    int fd = connect_helper();
+    if (fd < 0) {
+        printf("ERROR: nvenc-helper not running\n");
+        return 1;
+    }
+    close(fd);
+
+    test_invalid_command();
+    test_zero_payload();
+    test_truncated_init();
+    test_huge_payload_size();
+    test_encode_without_init();
+    test_rapid_connect_disconnect();
+    test_close_without_init();
+    test_double_init();
+
+    printf("\n=== Results: %d passed, %d failed ===\n\n", g_pass, g_fail);
+    return g_fail > 0 ? 1 : 0;
+}