diff --git a/.gitignore b/.gitignore index 80ad06ff..c19231fd 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ va-api-nvidia.files va-api-nvidia.includes meson.build.user .idea +pr_summary.md diff --git a/cross-i386.txt b/cross-i386.txt new file mode 100644 index 00000000..c7c4f2bd --- /dev/null +++ b/cross-i386.txt @@ -0,0 +1,22 @@ +[binaries] +c = 'gcc' +cpp = 'g++' +ar = 'ar' +strip = 'strip' +pkg-config = 'pkg-config' + +[built-in options] +c_args = ['-m32'] +c_link_args = ['-m32'] +cpp_args = ['-m32'] +cpp_link_args = ['-m32'] + +[properties] +pkg_config_libdir = ['/usr/lib/i386-linux-gnu/pkgconfig', '/usr/share/pkgconfig', '/usr/lib/pkgconfig'] +sys_root = '/' + +[host_machine] +system = 'linux' +cpu_family = 'x86' +cpu = 'i686' +endian = 'little' diff --git a/docs/install-fedora.md b/docs/install-fedora.md new file mode 100644 index 00000000..9089f86d --- /dev/null +++ b/docs/install-fedora.md @@ -0,0 +1,141 @@ +# Installation on Fedora + +Tested on Fedora 43 with NVIDIA driver 580.126.18 (RPM Fusion). + +## Prerequisites + +NVIDIA proprietary driver installed via RPM Fusion (`akmod-nvidia`). + +Verify: +```bash +nvidia-smi --query-gpu=driver_version --format=csv,noheader +``` + +## Step 1 — Install build dependencies (64-bit) + +```bash +sudo dnf install -y \ + meson ninja-build gcc pkg-config \ + libva-devel libdrm-devel mesa-libEGL-devel nv-codec-headers \ + libva-utils +``` + +## Step 2 — Install build dependencies (32-bit, for Steam) + +```bash +sudo dnf install -y \ + glibc-devel.i686 \ + libva-devel.i686 libdrm-devel.i686 mesa-libEGL-devel.i686 +``` + +## Step 3 — Remove stock libva-nvidia-driver + +If you have the Fedora-packaged version (v0.0.16, decode-only), remove it first: + +```bash +sudo dnf remove -y libva-nvidia-driver +``` + +## Step 4 — Build 64-bit + +```bash +meson setup build64 . --wipe --prefix=/usr +meson compile -C build64 +``` + +## Step 5 — Build 32-bit (cross-compile) + +Fedora uses `/usr/lib/pkgconfig` for 32-bit `.pc` files (not `/usr/lib/i386-linux-gnu/`). +Create a cross-file: + +```bash +cat > cross-i386-fedora.txt << 'EOF' +[binaries] +c = 'gcc' +cpp = 'g++' +ar = 'ar' +strip = 'strip' +pkg-config = 'pkg-config' + +[built-in options] +c_args = ['-m32'] +c_link_args = ['-m32'] +cpp_args = ['-m32'] +cpp_link_args = ['-m32'] + +[properties] +pkg_config_libdir = ['/usr/lib/pkgconfig', '/usr/share/pkgconfig'] +sys_root = '/' + +[host_machine] +system = 'linux' +cpu_family = 'x86' +cpu = 'i686' +endian = 'little' +EOF +``` + +Then build: + +```bash +meson setup build32 . --wipe --cross-file cross-i386-fedora.txt +meson compile -C build32 +``` + +## Step 6 — Install + +```bash +sudo meson install -C build64 +sudo mkdir -p /usr/lib/dri +sudo cp build32/nvidia_drv_video.so /usr/lib/dri/nvidia_drv_video.so +``` + +This installs: +- 64-bit driver → `/usr/lib64/dri/nvidia_drv_video.so` +- 32-bit driver → `/usr/lib/dri/nvidia_drv_video.so` +- nvenc-helper → `/usr/libexec/nvenc-helper` + +## Step 7 — Systemd user service + +```bash +mkdir -p ~/.config/systemd/user +cat > ~/.config/systemd/user/nvenc-helper.service << 'EOF' +[Unit] +Description=NVENC encode helper for nvidia-vaapi-driver +Documentation=https://github.com/efortin/nvidia-vaapi-driver +After=graphical-session.target + +[Service] +Type=simple +ExecStart=/usr/libexec/nvenc-helper +Restart=on-failure +RestartSec=2 + +[Install] +WantedBy=graphical-session.target +EOF + +systemctl --user daemon-reload +systemctl --user enable nvenc-helper.service +systemctl --user restart nvenc-helper.service +``` + +## Step 8 — Verify + +```bash +# Check helper is running +systemctl --user is-active nvenc-helper.service + +# Check VA-API profiles (should show VAEntrypointEncSlice for encode) +vainfo --display drm --device /dev/dri/renderD128 +``` + +Expected output includes both decode (VLD) and encode (EncSlice) entrypoints: +``` +VAProfileH264Main : VAEntrypointVLD +VAProfileH264Main : VAEntrypointEncSlice +VAProfileHEVCMain : VAEntrypointVLD +VAProfileHEVCMain : VAEntrypointEncSlice +``` + +No environment variables needed. Just launch Steam. diff --git a/docs/install-ubuntu.md b/docs/install-ubuntu.md new file mode 100644 index 00000000..42134f14 --- /dev/null +++ b/docs/install-ubuntu.md @@ -0,0 +1,114 @@ +# Installation on Ubuntu + +Tested on Ubuntu 22.04+ with NVIDIA proprietary driver. + +## Prerequisites + +NVIDIA proprietary driver installed. + +Verify: +```bash +nvidia-smi --query-gpu=driver_version --format=csv,noheader +``` + +Detect the driver version (used for 32-bit packages): +```bash +NV_VER=$(dpkg -l | grep 'libnvidia-compute-.*amd64' | awk '{print $2}' | sed 's/libnvidia-compute-//' | sed 's/:amd64//' | head -1) +echo "NVIDIA driver: $NV_VER" +``` + +## Step 1 — Install build dependencies (64-bit) + +```bash +sudo apt-get install -y --no-install-recommends \ + meson ninja-build gcc pkg-config \ + libva-dev libdrm-dev libegl-dev libffmpeg-nvenc-dev \ + vainfo +``` + +## Step 2 — Install build dependencies (32-bit, for Steam) + +```bash +sudo dpkg --add-architecture i386 +sudo apt-get update + +sudo apt-get install -y --no-install-recommends \ + gcc-multilib \ + libva-dev:i386 libdrm-dev:i386 libegl-dev:i386 \ + libnvidia-compute-${NV_VER}:i386 \ + libnvidia-encode-${NV_VER}:i386 +``` + +## Step 3 — Build 64-bit + +```bash +meson setup build64 . --wipe --prefix=/usr +meson compile -C build64 +``` + +## Step 4 — Build 32-bit (cross-compile) + +The repo includes `cross-i386.txt` configured for Ubuntu paths (`/usr/lib/i386-linux-gnu/`). + +```bash +meson setup build32 . --wipe --cross-file cross-i386.txt +meson compile -C build32 +``` + +## Step 5 — Install + +```bash +sudo meson install -C build64 +sudo mkdir -p /usr/lib/i386-linux-gnu/dri +sudo cp build32/nvidia_drv_video.so /usr/lib/i386-linux-gnu/dri/nvidia_drv_video.so +``` + +This installs: +- 64-bit driver → `/usr/lib/x86_64-linux-gnu/dri/nvidia_drv_video.so` +- 32-bit driver → `/usr/lib/i386-linux-gnu/dri/nvidia_drv_video.so` +- nvenc-helper → `/usr/libexec/nvenc-helper` + +## Step 6 — Systemd user service + +```bash +mkdir -p ~/.config/systemd/user +cat > ~/.config/systemd/user/nvenc-helper.service << 'EOF' +[Unit] +Description=NVENC encode helper for nvidia-vaapi-driver +Documentation=https://github.com/efortin/nvidia-vaapi-driver +After=graphical-session.target + +[Service] +Type=simple +ExecStart=/usr/libexec/nvenc-helper +Restart=on-failure +RestartSec=2 + +[Install] +WantedBy=graphical-session.target +EOF + +systemctl --user daemon-reload +systemctl --user enable nvenc-helper.service +systemctl --user restart nvenc-helper.service +``` + +## Step 7 — Verify + +```bash +# Check helper is running +systemctl --user is-active nvenc-helper.service + +# Check VA-API profiles (should show VAEntrypointEncSlice for encode) +vainfo --display drm --device /dev/dri/renderD128 +``` + +Expected output includes both decode (VLD) and encode (EncSlice) entrypoints: +``` +VAProfileH264Main : VAEntrypointVLD +VAProfileH264Main : VAEntrypointEncSlice +VAProfileHEVCMain : VAEntrypointVLD +VAProfileHEVCMain : VAEntrypointEncSlice +``` + +No environment variables needed. Just launch Steam. diff --git a/meson.build b/meson.build index 990c2b21..6a9e0447 100644 --- a/meson.build +++ b/meson.build @@ -55,10 +55,14 @@ sources = [ 'src/direct/direct-export-buf.c', 'src/direct/nv-driver.c', 'src/h264.c', + 'src/h264_encode.c', 'src/hevc.c', + 'src/hevc_encode.c', 'src/jpeg.c', 'src/mpeg2.c', 'src/mpeg4.c', + 'src/nvenc.c', + 'src/nvenc-ipc-client.c', 'src/vabackend.c', 'src/vc1.c', 'src/vp8.c', @@ -84,6 +88,47 @@ shared_library( gnu_symbol_visibility: 'hidden', ) +# Build the 64-bit NVENC helper daemon (only for native builds, not cross-compiled i386) +if host_machine.cpu_family() == 'x86_64' or host_machine.cpu_family() == 'aarch64' + helper_deps = [ + cc.find_library('dl', required : false), + dependency('ffnvcodec', version: '>= 11.1.5.1'), + dependency('threads'), + ] + executable( + 'nvenc-helper', + 'src/nvenc-helper.c', + 'src/nvenc-ipc-client.c', # for nvenc_ipc_get_socket_path + dependencies: helper_deps, + install: true, + install_dir: get_option('libexecdir'), + ) +endif + +# Tests (native builds only, not cross-compiled) +if not meson.is_cross_build() + libva_test_deps = [ + dependency('libva'), + dependency('libva-drm'), + cc.find_library('m', required : false), + ] + + test_encode = executable('test_encode', 'tests/test_encode.c', + dependencies : libva_test_deps, install : false) + test('encode', test_encode, timeout : 60) + + test_encode_config = executable('test_encode_config', 'tests/test_encode_config.c', + dependencies : libva_test_deps, install : false) + test('encode_config', test_encode_config, timeout : 60) + + gst_launch = find_program('gst-launch-1.0', required : false) + if gst_launch.found() + test('gstreamer', find_program('tests/test_gstreamer.sh'), + timeout : 120, + env : ['GST_VAAPI_ALL_DRIVERS=1', 'LIBVA_DRIVER_NAME=nvidia']) + endif +endif + meson.add_devenv(environment({ 'NVD_LOG': '1', 'LIBVA_DRIVER_NAME': 'nvidia', diff --git a/nvenc-helper.service b/nvenc-helper.service new file mode 100644 index 00000000..30317f6c --- /dev/null +++ b/nvenc-helper.service @@ -0,0 +1,13 @@ +[Unit] +Description=NVENC encode helper for nvidia-vaapi-driver +Documentation=https://github.com/efortin/nvidia-vaapi-driver +After=graphical-session.target + +[Service] +Type=simple +ExecStart=/usr/libexec/nvenc-helper +Restart=on-failure +RestartSec=2 + +[Install] +WantedBy=graphical-session.target diff --git a/src/direct/direct-export-buf.c b/src/direct/direct-export-buf.c index 47843e92..5a53108e 100644 --- a/src/direct/direct-export-buf.c +++ b/src/direct/direct-export-buf.c @@ -23,6 +23,12 @@ static void findGPUIndexFromFd(NVDriver *drv) { uint8_t drmUuid[16]; get_device_uuid(&drv->driverContext, drmUuid); + /* If CUDA is not available (32-bit encode-only mode), default to GPU 0 */ + if (!drv->cudaAvailable) { + drv->cudaGpuId = 0; + return; + } + int gpuCount = 0; if (CHECK_CUDA_RESULT(drv->cu->cuDeviceGetCount(&gpuCount))) { return; @@ -193,9 +199,26 @@ static BackingImage *direct_allocateBackingImage(NVDriver *drv, NVSurface *surfa p[i].channelCount, 8 * fmtInfo->bppc, p[i].fourcc, &driverImages[i]); } - for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) { - if (!import_to_cuda(drv, &driverImages[i], 8 * fmtInfo->bppc, p[i].channelCount, &backingImage->cudaImages[i], &backingImage->arrays[i])) - goto bail; + /* Import into CUDA only when CUDA is available. + * In IPC encode-only mode, surfaces are allocated via DRM but not imported + * into CUDA — the 64-bit helper handles CUDA import from the DMA-BUF fd. */ + if (drv->cudaAvailable) { + for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) { + if (!import_to_cuda(drv, &driverImages[i], 8 * fmtInfo->bppc, p[i].channelCount, &backingImage->cudaImages[i], &backingImage->arrays[i])) + goto bail; + } + } else { + /* Without CUDA, keep the nvFd handles for the IPC helper to import. + * Close nvFd2 which import_to_cuda would normally close. */ + for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) { + backingImage->nvFds[i] = driverImages[i].nvFd; + backingImage->memorySizes[i] = driverImages[i].memorySize; + driverImages[i].nvFd = 0; /* Ownership transferred to backingImage */ + if (driverImages[i].nvFd2 != 0) { + close(driverImages[i].nvFd2); + driverImages[i].nvFd2 = 0; + } + } } backingImage->width = surface->width; @@ -241,6 +264,10 @@ static void destroyBackingImage(NVDriver *drv, BackingImage *img) { if (img->fds[i] > 0) { close(img->fds[i]); } + /* Close NVIDIA opaque fds kept for IPC encode mode */ + if (img->nvFds[i] > 0) { + close(img->nvFds[i]); + } } for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) { diff --git a/src/h264_encode.c b/src/h264_encode.c new file mode 100644 index 00000000..c3e0bc37 --- /dev/null +++ b/src/h264_encode.c @@ -0,0 +1,132 @@ +#include "vabackend.h" +#include "nvenc.h" +#include +#include + +void h264enc_handle_sequence_params(NVENCContext *nvencCtx, NVBuffer *buffer) +{ + VAEncSequenceParameterBufferH264 *seq = + (VAEncSequenceParameterBufferH264*) buffer->ptr; + + LOG("H264 encode: seq params %ux%u, intra_period=%u, ip_period=%u", + seq->picture_width_in_mbs * 16, seq->picture_height_in_mbs * 16, + seq->intra_period, seq->ip_period); + + /* Store basic sequence-level encode parameters */ + nvencCtx->width = seq->picture_width_in_mbs * 16; + nvencCtx->height = seq->picture_height_in_mbs * 16; + + if (seq->intra_period > 0) { + nvencCtx->intraPeriod = seq->intra_period; + } + if (seq->ip_period > 0) { + nvencCtx->ipPeriod = seq->ip_period; + } + + /* Frame rate from time_scale / num_units_in_tick / 2 if provided */ + if (seq->num_units_in_tick > 0 && seq->time_scale > 0) { + nvencCtx->frameRateNum = seq->time_scale; + nvencCtx->frameRateDen = seq->num_units_in_tick * 2; + } + + /* Bitrate (VA-API provides in bits/sec) */ + if (seq->bits_per_second > 0) { + nvencCtx->bitrate = seq->bits_per_second; + if (nvencCtx->maxBitrate == 0) { + nvencCtx->maxBitrate = seq->bits_per_second; + } + } + + nvencCtx->seqParamSet = true; +} + +void h264enc_handle_picture_params(NVENCContext *nvencCtx, NVBuffer *buffer) +{ + VAEncPictureParameterBufferH264 *pic = + (VAEncPictureParameterBufferH264*) buffer->ptr; + + /* Only log first few frames to avoid flooding at 60fps */ + if (nvencCtx->frameCount < 3) { + LOG("H264 encode: picture params, coded_buf=%d, pic_fields=0x%x", + pic->coded_buf, pic->pic_fields.value); + } + + nvencCtx->currentCodedBufId = pic->coded_buf; + nvencCtx->forceIDR = (pic->pic_fields.bits.idr_pic_flag != 0); + if (nvencCtx->forceIDR) { + LOG("H264 encode: IDR requested, coded_buf=%d", pic->coded_buf); + } +} + +void h264enc_handle_slice_params(NVENCContext *nvencCtx, NVBuffer *buffer) +{ + const VAEncSliceParameterBufferH264 *slice = + (VAEncSliceParameterBufferH264*) buffer->ptr; + + /* Map VA-API H.264 slice_type to NVENC picture type. + * Currently unused (enablePTD=1), but kept for future B-frame support. */ + switch (slice->slice_type) { + case 2: case 7: /* I / SI */ + nvencCtx->picType = nvencCtx->forceIDR + ? NV_ENC_PIC_TYPE_IDR : NV_ENC_PIC_TYPE_I; + break; + case 0: case 5: /* P / SP */ + nvencCtx->picType = NV_ENC_PIC_TYPE_P; + break; + case 1: case 6: /* B */ + nvencCtx->picType = NV_ENC_PIC_TYPE_B; + break; + default: + nvencCtx->picType = NV_ENC_PIC_TYPE_UNKNOWN; + break; + } +} + +void h264enc_handle_misc_params(NVENCContext *nvencCtx, NVBuffer *buffer) +{ + VAEncMiscParameterBuffer *misc = (VAEncMiscParameterBuffer*) buffer->ptr; + + switch (misc->type) { + case VAEncMiscParameterTypeRateControl: { + VAEncMiscParameterRateControl *rc = + (VAEncMiscParameterRateControl*) misc->data; + LOG("H264 encode: rate control bits_per_second=%u, target_percentage=%u", + rc->bits_per_second, rc->target_percentage); + if (rc->bits_per_second > 0) { + nvencCtx->maxBitrate = rc->bits_per_second; + if (rc->target_percentage > 0) { + nvencCtx->bitrate = (uint32_t)((uint64_t)rc->bits_per_second * rc->target_percentage / 100); + } else { + nvencCtx->bitrate = rc->bits_per_second; + } + } + break; + } + case VAEncMiscParameterTypeFrameRate: { + const VAEncMiscParameterFrameRate *fr = + (VAEncMiscParameterFrameRate*) misc->data; + if (fr->framerate > 0) { + /* framerate can be packed as (num | (den << 16)) or just num */ + uint32_t num = fr->framerate & 0xffff; + uint32_t den = (fr->framerate >> 16) & 0xffff; + if (den == 0) den = 1; + nvencCtx->frameRateNum = num; + nvencCtx->frameRateDen = den; + LOG("H264 encode: framerate %u/%u", num, den); + } + break; + } + case VAEncMiscParameterTypeHRD: { + VAEncMiscParameterHRD *hrd = + (VAEncMiscParameterHRD*) misc->data; + if (hrd->buffer_size > 0) + nvencCtx->vbvBufferSize = hrd->buffer_size; + if (hrd->initial_buffer_fullness > 0) + nvencCtx->vbvInitialDelay = hrd->initial_buffer_fullness; + break; + } + default: + LOG("H264 encode: unhandled misc param type %d", misc->type); + break; + } +} diff --git a/src/hevc_encode.c b/src/hevc_encode.c new file mode 100644 index 00000000..14a9df2d --- /dev/null +++ b/src/hevc_encode.c @@ -0,0 +1,121 @@ +#include "vabackend.h" +#include "nvenc.h" +#include +#include + +void hevcenc_handle_sequence_params(NVENCContext *nvencCtx, NVBuffer *buffer) +{ + VAEncSequenceParameterBufferHEVC *seq = + (VAEncSequenceParameterBufferHEVC*) buffer->ptr; + + LOG("HEVC encode: seq params %ux%u, intra_period=%u, ip_period=%u", + seq->pic_width_in_luma_samples, seq->pic_height_in_luma_samples, + seq->intra_period, seq->ip_period); + + nvencCtx->width = seq->pic_width_in_luma_samples; + nvencCtx->height = seq->pic_height_in_luma_samples; + + if (seq->intra_period > 0) { + nvencCtx->intraPeriod = seq->intra_period; + } + if (seq->ip_period > 0) { + nvencCtx->ipPeriod = seq->ip_period; + } + + /* VUI timing info */ + if (seq->vui_num_units_in_tick > 0 && seq->vui_time_scale > 0) { + nvencCtx->frameRateNum = seq->vui_time_scale; + nvencCtx->frameRateDen = seq->vui_num_units_in_tick * 2; + } + + /* Bitrate (VA-API provides in bits/sec) */ + if (seq->bits_per_second > 0) { + nvencCtx->bitrate = seq->bits_per_second; + if (nvencCtx->maxBitrate == 0) { + nvencCtx->maxBitrate = seq->bits_per_second; + } + } + + nvencCtx->seqParamSet = true; +} + +void hevcenc_handle_picture_params(NVENCContext *nvencCtx, NVBuffer *buffer) +{ + VAEncPictureParameterBufferHEVC *pic = + (VAEncPictureParameterBufferHEVC*) buffer->ptr; + + nvencCtx->currentCodedBufId = pic->coded_buf; + nvencCtx->forceIDR = (pic->pic_fields.bits.idr_pic_flag != 0); + if (nvencCtx->forceIDR) { + LOG("HEVC encode: picture params, coded_buf=%d, IDR requested", pic->coded_buf); + } +} + +void hevcenc_handle_slice_params(NVENCContext *nvencCtx, NVBuffer *buffer) +{ + const VAEncSliceParameterBufferHEVC *slice = + (VAEncSliceParameterBufferHEVC*) buffer->ptr; + + /* Map VA-API HEVC slice_type to NVENC picture type. + * HEVC slice types: 0=B, 1=P, 2=I */ + switch (slice->slice_type) { + case 2: /* I */ + nvencCtx->picType = nvencCtx->forceIDR + ? NV_ENC_PIC_TYPE_IDR : NV_ENC_PIC_TYPE_I; + break; + case 1: /* P */ + nvencCtx->picType = NV_ENC_PIC_TYPE_P; + break; + case 0: /* B */ + nvencCtx->picType = NV_ENC_PIC_TYPE_B; + break; + default: + nvencCtx->picType = NV_ENC_PIC_TYPE_UNKNOWN; + break; + } +} + +void hevcenc_handle_misc_params(NVENCContext *nvencCtx, NVBuffer *buffer) +{ + VAEncMiscParameterBuffer *misc = (VAEncMiscParameterBuffer*) buffer->ptr; + + switch (misc->type) { + case VAEncMiscParameterTypeRateControl: { + VAEncMiscParameterRateControl *rc = + (VAEncMiscParameterRateControl*) misc->data; + LOG("HEVC encode: rate control bits_per_second=%u", rc->bits_per_second); + if (rc->bits_per_second > 0) { + nvencCtx->maxBitrate = rc->bits_per_second; + if (rc->target_percentage > 0) { + nvencCtx->bitrate = (uint32_t)((uint64_t)rc->bits_per_second * rc->target_percentage / 100); + } else { + nvencCtx->bitrate = rc->bits_per_second; + } + } + break; + } + case VAEncMiscParameterTypeFrameRate: { + const VAEncMiscParameterFrameRate *fr = + (VAEncMiscParameterFrameRate*) misc->data; + if (fr->framerate > 0) { + uint32_t num = fr->framerate & 0xffff; + uint32_t den = (fr->framerate >> 16) & 0xffff; + if (den == 0) den = 1; + nvencCtx->frameRateNum = num; + nvencCtx->frameRateDen = den; + } + break; + } + case VAEncMiscParameterTypeHRD: { + VAEncMiscParameterHRD *hrd = + (VAEncMiscParameterHRD*) misc->data; + if (hrd->buffer_size > 0) + nvencCtx->vbvBufferSize = hrd->buffer_size; + if (hrd->initial_buffer_fullness > 0) + nvencCtx->vbvInitialDelay = hrd->initial_buffer_fullness; + break; + } + default: + break; + } +} diff --git a/src/nvenc-helper.c b/src/nvenc-helper.c new file mode 100644 index 00000000..2c873f0c --- /dev/null +++ b/src/nvenc-helper.c @@ -0,0 +1,1179 @@ +/* + * nvenc-helper: 64-bit NVENC encode helper daemon. + * + * This standalone process runs as 64-bit, where CUDA works on all GPUs. + * It receives raw NV12/P010 frames from the VA-API driver via + * a Unix domain socket, encodes them with NVENC, and returns the + * encoded bitstream. + * + * Usage: nvenc-helper [--foreground] + * The socket is created at $XDG_RUNTIME_DIR/nvenc-helper.sock + * + * The helper runs persistently until stopped via SIGTERM/SIGINT. + * It is managed by a systemd user service (nvenc-helper.service). + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "nvenc-ipc.h" + +static CudaFunctions *cu; +static NvencFunctions *nv_dl; +static volatile sig_atomic_t running = 1; +static int log_enabled = 0; + +/* Force an IDR keyframe every N frames for streaming error recovery. + * At 60fps this is ~1 second. At 30fps this is ~2 seconds. */ +#define NVENC_HELPER_IDR_INTERVAL 60 + +static inline bool check_cuda_helper(CUresult err, const char *func, int line) { + if (err != CUDA_SUCCESS) { + const char *s = NULL; + cu->cuGetErrorString(err, &s); + fprintf(stderr, "[nvenc-helper] CUDA error: %s (%d) at %s:%d\n", + s ? s : "?", err, func, line); + return true; + } + return false; +} +#define CHECK_CUDA_RESULT_HELPER(err) check_cuda_helper(err, __func__, __LINE__) + +static void helper_log(const char *fmt, ...) __attribute__((format(printf, 1, 2))); +static void helper_log(const char *fmt, ...) { + if (!log_enabled) return; + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + fprintf(stderr, "[nvenc-helper %ld.%03ld] ", (long)ts.tv_sec, ts.tv_nsec / 1000000); + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + fputc('\n', stderr); +} +#define HELPER_LOG helper_log + +/* Per-client encoder state */ +typedef struct { + CUcontext cudaCtx; + void *encoder; + NV_ENCODE_API_FUNCTION_LIST funcs; + bool initialized; + NV_ENC_INPUT_PTR inputBuffer; /* NVENC-managed (fallback) */ + NV_ENC_OUTPUT_PTR outputBuffer; + /* Persistent CUDA buffer for GPU-side encode (avoids nvEncLockInputBuffer) */ + CUdeviceptr gpuBuf; /* Linear CUDA VRAM buffer */ + uint32_t gpuBufPitch; /* Aligned pitch */ + uint32_t gpuBufSize; /* Total allocation size */ + NV_ENC_REGISTERED_PTR gpuBufReg; /* Persistent NVENC registration */ + bool gpuBufReady; /* true if GPU path available */ + uint32_t width; + uint32_t height; + uint32_t is10bit; + uint64_t frameCount; + uint8_t *bsBuf; /* pre-allocated bitstream output */ + uint32_t bsBufSize; +} HelperEncoder; + +/* Reliable I/O */ +static bool send_all(int fd, const void *buf, size_t len) +{ + const char *p = buf; + while (len > 0) { + ssize_t n = send(fd, p, len, MSG_NOSIGNAL); + if (n <= 0) { + if (n < 0 && errno == EINTR) continue; + return false; + } + p += n; + len -= (size_t)n; + } + return true; +} + +static bool recv_all(int fd, void *buf, size_t len) +{ + char *p = buf; + while (len > 0) { + ssize_t n = recv(fd, p, len, 0); + if (n <= 0) { + if (n < 0 && errno == EINTR) continue; + return false; + } + p += n; + len -= (size_t)n; + } + return true; +} + +static bool send_response(int fd, int32_t status, const void *data, uint32_t size) +{ + NVEncIPCRespHeader resp = { .status = status, .payload_size = size }; + if (!send_all(fd, &resp, sizeof(resp))) return false; + if (size > 0 && data != NULL) { + if (!send_all(fd, data, size)) return false; + } + return true; +} + +/* Send response header with an fd attached via SCM_RIGHTS */ +static bool send_response_with_fd(int sock, int32_t status, int send_fd, + const void *data, uint32_t size) +{ + NVEncIPCRespHeader resp = { .status = status, .payload_size = size }; + + struct iovec iov = { .iov_base = &resp, .iov_len = sizeof(resp) }; + union { + char buf[CMSG_SPACE(sizeof(int))]; + struct cmsghdr align; + } cmsg_buf; + memset(&cmsg_buf, 0, sizeof(cmsg_buf)); + + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = cmsg_buf.buf, + .msg_controllen = sizeof(cmsg_buf.buf), + }; + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &send_fd, sizeof(int)); + + ssize_t n = sendmsg(sock, &msg, MSG_NOSIGNAL); + if (n != sizeof(resp)) return false; + + if (size > 0 && data != NULL) { + if (!send_all(sock, data, size)) return false; + } + return true; +} + +/* Encoder lifecycle */ +static bool encoder_init(HelperEncoder *enc, const NVEncIPCInitParams *params) +{ + HELPER_LOG("Init: %ux%u codec=%u profile=%u bitrate=%u", + params->width, params->height, params->codec, params->profile, + params->bitrate); + + /* Create CUDA context */ + if (CHECK_CUDA_RESULT_HELPER(cu->cuCtxCreate(&enc->cudaCtx, 0, 0))) { + return false; + } + + /* Get NVENC function list */ + enc->funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER; + NVENCSTATUS st = nv_dl->NvEncodeAPICreateInstance(&enc->funcs); + if (st != NV_ENC_SUCCESS) { + HELPER_LOG("NvEncodeAPICreateInstance failed: %d", st); + cu->cuCtxDestroy(enc->cudaCtx); + return false; + } + + /* Open NVENC session */ + NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS sessParams = {0}; + sessParams.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER; + sessParams.deviceType = NV_ENC_DEVICE_TYPE_CUDA; + sessParams.device = enc->cudaCtx; + sessParams.apiVersion = NVENCAPI_VERSION; + + st = enc->funcs.nvEncOpenEncodeSessionEx(&sessParams, &enc->encoder); + if (st != NV_ENC_SUCCESS) { + HELPER_LOG("nvEncOpenEncodeSessionEx failed: %d", st); + cu->cuCtxDestroy(enc->cudaCtx); + return false; + } + + /* Select codec and profile GUIDs */ + GUID codecGuid = (params->codec == 0) ? NV_ENC_CODEC_H264_GUID : NV_ENC_CODEC_HEVC_GUID; + GUID profileGuid; + if (params->codec == 0) { + /* H.264 */ + profileGuid = NV_ENC_H264_PROFILE_HIGH_GUID; + } else { + /* HEVC */ + profileGuid = params->is10bit ? NV_ENC_HEVC_PROFILE_MAIN10_GUID : NV_ENC_HEVC_PROFILE_MAIN_GUID; + } + + /* Get preset config */ + NV_ENC_PRESET_CONFIG presetConfig = {0}; + presetConfig.version = NV_ENC_PRESET_CONFIG_VER; + presetConfig.presetCfg.version = NV_ENC_CONFIG_VER; + + st = enc->funcs.nvEncGetEncodePresetConfigEx( + enc->encoder, codecGuid, NV_ENC_PRESET_P4_GUID, + NV_ENC_TUNING_INFO_LOW_LATENCY, &presetConfig); + if (st != NV_ENC_SUCCESS) { + HELPER_LOG("nvEncGetEncodePresetConfigEx failed: %d", st); + goto fail; + } + + NV_ENC_CONFIG encConfig; + memcpy(&encConfig, &presetConfig.presetCfg, sizeof(encConfig)); + encConfig.version = NV_ENC_CONFIG_VER; + encConfig.profileGUID = profileGuid; + encConfig.frameIntervalP = 1; /* No B-frames for synchronous encode */ + + if (params->bitrate > 0) { + encConfig.rcParams.averageBitRate = params->bitrate; + } + if (params->maxBitrate > 0) { + encConfig.rcParams.maxBitRate = params->maxBitrate; + } + if (params->gopLength > 0) { + encConfig.gopLength = params->gopLength; + } + + /* Initialize encoder */ + NV_ENC_INITIALIZE_PARAMS initParams = {0}; + initParams.version = NV_ENC_INITIALIZE_PARAMS_VER; + initParams.encodeGUID = codecGuid; + initParams.presetGUID = NV_ENC_PRESET_P4_GUID; + initParams.encodeWidth = params->width; + initParams.encodeHeight = params->height; + initParams.darWidth = params->width; + initParams.darHeight = params->height; + initParams.frameRateNum = params->frameRateNum > 0 ? params->frameRateNum : 30; + initParams.frameRateDen = params->frameRateDen > 0 ? params->frameRateDen : 1; + initParams.enablePTD = 1; + initParams.encodeConfig = &encConfig; + initParams.maxEncodeWidth = params->width; + initParams.maxEncodeHeight = params->height; + initParams.tuningInfo = NV_ENC_TUNING_INFO_LOW_LATENCY; + + st = enc->funcs.nvEncInitializeEncoder(enc->encoder, &initParams); + if (st != NV_ENC_SUCCESS) { + HELPER_LOG("nvEncInitializeEncoder failed: %d", st); + goto fail; + } + + /* Create NVENC-managed input buffer */ + NV_ENC_CREATE_INPUT_BUFFER createIn = {0}; + createIn.version = NV_ENC_CREATE_INPUT_BUFFER_VER; + createIn.width = params->width; + createIn.height = params->height; + createIn.bufferFmt = params->is10bit ? NV_ENC_BUFFER_FORMAT_YUV420_10BIT : NV_ENC_BUFFER_FORMAT_NV12; + + st = enc->funcs.nvEncCreateInputBuffer(enc->encoder, &createIn); + if (st != NV_ENC_SUCCESS) { + HELPER_LOG("nvEncCreateInputBuffer failed: %d", st); + goto fail; + } + enc->inputBuffer = createIn.inputBuffer; + + /* Create output bitstream buffer */ + NV_ENC_CREATE_BITSTREAM_BUFFER createOut = {0}; + createOut.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER; + + st = enc->funcs.nvEncCreateBitstreamBuffer(enc->encoder, &createOut); + if (st != NV_ENC_SUCCESS) { + HELPER_LOG("nvEncCreateBitstreamBuffer failed: %d", st); + enc->funcs.nvEncDestroyInputBuffer(enc->encoder, enc->inputBuffer); + goto fail; + } + enc->outputBuffer = createOut.bitstreamBuffer; + + enc->width = params->width; + enc->height = params->height; + enc->is10bit = params->is10bit; + enc->frameCount = 0; + enc->bsBufSize = 4 * 1024 * 1024; + enc->bsBuf = malloc(enc->bsBufSize); + enc->initialized = true; + + /* Allocate persistent CUDA linear buffer for GPU-side encode. + * This replaces nvEncLockInputBuffer (host memory) with a CUDA device + * buffer registered once with NVENC. Per-frame: single cuMemcpy2D + * (host→device with pitch conversion) + nvEncMapInputResource. */ + uint32_t bpp = params->is10bit ? 2 : 1; + enc->gpuBufPitch = params->width * bpp; + enc->gpuBufPitch = (enc->gpuBufPitch + 255) & ~255; /* Align to 256 */ + enc->gpuBufSize = enc->gpuBufPitch * params->height * 3 / 2; + enc->gpuBufReady = false; + + CUresult cres = cu->cuMemAlloc(&enc->gpuBuf, enc->gpuBufSize); + if (cres == CUDA_SUCCESS) { + NV_ENC_BUFFER_FORMAT bufFmt = params->is10bit + ? NV_ENC_BUFFER_FORMAT_YUV420_10BIT : NV_ENC_BUFFER_FORMAT_NV12; + + NV_ENC_REGISTER_RESOURCE regRes = {0}; + regRes.version = NV_ENC_REGISTER_RESOURCE_VER; + regRes.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR; + regRes.resourceToRegister = (void *)enc->gpuBuf; + regRes.width = params->width; + regRes.height = params->height; + regRes.pitch = enc->gpuBufPitch; + regRes.bufferFormat = bufFmt; + regRes.bufferUsage = NV_ENC_INPUT_IMAGE; + + st = enc->funcs.nvEncRegisterResource(enc->encoder, ®Res); + if (st == NV_ENC_SUCCESS) { + enc->gpuBufReg = regRes.registeredResource; + enc->gpuBufReady = true; + HELPER_LOG("GPU buffer: %u bytes, pitch=%u (persistent CUDA+NVENC)", + enc->gpuBufSize, enc->gpuBufPitch); + } else { + HELPER_LOG("GPU buffer register failed (%d), falling back to host path", st); + cu->cuMemFree(enc->gpuBuf); + enc->gpuBuf = 0; + } + } else { + HELPER_LOG("GPU buffer alloc failed (%d), falling back to host path", cres); + enc->gpuBuf = 0; + } + + HELPER_LOG("Encoder initialized: %ux%u %s %s (gpu=%s)", + params->width, params->height, + params->codec == 0 ? "H.264" : "HEVC", + params->is10bit ? "10-bit" : "8-bit", + enc->gpuBufReady ? "yes" : "no"); + return true; + +fail: + enc->funcs.nvEncDestroyEncoder(enc->encoder); + enc->encoder = NULL; + cu->cuCtxDestroy(enc->cudaCtx); + enc->cudaCtx = NULL; + return false; +} + +static bool encoder_encode(HelperEncoder *enc, const void *frame_data, + uint32_t frame_width, uint32_t frame_height, + uint32_t frame_size, bool force_idr, + void **out_data, uint32_t *out_size) +{ + NVENCSTATUS st; + uint32_t bpp = enc->is10bit ? 2 : 1; + uint32_t srcPitch = frame_width * bpp; + NV_ENC_INPUT_PTR encodeInput; + NV_ENC_BUFFER_FORMAT encFmt = enc->is10bit + ? NV_ENC_BUFFER_FORMAT_YUV420_10BIT : NV_ENC_BUFFER_FORMAT_NV12; + uint32_t encodePitch; + bool usedGpuPath = false; + + if (enc->gpuBufReady) { + /* GPU FAST PATH: cuMemcpy2D host→device with pitch conversion. + * Single CUDA call replaces 1080+ individual memcpy calls. + * GPU DMA engine handles pitch conversion in hardware. + * NVENC reads from VRAM — no PCIe upload at encode time. */ + uint32_t padLines = enc->height - frame_height; + + /* Luma: host SHM → GPU buffer */ + CUDA_MEMCPY2D cpyLuma = {0}; + cpyLuma.srcMemoryType = CU_MEMORYTYPE_HOST; + cpyLuma.srcHost = frame_data; + cpyLuma.srcPitch = srcPitch; + cpyLuma.dstMemoryType = CU_MEMORYTYPE_DEVICE; + cpyLuma.dstDevice = enc->gpuBuf; + cpyLuma.dstPitch = enc->gpuBufPitch; + cpyLuma.WidthInBytes = srcPitch; + cpyLuma.Height = frame_height; + + CUresult cres = cu->cuMemcpy2D(&cpyLuma); + if (cres != CUDA_SUCCESS) { + HELPER_LOG("GPU path: luma cuMemcpy2D failed: %d, falling back", cres); + goto host_fallback; + } + + /* Chroma: host SHM → GPU buffer */ + uint32_t chromaOff_src = srcPitch * frame_height; + uint32_t chromaOff_dst = enc->gpuBufPitch * enc->height; + uint32_t chromaHeight = frame_height / 2; + + CUDA_MEMCPY2D cpyChroma = {0}; + cpyChroma.srcMemoryType = CU_MEMORYTYPE_HOST; + cpyChroma.srcHost = (const uint8_t *)frame_data + chromaOff_src; + cpyChroma.srcPitch = srcPitch; + cpyChroma.dstMemoryType = CU_MEMORYTYPE_DEVICE; + cpyChroma.dstDevice = enc->gpuBuf + chromaOff_dst; + cpyChroma.dstPitch = enc->gpuBufPitch; + cpyChroma.WidthInBytes = srcPitch; + cpyChroma.Height = chromaHeight; + + cres = cu->cuMemcpy2D(&cpyChroma); + if (cres != CUDA_SUCCESS) { + HELPER_LOG("GPU path: chroma cuMemcpy2D failed: %d, falling back", cres); + goto host_fallback; + } + + /* Zero padding rows on GPU (async, only if needed) */ + if (padLines > 0) { + cu->cuMemsetD8Async(enc->gpuBuf + enc->gpuBufPitch * frame_height, + 0, enc->gpuBufPitch * padLines, 0); + cu->cuMemsetD8Async(enc->gpuBuf + chromaOff_dst + enc->gpuBufPitch * chromaHeight, + 128, enc->gpuBufPitch * (padLines / 2), 0); + } + + /* Map the persistent registered resource */ + NV_ENC_MAP_INPUT_RESOURCE mapRes = {0}; + mapRes.version = NV_ENC_MAP_INPUT_RESOURCE_VER; + mapRes.registeredResource = enc->gpuBufReg; + + st = enc->funcs.nvEncMapInputResource(enc->encoder, &mapRes); + if (st != NV_ENC_SUCCESS) { + HELPER_LOG("GPU path: nvEncMapInputResource failed: %d, falling back", st); + goto host_fallback; + } + + encodeInput = mapRes.mappedResource; + encFmt = mapRes.mappedBufferFmt; + encodePitch = enc->gpuBufPitch; + usedGpuPath = true; + goto do_encode; + } + +host_fallback: + /* HOST FALLBACK: nvEncLockInputBuffer + memcpy (original path) */ + { + NV_ENC_LOCK_INPUT_BUFFER lockIn = {0}; + lockIn.version = NV_ENC_LOCK_INPUT_BUFFER_VER; + lockIn.inputBuffer = enc->inputBuffer; + + st = enc->funcs.nvEncLockInputBuffer(enc->encoder, &lockIn); + if (st != NV_ENC_SUCCESS) { + HELPER_LOG("nvEncLockInputBuffer failed: %d", st); + return false; + } + + uint32_t dstPitch = lockIn.pitch; + uint8_t *src = (uint8_t *)frame_data; + uint8_t *dst = (uint8_t *)lockIn.bufferDataPtr; + uint32_t chromaOffset_src = srcPitch * frame_height; + uint32_t chromaOffset_dst = dstPitch * enc->height; + uint32_t chromaHeight = frame_height / 2; + uint32_t padLines = enc->height - frame_height; + + if (srcPitch == dstPitch) { + memcpy(dst, src, srcPitch * frame_height); + memcpy(dst + chromaOffset_dst, src + chromaOffset_src, srcPitch * chromaHeight); + } else { + for (uint32_t y = 0; y < frame_height; y++) + memcpy(dst + y * dstPitch, src + y * srcPitch, srcPitch); + for (uint32_t y = 0; y < chromaHeight; y++) + memcpy(dst + chromaOffset_dst + y * dstPitch, + src + chromaOffset_src + y * srcPitch, srcPitch); + } + + if (padLines > 0) { + memset(dst + dstPitch * frame_height, 0, dstPitch * padLines); + memset(dst + chromaOffset_dst + dstPitch * chromaHeight, 128, dstPitch * (padLines / 2)); + } + + enc->funcs.nvEncUnlockInputBuffer(enc->encoder, enc->inputBuffer); + encodeInput = enc->inputBuffer; + encodePitch = dstPitch; + } + +do_encode:; + /* Encode */ + NV_ENC_PIC_PARAMS picParams = {0}; + picParams.version = NV_ENC_PIC_PARAMS_VER; + picParams.inputBuffer = encodeInput; + picParams.bufferFmt = encFmt; + picParams.inputWidth = enc->width; + picParams.inputHeight = enc->height; + picParams.inputPitch = encodePitch; + picParams.outputBitstream = enc->outputBuffer; + picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME; + picParams.pictureType = NV_ENC_PIC_TYPE_UNKNOWN; + /* Force IDR: on first frame, on explicit request, or every 60 frames + * for streaming recovery. Without periodic IDR, a single lost packet + * causes the client to freeze until the next intra_period (up to 60s). */ + bool needIDR = (enc->frameCount == 0) || force_idr || (enc->frameCount % NVENC_HELPER_IDR_INTERVAL == 0); + picParams.encodePicFlags = needIDR + ? (NV_ENC_PIC_FLAG_OUTPUT_SPSPPS | NV_ENC_PIC_FLAG_FORCEIDR) + : 0; + picParams.frameIdx = (uint32_t)enc->frameCount; + picParams.inputTimeStamp = enc->frameCount; + + st = enc->funcs.nvEncEncodePicture(enc->encoder, &picParams); + + /* Unmap the GPU resource after encode (must happen before next map) */ + if (usedGpuPath) { + enc->funcs.nvEncUnmapInputResource(enc->encoder, encodeInput); + } + + if (st != NV_ENC_SUCCESS) { + HELPER_LOG("nvEncEncodePicture failed: %d", st); + return false; + } + + enc->frameCount++; + + if (enc->frameCount % 300 == 0) { + HELPER_LOG("Encoded %lu frames", (unsigned long)enc->frameCount); + } + + /* Lock output bitstream */ + NV_ENC_LOCK_BITSTREAM lockOut = {0}; + lockOut.version = NV_ENC_LOCK_BITSTREAM_VER; + lockOut.outputBitstream = enc->outputBuffer; + + st = enc->funcs.nvEncLockBitstream(enc->encoder, &lockOut); + if (st != NV_ENC_SUCCESS) { + HELPER_LOG("nvEncLockBitstream failed: %d", st); + return false; + } + + /* Copy bitstream data */ + *out_size = lockOut.bitstreamSizeInBytes; + + //grow pre-allocated buffer if needed + if (lockOut.bitstreamSizeInBytes > enc->bsBufSize) { + uint32_t newSize = lockOut.bitstreamSizeInBytes + (lockOut.bitstreamSizeInBytes >> 1); + uint8_t *newBuf = realloc(enc->bsBuf, newSize); + if (newBuf == NULL) { + enc->funcs.nvEncUnlockBitstream(enc->encoder, enc->outputBuffer); + return false; + } + enc->bsBuf = newBuf; + enc->bsBufSize = newSize; + } + memcpy(enc->bsBuf, lockOut.bitstreamBufferPtr, lockOut.bitstreamSizeInBytes); + *out_data = enc->bsBuf; + + enc->funcs.nvEncUnlockBitstream(enc->encoder, enc->outputBuffer); + + return true; +} + +static void encoder_close(HelperEncoder *enc) +{ + if (enc->encoder == NULL) return; + + /* Flush */ + if (enc->initialized) { + NV_ENC_PIC_PARAMS picParams = {0}; + picParams.version = NV_ENC_PIC_PARAMS_VER; + picParams.encodePicFlags = NV_ENC_PIC_FLAG_EOS; + enc->funcs.nvEncEncodePicture(enc->encoder, &picParams); + } + + if (enc->outputBuffer) { + enc->funcs.nvEncDestroyBitstreamBuffer(enc->encoder, enc->outputBuffer); + } + if (enc->inputBuffer) { + enc->funcs.nvEncDestroyInputBuffer(enc->encoder, enc->inputBuffer); + } + /* Free persistent GPU buffer */ + if (enc->gpuBufReady) { + enc->funcs.nvEncUnregisterResource(enc->encoder, enc->gpuBufReg); + enc->gpuBufReady = false; + } + if (enc->gpuBuf) { + cu->cuMemFree(enc->gpuBuf); + enc->gpuBuf = 0; + } + + enc->funcs.nvEncDestroyEncoder(enc->encoder); + enc->encoder = NULL; + + if (enc->cudaCtx) { + cu->cuCtxDestroy(enc->cudaCtx); + enc->cudaCtx = NULL; + } + + free(enc->bsBuf); + enc->bsBuf = NULL; + enc->bsBufSize = 0; + enc->initialized = false; + HELPER_LOG("Encoder closed (encoded %lu frames)", (unsigned long)enc->frameCount); +} + +/* Handle one client connection */ +static void handle_client(int client_fd) +{ + HelperEncoder enc = {0}; + void *shm_ptr = MAP_FAILED; + uint32_t shm_size = 0; + int shm_fd = -1; + + HELPER_LOG("Client connected (fd=%d)", client_fd); + + while (running) { + //wait for data with 5s timeout (detect dead clients) + struct pollfd cpfd = { .fd = client_fd, .events = POLLIN }; + int pr = poll(&cpfd, 1, 5000); + if (pr == 0) { + HELPER_LOG("Client timeout (5s), disconnecting"); + break; + } + if (pr < 0) { + if (errno == EINTR) continue; + break; + } + + NVEncIPCMsgHeader hdr; + if (!recv_all(client_fd, &hdr, sizeof(hdr))) { + HELPER_LOG("Client disconnected"); + break; + } + + switch (hdr.cmd) { + case NVENC_IPC_CMD_INIT: { + if (hdr.payload_size != sizeof(NVEncIPCInitParams)) { + send_response(client_fd, -1, NULL, 0); + break; + } + NVEncIPCInitParams params; + if (!recv_all(client_fd, ¶ms, sizeof(params))) goto done; + + if (enc.initialized) { + encoder_close(&enc); + } + + /* Clean up old shm if any */ + if (shm_ptr != MAP_FAILED) { + munmap(shm_ptr, shm_size); + shm_ptr = MAP_FAILED; + } + if (shm_fd >= 0) { + close(shm_fd); + shm_fd = -1; + } + + bool ok = encoder_init(&enc, ¶ms); + if (!ok) { + send_response(client_fd, -1, NULL, 0); + break; + } + + /* Create shared memory for frame transfer. + * NV12 = w*h*1.5, P010 = w*h*3 */ + uint32_t bpp = params.is10bit ? 2 : 1; + shm_size = params.width * bpp * params.height * 3 / 2; + shm_fd = memfd_create("nvenc-frame", MFD_CLOEXEC); + if (shm_fd < 0 || ftruncate(shm_fd, shm_size) < 0) { + HELPER_LOG("Failed to create shm: %s", strerror(errno)); + if (shm_fd >= 0) { close(shm_fd); shm_fd = -1; } + /* Fall back to socket-based transfer (no shm). + * Send normal response without fd (no SCM_RIGHTS with fd=-1). */ + NVEncIPCInitResponse iresp = { .shm_size = 0 }; + send_response(client_fd, 0, &iresp, sizeof(iresp)); + break; + } + + shm_ptr = mmap(NULL, shm_size, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0); + if (shm_ptr == MAP_FAILED) { + HELPER_LOG("Failed to mmap shm: %s", strerror(errno)); + close(shm_fd); + shm_fd = -1; + NVEncIPCInitResponse iresp = { .shm_size = 0 }; + send_response(client_fd, 0, &iresp, sizeof(iresp)); + break; + } + + /* Send shm fd to client */ + int client_shm_fd = dup(shm_fd); /* dup because SCM_RIGHTS transfers ownership */ + NVEncIPCInitResponse iresp = { .shm_size = shm_size }; + HELPER_LOG("Created shm: %u bytes, fd=%d", shm_size, client_shm_fd); + send_response_with_fd(client_fd, 0, client_shm_fd, &iresp, sizeof(iresp)); + close(client_shm_fd); + break; + } + + case NVENC_IPC_CMD_ENCODE: { + if (!enc.initialized || hdr.payload_size > NVENC_IPC_MAX_FRAME_SIZE + sizeof(NVEncIPCEncodeParams)) { + /* Drain the payload with a fixed buffer to avoid huge malloc */ + char drain[4096]; + uint32_t remaining = hdr.payload_size; + while (remaining > 0) { + uint32_t chunk = remaining < sizeof(drain) ? remaining : sizeof(drain); + if (!recv_all(client_fd, drain, chunk)) goto done; + remaining -= chunk; + } + send_response(client_fd, -1, NULL, 0); + break; + } + + NVEncIPCEncodeParams ep; + if (!recv_all(client_fd, &ep, sizeof(ep))) goto done; + + if (ep.frame_size > NVENC_IPC_MAX_FRAME_SIZE) { + HELPER_LOG("CMD_ENCODE: frame_size %u exceeds max %u", ep.frame_size, NVENC_IPC_MAX_FRAME_SIZE); + send_response(client_fd, -1, NULL, 0); + goto done; + } + + /* Receive frame data */ + void *frame = malloc(ep.frame_size); + if (frame == NULL) { + send_response(client_fd, -1, NULL, 0); + goto done; + } + if (!recv_all(client_fd, frame, ep.frame_size)) { + free(frame); + goto done; + } + + + void *bitstream = NULL; + uint32_t bsSize = 0; + bool ok = encoder_encode(&enc, frame, ep.width, ep.height, ep.frame_size, ep.force_idr, &bitstream, &bsSize); + free(frame); + + + if (ok) { + send_response(client_fd, 0, bitstream, bsSize); + } else { + send_response(client_fd, -1, NULL, 0); + } + break; + } + + case NVENC_IPC_CMD_ENCODE_DMABUF: { + if (!enc.initialized) { + if (hdr.payload_size > 0) { + void *tmp = malloc(hdr.payload_size); + if (tmp) { recv_all(client_fd, tmp, hdr.payload_size); free(tmp); } + } + send_response(client_fd, -1, NULL, 0); + break; + } + + /* Receive params WITH per-plane DMA-BUF fds via SCM_RIGHTS */ + NVEncIPCEncodeDmaBufParams dp; + int dmabuf_fds[4] = {-1, -1, -1, -1}; + int num_fds = 0; + { + struct iovec iov = { .iov_base = &dp, .iov_len = sizeof(dp) }; + union { + char buf[CMSG_SPACE(sizeof(int) * 4)]; + struct cmsghdr align; + } cmsg_buf; + memset(&cmsg_buf, 0, sizeof(cmsg_buf)); + + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = cmsg_buf.buf, + .msg_controllen = sizeof(cmsg_buf.buf), + }; + + ssize_t n = recvmsg(client_fd, &msg, 0); + if (n != sizeof(dp)) { + HELPER_LOG("DMABUF: recvmsg failed: %zd (errno=%d)", n, errno); + send_response(client_fd, -1, NULL, 0); + break; + } + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (cmsg && cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_RIGHTS) { + num_fds = (int)((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int)); + if (num_fds > 4) num_fds = 4; + memcpy(dmabuf_fds, CMSG_DATA(cmsg), (size_t)num_fds * sizeof(int)); + } + } + + if (num_fds < 1 || dmabuf_fds[0] < 0) { + HELPER_LOG("DMABUF: no fds received"); + send_response(client_fd, -1, NULL, 0); + break; + } + + + if (enc.frameCount < 3) { + HELPER_LOG("DMABUF: fds=[%d,%d] %ux%u planes=%u bppc=%u sizes=[%u,%u]", + dmabuf_fds[0], dmabuf_fds[1], + dp.width, dp.height, dp.num_planes, dp.bppc, + dp.sizes[0], dp.sizes[1]); + } + + /* Import each plane's DMA-BUF into CUDA as a CUarray, + * same as the driver's import_to_cuda in direct-export-buf.c */ + CUexternalMemory extMems[4] = {0}; + CUmipmappedArray mipmaps[4] = {0}; + CUarray arrays[4] = {0}; + bool importOk = true; + + for (int i = 0; i < (int)dp.num_planes && i < num_fds; i++) { + CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = { + .type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, + .handle.fd = dmabuf_fds[i], + .size = dp.sizes[i], + .flags = 0, + }; + + CUresult cres = cu->cuImportExternalMemory(&extMems[i], &extMemDesc); + /* CUDA takes ownership of the fd on success */ + if (cres != CUDA_SUCCESS) { + HELPER_LOG("DMABUF: cuImportExternalMemory plane %d failed: %d", i, cres); + close(dmabuf_fds[i]); + importOk = false; + break; + } + + /* Determine plane format */ + int bpc = 8 * dp.bppc; + int channels = (i == 0) ? 1 : 2; /* Y=1ch, UV=2ch interleaved */ + uint32_t planeW = (i == 0) ? dp.width : dp.width / 2; + uint32_t planeH = (i == 0) ? dp.height : dp.height / 2; + + CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC mipmapDesc = { + .arrayDesc = { + .Width = planeW, + .Height = planeH, + .Depth = 0, + .Format = (bpc == 8) ? CU_AD_FORMAT_UNSIGNED_INT8 : CU_AD_FORMAT_UNSIGNED_INT16, + .NumChannels = (unsigned int)channels, + .Flags = 0, + }, + .numLevels = 1, + .offset = 0, + }; + + cres = cu->cuExternalMemoryGetMappedMipmappedArray(&mipmaps[i], extMems[i], &mipmapDesc); + if (cres != CUDA_SUCCESS) { + HELPER_LOG("DMABUF: cuExternalMemoryGetMappedMipmappedArray plane %d failed: %d", i, cres); + importOk = false; + break; + } + + cres = cu->cuMipmappedArrayGetLevel(&arrays[i], mipmaps[i], 0); + if (cres != CUDA_SUCCESS) { + HELPER_LOG("DMABUF: cuMipmappedArrayGetLevel plane %d failed: %d", i, cres); + importOk = false; + break; + } + } + + if (!importOk) { + for (int i = 0; i < 4; i++) { + if (mipmaps[i]) cu->cuMipmappedArrayDestroy(mipmaps[i]); + if (extMems[i]) cu->cuDestroyExternalMemory(extMems[i]); + /* Close any fds that CUDA didn't take ownership of */ + else if (i < num_fds && dmabuf_fds[i] >= 0) close(dmabuf_fds[i]); + } + /* Close remaining fds beyond what we tried to import */ + for (int i = (int)dp.num_planes; i < num_fds; i++) { + if (dmabuf_fds[i] >= 0) close(dmabuf_fds[i]); + } + send_response(client_fd, -1, NULL, 0); + break; + } + + /* Copy CUarrays to linear buffer (same as nvEndPictureEncode direct path) */ + uint32_t bpp = dp.is10bit ? 2 : 1; + uint32_t pitch = dp.width * bpp; + pitch = (pitch + 255) & ~255; /* Align to 256 */ + uint32_t lumaSize = pitch * dp.height; + uint32_t chromaSize = pitch * (dp.height / 2); + uint32_t totalSize = lumaSize + chromaSize; + + CUdeviceptr linearBuf = 0; + CUresult cres = cu->cuMemAlloc(&linearBuf, totalSize); + if (cres != CUDA_SUCCESS) { + HELPER_LOG("DMABUF: cuMemAlloc(%u) failed: %d", totalSize, cres); + goto dmabuf_cleanup; + } + cu->cuMemsetD8Async(linearBuf, 0, totalSize, 0); + + /* Copy luma */ + CUDA_MEMCPY2D cpy = {0}; + cpy.srcMemoryType = CU_MEMORYTYPE_ARRAY; + cpy.srcArray = arrays[0]; + cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE; + cpy.dstDevice = linearBuf; + cpy.dstPitch = pitch; + cpy.WidthInBytes = dp.width * bpp; + cpy.Height = dp.height; + cres = cu->cuMemcpy2D(&cpy); + if (cres != CUDA_SUCCESS) { + HELPER_LOG("DMABUF: luma cuMemcpy2D failed: %d", cres); + cu->cuMemFree(linearBuf); + goto dmabuf_cleanup; + } + + /* Copy chroma */ + if (dp.num_planes >= 2 && arrays[1]) { + memset(&cpy, 0, sizeof(cpy)); + cpy.srcMemoryType = CU_MEMORYTYPE_ARRAY; + cpy.srcArray = arrays[1]; + cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE; + cpy.dstDevice = linearBuf + lumaSize; + cpy.dstPitch = pitch; + cpy.WidthInBytes = dp.width * bpp; + cpy.Height = dp.height / 2; + cres = cu->cuMemcpy2D(&cpy); + if (cres != CUDA_SUCCESS) { + HELPER_LOG("DMABUF: chroma cuMemcpy2D failed: %d", cres); + cu->cuMemFree(linearBuf); + goto dmabuf_cleanup; + } + } + + /* Register linear buffer with NVENC */ + NV_ENC_BUFFER_FORMAT bufFmt = dp.is10bit + ? NV_ENC_BUFFER_FORMAT_YUV420_10BIT : NV_ENC_BUFFER_FORMAT_NV12; + + NV_ENC_REGISTER_RESOURCE regRes = {0}; + regRes.version = NV_ENC_REGISTER_RESOURCE_VER; + regRes.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR; + regRes.resourceToRegister = (void *)linearBuf; + regRes.width = dp.width; + regRes.height = dp.height; + regRes.pitch = pitch; + regRes.bufferFormat = bufFmt; + regRes.bufferUsage = NV_ENC_INPUT_IMAGE; + + NVENCSTATUS nvst = enc.funcs.nvEncRegisterResource(enc.encoder, ®Res); + if (nvst != NV_ENC_SUCCESS) { + HELPER_LOG("DMABUF: nvEncRegisterResource failed: %d", nvst); + cu->cuMemFree(linearBuf); + goto dmabuf_cleanup; + } + + NV_ENC_MAP_INPUT_RESOURCE mapRes = {0}; + mapRes.version = NV_ENC_MAP_INPUT_RESOURCE_VER; + mapRes.registeredResource = regRes.registeredResource; + nvst = enc.funcs.nvEncMapInputResource(enc.encoder, &mapRes); + if (nvst != NV_ENC_SUCCESS) { + enc.funcs.nvEncUnregisterResource(enc.encoder, regRes.registeredResource); + cu->cuMemFree(linearBuf); + goto dmabuf_cleanup; + } + + /* Encode */ + NV_ENC_PIC_PARAMS picParams = {0}; + picParams.version = NV_ENC_PIC_PARAMS_VER; + picParams.inputBuffer = mapRes.mappedResource; + picParams.bufferFmt = mapRes.mappedBufferFmt; + picParams.inputWidth = dp.width; + picParams.inputHeight = dp.height; + picParams.inputPitch = pitch; + picParams.outputBitstream = enc.outputBuffer; + picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME; + picParams.pictureType = NV_ENC_PIC_TYPE_UNKNOWN; + picParams.encodePicFlags = (enc.frameCount == 0) + ? (NV_ENC_PIC_FLAG_OUTPUT_SPSPPS | NV_ENC_PIC_FLAG_FORCEIDR) : 0; + picParams.frameIdx = (uint32_t)enc.frameCount; + picParams.inputTimeStamp = enc.frameCount; + + nvst = enc.funcs.nvEncEncodePicture(enc.encoder, &picParams); + + enc.funcs.nvEncUnmapInputResource(enc.encoder, mapRes.mappedResource); + enc.funcs.nvEncUnregisterResource(enc.encoder, regRes.registeredResource); + cu->cuMemFree(linearBuf); + + if (nvst != NV_ENC_SUCCESS) { + HELPER_LOG("DMABUF: nvEncEncodePicture failed: %d", nvst); + goto dmabuf_cleanup; + } + + enc.frameCount++; + if (enc.frameCount % 300 == 0) { + HELPER_LOG("Encoded %lu frames (DMABUF)", (unsigned long)enc.frameCount); + } + + /* Lock and send bitstream */ + { + NV_ENC_LOCK_BITSTREAM lockOut = {0}; + lockOut.version = NV_ENC_LOCK_BITSTREAM_VER; + lockOut.outputBitstream = enc.outputBuffer; + nvst = enc.funcs.nvEncLockBitstream(enc.encoder, &lockOut); + if (nvst == NV_ENC_SUCCESS) { + send_response(client_fd, 0, lockOut.bitstreamBufferPtr, + lockOut.bitstreamSizeInBytes); + enc.funcs.nvEncUnlockBitstream(enc.encoder, enc.outputBuffer); + } else { + send_response(client_fd, -1, NULL, 0); + } + } + +dmabuf_cleanup: + for (int i = 0; i < 4; i++) { + if (mipmaps[i]) cu->cuMipmappedArrayDestroy(mipmaps[i]); + if (extMems[i]) cu->cuDestroyExternalMemory(extMems[i]); + } + break; + } + + case NVENC_IPC_CMD_ENCODE_SHM: { + if (!enc.initialized || shm_ptr == MAP_FAILED) { + /* Drain payload */ + if (hdr.payload_size > 0) { + void *tmp = malloc(hdr.payload_size); + if (tmp) { recv_all(client_fd, tmp, hdr.payload_size); free(tmp); } + } + send_response(client_fd, -1, NULL, 0); + break; + } + + NVEncIPCEncodeShmParams sp; + if (!recv_all(client_fd, &sp, sizeof(sp))) goto done; + + + /* Encode directly from shared memory — no socket data transfer */ + void *bitstream = NULL; + uint32_t bsSize = 0; + bool ok = encoder_encode(&enc, shm_ptr, sp.width, sp.height, + sp.frame_size, sp.force_idr, + &bitstream, &bsSize); + + + if (ok) { + send_response(client_fd, 0, bitstream, bsSize); + } else { + send_response(client_fd, -1, NULL, 0); + } + break; + } + + case NVENC_IPC_CMD_CLOSE: + encoder_close(&enc); + send_response(client_fd, 0, NULL, 0); + goto done; + + default: + HELPER_LOG("Unknown command: %u", hdr.cmd); + send_response(client_fd, -1, NULL, 0); + break; + } + } + +done: + if (enc.initialized) { + cu->cuCtxPushCurrent(enc.cudaCtx); + encoder_close(&enc); + cu->cuCtxPopCurrent(NULL); + } + if (shm_ptr != MAP_FAILED) { + munmap(shm_ptr, shm_size); + } + if (shm_fd >= 0) { + close(shm_fd); + } + close(client_fd); + HELPER_LOG("Client handler done"); +} + +static void sighandler(int sig) +{ + (void)sig; + running = 0; +} + +int main(int argc, char **argv) +{ + (void)argc; (void)argv; + + /* Always log to stderr — this is a daemon, logs are essential for diagnostics */ + log_enabled = 1; + + signal(SIGTERM, sighandler); + signal(SIGINT, sighandler); + signal(SIGPIPE, SIG_IGN); + + HELPER_LOG("Starting nvenc-helper (pid=%d)", getpid()); + + /* Load CUDA */ + if (cuda_load_functions(&cu, NULL) != 0 || cu == NULL) { + HELPER_LOG("Failed to load CUDA"); + return 1; + } + + CUresult cres = cu->cuInit(0); + if (cres != CUDA_SUCCESS) { + HELPER_LOG("cuInit failed: %d", cres); + cuda_free_functions(&cu); + return 1; + } + + /* Load NVENC */ + if (nvenc_load_functions(&nv_dl, NULL) != 0 || nv_dl == NULL) { + HELPER_LOG("Failed to load NVENC"); + cuda_free_functions(&cu); + return 1; + } + + HELPER_LOG("CUDA and NVENC loaded"); + + /* Create socket */ + char sock_path[256]; + if (!nvenc_ipc_get_socket_path(sock_path, sizeof(sock_path))) { + HELPER_LOG("Failed to get socket path"); + return 1; + } + + unlink(sock_path); /* Remove stale socket */ + + int listen_fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (listen_fd < 0) { + HELPER_LOG("socket: %s", strerror(errno)); + return 1; + } + + struct sockaddr_un addr; + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, sock_path, sizeof(addr.sun_path) - 1); + + mode_t old_umask = umask(0077); //socket created with 0700 permissions + if (bind(listen_fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + HELPER_LOG("bind(%s): %s", sock_path, strerror(errno)); + umask(old_umask); + close(listen_fd); + return 1; + } + umask(old_umask); + + if (listen(listen_fd, 8) < 0) { + HELPER_LOG("listen: %s", strerror(errno)); + close(listen_fd); + unlink(sock_path); + return 1; + } + + HELPER_LOG("Listening on %s", sock_path); + + /* Accept loop — runs until SIGTERM/SIGINT */ + while (running) { + struct pollfd pfd = { .fd = listen_fd, .events = POLLIN }; + int ret = poll(&pfd, 1, -1); /* Block forever until connection or signal */ + + if (ret < 0) { + if (errno == EINTR) continue; + HELPER_LOG("poll: %s", strerror(errno)); + break; + } + + int client_fd = accept(listen_fd, NULL, NULL); + if (client_fd < 0) { + if (errno == EINTR) continue; + HELPER_LOG("accept: %s", strerror(errno)); + continue; /* Don't exit on accept error — keep listening */ + } + + /* Handle one client at a time (sufficient for Steam's single encode stream) */ + handle_client(client_fd); + HELPER_LOG("Ready for next client"); + } + + close(listen_fd); + unlink(sock_path); + nvenc_free_functions(&nv_dl); + cuda_free_functions(&cu); + HELPER_LOG("Exiting"); + return 0; +} diff --git a/src/nvenc-ipc-client.c b/src/nvenc-ipc-client.c new file mode 100644 index 00000000..e910e08e --- /dev/null +++ b/src/nvenc-ipc-client.c @@ -0,0 +1,379 @@ +#define _GNU_SOURCE +#include "nvenc-ipc.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Reliable send: loop until all bytes sent */ +static bool send_all(int fd, const void *buf, size_t len) +{ + const char *p = buf; + while (len > 0) { + ssize_t n = send(fd, p, len, MSG_NOSIGNAL); + if (n <= 0) { + if (n < 0 && errno == EINTR) continue; + return false; + } + p += n; + len -= (size_t)n; + } + return true; +} + +/* Reliable recv: loop until all bytes received */ +static bool recv_all(int fd, void *buf, size_t len) +{ + char *p = buf; + while (len > 0) { + ssize_t n = recv(fd, p, len, 0); + if (n <= 0) { + if (n < 0 && errno == EINTR) continue; + return false; + } + p += n; + len -= (size_t)n; + } + return true; +} + +bool nvenc_ipc_get_socket_path(char *buf, size_t bufsize) +{ + const char *runtime_dir = getenv("XDG_RUNTIME_DIR"); + if (runtime_dir == NULL) { + runtime_dir = "/tmp"; + } + int ret = snprintf(buf, bufsize, "%s/%s", runtime_dir, NVENC_IPC_SOCK_NAME); + return ret > 0 && (size_t)ret < bufsize; +} + +int nvenc_ipc_connect(void) +{ + char path[256]; + if (!nvenc_ipc_get_socket_path(path, sizeof(path))) { + return -1; + } + + int fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) { + return -1; + } + + struct sockaddr_un addr; + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1); + + if (connect(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + close(fd); + return -1; + } + + return fd; +} + +int nvenc_ipc_connect_or_start(const char *helper_path) +{ + /* Try connecting first */ + int fd = nvenc_ipc_connect(); + if (fd >= 0) { + return fd; + } + + /* Helper not running — start it */ + pid_t pid = fork(); + if (pid < 0) { + return -1; + } + + if (pid == 0) { + /* Child: exec the helper. + * Detach from parent's session so it survives parent exit. */ + setsid(); + + /* Close inherited fds */ + for (int i = 3; i < 1024; i++) { + close(i); + } + + /* Redirect stdout/stderr to /dev/null unless NVD_LOG is set */ + if (getenv("NVD_LOG") == NULL) { + int devnull = open("/dev/null", O_WRONLY); + if (devnull >= 0) { + dup2(devnull, STDOUT_FILENO); + dup2(devnull, STDERR_FILENO); + close(devnull); + } + } + + execl(helper_path, helper_path, NULL); + _exit(127); + } + + /* Parent: wait for the helper to create the socket */ + for (int attempt = 0; attempt < 50; attempt++) { + usleep(100000); /* 100ms */ + fd = nvenc_ipc_connect(); + if (fd >= 0) { + return fd; + } + } + + /* Timed out — kill the child */ + kill(pid, SIGTERM); + waitpid(pid, NULL, 0); + return -1; +} + +/* Receive a single fd via SCM_RIGHTS */ +static int recv_fd(int sock, void *buf, size_t len) +{ + struct iovec iov = { .iov_base = buf, .iov_len = len }; + union { + char buf[CMSG_SPACE(sizeof(int))]; + struct cmsghdr align; + } cmsg_buf; + memset(&cmsg_buf, 0, sizeof(cmsg_buf)); + + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = cmsg_buf.buf, + .msg_controllen = sizeof(cmsg_buf.buf), + }; + + ssize_t n = recvmsg(sock, &msg, 0); + if (n != (ssize_t)len) return -1; + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + if (cmsg && cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { + int received_fd = -1; + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); + return received_fd; + } + return -1; +} + +int nvenc_ipc_init(int fd, const NVEncIPCInitParams *params, + int *shm_fd_out, uint32_t *shm_size_out) +{ + NVEncIPCMsgHeader hdr = { + .cmd = NVENC_IPC_CMD_INIT, + .payload_size = sizeof(*params) + }; + + if (!send_all(fd, &hdr, sizeof(hdr))) return -1; + if (!send_all(fd, params, sizeof(*params))) return -1; + + /* Response includes shm fd via SCM_RIGHTS + NVEncIPCInitResponse payload */ + NVEncIPCRespHeader resp; + NVEncIPCInitResponse init_resp = {0}; + + int shm_fd = recv_fd(fd, &resp, sizeof(resp)); + + if (resp.status != 0) { + if (shm_fd >= 0) close(shm_fd); + return resp.status; + } + + if (resp.payload_size >= sizeof(init_resp)) { + if (!recv_all(fd, &init_resp, sizeof(init_resp))) { + if (shm_fd >= 0) close(shm_fd); + return -1; + } + } + + if (shm_fd_out) { + *shm_fd_out = shm_fd; + } else if (shm_fd >= 0) { + close(shm_fd); + } + if (shm_size_out) *shm_size_out = init_resp.shm_size; + + return 0; +} + +int nvenc_ipc_encode(int fd, const void *frame_data, + uint32_t width, uint32_t height, uint32_t frame_size, + uint32_t force_idr, + void **bitstream_out, uint32_t *bitstream_size_out) +{ + NVEncIPCEncodeParams enc_params = { + .width = width, + .height = height, + .frame_size = frame_size, + .force_idr = force_idr, + }; + + NVEncIPCMsgHeader hdr = { + .cmd = NVENC_IPC_CMD_ENCODE, + .payload_size = sizeof(enc_params) + frame_size + }; + + if (!send_all(fd, &hdr, sizeof(hdr))) return -1; + if (!send_all(fd, &enc_params, sizeof(enc_params))) return -1; + if (!send_all(fd, frame_data, frame_size)) return -1; + + NVEncIPCRespHeader resp; + if (!recv_all(fd, &resp, sizeof(resp))) return -1; + + if (resp.status != 0) { + *bitstream_out = NULL; + *bitstream_size_out = 0; + return resp.status; + } + + if (resp.payload_size > 0) { + void *data = malloc(resp.payload_size); + if (data == NULL) return -1; + if (!recv_all(fd, data, resp.payload_size)) { + free(data); + return -1; + } + *bitstream_out = data; + *bitstream_size_out = resp.payload_size; + } else { + *bitstream_out = NULL; + *bitstream_size_out = 0; + } + + return 0; +} + +/* Send multiple DMA-BUF fds via SCM_RIGHTS ancillary data */ +static bool send_fds(int sock, const int *fds, int num_fds, const void *data, size_t len) +{ + struct iovec iov = { .iov_base = (void *)data, .iov_len = len }; + union { + char buf[CMSG_SPACE(sizeof(int) * 4)]; /* up to 4 fds */ + struct cmsghdr align; + } cmsg_buf; + memset(&cmsg_buf, 0, sizeof(cmsg_buf)); + + size_t fd_size = sizeof(int) * (size_t)num_fds; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = cmsg_buf.buf, + .msg_controllen = CMSG_SPACE(fd_size), + }; + + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(fd_size); + memcpy(CMSG_DATA(cmsg), fds, fd_size); + + ssize_t n = sendmsg(sock, &msg, MSG_NOSIGNAL); + return n == (ssize_t)len; +} + +int nvenc_ipc_encode_dmabuf(int fd, const int *dmabuf_fds, int num_fds, + const NVEncIPCEncodeDmaBufParams *params, + void **bitstream_out, uint32_t *bitstream_size_out) +{ + NVEncIPCMsgHeader hdr = { + .cmd = NVENC_IPC_CMD_ENCODE_DMABUF, + .payload_size = sizeof(*params) + }; + + /* Send the header normally */ + if (!send_all(fd, &hdr, sizeof(hdr))) return -1; + + /* Send the params WITH the fds attached via SCM_RIGHTS */ + if (!send_fds(fd, dmabuf_fds, num_fds, params, sizeof(*params))) return -1; + + /* Receive response */ + NVEncIPCRespHeader resp; + if (!recv_all(fd, &resp, sizeof(resp))) return -1; + + if (resp.status != 0) { + *bitstream_out = NULL; + *bitstream_size_out = 0; + return resp.status; + } + + if (resp.payload_size > 0) { + void *data = malloc(resp.payload_size); + if (data == NULL) return -1; + if (!recv_all(fd, data, resp.payload_size)) { + free(data); + return -1; + } + *bitstream_out = data; + *bitstream_size_out = resp.payload_size; + } else { + *bitstream_out = NULL; + *bitstream_size_out = 0; + } + + return 0; +} + +int nvenc_ipc_encode_shm(int fd, uint32_t width, uint32_t height, + uint32_t frame_size, uint32_t force_idr, + void **bitstream_out, uint32_t *bitstream_size_out) +{ + NVEncIPCEncodeShmParams sp = { + .width = width, + .height = height, + .frame_size = frame_size, + .force_idr = force_idr, + }; + + NVEncIPCMsgHeader hdr = { + .cmd = NVENC_IPC_CMD_ENCODE_SHM, + .payload_size = sizeof(sp) + }; + + /* Only send the small header + params — pixel data is already in shm */ + if (!send_all(fd, &hdr, sizeof(hdr))) return -1; + if (!send_all(fd, &sp, sizeof(sp))) return -1; + + NVEncIPCRespHeader resp; + if (!recv_all(fd, &resp, sizeof(resp))) return -1; + + if (resp.status != 0) { + *bitstream_out = NULL; + *bitstream_size_out = 0; + return resp.status; + } + + if (resp.payload_size > 0) { + void *data = malloc(resp.payload_size); + if (data == NULL) return -1; + if (!recv_all(fd, data, resp.payload_size)) { + free(data); + return -1; + } + *bitstream_out = data; + *bitstream_size_out = resp.payload_size; + } else { + *bitstream_out = NULL; + *bitstream_size_out = 0; + } + + return 0; +} + +void nvenc_ipc_close(int fd) +{ + NVEncIPCMsgHeader hdr = { + .cmd = NVENC_IPC_CMD_CLOSE, + .payload_size = 0 + }; + /* Best-effort send; ignore errors since we're closing anyway */ + send_all(fd, &hdr, sizeof(hdr)); + + NVEncIPCRespHeader resp; + recv_all(fd, &resp, sizeof(resp)); + + close(fd); +} diff --git a/src/nvenc-ipc.h b/src/nvenc-ipc.h new file mode 100644 index 00000000..e4532bbc --- /dev/null +++ b/src/nvenc-ipc.h @@ -0,0 +1,139 @@ +#ifndef NVENC_IPC_H +#define NVENC_IPC_H + +#include +#include +#include + +/* + * IPC protocol between the VA-API driver and the 64-bit NVENC helper. + * + * When CUDA is unavailable (e.g. 32-bit process on Blackwell GPUs where + * cuInit fails), the driver delegates encoding to a 64-bit helper process + * via a Unix domain socket. On systems where CUDA works, the driver uses + * NVENC directly without the helper. + * + * Socket path: /run/user//nvenc-helper.sock + * + * All integers are in host byte order (both processes are on the same machine). + * Messages are: header + payload. Responses are: header + payload. + */ + +#define NVENC_IPC_SOCK_NAME "nvenc-helper.sock" + +/* Maximum frame size we'll accept over the socket (64MB, enough for 8K NV12) */ +#define NVENC_IPC_MAX_FRAME_SIZE (64 * 1024 * 1024) + +/* Commands */ +#define NVENC_IPC_CMD_INIT 1 /* Initialize encoder */ +#define NVENC_IPC_CMD_ENCODE 2 /* Encode a frame (host pixel data) */ +#define NVENC_IPC_CMD_CLOSE 3 /* Close encoder and disconnect */ +#define NVENC_IPC_CMD_ENCODE_DMABUF 4 /* Encode from DMA-BUF fd (GPU zero-copy) */ +#define NVENC_IPC_CMD_ENCODE_SHM 5 /* Encode from shared memory (zero-copy host) */ + +/* Message header (client → helper) */ +typedef struct { + uint32_t cmd; + uint32_t payload_size; +} NVEncIPCMsgHeader; + +/* Response header (helper → client) */ +typedef struct { + int32_t status; /* 0 = success, <0 = error code */ + uint32_t payload_size; /* size of following data */ +} NVEncIPCRespHeader; + +/* CMD_INIT payload */ +typedef struct { + uint32_t width; + uint32_t height; + uint32_t codec; /* 0 = H.264, 1 = HEVC */ + uint32_t profile; /* VA-API profile value */ + uint32_t frameRateNum; + uint32_t frameRateDen; + uint32_t bitrate; + uint32_t maxBitrate; + uint32_t gopLength; + uint32_t is10bit; /* 0 = 8-bit NV12, 1 = 10-bit P010 */ +} NVEncIPCInitParams; + +/* CMD_ENCODE payload header (followed by frame_size bytes of NV12/P010 data) */ +typedef struct { + uint32_t width; + uint32_t height; + uint32_t frame_size; /* total bytes of pixel data */ + uint32_t force_idr; /* 1 = force IDR keyframe */ +} NVEncIPCEncodeParams; + +/* CMD_ENCODE_DMABUF payload. + * Multiple DMA-BUF fds (one per plane) sent via SCM_RIGHTS ancillary data. + * For NV12: 2 fds (Y plane, UV plane). */ +typedef struct { + uint32_t width; + uint32_t height; + uint32_t pitches[4]; /* stride per plane */ + uint32_t offsets[4]; /* offset per plane */ + uint32_t sizes[4]; /* memory size per plane */ + uint32_t num_planes; + uint32_t bppc; /* bytes per pixel per channel */ + uint32_t is10bit; +} NVEncIPCEncodeDmaBufParams; + +/* CMD_INIT response includes a shm fd via SCM_RIGHTS. + * The shm region is large enough for one NV12/P010 frame. */ +typedef struct { + uint32_t shm_size; /* size of the shared memory region */ +} NVEncIPCInitResponse; + +/* CMD_ENCODE_SHM payload (frame data is already in shared memory) */ +typedef struct { + uint32_t width; + uint32_t height; + uint32_t frame_size; + uint32_t force_idr; +} NVEncIPCEncodeShmParams; + +/* IPC client functions (used by the driver when CUDA is unavailable) */ + +/* Get the socket path for this user */ +bool nvenc_ipc_get_socket_path(char *buf, size_t bufsize); + +/* Try to connect to the helper. Returns socket fd or -1. */ +int nvenc_ipc_connect(void); + +/* Start the helper if not running, then connect. Returns socket fd or -1. */ +int nvenc_ipc_connect_or_start(const char *helper_path); + +/* Send init command. Returns 0 on success. + * If shm_fd_out is non-NULL, receives the shared memory fd from the helper. + * If shm_size_out is non-NULL, receives the shm region size. */ +int nvenc_ipc_init(int fd, const NVEncIPCInitParams *params, + int *shm_fd_out, uint32_t *shm_size_out); + +/* Send frame data and receive encoded bitstream. + * bitstream_out is malloc'd by this function, caller must free. + * Returns 0 on success. */ +int nvenc_ipc_encode(int fd, const void *frame_data, + uint32_t width, uint32_t height, uint32_t frame_size, + uint32_t force_idr, + void **bitstream_out, uint32_t *bitstream_size_out); + +/* Send DMA-BUF fd and receive encoded bitstream (GPU zero-copy path). + * The fd is sent via SCM_RIGHTS ancillary data. + * bitstream_out is malloc'd by this function, caller must free. + * Returns 0 on success. */ +int nvenc_ipc_encode_dmabuf(int fd, const int *dmabuf_fds, int num_fds, + const NVEncIPCEncodeDmaBufParams *params, + void **bitstream_out, uint32_t *bitstream_size_out); + +/* Encode from shared memory — frame data already written to shm. + * Only sends a small header, no pixel data over the socket. + * Returns 0 on success. */ +int nvenc_ipc_encode_shm(int fd, uint32_t width, uint32_t height, + uint32_t frame_size, uint32_t force_idr, + void **bitstream_out, uint32_t *bitstream_size_out); + +/* Send close command and close the socket. */ +void nvenc_ipc_close(int fd); + +#endif /* NVENC_IPC_H */ diff --git a/src/nvenc.c b/src/nvenc.c new file mode 100644 index 00000000..d946c2d9 --- /dev/null +++ b/src/nvenc.c @@ -0,0 +1,436 @@ +#include "nvenc.h" +#include "vabackend.h" + +#include +#include + +static bool check_nvenc_status(NVENCSTATUS status, const char *func, int line) +{ + if (status != NV_ENC_SUCCESS) { + LOG("NVENC error %d at %s:%d", status, func, line); + return false; + } + return true; +} +#define CHECK_NVENC(status) check_nvenc_status(status, __func__, __LINE__) + +bool nvenc_load(NvencFunctions **nvenc_dl) +{ + int ret = nvenc_load_functions(nvenc_dl, NULL); + if (ret != 0) { + LOG("Failed to load NVENC functions (libnvidia-encode.so)"); + *nvenc_dl = NULL; + return false; + } + //version format: API returns (major << 4 | minor) + uint32_t maxVersion = 0; + NVENCSTATUS st = (*nvenc_dl)->NvEncodeAPIGetMaxSupportedVersion(&maxVersion); + if (st != NV_ENC_SUCCESS) { + LOG("NvEncodeAPIGetMaxSupportedVersion failed: %d", st); + nvenc_free_functions(nvenc_dl); + *nvenc_dl = NULL; + return false; + } + uint32_t currentVersion = (NVENCAPI_MAJOR_VERSION << 4) | NVENCAPI_MINOR_VERSION; + LOG("NVENC max supported version: %u.%u, header version: %u.%u", + maxVersion >> 4, maxVersion & 0xf, + NVENCAPI_MAJOR_VERSION, NVENCAPI_MINOR_VERSION); + + if (currentVersion > maxVersion) { + LOG("NVENC header version (%u) is newer than driver supports (%u)", + currentVersion, maxVersion); + nvenc_free_functions(nvenc_dl); + *nvenc_dl = NULL; + return false; + } + return true; +} + +void nvenc_unload(NvencFunctions **nvenc_dl) +{ + if (*nvenc_dl != NULL) { + nvenc_free_functions(nvenc_dl); + *nvenc_dl = NULL; + } +} + +bool nvenc_open_session(NVENCContext *nvencCtx, NvencFunctions *nvenc_dl, CUcontext cudaCtx) +{ + /* Fill function list */ + nvencCtx->funcs.version = NV_ENCODE_API_FUNCTION_LIST_VER; + NVENCSTATUS st = nvenc_dl->NvEncodeAPICreateInstance(&nvencCtx->funcs); + if (!CHECK_NVENC(st)) { + return false; + } + + /* Open encode session */ + NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS sessionParams = {0}; + sessionParams.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER; + sessionParams.deviceType = NV_ENC_DEVICE_TYPE_CUDA; + sessionParams.device = cudaCtx; + sessionParams.apiVersion = NVENCAPI_VERSION; + + st = nvencCtx->funcs.nvEncOpenEncodeSessionEx(&sessionParams, &nvencCtx->encoder); + if (!CHECK_NVENC(st)) { + nvencCtx->encoder = NULL; + return false; + } + + LOG("NVENC session opened: %p", nvencCtx->encoder); + return true; +} + +void nvenc_close_session(NVENCContext *nvencCtx) +{ + if (nvencCtx->encoder == NULL) { + return; + } + + /* Send EOS to flush encoder before freeing any buffers */ + if (nvencCtx->initialized) { + NV_ENC_PIC_PARAMS picParams = {0}; + picParams.version = NV_ENC_PIC_PARAMS_VER; + picParams.encodePicFlags = NV_ENC_PIC_FLAG_EOS; + nvencCtx->funcs.nvEncEncodePicture(nvencCtx->encoder, &picParams); + } + + /* Free output buffer after flush */ + nvenc_free_output_buffer(nvencCtx); + + /* Destroy encoder */ + NVENCSTATUS st = nvencCtx->funcs.nvEncDestroyEncoder(nvencCtx->encoder); + if (st != NV_ENC_SUCCESS) { + LOG("nvEncDestroyEncoder failed: %d", st); + } + + LOG("NVENC session closed"); + nvencCtx->encoder = NULL; + nvencCtx->initialized = false; +} + +bool nvenc_init_encoder(NVENCContext *nvencCtx, uint32_t width, uint32_t height, + GUID codecGuid, GUID profileGuid, GUID presetGuid, + NV_ENC_TUNING_INFO tuningInfo) +{ + NVENCSTATUS st; + + nvencCtx->codecGuid = codecGuid; + nvencCtx->profileGuid = profileGuid; + nvencCtx->width = width; + nvencCtx->height = height; + + //get preset config + NV_ENC_PRESET_CONFIG presetConfig = {0}; + presetConfig.version = NV_ENC_PRESET_CONFIG_VER; + presetConfig.presetCfg.version = NV_ENC_CONFIG_VER; + + st = nvencCtx->funcs.nvEncGetEncodePresetConfigEx( + nvencCtx->encoder, codecGuid, presetGuid, tuningInfo, &presetConfig); + if (!CHECK_NVENC(st)) { + return false; + } + + //apply overrides + memcpy(&nvencCtx->encodeConfig, &presetConfig.presetCfg, sizeof(NV_ENC_CONFIG)); + nvencCtx->encodeConfig.encodeCodecConfig.hevcConfig.pixelBitDepthMinus8 = nvencCtx->inputFormat == NV_ENC_BUFFER_FORMAT_YUV420_10BIT? 2: 0; + nvencCtx->encodeConfig.version = NV_ENC_CONFIG_VER; + nvencCtx->encodeConfig.profileGUID = profileGuid; + + if (nvencCtx->rcMode != 0) { + nvencCtx->encodeConfig.rcParams.rateControlMode = (NV_ENC_PARAMS_RC_MODE)nvencCtx->rcMode; + } + if (nvencCtx->bitrate > 0) { + nvencCtx->encodeConfig.rcParams.averageBitRate = nvencCtx->bitrate; + } + if (nvencCtx->maxBitrate > 0) { + nvencCtx->encodeConfig.rcParams.maxBitRate = nvencCtx->maxBitrate; + } + if (nvencCtx->vbvBufferSize > 0) { + nvencCtx->encodeConfig.rcParams.vbvBufferSize = nvencCtx->vbvBufferSize; + } + if (nvencCtx->vbvInitialDelay > 0) { + nvencCtx->encodeConfig.rcParams.vbvInitialDelay = nvencCtx->vbvInitialDelay; + } + + if (nvencCtx->intraPeriod > 0) { + nvencCtx->encodeConfig.gopLength = nvencCtx->intraPeriod; + } + //no B-frames: NVENC needs DPB management or returns NEED_MORE_INPUT which ffmpeg 6.x can't handle + nvencCtx->encodeConfig.frameIntervalP = 1; + + memset(&nvencCtx->initParams, 0, sizeof(nvencCtx->initParams)); + nvencCtx->initParams.version = NV_ENC_INITIALIZE_PARAMS_VER; + nvencCtx->initParams.encodeGUID = codecGuid; + nvencCtx->initParams.presetGUID = presetGuid; + nvencCtx->initParams.encodeWidth = width; + nvencCtx->initParams.encodeHeight = height; + nvencCtx->initParams.darWidth = width; + nvencCtx->initParams.darHeight = height; + nvencCtx->initParams.frameRateNum = nvencCtx->frameRateNum > 0 ? nvencCtx->frameRateNum : 30; + nvencCtx->initParams.frameRateDen = nvencCtx->frameRateDen > 0 ? nvencCtx->frameRateDen : 1; + nvencCtx->initParams.enablePTD = 1; + nvencCtx->initParams.encodeConfig = &nvencCtx->encodeConfig; + nvencCtx->initParams.maxEncodeWidth = width; + nvencCtx->initParams.maxEncodeHeight = height; + nvencCtx->initParams.tuningInfo = tuningInfo; + + st = nvencCtx->funcs.nvEncInitializeEncoder(nvencCtx->encoder, &nvencCtx->initParams); + if (!CHECK_NVENC(st)) { + return false; + } + + nvencCtx->initialized = true; + LOG("NVENC encoder initialized: %ux%u codec=%s", + width, height, + memcmp(&codecGuid, &NV_ENC_CODEC_H264_GUID, sizeof(GUID)) == 0 ? "H.264" : "HEVC"); + + return true; +} + +bool nvenc_alloc_output_buffer(NVENCContext *nvencCtx) +{ + if (nvencCtx->outputBuffer.allocated) { + return true; + } + + NV_ENC_CREATE_BITSTREAM_BUFFER createBuf = {0}; + createBuf.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER; + + NVENCSTATUS st = nvencCtx->funcs.nvEncCreateBitstreamBuffer( + nvencCtx->encoder, &createBuf); + if (!CHECK_NVENC(st)) { + return false; + } + + nvencCtx->outputBuffer.bitstreamBuffer = createBuf.bitstreamBuffer; + nvencCtx->outputBuffer.allocated = true; + nvencCtx->outputBuffer.locked = false; + nvencCtx->outputBuffer.lockedPtr = NULL; + nvencCtx->outputBuffer.lockedSize = 0; + + return true; +} + +void nvenc_free_output_buffer(NVENCContext *nvencCtx) +{ + if (!nvencCtx->outputBuffer.allocated || nvencCtx->encoder == NULL) { + return; + } + + /* Unlock if still locked */ + if (nvencCtx->outputBuffer.locked) { + nvenc_unlock_bitstream(nvencCtx); + } + + nvencCtx->funcs.nvEncDestroyBitstreamBuffer( + nvencCtx->encoder, nvencCtx->outputBuffer.bitstreamBuffer); + nvencCtx->outputBuffer.bitstreamBuffer = NULL; + nvencCtx->outputBuffer.allocated = false; +} + +bool nvenc_register_cuda_resource(NVENCContext *nvencCtx, CUdeviceptr devPtr, + uint32_t width, uint32_t height, uint32_t pitch, + NV_ENC_BUFFER_FORMAT format, + NV_ENC_REGISTERED_PTR *outRegistered) +{ + NV_ENC_REGISTER_RESOURCE regRes = {0}; + regRes.version = NV_ENC_REGISTER_RESOURCE_VER; + regRes.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR; + regRes.resourceToRegister = (void*)devPtr; + regRes.width = width; + regRes.height = height; + regRes.pitch = pitch; + regRes.bufferFormat = format; + regRes.bufferUsage = NV_ENC_INPUT_IMAGE; + + NVENCSTATUS st = nvencCtx->funcs.nvEncRegisterResource( + nvencCtx->encoder, ®Res); + if (!CHECK_NVENC(st)) { + return false; + } + + *outRegistered = regRes.registeredResource; + return true; +} + +bool nvenc_map_resource(NVENCContext *nvencCtx, NV_ENC_REGISTERED_PTR registered, + NV_ENC_INPUT_PTR *outMapped, NV_ENC_BUFFER_FORMAT *outFmt) +{ + NV_ENC_MAP_INPUT_RESOURCE mapRes = {0}; + mapRes.version = NV_ENC_MAP_INPUT_RESOURCE_VER; + mapRes.registeredResource = registered; + + NVENCSTATUS st = nvencCtx->funcs.nvEncMapInputResource( + nvencCtx->encoder, &mapRes); + if (!CHECK_NVENC(st)) { + return false; + } + + *outMapped = mapRes.mappedResource; + if (outFmt) { + *outFmt = mapRes.mappedBufferFmt; + } + return true; +} + +bool nvenc_unmap_resource(NVENCContext *nvencCtx, NV_ENC_INPUT_PTR mapped) +{ + NVENCSTATUS st = nvencCtx->funcs.nvEncUnmapInputResource( + nvencCtx->encoder, mapped); + return CHECK_NVENC(st); +} + +bool nvenc_unregister_resource(NVENCContext *nvencCtx, NV_ENC_REGISTERED_PTR registered) +{ + NVENCSTATUS st = nvencCtx->funcs.nvEncUnregisterResource( + nvencCtx->encoder, registered); + return CHECK_NVENC(st); +} + +/* + * Encode a frame. Returns: + * 1 = encoded successfully, output available + * 0 = needs more input (B-frame buffering), no output yet + * -1 = error + */ +int nvenc_encode_frame(NVENCContext *nvencCtx, NV_ENC_INPUT_PTR inputBuffer, + NV_ENC_BUFFER_FORMAT bufferFmt, + uint32_t inputWidth, uint32_t inputHeight, uint32_t inputPitch, + NV_ENC_PIC_TYPE picType, uint32_t picFlags) +{ + if (!nvencCtx->outputBuffer.allocated) { + if (!nvenc_alloc_output_buffer(nvencCtx)) { + return -1; + } + } + + NV_ENC_PIC_PARAMS picParams = {0}; + picParams.version = NV_ENC_PIC_PARAMS_VER; + picParams.inputBuffer = inputBuffer; + picParams.bufferFmt = bufferFmt; + picParams.inputWidth = inputWidth; + picParams.inputHeight = inputHeight; + picParams.inputPitch = inputPitch; + picParams.outputBitstream = nvencCtx->outputBuffer.bitstreamBuffer; + picParams.pictureStruct = NV_ENC_PIC_STRUCT_FRAME; + picParams.pictureType = picType; + picParams.encodePicFlags = picFlags; + picParams.frameIdx = (uint32_t)nvencCtx->frameCount; + picParams.inputTimeStamp = nvencCtx->frameCount; + + NVENCSTATUS st = nvencCtx->funcs.nvEncEncodePicture( + nvencCtx->encoder, &picParams); + + nvencCtx->frameCount++; + + if (st == NV_ENC_ERR_NEED_MORE_INPUT) { + /* B-frame reordering: NVENC needs more frames before producing output */ + return 0; + } + if (st != NV_ENC_SUCCESS) { + LOG("nvEncEncodePicture failed: %d", st); + return -1; + } + + return 1; +} + +bool nvenc_lock_bitstream(NVENCContext *nvencCtx, void **outPtr, uint32_t *outSize) +{ + NV_ENC_LOCK_BITSTREAM lockParams = {0}; + lockParams.version = NV_ENC_LOCK_BITSTREAM_VER; + lockParams.outputBitstream = nvencCtx->outputBuffer.bitstreamBuffer; + lockParams.doNotWait = 0; + + NVENCSTATUS st = nvencCtx->funcs.nvEncLockBitstream( + nvencCtx->encoder, &lockParams); + if (!CHECK_NVENC(st)) { + return false; + } + + *outPtr = lockParams.bitstreamBufferPtr; + *outSize = lockParams.bitstreamSizeInBytes; + nvencCtx->outputBuffer.locked = true; + nvencCtx->outputBuffer.lockedPtr = lockParams.bitstreamBufferPtr; + nvencCtx->outputBuffer.lockedSize = lockParams.bitstreamSizeInBytes; + + return true; +} + +bool nvenc_unlock_bitstream(NVENCContext *nvencCtx) +{ + if (!nvencCtx->outputBuffer.locked) { + return true; + } + + NVENCSTATUS st = nvencCtx->funcs.nvEncUnlockBitstream( + nvencCtx->encoder, nvencCtx->outputBuffer.bitstreamBuffer); + nvencCtx->outputBuffer.locked = false; + nvencCtx->outputBuffer.lockedPtr = NULL; + nvencCtx->outputBuffer.lockedSize = 0; + + return CHECK_NVENC(st); +} + +/* Profile/entrypoint helpers */ + +bool nvenc_is_encode_profile(VAProfile profile) +{ + switch (profile) { + case VAProfileH264ConstrainedBaseline: + case VAProfileH264Main: + case VAProfileH264High: + case VAProfileHEVCMain: + case VAProfileHEVCMain10: + return true; + default: + return false; + } +} + +GUID nvenc_va_profile_to_codec_guid(VAProfile profile) +{ + switch (profile) { + case VAProfileH264ConstrainedBaseline: + case VAProfileH264Main: + case VAProfileH264High: + return NV_ENC_CODEC_H264_GUID; + case VAProfileHEVCMain: + case VAProfileHEVCMain10: + return NV_ENC_CODEC_HEVC_GUID; + default: { + GUID empty = {0}; + return empty; + } + } +} + +GUID nvenc_va_profile_to_profile_guid(VAProfile profile) +{ + switch (profile) { + case VAProfileH264ConstrainedBaseline: + return NV_ENC_H264_PROFILE_BASELINE_GUID; + case VAProfileH264Main: + return NV_ENC_H264_PROFILE_MAIN_GUID; + case VAProfileH264High: + return NV_ENC_H264_PROFILE_HIGH_GUID; + case VAProfileHEVCMain: + return NV_ENC_HEVC_PROFILE_MAIN_GUID; + case VAProfileHEVCMain10: + return NV_ENC_HEVC_PROFILE_MAIN10_GUID; + default: { + GUID empty = {0}; + return empty; + } + } +} + +NV_ENC_BUFFER_FORMAT nvenc_surface_format(VAProfile profile) +{ + switch (profile) { + case VAProfileHEVCMain10: + return NV_ENC_BUFFER_FORMAT_YUV420_10BIT; + default: + return NV_ENC_BUFFER_FORMAT_NV12; + } +} diff --git a/src/nvenc.h b/src/nvenc.h new file mode 100644 index 00000000..dccbb4ab --- /dev/null +++ b/src/nvenc.h @@ -0,0 +1,108 @@ +#ifndef NVENC_H +#define NVENC_H + +#include +#include +#include +#include +#include +#include "vabackend.h" + +// Encode-specific context, stored in NVContext->encodeData +// when created with VAEntrypointEncSlice. + +typedef struct { + NV_ENC_OUTPUT_PTR bitstreamBuffer; + bool allocated; + void *lockedPtr; //locked bitstream pointer + uint32_t lockedSize; + bool locked; +} NVENCOutputBuffer; + +typedef struct { + void *encoder; //NVENC session handle + NV_ENCODE_API_FUNCTION_LIST funcs; + bool initialized; + GUID codecGuid; + GUID profileGuid; + NV_ENC_CONFIG encodeConfig; + NV_ENC_INITIALIZE_PARAMS initParams; + uint32_t width; + uint32_t height; + NV_ENC_BUFFER_FORMAT inputFormat; + bool seqParamSet; + uint32_t rcMode; //VA-API rate control mode + uint32_t bitrate; //bits/sec + uint32_t maxBitrate; + uint32_t frameRateNum; + uint32_t frameRateDen; + uint32_t intraPeriod; //GOP length + uint32_t ipPeriod; + uint32_t vbvBufferSize; //HRD buffer size (bits) + uint32_t vbvInitialDelay; //HRD initial fullness (bits) + uint64_t frameCount; + NVENCOutputBuffer outputBuffer; + VABufferID currentCodedBufId; + bool forceIDR; //from idr_pic_flag + NV_ENC_PIC_TYPE picType; //from slice params + bool useIPC; //encode via 64-bit helper + int ipcFd; //socket fd, -1 if not connected + void *shmPtr; //mmap'd shared memory for frame data + uint32_t shmSize; + int shmFd; +} NVENCContext; + +// Wraps VACodedBufferSegment with NVENC bitstream storage +typedef struct { + VACodedBufferSegment segment; + void *bitstreamData; + uint32_t bitstreamSize; + uint32_t bitstreamAlloc; + bool hasData; +} NVCodedBuffer; + +bool nvenc_load(NvencFunctions **nvenc_dl); +void nvenc_unload(NvencFunctions **nvenc_dl); + +bool nvenc_open_session(NVENCContext *nvencCtx, NvencFunctions *nvenc_dl, CUcontext cudaCtx); +void nvenc_close_session(NVENCContext *nvencCtx); + +bool nvenc_init_encoder(NVENCContext *nvencCtx, uint32_t width, uint32_t height, + GUID codecGuid, GUID profileGuid, GUID presetGuid, + NV_ENC_TUNING_INFO tuningInfo); + +bool nvenc_alloc_output_buffer(NVENCContext *nvencCtx); +void nvenc_free_output_buffer(NVENCContext *nvencCtx); + +bool nvenc_register_cuda_resource(NVENCContext *nvencCtx, CUdeviceptr devPtr, + uint32_t width, uint32_t height, uint32_t pitch, + NV_ENC_BUFFER_FORMAT format, + NV_ENC_REGISTERED_PTR *outRegistered); +bool nvenc_map_resource(NVENCContext *nvencCtx, NV_ENC_REGISTERED_PTR registered, + NV_ENC_INPUT_PTR *outMapped, NV_ENC_BUFFER_FORMAT *outFmt); +bool nvenc_unmap_resource(NVENCContext *nvencCtx, NV_ENC_INPUT_PTR mapped); +bool nvenc_unregister_resource(NVENCContext *nvencCtx, NV_ENC_REGISTERED_PTR registered); + +int nvenc_encode_frame(NVENCContext *nvencCtx, NV_ENC_INPUT_PTR inputBuffer, + NV_ENC_BUFFER_FORMAT bufferFmt, + uint32_t inputWidth, uint32_t inputHeight, uint32_t inputPitch, + NV_ENC_PIC_TYPE picType, uint32_t picFlags); + +bool nvenc_lock_bitstream(NVENCContext *nvencCtx, void **outPtr, uint32_t *outSize); +bool nvenc_unlock_bitstream(NVENCContext *nvencCtx); + +bool nvenc_is_encode_profile(VAProfile profile); +GUID nvenc_va_profile_to_codec_guid(VAProfile profile); +GUID nvenc_va_profile_to_profile_guid(VAProfile profile); +NV_ENC_BUFFER_FORMAT nvenc_surface_format(VAProfile profile); + +void h264enc_handle_sequence_params(NVENCContext *ctx, NVBuffer *buf); +void h264enc_handle_picture_params(NVENCContext *ctx, NVBuffer *buf); +void h264enc_handle_slice_params(NVENCContext *ctx, NVBuffer *buf); +void h264enc_handle_misc_params(NVENCContext *ctx, NVBuffer *buf); +void hevcenc_handle_sequence_params(NVENCContext *ctx, NVBuffer *buf); +void hevcenc_handle_picture_params(NVENCContext *ctx, NVBuffer *buf); +void hevcenc_handle_slice_params(NVENCContext *ctx, NVBuffer *buf); +void hevcenc_handle_misc_params(NVENCContext *ctx, NVBuffer *buf); + +#endif // NVENC_H diff --git a/src/vabackend.c b/src/vabackend.c index fb964f50..581210c3 100644 --- a/src/vabackend.c +++ b/src/vabackend.c @@ -2,6 +2,9 @@ #include "vabackend.h" #include "backend-common.h" +#include "nvenc.h" +#include "nvenc-ipc.h" + #include #include @@ -11,6 +14,7 @@ #include #include #include +#include #include #include @@ -67,6 +71,8 @@ static uint32_t max_instances; static CudaFunctions *cu; static CuvidFunctions *cv; +static NvencFunctions *nv; +static bool cudaInitSuccess; extern const NVCodec __start_nvd_codecs[]; extern const NVCodec __stop_nvd_codecs[]; @@ -164,12 +170,22 @@ static void init() { return; } + /* Load NVENC functions (optional — encoding won't work without it but decode still will) */ + if (!nvenc_load(&nv)) { + LOG("NVENC not available, encoding support disabled"); + /* nv is already NULL from nvenc_load on failure */ + } + //Not really much we can do here to abort the loading of the library - CHECK_CUDA_RESULT(cu->cuInit(0)); + cudaInitSuccess = !CHECK_CUDA_RESULT(cu->cuInit(0)); + if (!cudaInitSuccess) { + LOG("CUDA init failed — encode-only mode via IPC helper"); + } } __attribute__ ((destructor)) static void cleanup() { + nvenc_unload(&nv); if (cv != NULL) { cuvid_free_functions(&cv); } @@ -318,7 +334,34 @@ static void deleteObject(NVDriver *drv, VAGenericID id) { } static bool destroyContext(NVDriver *drv, NVContext *nvCtx) { - CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), false); + if (drv->cudaAvailable) { + CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), false); + } + + if (nvCtx->isEncode) { + /* Encode context cleanup */ + NVENCContext *nvencCtx = (NVENCContext*) nvCtx->encodeData; + if (nvencCtx != NULL) { + if (nvencCtx->useIPC) { + if (nvencCtx->shmPtr != NULL) { + munmap(nvencCtx->shmPtr, nvencCtx->shmSize); + nvencCtx->shmPtr = NULL; + } + if (nvencCtx->ipcFd >= 0) { + nvenc_ipc_close(nvencCtx->ipcFd); + nvencCtx->ipcFd = -1; + } + } else { + nvenc_close_session(nvencCtx); + } + free(nvencCtx); + nvCtx->encodeData = NULL; + } + if (drv->cudaAvailable) { + CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), false); + } + return true; + } LOG("Signaling resolve thread to exit"); struct timespec timeout; @@ -607,30 +650,31 @@ static VAStatus nvQueryConfigEntrypoints( int *num_entrypoints /* out */ ) { - entrypoint_list[0] = VAEntrypointVLD; - *num_entrypoints = 1; + NVDriver *drv = (NVDriver*) ctx->pDriverData; + int count = 0; + + /* Decode entrypoint — supported for all profiles that have a codec (requires CUDA) */ + if (drv->cudaAvailable && vaToCuCodec(profile) != cudaVideoCodec_NONE) { + entrypoint_list[count++] = VAEntrypointVLD; + } + + /* Encode entrypoint — supported for H.264 and HEVC if NVENC is available */ + if (drv->nvencAvailable && nvenc_is_encode_profile(profile)) { + entrypoint_list[count++] = VAEntrypointEncSlice; + } + + *num_entrypoints = count; return VA_STATUS_SUCCESS; } -static VAStatus nvGetConfigAttributes( - VADriverContextP ctx, +static void nvGetConfigAttributesDecode( + NVDriver *drv, VAProfile profile, - VAEntrypoint entrypoint, - VAConfigAttrib *attrib_list, /* in/out */ + VAConfigAttrib *attrib_list, int num_attribs ) { - if (entrypoint != VAEntrypointVLD) { - return VA_STATUS_ERROR_UNSUPPORTED_ENTRYPOINT; - } - - NVDriver *drv = (NVDriver*) ctx->pDriverData; - if (vaToCuCodec(profile) == cudaVideoCodec_NONE) { - return VA_STATUS_ERROR_UNSUPPORTED_PROFILE; - } - //LOG("Got here with profile: %d == %d", profile, vaToCuCodec(profile)); - for (int i = 0; i < num_attribs; i++) { if (attrib_list[i].type == VAConfigAttribRTFormat) @@ -683,6 +727,81 @@ static VAStatus nvGetConfigAttributes( LOG("unhandled config attribute: %d", attrib_list[i].type); } } +} + +static void nvGetConfigAttributesEncode( + VAProfile profile, + VAConfigAttrib *attrib_list, + int num_attribs + ) +{ + for (int i = 0; i < num_attribs; i++) + { + switch (attrib_list[i].type) { + case VAConfigAttribRTFormat: + attrib_list[i].value = VA_RT_FORMAT_YUV420; + if (profile == VAProfileHEVCMain10) { + attrib_list[i].value |= VA_RT_FORMAT_YUV420_10; + } + break; + case VAConfigAttribRateControl: + attrib_list[i].value = VA_RC_CQP | VA_RC_CBR | VA_RC_VBR; + break; + case VAConfigAttribEncPackedHeaders: + //accept all packed header types; NVENC generates its own but + //apps (Steam) expect the driver to accept them without warning + attrib_list[i].value = VA_ENC_PACKED_HEADER_SEQUENCE + | VA_ENC_PACKED_HEADER_PICTURE + | VA_ENC_PACKED_HEADER_SLICE + | VA_ENC_PACKED_HEADER_MISC; + break; + case VAConfigAttribEncMaxRefFrames: + /* NVENC supports multiple reference frames; report a safe value */ + attrib_list[i].value = 1 | (1 << 16); /* 1 L0, 1 L1 */ + break; + case VAConfigAttribMaxPictureWidth: + attrib_list[i].value = 4096; + break; + case VAConfigAttribMaxPictureHeight: + attrib_list[i].value = 4096; + break; + case VAConfigAttribEncQualityRange: + attrib_list[i].value = 7; //NVENC presets P1-P7 + break; + default: + attrib_list[i].value = VA_ATTRIB_NOT_SUPPORTED; + break; + } + } +} + +static VAStatus nvGetConfigAttributes( + VADriverContextP ctx, + VAProfile profile, + VAEntrypoint entrypoint, + VAConfigAttrib *attrib_list, /* in/out */ + int num_attribs + ) +{ + NVDriver *drv = (NVDriver*) ctx->pDriverData; + + if (entrypoint == VAEntrypointEncSlice) { + if (!drv->nvencAvailable || !nvenc_is_encode_profile(profile)) { + return VA_STATUS_ERROR_UNSUPPORTED_ENTRYPOINT; + } + nvGetConfigAttributesEncode(profile, attrib_list, num_attribs); + return VA_STATUS_SUCCESS; + } + + if (entrypoint != VAEntrypointVLD) { + return VA_STATUS_ERROR_UNSUPPORTED_ENTRYPOINT; + } + + if (vaToCuCodec(profile) == cudaVideoCodec_NONE) { + return VA_STATUS_ERROR_UNSUPPORTED_PROFILE; + } + + nvGetConfigAttributesDecode(drv, profile, attrib_list, num_attribs); return VA_STATUS_SUCCESS; } @@ -697,6 +816,28 @@ static VAStatus nvCreateConfig( ) { NVDriver *drv = (NVDriver*) ctx->pDriverData; + + if (entrypoint == VAEntrypointEncSlice) { + /* Encode config */ + if (!drv->nvencAvailable || !nvenc_is_encode_profile(profile)) { + LOG("Encode not supported for profile: %d", profile); + return VA_STATUS_ERROR_UNSUPPORTED_PROFILE; + } + Object obj = allocateObject(drv, OBJECT_TYPE_CONFIG, sizeof(NVConfig)); + NVConfig *cfg = (NVConfig*) obj->obj; + cfg->profile = profile; + cfg->entrypoint = entrypoint; + cfg->isEncode = true; + cfg->cudaCodec = cudaVideoCodec_NONE; + cfg->chromaFormat = cudaVideoChromaFormat_420; + cfg->bitDepth = (profile == VAProfileHEVCMain10) ? 10 : 8; + cfg->surfaceFormat = (profile == VAProfileHEVCMain10) + ? cudaVideoSurfaceFormat_P016 + : cudaVideoSurfaceFormat_NV12; + *config_id = obj->id; + return VA_STATUS_SUCCESS; + } + //LOG("got profile: %d with %d attributes", profile, num_attribs); cudaVideoCodec cudaCodec = vaToCuCodec(profile); @@ -867,6 +1008,20 @@ static VAStatus nvQueryConfigAttributes( *profile = cfg->profile; *entrypoint = cfg->entrypoint; + + /* Encode config attributes */ + if (cfg->isEncode) { + int i = 0; + attrib_list[i].type = VAConfigAttribRTFormat; + attrib_list[i].value = VA_RT_FORMAT_YUV420; + if (cfg->profile == VAProfileHEVCMain10) { + attrib_list[i].value |= VA_RT_FORMAT_YUV420_10; + } + i++; + *num_attribs = i; + return VA_STATUS_SUCCESS; + } + int i = 0; attrib_list[i].value = VA_RT_FORMAT_YUV420; attrib_list[i].type = VAConfigAttribRTFormat; @@ -922,6 +1077,35 @@ static VAStatus nvCreateSurfaces2( { NVDriver *drv = (NVDriver*) ctx->pDriverData; + /* Log surface attributes for diagnostics */ + uint32_t memType = VA_SURFACE_ATTRIB_MEM_TYPE_VA; + VASurfaceAttribExternalBuffers *extBuf = NULL; + for (unsigned int a = 0; a < num_attribs; a++) { + LOG("Surface attrib[%u]: type=%d, flags=0x%x, value_type=%d", + a, attrib_list[a].type, attrib_list[a].flags, + attrib_list[a].value.type); + if (attrib_list[a].type == VASurfaceAttribMemoryType && + attrib_list[a].value.type == VAGenericValueTypeInteger) { + memType = attrib_list[a].value.value.i; + LOG(" MemoryType: 0x%x", memType); + } + if (attrib_list[a].type == VASurfaceAttribExternalBufferDescriptor && + attrib_list[a].value.type == VAGenericValueTypePointer) { + extBuf = (VASurfaceAttribExternalBuffers*)attrib_list[a].value.value.p; + if (extBuf) { + LOG(" ExternalBuffers: %ux%u fmt=0x%x planes=%u bufs=%u size=%u", + extBuf->width, extBuf->height, extBuf->pixel_format, + extBuf->num_planes, extBuf->num_buffers, extBuf->data_size); + for (unsigned int b = 0; b < extBuf->num_buffers && b < 4; b++) { + LOG(" buffer[%u] = %lu (fd or ptr)", b, (unsigned long)extBuf->buffers[b]); + } + for (unsigned int p = 0; p < extBuf->num_planes && p < 4; p++) { + LOG(" plane[%u]: pitch=%u offset=%u", p, extBuf->pitches[p], extBuf->offsets[p]); + } + } + } + } + cudaVideoSurfaceFormat nvFormat; cudaVideoChromaFormat chromaFormat; int bitdepth; @@ -978,7 +1162,9 @@ static VAStatus nvCreateSurfaces2( break; } - CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED); + if (drv->cudaAvailable) { + CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED); + } for (uint32_t i = 0; i < num_surfaces; i++) { Object surfaceObject = allocateObject(drv, OBJECT_TYPE_SURFACE, sizeof(NVSurface)); @@ -991,13 +1177,47 @@ static VAStatus nvCreateSurfaces2( suf->bitDepth = bitdepth; suf->context = NULL; suf->chromaFormat = chromaFormat; + suf->hostPixelData = NULL; + suf->hostPixelSize = 0; + suf->importedDmaBufFd = -1; + suf->importedNumPlanes = 0; + suf->importedDataSize = 0; pthread_mutex_init(&suf->mutex, NULL); pthread_cond_init(&suf->cond, NULL); - LOG("Creating surface %ux%u, format %X (%p)", width, height, format, suf); + /* Store imported DMA-BUF if provided via external buffer attribs */ + if (extBuf != NULL && extBuf->num_buffers > 0) { + /* DRM_PRIME: buffers[] contains DMA-BUF fds. + * dup() the fd so the surface owns its own copy. */ + if (memType & (VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME | VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2)) { + int srcFd = (int)extBuf->buffers[i < extBuf->num_buffers ? i : 0]; + suf->importedDmaBufFd = dup(srcFd); + suf->importedNumPlanes = extBuf->num_planes; + suf->importedDataSize = extBuf->data_size; + for (uint32_t p = 0; p < extBuf->num_planes && p < 4; p++) { + suf->importedPitches[p] = extBuf->pitches[p]; + suf->importedOffsets[p] = extBuf->offsets[p]; + } + LOG(" Surface %u: imported DMA-BUF fd=%d (dup of %d), size=%u", + i, suf->importedDmaBufFd, srcFd, suf->importedDataSize); + } + } + + /* In IPC encode-only mode, eagerly allocate the backing image now + * so the surface has GPU memory that can be exported via DMA-BUF. + * Steam's OpenGL capture needs to render into these surfaces BEFORE + * the encode begins. Without early allocation, the surface is empty. */ + if (!drv->cudaAvailable && drv->backend != NULL) { + drv->backend->realiseSurface(drv, suf); + } + + LOG("Creating surface %ux%u, format %X (%p) dmabuf=%d backing=%p", + width, height, format, suf, suf->importedDmaBufFd, suf->backingImage); } - CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED); + if (drv->cudaAvailable) { + CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED); + } return VA_STATUS_SUCCESS; } @@ -1031,7 +1251,19 @@ static VAStatus nvDestroySurfaces( LOG("Destroying surface %d (%p)", surface->pictureIdx, surface); - drv->backend->detachBackingImageFromSurface(drv, surface); + if (!surface->hostPixelIsShm) { + free(surface->hostPixelData); + } + surface->hostPixelData = NULL; + + if (surface->importedDmaBufFd >= 0) { + close(surface->importedDmaBufFd); + surface->importedDmaBufFd = -1; + } + + if (drv->backend != NULL) { + drv->backend->detachBackingImageFromSurface(drv, surface); + } deleteObject(drv, surface_list[i]); } @@ -1057,7 +1289,68 @@ static VAStatus nvCreateContext( return VA_STATUS_ERROR_INVALID_CONFIG; } - LOG("Creating context with %d render targets, at %dx%d", num_render_targets, picture_width, picture_height); + LOG("Creating context with %d render targets, at %dx%d (encode=%d)", + num_render_targets, picture_width, picture_height, cfg->isEncode); + + /* Encode context path */ + if (cfg->isEncode) { + NVENCContext *nvencCtx = (NVENCContext*) calloc(1, sizeof(NVENCContext)); + if (nvencCtx == NULL) { + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + + nvencCtx->width = picture_width; + nvencCtx->height = picture_height; + nvencCtx->inputFormat = nvenc_surface_format(cfg->profile); + nvencCtx->frameRateNum = 30; + nvencCtx->frameRateDen = 1; + nvencCtx->ipcFd = -1; + nvencCtx->shmPtr = NULL; + nvencCtx->shmSize = 0; + nvencCtx->shmFd = -1; + + if (drv->cudaAvailable) { + /* Direct NVENC path (64-bit, CUDA works) */ + if (CHECK_CUDA_RESULT(cu->cuCtxPushCurrent(drv->cudaContext))) { + free(nvencCtx); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + + if (!nvenc_open_session(nvencCtx, drv->nv, drv->cudaContext)) { + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + free(nvencCtx); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + + if (CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL))) { + nvenc_close_session(nvencCtx); + free(nvencCtx); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + nvencCtx->useIPC = false; + } else { + /* IPC path: CUDA unavailable (e.g. 32-bit on Blackwell). + * Encoding delegated to 64-bit nvenc-helper via Unix socket. */ + LOG("Using IPC encode path (CUDA unavailable)"); + nvencCtx->useIPC = true; + } + + Object contextObj = allocateObject(drv, OBJECT_TYPE_CONTEXT, sizeof(NVContext)); + NVContext *nvCtx = (NVContext*) contextObj->obj; + nvCtx->drv = drv; + nvCtx->profile = cfg->profile; + nvCtx->entrypoint = cfg->entrypoint; + nvCtx->width = picture_width; + nvCtx->height = picture_height; + nvCtx->isEncode = true; + nvCtx->encodeData = nvencCtx; + nvCtx->decoder = NULL; + nvCtx->codec = NULL; + + *context = contextObj->id; + LOG("Created encode context id: %d, ipc=%d", contextObj->id, nvencCtx->useIPC); + return VA_STATUS_SUCCESS; + } //find the codec they've selected const NVCodec *selectedCodec = NULL; @@ -1214,6 +1507,35 @@ static VAStatus nvCreateBuffer( return VA_STATUS_ERROR_INVALID_CONTEXT; } + /* Coded buffer for encoding: allocate NVCodedBuffer */ + if (type == VAEncCodedBufferType) { + NVCodedBuffer *coded = (NVCodedBuffer*) calloc(1, sizeof(NVCodedBuffer)); + if (coded == NULL) { + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + + /* Pre-allocate the bitstream storage */ + coded->bitstreamAlloc = size; /* size requested by app is the max coded size */ + coded->bitstreamData = malloc(size); + if (coded->bitstreamData == NULL) { + free(coded); + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + coded->hasData = false; + + Object bufferObject = allocateObject(drv, OBJECT_TYPE_BUFFER, sizeof(NVBuffer)); + *buf_id = bufferObject->id; + + NVBuffer *buf = (NVBuffer*) bufferObject->obj; + buf->bufferType = type; + buf->elements = 1; + buf->size = sizeof(NVCodedBuffer); + buf->ptr = coded; + buf->offset = 0; + + return VA_STATUS_SUCCESS; + } + //HACK: This is an awful hack to support VP8 videos when running within FFMPEG. //VA-API doesn't pass enough information for NVDEC to work with, but the information is there //just before the start of the buffer that was passed to us. @@ -1266,10 +1588,34 @@ static VAStatus nvMapBuffer( NVDriver *drv = (NVDriver*) ctx->pDriverData; NVBuffer *buf = getObjectPtr(drv, OBJECT_TYPE_BUFFER, buf_id); - if (buf == NULL) { + if (buf == NULL || buf->ptr == NULL) { return VA_STATUS_ERROR_INVALID_BUFFER; } + /* Coded buffer: return pointer to VACodedBufferSegment */ + if (buf->bufferType == VAEncCodedBufferType) { + NVCodedBuffer *coded = (NVCodedBuffer*) buf->ptr; + if (coded->hasData) { + coded->segment.size = coded->bitstreamSize; + coded->segment.bit_offset = 0; + coded->segment.status = 0; + coded->segment.reserved = 0; + coded->segment.buf = coded->bitstreamData; + coded->segment.next = NULL; + *pbuf = &coded->segment; + } else { + /* No data yet — return empty segment */ + coded->segment.size = 0; + coded->segment.bit_offset = 0; + coded->segment.status = 0; + coded->segment.reserved = 0; + coded->segment.buf = NULL; + coded->segment.next = NULL; + *pbuf = &coded->segment; + } + return VA_STATUS_SUCCESS; + } + *pbuf = buf->ptr; return VA_STATUS_SUCCESS; @@ -1296,6 +1642,12 @@ static VAStatus nvDestroyBuffer( } if (buf->ptr != NULL) { + /* Free coded buffer internals before freeing the NVCodedBuffer itself */ + if (buf->bufferType == VAEncCodedBufferType) { + NVCodedBuffer *coded = (NVCodedBuffer*) buf->ptr; + free(coded->bitstreamData); + coded->bitstreamData = NULL; + } free(buf->ptr); } @@ -1322,6 +1674,13 @@ static VAStatus nvBeginPicture( return VA_STATUS_ERROR_INVALID_SURFACE; } + /* Encode path: just record the render target */ + if (nvCtx->isEncode) { + nvCtx->renderTarget = surface; + surface->context = nvCtx; + return VA_STATUS_SUCCESS; + } + if (surface->context != NULL && surface->context != nvCtx) { //this surface was last used on a different context, we need to free up the backing image (it might not be the correct size) if (surface->backingImage != NULL) { @@ -1356,6 +1715,55 @@ static VAStatus nvBeginPicture( return VA_STATUS_SUCCESS; } +static void nvRenderPictureEncode(NVContext *nvCtx, NVBuffer *buf) +{ + NVENCContext *nvencCtx = (NVENCContext*) nvCtx->encodeData; + bool isH264 = (nvCtx->profile == VAProfileH264ConstrainedBaseline || + nvCtx->profile == VAProfileH264Main || + nvCtx->profile == VAProfileH264High); + + switch (buf->bufferType) { + case VAEncSequenceParameterBufferType: + if (isH264) { + h264enc_handle_sequence_params(nvencCtx, buf); + } else { + hevcenc_handle_sequence_params(nvencCtx, buf); + } + break; + case VAEncPictureParameterBufferType: + if (isH264) { + h264enc_handle_picture_params(nvencCtx, buf); + } else { + hevcenc_handle_picture_params(nvencCtx, buf); + } + break; + case VAEncSliceParameterBufferType: + if (isH264) { + h264enc_handle_slice_params(nvencCtx, buf); + } else { + hevcenc_handle_slice_params(nvencCtx, buf); + } + break; + case VAEncMiscParameterBufferType: + if (isH264) { + h264enc_handle_misc_params(nvencCtx, buf); + } else { + hevcenc_handle_misc_params(nvencCtx, buf); + } + break; + case VAEncCodedBufferType: + /* Coded buffer is handled at EndPicture */ + break; + case VAEncPackedHeaderParameterBufferType: + case VAEncPackedHeaderDataBufferType: + /* Packed headers: NVENC generates its own headers, skip these */ + break; + default: + LOG("Encode: unhandled buffer type: %d", buf->bufferType); + break; + } +} + static VAStatus nvRenderPicture( VADriverContextP ctx, VAContextID context, @@ -1370,14 +1778,19 @@ static VAStatus nvRenderPicture( return VA_STATUS_ERROR_INVALID_CONTEXT; } - CUVIDPICPARAMS *picParams = &nvCtx->pPicParams; - for (int i = 0; i < num_buffers; i++) { NVBuffer *buf = (NVBuffer*) getObjectPtr(drv, OBJECT_TYPE_BUFFER, buffers[i]); if (buf == NULL || buf->ptr == NULL) { LOG("Invalid buffer detected, skipping: %d", buffers[i]); continue; } + + if (nvCtx->isEncode) { + nvRenderPictureEncode(nvCtx, buf); + continue; + } + + CUVIDPICPARAMS *picParams = &nvCtx->pPicParams; HandlerFunc func = nvCtx->codec->handlers[buf->bufferType]; if (func != NULL) { func(nvCtx, buf, picParams); @@ -1389,6 +1802,450 @@ static VAStatus nvRenderPicture( return VA_STATUS_SUCCESS; } +static VAStatus nvEndPictureEncodeIPC(NVDriver *drv, NVContext *nvCtx); + +static VAStatus nvEndPictureEncode(NVDriver *drv, NVContext *nvCtx) +{ + NVENCContext *nvencCtx = (NVENCContext*) nvCtx->encodeData; + NVSurface *surface = nvCtx->renderTarget; + + if (nvencCtx == NULL) { + return VA_STATUS_ERROR_INVALID_CONTEXT; + } + + /* IPC path: delegate to 64-bit helper */ + if (nvencCtx->useIPC) { + return nvEndPictureEncodeIPC(drv, nvCtx); + } + + if (nvencCtx->encoder == NULL) { + return VA_STATUS_ERROR_INVALID_CONTEXT; + } + + CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED); + + /* Initialize encoder on first frame (we now have all params from sequence/picture buffers) */ + if (!nvencCtx->initialized) { + GUID codecGuid = nvenc_va_profile_to_codec_guid(nvCtx->profile); + GUID profileGuid = nvenc_va_profile_to_profile_guid(nvCtx->profile); + + if (!nvenc_init_encoder(nvencCtx, nvencCtx->width, nvencCtx->height, + codecGuid, profileGuid, + NV_ENC_PRESET_P4_GUID, + NV_ENC_TUNING_INFO_LOW_LATENCY)) { + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + + if (!nvenc_alloc_output_buffer(nvencCtx)) { + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + } + + /* Realise the surface so we have a backing image with CUDA memory */ + if (!drv->backend->realiseSurface(drv, surface)) { + LOG("Encode: failed to realise input surface"); + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + + BackingImage *img = surface->backingImage; + if (img == NULL) { + LOG("Encode: surface has no backing image"); + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + + /* + * The backing image contains CUarray(s) for each plane. + * NVENC needs a linear CUdeviceptr. We need to allocate a linear buffer, + * copy the CUarray contents into it, then register with NVENC. + * + * Use surface dimensions for the copy (the CUarray matches the surface). + * NVENC width/height may differ due to MB/CTU alignment. + */ + uint32_t surfWidth = surface->width; + uint32_t surfHeight = surface->height; + uint32_t encWidth = nvencCtx->width; + uint32_t encHeight = nvencCtx->height; + NV_ENC_BUFFER_FORMAT encFmt = nvencCtx->inputFormat; + + /* Calculate pitch and size for NV12/P010 linear buffer. + * Allocate for the full encode height (may be larger than surface due to alignment) + * but only copy surfHeight rows from the CUarray. */ + uint32_t bytesPerPixel = (encFmt == NV_ENC_BUFFER_FORMAT_YUV420_10BIT) ? 2 : 1; + uint32_t pitch = encWidth * bytesPerPixel; + /* Align pitch to 256 bytes for NVENC */ + pitch = (pitch + 255) & ~255; + uint32_t lumaSize = pitch * encHeight; + uint32_t chromaSize = pitch * (encHeight / 2); + uint32_t totalSize = lumaSize + chromaSize; + + CUdeviceptr linearBuffer = 0; + CUresult cuRes = cu->cuMemAlloc(&linearBuffer, totalSize); + if (cuRes != CUDA_SUCCESS) { + LOG("Encode: failed to allocate linear buffer (%u bytes): %d", totalSize, cuRes); + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + + /* Zero the buffer so padded rows are clean */ + cu->cuMemsetD8Async(linearBuffer, 0, totalSize, 0); + + /* Copy luma plane from CUarray to linear buffer */ + CUDA_MEMCPY2D copy = {0}; + copy.srcMemoryType = CU_MEMORYTYPE_ARRAY; + copy.srcArray = img->arrays[0]; + copy.dstMemoryType = CU_MEMORYTYPE_DEVICE; + copy.dstDevice = linearBuffer; + copy.dstPitch = pitch; + copy.WidthInBytes = surfWidth * bytesPerPixel; + copy.Height = surfHeight; + + cuRes = cu->cuMemcpy2D(©); + if (cuRes != CUDA_SUCCESS) { + LOG("Encode: luma copy failed: %d (surface=%ux%u, pitch=%u)", cuRes, surfWidth, surfHeight, pitch); + cu->cuMemFree(linearBuffer); + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + + /* Copy chroma plane (interleaved UV) */ + memset(©, 0, sizeof(copy)); + copy.srcMemoryType = CU_MEMORYTYPE_ARRAY; + copy.srcArray = img->arrays[1]; + copy.dstMemoryType = CU_MEMORYTYPE_DEVICE; + copy.dstDevice = linearBuffer + lumaSize; + copy.dstPitch = pitch; + /* Chroma plane: each pixel has 2 channels (U,V) interleaved */ + copy.WidthInBytes = surfWidth * bytesPerPixel; + copy.Height = surfHeight / 2; + + cuRes = cu->cuMemcpy2D(©); + if (cuRes != CUDA_SUCCESS) { + LOG("Encode: chroma copy failed: %d", cuRes); + cu->cuMemFree(linearBuffer); + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + + /* Register the linear buffer with NVENC */ + NV_ENC_REGISTERED_PTR registeredRes = NULL; + if (!nvenc_register_cuda_resource(nvencCtx, linearBuffer, + encWidth, encHeight, pitch, + encFmt, ®isteredRes)) { + cu->cuMemFree(linearBuffer); + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + + /* Map the registered resource */ + NV_ENC_INPUT_PTR mappedResource = NULL; + NV_ENC_BUFFER_FORMAT mappedFmt = encFmt; + if (!nvenc_map_resource(nvencCtx, registeredRes, &mappedResource, &mappedFmt)) { + nvenc_unregister_resource(nvencCtx, registeredRes); + cu->cuMemFree(linearBuffer); + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + + /* Encode the frame. + * Use only OUTPUT_SPSPPS on the first frame; after that let NVENC handle it. */ + uint32_t picFlags = (nvencCtx->frameCount == 0 || nvencCtx->forceIDR) + ? (NV_ENC_PIC_FLAG_OUTPUT_SPSPPS | NV_ENC_PIC_FLAG_FORCEIDR) + : 0; + nvencCtx->forceIDR = false; + int encResult = nvenc_encode_frame(nvencCtx, mappedResource, mappedFmt, + encWidth, encHeight, pitch, + nvencCtx->picType, picFlags); + + /* Unmap and unregister regardless of encode result */ + nvenc_unmap_resource(nvencCtx, mappedResource); + nvenc_unregister_resource(nvencCtx, registeredRes); + cu->cuMemFree(linearBuffer); + + if (encResult < 0) { + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_ENCODING_ERROR; + } + + /* Find the coded buffer */ + NVBuffer *codedBuf = (NVBuffer*) getObjectPtr(drv, OBJECT_TYPE_BUFFER, + nvencCtx->currentCodedBufId); + + if (encResult == 0) { + /* NVENC needs more input (B-frame reordering). Mark coded buffer as empty. */ + if (codedBuf != NULL && codedBuf->ptr != NULL) { + NVCodedBuffer *coded = (NVCodedBuffer*) codedBuf->ptr; + coded->bitstreamSize = 0; + coded->hasData = false; + } + LOG("Encode: frame %lu buffered (needs more input)", + (unsigned long)(nvencCtx->frameCount - 1)); + CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED); + return VA_STATUS_SUCCESS; + } + + /* Lock bitstream and copy into the coded buffer */ + void *bitstreamPtr = NULL; + uint32_t bitstreamSize = 0; + if (!nvenc_lock_bitstream(nvencCtx, &bitstreamPtr, &bitstreamSize)) { + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_ENCODING_ERROR; + } + + if (codedBuf != NULL && codedBuf->ptr != NULL) { + NVCodedBuffer *coded = (NVCodedBuffer*) codedBuf->ptr; + /* Grow the buffer if needed */ + if (bitstreamSize > coded->bitstreamAlloc) { + void *newBuf = realloc(coded->bitstreamData, bitstreamSize); + if (newBuf != NULL) { + coded->bitstreamData = newBuf; + coded->bitstreamAlloc = bitstreamSize; + } else { + LOG("Encode: failed to grow coded buffer to %u bytes", bitstreamSize); + nvenc_unlock_bitstream(nvencCtx); + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + } + memcpy(coded->bitstreamData, bitstreamPtr, bitstreamSize); + coded->bitstreamSize = bitstreamSize; + coded->hasData = true; + LOG("Encode: frame %lu encoded, %u bytes", + (unsigned long)(nvencCtx->frameCount - 1), bitstreamSize); + } else { + LOG("Encode: WARNING - no coded buffer found for id %d", nvencCtx->currentCodedBufId); + } + + nvenc_unlock_bitstream(nvencCtx); + + CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED); + return VA_STATUS_SUCCESS; +} + +/* IPC encode path: send frame data to 64-bit helper, receive bitstream */ +static VAStatus nvEndPictureEncodeIPC(NVDriver *drv, NVContext *nvCtx) +{ + NVENCContext *nvencCtx = (NVENCContext*) nvCtx->encodeData; + NVSurface *surface = nvCtx->renderTarget; + + (void)drv; + + /* Connect to helper on first use */ + if (nvencCtx->ipcFd < 0) { + /* Try connecting to an already-running helper first, then start one */ + static const char *helper_paths[] = { + "/usr/libexec/nvenc-helper", + "/usr/local/libexec/nvenc-helper", + "/usr/lib/nvidia-vaapi-driver/nvenc-helper", + NULL + }; + nvencCtx->ipcFd = nvenc_ipc_connect(); + if (nvencCtx->ipcFd < 0) { + for (int pi = 0; helper_paths[pi] != NULL; pi++) { + if (access(helper_paths[pi], X_OK) == 0) { + LOG("IPC encode: starting helper: %s", helper_paths[pi]); + nvencCtx->ipcFd = nvenc_ipc_connect_or_start(helper_paths[pi]); + if (nvencCtx->ipcFd >= 0) break; + } + } + } + if (nvencCtx->ipcFd < 0) { + LOG("IPC encode: failed to connect to nvenc-helper (is it installed?)"); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + LOG("IPC encode: connected to nvenc-helper (fd=%d)", nvencCtx->ipcFd); + } + + /* Initialize encoder via IPC on first frame */ + if (!nvencCtx->initialized) { + bool isH264 = (nvCtx->profile == VAProfileH264ConstrainedBaseline || + nvCtx->profile == VAProfileH264Main || + nvCtx->profile == VAProfileH264High); + NVEncIPCInitParams params = { + .width = nvencCtx->width, + .height = nvencCtx->height, + .codec = isH264 ? 0 : 1, + .profile = (uint32_t)nvCtx->profile, + .frameRateNum = nvencCtx->frameRateNum, + .frameRateDen = nvencCtx->frameRateDen, + .bitrate = nvencCtx->bitrate, + .maxBitrate = nvencCtx->maxBitrate, + .gopLength = nvencCtx->intraPeriod, + .is10bit = (nvencCtx->inputFormat == NV_ENC_BUFFER_FORMAT_YUV420_10BIT) ? 1 : 0, + }; + + int shm_fd = -1; + uint32_t shm_size = 0; + if (nvenc_ipc_init(nvencCtx->ipcFd, ¶ms, &shm_fd, &shm_size) != 0) { + LOG("IPC encode: init failed"); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + nvencCtx->initialized = true; + + /* Map shared memory if the helper provided one */ + if (shm_fd >= 0 && shm_size > 0) { + nvencCtx->shmPtr = mmap(NULL, shm_size, PROT_READ | PROT_WRITE, + MAP_SHARED, shm_fd, 0); + if (nvencCtx->shmPtr == MAP_FAILED) { + nvencCtx->shmPtr = NULL; + LOG("IPC encode: shm mmap failed, falling back to socket"); + } else { + nvencCtx->shmSize = shm_size; + nvencCtx->shmFd = shm_fd; + + /* Redirect the surface's hostPixelData to the SHM region. + * This eliminates the memcpy in EndPicture — Steam writes + * directly to shared memory via vaDeriveImage → vaMapBuffer. + * The helper reads from the same physical pages. Zero copy. */ + if (surface->hostPixelSize <= shm_size) { + if (!surface->hostPixelIsShm) { + free(surface->hostPixelData); + } + surface->hostPixelData = nvencCtx->shmPtr; + surface->hostPixelSize = shm_size; + surface->hostPixelIsShm = true; + LOG("IPC encode: shm zero-copy enabled, %u bytes", shm_size); + } else { + LOG("IPC encode: shm enabled (copy mode), %u bytes", shm_size); + } + } + close(shm_fd); /* mmap keeps the mapping alive after close */ + } + + LOG("IPC encode: encoder initialized %ux%u shm=%s", + params.width, params.height, nvencCtx->shmPtr ? "yes" : "no"); + } + + /* Encode via IPC. + * Priority: 1) Host pixel data from vaDeriveImage/vaPutImage (has actual captured pixels) + * 2) DRM-backed surface via NVIDIA opaque fds (GPU zero-copy) + * Host data takes priority because vaDeriveImage is how Steam writes captured + * frames — the GPU surface may exist but not contain the capture. */ + void *bitstream = NULL; + uint32_t bsSize = 0; + int ret; + int dmabuf_fds[4] = {-1, -1, -1, -1}; + int num_dmabuf_fds = 0; + NVEncIPCEncodeDmaBufParams dp = {0}; + bool useDmaBuf = false; + bool useHostData = false; + + /* Prefer host pixel data if available (written by vaDeriveImage → vaMapBuffer) */ + if (surface->hostPixelData != NULL && surface->hostPixelSize > 0) { + useHostData = true; + } else if (surface->backingImage != NULL && surface->backingImage->nvFds[0] > 0) { + /* DRM-backed surface: send per-plane NVIDIA opaque fds to helper. + * The helper imports each into CUDA (cuImportExternalMemory with + * OPAQUE_FD), maps to CUarray, copies to linear buffer, encodes. + * We dup() the fds because CUDA takes ownership on import. */ + BackingImage *img = surface->backingImage; + const NVFormatInfo *fmtInfo = &formatsInfo[img->format]; + dp.width = surface->width; + dp.height = surface->height; + dp.num_planes = fmtInfo->numPlanes; + dp.bppc = fmtInfo->bppc; + dp.is10bit = (nvencCtx->inputFormat == NV_ENC_BUFFER_FORMAT_YUV420_10BIT) ? 1 : 0; + for (uint32_t p = 0; p < fmtInfo->numPlanes && p < 4; p++) { + dmabuf_fds[p] = dup(img->nvFds[p]); + dp.pitches[p] = img->strides[p]; + dp.offsets[p] = 0; + dp.sizes[p] = img->memorySizes[p]; + } + num_dmabuf_fds = (int)fmtInfo->numPlanes; + useDmaBuf = true; + } + + if (useHostData) { + /* Host memory path: pixel data from vaDeriveImage/vaPutImage. + * IMPORTANT: use the SURFACE dimensions (e.g. 1920x1080), not the + * encoder dimensions (e.g. 1920x1088). */ + uint32_t surfW = surface->width; + uint32_t surfH = surface->height; + uint32_t frameSize = surface->hostPixelSize; + uint32_t forceIDR = nvencCtx->forceIDR ? 1 : 0; + nvencCtx->forceIDR = false; + + if (nvencCtx->shmPtr != NULL && frameSize <= nvencCtx->shmSize) { + /* SHM path: if hostPixelData IS the shm (zero-copy), skip memcpy. + * Otherwise copy frame to shared memory. */ + if (surface->hostPixelData != nvencCtx->shmPtr) { + memcpy(nvencCtx->shmPtr, surface->hostPixelData, frameSize); + } + if (nvencCtx->frameCount < 3) { + LOG("IPC encode: SHM path %ux%u %u bytes", surfW, surfH, frameSize); + } + ret = nvenc_ipc_encode_shm(nvencCtx->ipcFd, surfW, surfH, + frameSize, forceIDR, + &bitstream, &bsSize); + } else { + /* Socket fallback: snapshot + full send */ + void *snapshot = malloc(frameSize); + if (snapshot == NULL) { + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + memcpy(snapshot, surface->hostPixelData, frameSize); + if (nvencCtx->frameCount < 3) { + LOG("IPC encode: SOCKET path %ux%u %u bytes", surfW, surfH, frameSize); + } + ret = nvenc_ipc_encode(nvencCtx->ipcFd, snapshot, + surfW, surfH, frameSize, forceIDR, + &bitstream, &bsSize); + free(snapshot); + } + } else if (useDmaBuf) { + if (nvencCtx->frameCount < 3) { + LOG("IPC encode: DMABUF planes=%d fds=[%d,%d] %ux%u pitch=%u sizes=[%u,%u]", + num_dmabuf_fds, dmabuf_fds[0], dmabuf_fds[1], + dp.width, dp.height, dp.pitches[0], dp.sizes[0], dp.sizes[1]); + } + ret = nvenc_ipc_encode_dmabuf(nvencCtx->ipcFd, dmabuf_fds, num_dmabuf_fds, + &dp, &bitstream, &bsSize); + } else { + LOG("IPC encode: surface has no pixel data (no DMA-BUF, no host data)"); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + + if (ret != 0) { + LOG("IPC encode: encode failed (ret=%d)", ret); + return VA_STATUS_ERROR_ENCODING_ERROR; + } + + /* Copy bitstream into coded buffer */ + NVBuffer *codedBuf = (NVBuffer*) getObjectPtr(drv, OBJECT_TYPE_BUFFER, + nvencCtx->currentCodedBufId); + if (codedBuf != NULL && codedBuf->ptr != NULL) { + NVCodedBuffer *coded = (NVCodedBuffer*) codedBuf->ptr; + if (bsSize > coded->bitstreamAlloc) { + void *newBuf = realloc(coded->bitstreamData, bsSize); + if (newBuf != NULL) { + coded->bitstreamData = newBuf; + coded->bitstreamAlloc = bsSize; + } else { + free(bitstream); + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + } + memcpy(coded->bitstreamData, bitstream, bsSize); + coded->bitstreamSize = bsSize; + coded->hasData = true; + if (nvencCtx->frameCount < 5 || nvencCtx->frameCount % 300 == 0) { + unsigned char *bs = (unsigned char *)coded->bitstreamData; + LOG("IPC encode: frame %lu, %u bytes, first4=[%02x %02x %02x %02x]", + (unsigned long)nvencCtx->frameCount, bsSize, + bsSize > 0 ? bs[0] : 0, bsSize > 1 ? bs[1] : 0, + bsSize > 2 ? bs[2] : 0, bsSize > 3 ? bs[3] : 0); + } + } + + free(bitstream); + nvencCtx->frameCount++; + + return VA_STATUS_SUCCESS; +} + static VAStatus nvEndPicture( VADriverContextP ctx, VAContextID context @@ -1397,7 +2254,16 @@ static VAStatus nvEndPicture( NVDriver *drv = (NVDriver*) ctx->pDriverData; NVContext *nvCtx = (NVContext*) getObjectPtr(drv, OBJECT_TYPE_CONTEXT, context); - if (nvCtx == NULL || nvCtx->decoder == NULL) { + if (nvCtx == NULL) { + return VA_STATUS_ERROR_INVALID_CONTEXT; + } + + /* Encode path */ + if (nvCtx->isEncode) { + return nvEndPictureEncode(drv, nvCtx); + } + + if (nvCtx->decoder == NULL) { return VA_STATUS_ERROR_INVALID_CONTEXT; } @@ -1453,6 +2319,11 @@ static VAStatus nvSyncSurface( return VA_STATUS_ERROR_INVALID_SURFACE; } + /* Encode is synchronous — EndPicture blocks until encode is done */ + if (surface->context != NULL && surface->context->isEncode) { + return VA_STATUS_SUCCESS; + } + //LOG("Syncing on surface: %d (%p)", surface->pictureIdx, surface); //wait for resolve to occur before synchronising @@ -1611,8 +2482,75 @@ static VAStatus nvDeriveImage( VAImage *image /* out */ ) { - //LOG("In %s", __func__); - //FAILED because we don't support it + NVDriver *drv = (NVDriver*) ctx->pDriverData; + NVSurface *surfaceObj = (NVSurface*) getObjectPtr(drv, OBJECT_TYPE_SURFACE, surface); + + if (surfaceObj == NULL) { + return VA_STATUS_ERROR_INVALID_SURFACE; + } + + /* In IPC encode-only mode, derive a host-memory image so Steam's ffmpeg + * can write captured NV12 frames into it via vaMapBuffer. The encoder + * then reads from this host memory via the IPC pixel-data path. */ + if (!drv->cudaAvailable) { + uint32_t width = surfaceObj->width; + uint32_t height = surfaceObj->height; + int bpp = (surfaceObj->bitDepth > 8) ? 2 : 1; + uint32_t lumaSize = width * bpp * height; + uint32_t chromaSize = width * bpp * (height / 2); + uint32_t totalSize = lumaSize + chromaSize; + + /* Allocate or reuse the surface's host pixel buffer */ + if (surfaceObj->hostPixelData == NULL || surfaceObj->hostPixelSize < totalSize) { + free(surfaceObj->hostPixelData); + surfaceObj->hostPixelData = malloc(totalSize); + if (surfaceObj->hostPixelData == NULL) { + surfaceObj->hostPixelSize = 0; + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + surfaceObj->hostPixelSize = totalSize; + memset(surfaceObj->hostPixelData, 0, totalSize); + } + + /* Create a buffer object for the image data (points to the surface's host memory) */ + Object imageBufferObj = allocateObject(drv, OBJECT_TYPE_BUFFER, sizeof(NVBuffer)); + NVBuffer *imageBuf = (NVBuffer*) imageBufferObj->obj; + imageBuf->bufferType = VAImageBufferType; + imageBuf->size = totalSize; + imageBuf->elements = 1; + imageBuf->ptr = surfaceObj->hostPixelData; /* Shared with surface! */ + imageBuf->offset = (size_t)-1; /* Sentinel: don't free ptr on destroy */ + + /* Create the image object */ + Object imageObj = allocateObject(drv, OBJECT_TYPE_IMAGE, sizeof(NVImage)); + NVImage *img = (NVImage*) imageObj->obj; + img->width = width; + img->height = height; + img->format = (bpp == 1) ? NV_FORMAT_NV12 : NV_FORMAT_P010; + img->imageBuffer = imageBuf; + + /* Fill VAImage output */ + memset(image, 0, sizeof(*image)); + image->image_id = imageObj->id; + image->format.fourcc = (bpp == 1) ? VA_FOURCC_NV12 : VA_FOURCC_P010; + image->format.byte_order = VA_LSB_FIRST; + image->format.bits_per_pixel = (bpp == 1) ? 12 : 24; + image->buf = imageBufferObj->id; + image->width = width; + image->height = height; + image->data_size = totalSize; + image->num_planes = 2; + image->pitches[0] = width * bpp; + image->pitches[1] = width * bpp; + image->offsets[0] = 0; + image->offsets[1] = lumaSize; + + LOG("DeriveImage: surface %d → host image %d (%ux%u, %u bytes)", + surface, imageObj->id, width, height, totalSize); + return VA_STATUS_SUCCESS; + } + + /* Normal CUDA path: not supported */ return VA_STATUS_ERROR_OPERATION_FAILED; } @@ -1631,7 +2569,10 @@ static VAStatus nvDestroyImage( Object imageBufferObj = getObjectByPtr(drv, OBJECT_TYPE_BUFFER, img->imageBuffer); if (imageBufferObj != NULL) { - if (img->imageBuffer->ptr != NULL) { + /* For derived images, the buffer ptr is shared with the surface's + * hostPixelData — don't free it (the surface owns the memory). + * For regular images (from vaCreateImage), we own the buffer. */ + if (img->imageBuffer->ptr != NULL && img->imageBuffer->offset != (size_t)-1) { free(img->imageBuffer->ptr); } @@ -1735,7 +2676,98 @@ static VAStatus nvPutImage( unsigned int dest_height ) { - LOG("In %s", __func__); + NVDriver *drv = (NVDriver*) ctx->pDriverData; + + NVSurface *surfaceObj = (NVSurface*) getObjectPtr(drv, OBJECT_TYPE_SURFACE, surface); + NVImage *imageObj = (NVImage*) getObjectPtr(drv, OBJECT_TYPE_IMAGE, image); + + if (surfaceObj == NULL) { + return VA_STATUS_ERROR_INVALID_SURFACE; + } + if (imageObj == NULL) { + return VA_STATUS_ERROR_INVALID_IMAGE; + } + + const NVFormatInfo *fmtInfo = &formatsInfo[imageObj->format]; + + /* Host-memory path: when CUDA is unavailable (IPC encode-only mode), + * store pixel data directly in the surface for later IPC transmission. */ + if (!drv->cudaAvailable) { + uint32_t totalSize = imageObj->imageBuffer->size; + if (surfaceObj->hostPixelData == NULL || surfaceObj->hostPixelSize < totalSize) { + free(surfaceObj->hostPixelData); + surfaceObj->hostPixelData = malloc(totalSize); + if (surfaceObj->hostPixelData == NULL) { + surfaceObj->hostPixelSize = 0; + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + surfaceObj->hostPixelSize = totalSize; + } + memcpy(surfaceObj->hostPixelData, imageObj->imageBuffer->ptr, totalSize); + return VA_STATUS_SUCCESS; + } + + CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED); + + /* Ensure the surface has a backing image to write into */ + if (!drv->backend->realiseSurface(drv, surfaceObj)) { + LOG("PutImage: failed to realise surface"); + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + + BackingImage *backImg = surfaceObj->backingImage; + if (backImg == NULL) { + LOG("PutImage: no backing image"); + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + + /* Copy each plane from host memory (image buffer) to GPU (CUarray). + * Apply source/destination offsets per the VA-API spec. */ + uint32_t copyWidth = src_width > 0 ? src_width : imageObj->width; + uint32_t copyHeight = src_height > 0 ? src_height : imageObj->height; + uint32_t imgWidth = imageObj->width; + uint32_t imgHeight = imageObj->height; + uint32_t imgPlaneOffset = 0; + + for (uint32_t i = 0; i < fmtInfo->numPlanes; i++) { + const NVFormatPlane *p = &fmtInfo->plane[i]; + /* Subsampled offsets and dimensions */ + uint32_t planeSrcX = (uint32_t)((src_x > 0 ? src_x : 0)) >> p->ss.x; + uint32_t planeSrcY = (uint32_t)((src_y > 0 ? src_y : 0)) >> p->ss.y; + uint32_t planeDstX = (uint32_t)((dest_x > 0 ? dest_x : 0)) >> p->ss.x; + uint32_t planeDstY = (uint32_t)((dest_y > 0 ? dest_y : 0)) >> p->ss.y; + uint32_t planeCopyW = copyWidth >> p->ss.x; + uint32_t planeCopyH = copyHeight >> p->ss.y; + uint32_t imgPlanePitch = imgWidth * fmtInfo->bppc; + + CUDA_MEMCPY2D memcpy2d = { + .srcXInBytes = planeSrcX * fmtInfo->bppc * p->channelCount, + .srcY = planeSrcY, + .srcMemoryType = CU_MEMORYTYPE_HOST, + .srcHost = (char*)imageObj->imageBuffer->ptr + imgPlaneOffset, + .srcPitch = imgPlanePitch, + + .dstXInBytes = planeDstX * fmtInfo->bppc * p->channelCount, + .dstY = planeDstY, + .dstMemoryType = CU_MEMORYTYPE_ARRAY, + .dstArray = backImg->arrays[i], + + .WidthInBytes = planeCopyW * fmtInfo->bppc * p->channelCount, + .Height = planeCopyH, + }; + + CUresult result = cu->cuMemcpy2D(&memcpy2d); + if (result != CUDA_SUCCESS) { + LOG("PutImage: cuMemcpy2D failed for plane %u: %d", i, result); + CHECK_CUDA_RESULT(cu->cuCtxPopCurrent(NULL)); + return VA_STATUS_ERROR_OPERATION_FAILED; + } + imgPlaneOffset += ((imgWidth * imgHeight) >> (p->ss.x + p->ss.y)) * fmtInfo->bppc * p->channelCount; + } + + CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED); return VA_STATUS_SUCCESS; } @@ -1882,6 +2914,41 @@ static VAStatus nvQuerySurfaceAttributes( return VA_STATUS_ERROR_INVALID_CONFIG; } + /* Encode config surface attributes — GStreamer needs min/max dimensions */ + if (cfg->isEncode) { + int cnt = 5; + if (num_attribs != NULL) { + *num_attribs = cnt; + } + if (attrib_list != NULL) { + attrib_list[0].type = VASurfaceAttribMinWidth; + attrib_list[0].flags = VA_SURFACE_ATTRIB_GETTABLE; + attrib_list[0].value.type = VAGenericValueTypeInteger; + attrib_list[0].value.value.i = 16; + + attrib_list[1].type = VASurfaceAttribMinHeight; + attrib_list[1].flags = VA_SURFACE_ATTRIB_GETTABLE; + attrib_list[1].value.type = VAGenericValueTypeInteger; + attrib_list[1].value.value.i = 16; + + attrib_list[2].type = VASurfaceAttribMaxWidth; + attrib_list[2].flags = VA_SURFACE_ATTRIB_GETTABLE; + attrib_list[2].value.type = VAGenericValueTypeInteger; + attrib_list[2].value.value.i = 4096; + + attrib_list[3].type = VASurfaceAttribMaxHeight; + attrib_list[3].flags = VA_SURFACE_ATTRIB_GETTABLE; + attrib_list[3].value.type = VAGenericValueTypeInteger; + attrib_list[3].value.value.i = 4096; + + attrib_list[4].type = VASurfaceAttribPixelFormat; + attrib_list[4].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; + attrib_list[4].value.type = VAGenericValueTypeInteger; + attrib_list[4].value.value.i = (cfg->bitDepth > 8) ? VA_FOURCC_P010 : VA_FOURCC_NV12; + } + return VA_STATUS_SUCCESS; + } + //LOG("with %d (%d) %p %d", cfg->cudaCodec, cfg->bitDepth, attrib_list, *num_attribs); if (cfg->chromaFormat != cudaVideoChromaFormat_420 && cfg->chromaFormat != cudaVideoChromaFormat_444) { @@ -2152,9 +3219,11 @@ static VAStatus nvExportSurfaceHandle( return VA_STATUS_ERROR_INVALID_SURFACE; } - //LOG("Exporting surface: %d (%p)", surface->pictureIdx, surface); + LOG("Exporting surface: %d (%p)", surface->pictureIdx, surface); - CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED); + if (drv->cudaAvailable) { + CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED); + } if (!drv->backend->realiseSurface(drv, surface)) { LOG("Unable to export surface"); @@ -2170,7 +3239,9 @@ static VAStatus nvExportSurfaceHandle( // ptr->layers[1].offset[0], ptr->layers[1].pitch[0], // ptr->objects[1].drm_format_modifier); - CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED); + if (drv->cudaAvailable) { + CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED); + } return VA_STATUS_SUCCESS; } @@ -2180,23 +3251,32 @@ static VAStatus nvTerminate( VADriverContextP ctx ) NVDriver *drv = (NVDriver*) ctx->pDriverData; LOG("Terminating %p", ctx); - CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED); - - drv->backend->destroyAllBackingImage(drv); - - deleteAllObjects(drv); + if (drv->cudaAvailable) { + CHECK_CUDA_RESULT_RETURN(cu->cuCtxPushCurrent(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED); - drv->backend->releaseExporter(drv); + drv->backend->destroyAllBackingImage(drv); + deleteAllObjects(drv); + drv->backend->releaseExporter(drv); - CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED); + CHECK_CUDA_RESULT_RETURN(cu->cuCtxPopCurrent(NULL), VA_STATUS_ERROR_OPERATION_FAILED); + } else { + deleteAllObjects(drv); + /* Release the DRM backend if it was initialized for IPC mode */ + if (drv->backend != NULL) { + drv->backend->destroyAllBackingImage(drv); + drv->backend->releaseExporter(drv); + } + } pthread_mutex_lock(&concurrency_mutex); instances--; LOG("Now have %d (%d max) instances", instances, max_instances); pthread_mutex_unlock(&concurrency_mutex); - CHECK_CUDA_RESULT_RETURN(cu->cuCtxDestroy(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED); - drv->cudaContext = NULL; + if (drv->cudaAvailable && drv->cudaContext != NULL) { + CHECK_CUDA_RESULT_RETURN(cu->cuCtxDestroy(drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED); + drv->cudaContext = NULL; + } free(drv); @@ -2299,7 +3379,8 @@ VAStatus __vaDriverInit_1_0(VADriverContextP ctx) { pthread_mutex_unlock(&concurrency_mutex); //check to make sure we initialised the CUDA functions correctly - if (cu == NULL || cv == NULL) { + //If CUDA loaded but cuInit failed, we can still do encode-only via IPC + if (cu == NULL) { return VA_STATUS_ERROR_OPERATION_FAILED; } @@ -2308,6 +3389,9 @@ VAStatus __vaDriverInit_1_0(VADriverContextP ctx) { drv->cu = cu; drv->cv = cv; + drv->nv = nv; + drv->nvencAvailable = (nv != NULL); + drv->cudaAvailable = cudaInitSuccess; drv->useCorrectNV12Format = true; drv->cudaGpuId = gpu; //make sure that we want the default GPU, and that a DRM fd that we care about is passed in @@ -2322,16 +3406,24 @@ VAStatus __vaDriverInit_1_0(VADriverContextP ctx) { } ctx->max_profiles = MAX_PROFILES; - ctx->max_entrypoints = 1; + ctx->max_entrypoints = 2; ctx->max_attributes = 1; ctx->max_display_attributes = 1; ctx->max_image_formats = ARRAY_SIZE(formatsInfo) - 1; ctx->max_subpic_formats = 1; - if (backend == DIRECT) { - ctx->str_vendor = "VA-API NVDEC driver [direct backend]"; - } else if (backend == EGL) { - ctx->str_vendor = "VA-API NVDEC driver [egl backend]"; + if (drv->cudaAvailable) { + if (backend == DIRECT) { + ctx->str_vendor = drv->nvencAvailable + ? "VA-API NVDEC/NVENC driver [direct backend]" + : "VA-API NVDEC driver [direct backend]"; + } else if (backend == EGL) { + ctx->str_vendor = drv->nvencAvailable + ? "VA-API NVDEC/NVENC driver [egl backend]" + : "VA-API NVDEC driver [egl backend]"; + } + } else { + ctx->str_vendor = "VA-API NVENC driver [IPC encode-only]"; } pthread_mutexattr_t attrib; @@ -2341,21 +3433,43 @@ VAStatus __vaDriverInit_1_0(VADriverContextP ctx) { pthread_mutex_init(&drv->imagesMutex, &attrib); pthread_mutex_init(&drv->exportMutex, NULL); - if (!drv->backend->initExporter(drv)) { - LOG("Exporter failed"); - free(drv); - return VA_STATUS_ERROR_OPERATION_FAILED; - } + if (drv->cudaAvailable) { + /* Full CUDA path: init exporter and create CUDA context */ + if (!drv->backend->initExporter(drv)) { + LOG("Exporter failed"); + free(drv); + return VA_STATUS_ERROR_OPERATION_FAILED; + } - if (CHECK_CUDA_RESULT(cu->cuCtxCreate(&drv->cudaContext, CU_CTX_SCHED_BLOCKING_SYNC, drv->cudaGpuId))) { - drv->backend->releaseExporter(drv); - free(drv); - return VA_STATUS_ERROR_OPERATION_FAILED; - } + if (CHECK_CUDA_RESULT(cu->cuCtxCreate(&drv->cudaContext, CU_CTX_SCHED_BLOCKING_SYNC, drv->cudaGpuId))) { + drv->backend->releaseExporter(drv); + free(drv); + return VA_STATUS_ERROR_OPERATION_FAILED; + } - //CHECK_CUDA_RESULT_RETURN(cv->cuvidCtxLockCreate(&drv->vidLock, drv->cudaContext), VA_STATUS_ERROR_OPERATION_FAILED); + nvQueryConfigProfiles2(ctx, drv->profiles, &drv->profileCount); + } else { + /* Encode-only IPC path: no CUDA context, no decode profiles. + * Init the direct backend for GPU surface allocation via DRM. + * This lets Steam render into our surfaces via OpenGL/EGL, + * and we send the DMA-BUF fds to the 64-bit helper for encoding. */ + LOG("CUDA unavailable — encode-only mode, init DRM backend for surfaces"); + drv->cudaContext = NULL; + + if (backend == DIRECT && drv->backend->initExporter(drv)) { + LOG("DRM backend initialized for surface allocation"); + } else { + LOG("DRM backend init failed — surfaces will have no GPU backing"); + } - nvQueryConfigProfiles2(ctx, drv->profiles, &drv->profileCount); + int p = 0; + drv->profiles[p++] = VAProfileH264ConstrainedBaseline; + drv->profiles[p++] = VAProfileH264Main; + drv->profiles[p++] = VAProfileH264High; + drv->profiles[p++] = VAProfileHEVCMain; + drv->profiles[p++] = VAProfileHEVCMain10; + drv->profileCount = p; + } *ctx->vtable = vtable; return VA_STATUS_SUCCESS; diff --git a/src/vabackend.h b/src/vabackend.h index 672c489f..df6b4412 100644 --- a/src/vabackend.h +++ b/src/vabackend.h @@ -2,6 +2,7 @@ #define VABACKEND_H #include +#include #include #include #include @@ -69,6 +70,16 @@ typedef struct pthread_mutex_t mutex; pthread_cond_t cond; bool decodeFailed; + /* Host-memory pixel buffer for encode-only IPC path (no CUDA) */ + void *hostPixelData; + uint32_t hostPixelSize; + bool hostPixelIsShm; /* true if hostPixelData points to SHM (don't free) */ + /* Imported DMA-BUF for IPC encode (fd from Steam's GPU capture) */ + int importedDmaBufFd; + uint32_t importedPitches[4]; + uint32_t importedOffsets[4]; + uint32_t importedNumPlanes; + uint32_t importedDataSize; } NVSurface; typedef enum @@ -110,6 +121,9 @@ typedef struct _BackingImage { //direct backend only NVCudaImage cudaImages[3]; NVFormat format; + /* NVIDIA opaque fds for CUDA import (IPC encode path) */ + int nvFds[4]; + uint32_t memorySizes[4]; } BackingImage; struct _NVDriver; @@ -129,6 +143,7 @@ typedef struct _NVDriver { CudaFunctions *cu; CuvidFunctions *cv; + NvencFunctions *nv; CUcontext cudaContext; CUvideoctxlock vidLock; Array/**/ objects; @@ -154,6 +169,8 @@ typedef struct _NVDriver int numFramesPresented; int profileCount; VAProfile profiles[MAX_PROFILES]; + bool nvencAvailable; + bool cudaAvailable; /* false when 32-bit CUDA fails */ } NVDriver; struct _NVCodec; @@ -185,6 +202,8 @@ typedef struct _NVContext pthread_mutex_t surfaceCreationMutex; int surfaceCount; bool firstKeyframeValid; + bool isEncode; + void *encodeData; /* NVENCContext* for encode contexts */ } NVContext; typedef struct @@ -195,6 +214,7 @@ typedef struct cudaVideoChromaFormat chromaFormat; int bitDepth; cudaVideoCodec cudaCodec; + bool isEncode; } NVConfig; typedef void (*HandlerFunc)(NVContext*, NVBuffer* , CUVIDPICPARAMS*); diff --git a/tests/test_common.h b/tests/test_common.h new file mode 100644 index 00000000..1648adde --- /dev/null +++ b/tests/test_common.h @@ -0,0 +1,138 @@ +/* + * test_common.h — Shared test utilities for nvidia-vaapi-driver tests. + * Inspired by Intel's i965 test infrastructure. + */ + +#ifndef TEST_COMMON_H +#define TEST_COMMON_H + +#define _POSIX_C_SOURCE 199309L + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRM_DEVICE "/dev/dri/renderD128" + +/* Test counters */ +static int g_pass = 0; +static int g_fail = 0; +static int g_skip = 0; + +/* Colors */ +#define C_GREEN "\033[32m" +#define C_RED "\033[31m" +#define C_YELLOW "\033[33m" +#define C_RESET "\033[0m" + +/* Test macros */ +#define TEST_START(name) \ + printf(" %-55s ", name); fflush(stdout); + +#define TEST_PASS() do { \ + printf(C_GREEN "PASS" C_RESET "\n"); g_pass++; \ +} while (0) + +#define TEST_FAIL(reason) do { \ + printf(C_RED "FAIL" C_RESET " (%s)\n", reason); g_fail++; \ +} while (0) + +#define TEST_SKIP(reason) do { \ + printf(C_YELLOW "SKIP" C_RESET " (%s)\n", reason); g_skip++; \ +} while (0) + +/* Assert that aborts current test function on failure */ +#define EXPECT_STATUS(st) do { \ + if ((st) != VA_STATUS_SUCCESS) { \ + char _msg[64]; snprintf(_msg, sizeof(_msg), "VA status %d", (st)); \ + TEST_FAIL(_msg); return; \ + } \ +} while (0) + +#define EXPECT_STATUS_EQ(expect, st) do { \ + VAStatus _s = (st); \ + if (_s != (expect)) { \ + char _msg[64]; snprintf(_msg, sizeof(_msg), \ + "expected status %d, got %d", (expect), _s); \ + TEST_FAIL(_msg); return; \ + } \ +} while (0) + +#define EXPECT_TRUE(cond, reason) do { \ + if (!(cond)) { TEST_FAIL(reason); return; } \ +} while (0) + +#define EXPECT_NOT_NULL(ptr, reason) do { \ + if ((ptr) == NULL) { TEST_FAIL(reason); return; } \ +} while (0) + +/* Timer for performance measurement */ +typedef struct { + struct timespec start; + struct timespec end; +} TestTimer; + +static inline void timer_start(TestTimer *t) { + clock_gettime(CLOCK_MONOTONIC, &t->start); +} + +static inline double timer_stop_ms(TestTimer *t) { + clock_gettime(CLOCK_MONOTONIC, &t->end); + return (t->end.tv_sec - t->start.tv_sec) * 1000.0 + + (t->end.tv_nsec - t->start.tv_nsec) / 1000000.0; +} + +/* Global VA display setup */ +static VADisplay g_dpy; +static int g_drm_fd; + +static void test_global_setup(void) { + g_drm_fd = open(DRM_DEVICE, O_RDWR); + if (g_drm_fd < 0) { + fprintf(stderr, "Cannot open %s\n", DRM_DEVICE); + exit(1); + } + g_dpy = vaGetDisplayDRM(g_drm_fd); + if (!g_dpy) { + fprintf(stderr, "vaGetDisplayDRM failed\n"); + exit(1); + } + int major, minor; + VAStatus st = vaInitialize(g_dpy, &major, &minor); + if (st != VA_STATUS_SUCCESS) { + fprintf(stderr, "vaInitialize failed: %d\n", st); + exit(1); + } +} + +static void test_global_teardown(void) { + vaTerminate(g_dpy); + close(g_drm_fd); +} + +static void test_print_summary(const char *suite_name) { + printf("\n=== %s: %d passed, %d failed, %d skipped ===\n\n", + suite_name, g_pass, g_fail, g_skip); +} + +/* Check if a profile+entrypoint combination is supported */ +static bool test_has_entrypoint(VADisplay dpy, VAProfile profile, VAEntrypoint ep) { + int ne = vaMaxNumEntrypoints(dpy); + VAEntrypoint *eps = calloc(ne, sizeof(VAEntrypoint)); + int n = 0; + vaQueryConfigEntrypoints(dpy, profile, eps, &n); + bool found = false; + for (int i = 0; i < n; i++) { + if (eps[i] == ep) { found = true; break; } + } + free(eps); + return found; +} + +#endif /* TEST_COMMON_H */ diff --git a/tests/test_encode.c b/tests/test_encode.c new file mode 100644 index 00000000..454b3b6c --- /dev/null +++ b/tests/test_encode.c @@ -0,0 +1,488 @@ +/* + * test_encode.c — Encode path integration tests for nvidia-vaapi-driver. + * + * Build: + * gcc -o test_encode test_encode.c -lva -lva-drm -lm + * + * Run: + * ./test_encode # all tests + * ./test_encode h264 # H.264 tests only + * ./test_encode hevc # HEVC tests only + * + * Exit code: 0 = all pass, 1 = failure + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRM_DEVICE "/dev/dri/renderD128" + +static int pass_count = 0; +static int fail_count = 0; + +#define TEST_START(name) \ + printf(" %-50s ", name); fflush(stdout); + +#define TEST_PASS() do { \ + printf("\033[32mPASS\033[0m\n"); pass_count++; \ +} while (0) + +#define TEST_FAIL(reason) do { \ + printf("\033[31mFAIL\033[0m (%s)\n", reason); fail_count++; \ +} while (0) + +#define TEST_ASSERT(cond, reason) do { \ + if (!(cond)) { TEST_FAIL(reason); return; } \ +} while (0) + +static VADisplay dpy; +static int drm_fd; + +static void setup(void) +{ + drm_fd = open(DRM_DEVICE, O_RDWR); + if (drm_fd < 0) { + fprintf(stderr, "Cannot open %s\n", DRM_DEVICE); + exit(1); + } + dpy = vaGetDisplayDRM(drm_fd); + if (!dpy) { + fprintf(stderr, "vaGetDisplayDRM failed\n"); + exit(1); + } + int major, minor; + VAStatus st = vaInitialize(dpy, &major, &minor); + if (st != VA_STATUS_SUCCESS) { + fprintf(stderr, "vaInitialize failed: %d\n", st); + exit(1); + } +} + +static void teardown(void) +{ + vaTerminate(dpy); + close(drm_fd); +} + +/* --- Test: Entrypoints --- */ + +static void test_entrypoints_h264(void) +{ + TEST_START("H.264 EncSlice entrypoint exists"); + int ne = vaMaxNumEntrypoints(dpy); + VAEntrypoint *eps = calloc(ne, sizeof(VAEntrypoint)); + int n = 0; + vaQueryConfigEntrypoints(dpy, VAProfileH264High, eps, &n); + bool found = false; + for (int i = 0; i < n; i++) { + if (eps[i] == VAEntrypointEncSlice) found = true; + } + free(eps); + TEST_ASSERT(found, "VAEntrypointEncSlice not found for H264High"); + TEST_PASS(); +} + +static void test_entrypoints_hevc(void) +{ + TEST_START("HEVC EncSlice entrypoint exists"); + int ne = vaMaxNumEntrypoints(dpy); + VAEntrypoint *eps = calloc(ne, sizeof(VAEntrypoint)); + int n = 0; + vaQueryConfigEntrypoints(dpy, VAProfileHEVCMain, eps, &n); + bool found = false; + for (int i = 0; i < n; i++) { + if (eps[i] == VAEntrypointEncSlice) found = true; + } + free(eps); + TEST_ASSERT(found, "VAEntrypointEncSlice not found for HEVCMain"); + TEST_PASS(); +} + +/* --- Test: Config attributes --- */ + +static void test_config_attributes(void) +{ + TEST_START("Encode config attributes (RTFormat, RateControl)"); + VAConfigAttrib attribs[3] = { + { .type = VAConfigAttribRTFormat }, + { .type = VAConfigAttribRateControl }, + { .type = VAConfigAttribEncMaxRefFrames }, + }; + VAStatus st = vaGetConfigAttributes(dpy, VAProfileH264High, + VAEntrypointEncSlice, attribs, 3); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaGetConfigAttributes failed"); + TEST_ASSERT(attribs[0].value & VA_RT_FORMAT_YUV420, "no YUV420 RTFormat"); + TEST_ASSERT(attribs[1].value & VA_RC_CQP, "no CQP rate control"); + TEST_ASSERT(attribs[1].value & VA_RC_CBR, "no CBR rate control"); + TEST_ASSERT(attribs[1].value & VA_RC_VBR, "no VBR rate control"); + TEST_PASS(); +} + +/* --- Test: Create/destroy config+surfaces+context --- */ + +static void test_create_destroy(void) +{ + TEST_START("Create and destroy encode config/surfaces/context"); + + VAConfigAttrib attrib = { .type = VAConfigAttribRTFormat, + .value = VA_RT_FORMAT_YUV420 }; + VAConfigID config; + VAStatus st = vaCreateConfig(dpy, VAProfileH264High, + VAEntrypointEncSlice, &attrib, 1, &config); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaCreateConfig failed"); + + VASurfaceID surfaces[4]; + st = vaCreateSurfaces(dpy, VA_RT_FORMAT_YUV420, 320, 240, + surfaces, 4, NULL, 0); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaCreateSurfaces failed"); + + VAContextID context; + st = vaCreateContext(dpy, config, 320, 240, VA_PROGRESSIVE, + surfaces, 4, &context); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaCreateContext failed"); + + st = vaDestroyContext(dpy, context); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaDestroyContext failed"); + st = vaDestroySurfaces(dpy, surfaces, 4); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaDestroySurfaces failed"); + st = vaDestroyConfig(dpy, config); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaDestroyConfig failed"); + TEST_PASS(); +} + +/* --- Test: Full encode cycle (1 frame) --- */ + +static void test_encode_one_frame(VAProfile profile, const char *codec_name) +{ + char name[64]; + snprintf(name, sizeof(name), "%s encode 1 frame (320x240)", codec_name); + TEST_START(name); + + VAConfigAttrib attrib = { .type = VAConfigAttribRTFormat, + .value = VA_RT_FORMAT_YUV420 }; + VAConfigID config; + VAStatus st = vaCreateConfig(dpy, profile, VAEntrypointEncSlice, + &attrib, 1, &config); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "config"); + + VASurfaceID surface; + st = vaCreateSurfaces(dpy, VA_RT_FORMAT_YUV420, 320, 240, + &surface, 1, NULL, 0); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "surface"); + + VAContextID context; + st = vaCreateContext(dpy, config, 320, 240, VA_PROGRESSIVE, + &surface, 1, &context); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "context"); + + /* Coded buffer */ + VABufferID coded_buf; + st = vaCreateBuffer(dpy, context, VAEncCodedBufferType, 320 * 240, + 1, NULL, &coded_buf); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "coded_buf"); + + /* Create NV12 image and fill with gray */ + VAImageFormat fmt = { .fourcc = VA_FOURCC_NV12 }; + VAImage image; + st = vaCreateImage(dpy, &fmt, 320, 240, &image); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "image"); + void *img_data; + vaMapBuffer(dpy, image.buf, &img_data); + memset(img_data, 128, image.data_size); + vaUnmapBuffer(dpy, image.buf); + vaPutImage(dpy, surface, image.image_id, 0, 0, 320, 240, 0, 0, 320, 240); + + /* Sequence params */ + VABufferID seq_buf; + if (profile == VAProfileH264High || profile == VAProfileH264Main || + profile == VAProfileH264ConstrainedBaseline) { + VAEncSequenceParameterBufferH264 seq = { + .picture_width_in_mbs = 320 / 16, + .picture_height_in_mbs = 240 / 16, + .intra_period = 30, .ip_period = 1, + }; + vaCreateBuffer(dpy, context, VAEncSequenceParameterBufferType, + sizeof(seq), 1, &seq, &seq_buf); + } else { + VAEncSequenceParameterBufferHEVC seq = { + .pic_width_in_luma_samples = 320, + .pic_height_in_luma_samples = 240, + .intra_period = 30, .ip_period = 1, + }; + vaCreateBuffer(dpy, context, VAEncSequenceParameterBufferType, + sizeof(seq), 1, &seq, &seq_buf); + } + + /* Picture params */ + VABufferID pic_buf; + if (profile == VAProfileH264High || profile == VAProfileH264Main || + profile == VAProfileH264ConstrainedBaseline) { + VAEncPictureParameterBufferH264 pic = { + .coded_buf = coded_buf, + .pic_fields.bits.idr_pic_flag = 1, + }; + vaCreateBuffer(dpy, context, VAEncPictureParameterBufferType, + sizeof(pic), 1, &pic, &pic_buf); + } else { + VAEncPictureParameterBufferHEVC pic = { + .coded_buf = coded_buf, + .pic_fields.bits.idr_pic_flag = 1, + }; + vaCreateBuffer(dpy, context, VAEncPictureParameterBufferType, + sizeof(pic), 1, &pic, &pic_buf); + } + + /* Slice params */ + VABufferID slice_buf; + if (profile == VAProfileH264High || profile == VAProfileH264Main || + profile == VAProfileH264ConstrainedBaseline) { + VAEncSliceParameterBufferH264 slice = { .slice_type = 2 }; + vaCreateBuffer(dpy, context, VAEncSliceParameterBufferType, + sizeof(slice), 1, &slice, &slice_buf); + } else { + VAEncSliceParameterBufferHEVC slice = { .slice_type = 2 }; + vaCreateBuffer(dpy, context, VAEncSliceParameterBufferType, + sizeof(slice), 1, &slice, &slice_buf); + } + + /* Encode */ + st = vaBeginPicture(dpy, context, surface); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaBeginPicture"); + VABufferID bufs[] = { seq_buf, pic_buf, slice_buf }; + st = vaRenderPicture(dpy, context, bufs, 3); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaRenderPicture"); + st = vaEndPicture(dpy, context); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaEndPicture"); + + st = vaSyncSurface(dpy, surface); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaSyncSurface"); + + /* Map coded buffer and check output */ + VACodedBufferSegment *seg = NULL; + st = vaMapBuffer(dpy, coded_buf, (void **)&seg); + TEST_ASSERT(st == VA_STATUS_SUCCESS, "vaMapBuffer"); + TEST_ASSERT(seg != NULL, "coded segment is NULL"); + TEST_ASSERT(seg->buf != NULL, "coded data is NULL"); + TEST_ASSERT(seg->size > 0, "coded size is 0"); + + /* Check for valid NAL start code */ + unsigned char *bs = (unsigned char *)seg->buf; + bool has_start_code = (bs[0] == 0 && bs[1] == 0 && bs[2] == 0 && bs[3] == 1); + TEST_ASSERT(has_start_code, "no NAL start code 00 00 00 01"); + + vaUnmapBuffer(dpy, coded_buf); + + /* Cleanup */ + vaDestroyBuffer(dpy, coded_buf); + vaDestroyBuffer(dpy, seq_buf); + vaDestroyBuffer(dpy, pic_buf); + vaDestroyBuffer(dpy, slice_buf); + vaDestroyImage(dpy, image.image_id); + vaDestroyContext(dpy, context); + vaDestroySurfaces(dpy, &surface, 1); + vaDestroyConfig(dpy, config); + TEST_PASS(); +} + +/* --- Test: Sequential encodes (leak check) --- */ + +static void test_sequential_encodes(void) +{ + TEST_START("10 sequential H.264 encodes (leak check)"); + + for (int run = 0; run < 10; run++) { + VAConfigAttrib attrib = { .type = VAConfigAttribRTFormat, + .value = VA_RT_FORMAT_YUV420 }; + VAConfigID config; + vaCreateConfig(dpy, VAProfileH264High, VAEntrypointEncSlice, + &attrib, 1, &config); + VASurfaceID surface; + vaCreateSurfaces(dpy, VA_RT_FORMAT_YUV420, 320, 240, &surface, 1, NULL, 0); + VAContextID context; + vaCreateContext(dpy, config, 320, 240, VA_PROGRESSIVE, &surface, 1, &context); + VABufferID coded; + vaCreateBuffer(dpy, context, VAEncCodedBufferType, 320 * 240, 1, NULL, &coded); + + VAEncSequenceParameterBufferH264 seq = { + .picture_width_in_mbs = 20, .picture_height_in_mbs = 15, + .intra_period = 30, .ip_period = 1, + }; + VAEncPictureParameterBufferH264 pic = { + .coded_buf = coded, .pic_fields.bits.idr_pic_flag = 1, + }; + VAEncSliceParameterBufferH264 slice = { .slice_type = 2 }; + VABufferID bufs[3]; + vaCreateBuffer(dpy, context, VAEncSequenceParameterBufferType, + sizeof(seq), 1, &seq, &bufs[0]); + vaCreateBuffer(dpy, context, VAEncPictureParameterBufferType, + sizeof(pic), 1, &pic, &bufs[1]); + vaCreateBuffer(dpy, context, VAEncSliceParameterBufferType, + sizeof(slice), 1, &slice, &bufs[2]); + + vaBeginPicture(dpy, context, surface); + vaRenderPicture(dpy, context, bufs, 3); + VAStatus st = vaEndPicture(dpy, context); + if (st != VA_STATUS_SUCCESS) { + TEST_FAIL("vaEndPicture failed in sequential run"); + return; + } + + vaDestroyBuffer(dpy, coded); + vaDestroyBuffer(dpy, bufs[0]); + vaDestroyBuffer(dpy, bufs[1]); + vaDestroyBuffer(dpy, bufs[2]); + vaDestroyContext(dpy, context); + vaDestroySurfaces(dpy, &surface, 1); + vaDestroyConfig(dpy, config); + } + TEST_PASS(); +} + +/* --- Test: Coded buffer reuse across frames --- */ + +static void test_coded_buffer_reuse(void) +{ + TEST_START("Coded buffer reuse across 5 frames"); + + VAConfigAttrib attrib = { .type = VAConfigAttribRTFormat, + .value = VA_RT_FORMAT_YUV420 }; + VAConfigID config; + vaCreateConfig(dpy, VAProfileH264High, VAEntrypointEncSlice, + &attrib, 1, &config); + VASurfaceID surface; + vaCreateSurfaces(dpy, VA_RT_FORMAT_YUV420, 320, 240, &surface, 1, NULL, 0); + VAContextID context; + vaCreateContext(dpy, config, 320, 240, VA_PROGRESSIVE, &surface, 1, &context); + VABufferID coded; + vaCreateBuffer(dpy, context, VAEncCodedBufferType, 320 * 240, 1, NULL, &coded); + + for (int frame = 0; frame < 5; frame++) { + VAEncSequenceParameterBufferH264 seq = { + .picture_width_in_mbs = 20, .picture_height_in_mbs = 15, + .intra_period = 30, .ip_period = 1, + }; + VAEncPictureParameterBufferH264 pic = { + .coded_buf = coded, + .pic_fields.bits.idr_pic_flag = (frame == 0) ? 1 : 0, + }; + VAEncSliceParameterBufferH264 slice = { + .slice_type = (frame == 0) ? 2 : 0, + }; + VABufferID bufs[3]; + vaCreateBuffer(dpy, context, VAEncSequenceParameterBufferType, + sizeof(seq), 1, &seq, &bufs[0]); + vaCreateBuffer(dpy, context, VAEncPictureParameterBufferType, + sizeof(pic), 1, &pic, &bufs[1]); + vaCreateBuffer(dpy, context, VAEncSliceParameterBufferType, + sizeof(slice), 1, &slice, &bufs[2]); + + vaBeginPicture(dpy, context, surface); + vaRenderPicture(dpy, context, bufs, 3); + VAStatus st = vaEndPicture(dpy, context); + if (st != VA_STATUS_SUCCESS) { + TEST_FAIL("vaEndPicture failed"); + goto cleanup; + } + + VACodedBufferSegment *seg; + vaMapBuffer(dpy, coded, (void **)&seg); + if (!seg || !seg->buf || seg->size == 0) { + TEST_FAIL("empty coded buffer"); + vaUnmapBuffer(dpy, coded); + goto cleanup; + } + vaUnmapBuffer(dpy, coded); + + vaDestroyBuffer(dpy, bufs[0]); + vaDestroyBuffer(dpy, bufs[1]); + vaDestroyBuffer(dpy, bufs[2]); + } + TEST_PASS(); + +cleanup: + vaDestroyBuffer(dpy, coded); + vaDestroyContext(dpy, context); + vaDestroySurfaces(dpy, &surface, 1); + vaDestroyConfig(dpy, config); +} + +/* --- Test: Decode regression --- */ + +static void test_decode_still_works(void) +{ + TEST_START("Decode entrypoints still present (VLD)"); + int ne = vaMaxNumEntrypoints(dpy); + VAEntrypoint *eps = calloc(ne, sizeof(VAEntrypoint)); + int n = 0; + vaQueryConfigEntrypoints(dpy, VAProfileH264High, eps, &n); + bool found_vld = false; + bool found_enc = false; + for (int i = 0; i < n; i++) { + if (eps[i] == VAEntrypointVLD) found_vld = true; + if (eps[i] == VAEntrypointEncSlice) found_enc = true; + } + free(eps); + TEST_ASSERT(found_vld, "VAEntrypointVLD missing"); + TEST_ASSERT(found_enc, "VAEntrypointEncSlice missing"); + TEST_PASS(); +} + +/* --- Main --- */ + +int main(int argc, char **argv) +{ + bool run_h264 = true, run_hevc = true; + if (argc > 1) { + if (strcmp(argv[1], "h264") == 0) run_hevc = false; + else if (strcmp(argv[1], "hevc") == 0) run_h264 = false; + } + + setup(); + + printf("\n=== nvidia-vaapi-driver encode tests ===\n"); + printf("Driver: %s\n\n", vaQueryVendorString(dpy)); + + printf("Entrypoints:\n"); + test_entrypoints_h264(); + test_entrypoints_hevc(); + + printf("\nConfig:\n"); + test_config_attributes(); + + printf("\nLifecycle:\n"); + test_create_destroy(); + + if (run_h264) { + printf("\nH.264 Encode:\n"); + test_encode_one_frame(VAProfileH264High, "H.264 High"); + test_encode_one_frame(VAProfileH264Main, "H.264 Main"); + test_encode_one_frame(VAProfileH264ConstrainedBaseline, "H.264 CB"); + } + + if (run_hevc) { + printf("\nHEVC Encode:\n"); + test_encode_one_frame(VAProfileHEVCMain, "HEVC Main"); + } + + printf("\nStress:\n"); + test_sequential_encodes(); + test_coded_buffer_reuse(); + + printf("\nRegression:\n"); + test_decode_still_works(); + + printf("\n=== Results: %d passed, %d failed ===\n\n", + pass_count, fail_count); + + teardown(); + return fail_count > 0 ? 1 : 0; +} diff --git a/tests/test_encode_config.c b/tests/test_encode_config.c new file mode 100644 index 00000000..1780d5fb --- /dev/null +++ b/tests/test_encode_config.c @@ -0,0 +1,259 @@ +/* + * test_encode_config.c — Config and capability tests. + * Tests profile/entrypoint validation, config attributes, and error paths. + * + * Build: gcc -o test_encode_config tests/test_encode_config.c -lva -lva-drm + * Run: ./test_encode_config + */ + +#include "test_common.h" + +/* --- Profile/Entrypoint matrix --- */ + +typedef struct { + VAProfile profile; + const char *name; + bool expect_encode; + bool expect_decode; +} ProfileTest; + +static const ProfileTest profile_tests[] = { + { VAProfileH264ConstrainedBaseline, "H264 CB", true, true }, + { VAProfileH264Main, "H264 Main", true, true }, + { VAProfileH264High, "H264 High", true, true }, + { VAProfileHEVCMain, "HEVC Main", true, true }, + { VAProfileHEVCMain10, "HEVC M10", true, true }, + { VAProfileMPEG2Simple, "MPEG2", false, true }, + { VAProfileVP9Profile0, "VP9 P0", false, false }, /* VP9 requires gstreamer-codecparsers */ + { VAProfileAV1Profile0, "AV1 P0", false, true }, + { VAProfileJPEGBaseline, "JPEG", false, true }, +}; +#define NUM_PROFILE_TESTS (sizeof(profile_tests) / sizeof(profile_tests[0])) + +static void test_encode_entrypoints(void) { + for (int i = 0; i < (int)NUM_PROFILE_TESTS; i++) { + char name[64]; + snprintf(name, sizeof(name), "EncSlice for %-10s → %s", + profile_tests[i].name, + profile_tests[i].expect_encode ? "present" : "absent"); + TEST_START(name); + + bool has = test_has_entrypoint(g_dpy, profile_tests[i].profile, + VAEntrypointEncSlice); + if (has == profile_tests[i].expect_encode) { + TEST_PASS(); + } else { + TEST_FAIL(has ? "unexpected EncSlice" : "missing EncSlice"); + } + } +} + +static void test_decode_entrypoints(void) { + for (int i = 0; i < (int)NUM_PROFILE_TESTS; i++) { + char name[64]; + snprintf(name, sizeof(name), "VLD for %-10s → %s", + profile_tests[i].name, + profile_tests[i].expect_decode ? "present" : "absent"); + TEST_START(name); + + bool has = test_has_entrypoint(g_dpy, profile_tests[i].profile, + VAEntrypointVLD); + if (has == profile_tests[i].expect_decode) { + TEST_PASS(); + } else { + TEST_FAIL(has ? "unexpected VLD" : "missing VLD"); + } + } +} + +/* --- Config attribute validation --- */ + +static void test_config_rtformat(void) { + TEST_START("H264 High RTFormat includes YUV420"); + VAConfigAttrib a = { .type = VAConfigAttribRTFormat }; + EXPECT_STATUS(vaGetConfigAttributes(g_dpy, VAProfileH264High, + VAEntrypointEncSlice, &a, 1)); + EXPECT_TRUE(a.value & VA_RT_FORMAT_YUV420, "no YUV420"); + TEST_PASS(); +} + +static void test_config_ratecontrol(void) { + TEST_START("Rate control: CQP + CBR + VBR supported"); + VAConfigAttrib a = { .type = VAConfigAttribRateControl }; + EXPECT_STATUS(vaGetConfigAttributes(g_dpy, VAProfileH264High, + VAEntrypointEncSlice, &a, 1)); + EXPECT_TRUE(a.value & VA_RC_CQP, "no CQP"); + EXPECT_TRUE(a.value & VA_RC_CBR, "no CBR"); + EXPECT_TRUE(a.value & VA_RC_VBR, "no VBR"); + TEST_PASS(); +} + +static void test_config_packed_headers(void) { + TEST_START("Packed headers: SEQ + PIC advertised"); + VAConfigAttrib a = { .type = VAConfigAttribEncPackedHeaders }; + EXPECT_STATUS(vaGetConfigAttributes(g_dpy, VAProfileH264High, + VAEntrypointEncSlice, &a, 1)); + EXPECT_TRUE(a.value & VA_ENC_PACKED_HEADER_SEQUENCE, "no SEQ"); + EXPECT_TRUE(a.value & VA_ENC_PACKED_HEADER_PICTURE, "no PIC"); + TEST_PASS(); +} + +static void test_config_max_ref_frames(void) { + TEST_START("Max ref frames reported"); + VAConfigAttrib a = { .type = VAConfigAttribEncMaxRefFrames }; + EXPECT_STATUS(vaGetConfigAttributes(g_dpy, VAProfileH264High, + VAEntrypointEncSlice, &a, 1)); + EXPECT_TRUE(a.value != VA_ATTRIB_NOT_SUPPORTED, "not supported"); + EXPECT_TRUE((a.value & 0xffff) >= 1, "L0 refs < 1"); + TEST_PASS(); +} + +static void test_config_quality_range(void) { + TEST_START("Quality range attribute reported"); + VAConfigAttrib a = { .type = VAConfigAttribEncQualityRange }; + EXPECT_STATUS(vaGetConfigAttributes(g_dpy, VAProfileH264High, + VAEntrypointEncSlice, &a, 1)); + EXPECT_TRUE(a.value != VA_ATTRIB_NOT_SUPPORTED, "not supported"); + EXPECT_TRUE(a.value >= 1, "quality range < 1"); + TEST_PASS(); +} + +/* --- Error path tests --- */ + +static void test_invalid_entrypoint(void) { + TEST_START("vaCreateConfig with invalid entrypoint → error"); + VAConfigAttrib a = { .type = VAConfigAttribRTFormat, .value = VA_RT_FORMAT_YUV420 }; + VAConfigID config; + /* Use a valid profile but wrong entrypoint type (0xFF) */ + VAStatus st = vaCreateConfig(g_dpy, VAProfileH264High, (VAEntrypoint)0xFF, + &a, 1, &config); + EXPECT_TRUE(st != VA_STATUS_SUCCESS, "should fail for invalid entrypoint"); + TEST_PASS(); +} + +static void test_encode_on_decode_only_profile(void) { + TEST_START("vaCreateConfig encode on MPEG2 (decode-only) → error"); + VAConfigAttrib a = { .type = VAConfigAttribRTFormat, .value = VA_RT_FORMAT_YUV420 }; + VAConfigID config; + VAStatus st = vaCreateConfig(g_dpy, VAProfileMPEG2Simple, + VAEntrypointEncSlice, &a, 1, &config); + EXPECT_TRUE(st != VA_STATUS_SUCCESS, "should fail for decode-only profile"); + TEST_PASS(); +} + +static void test_create_config_all_encode_profiles(void) { + VAProfile profiles[] = { + VAProfileH264ConstrainedBaseline, VAProfileH264Main, VAProfileH264High, + VAProfileHEVCMain, VAProfileHEVCMain10, + }; + for (int i = 0; i < 5; i++) { + char name[64]; + snprintf(name, sizeof(name), "vaCreateConfig for encode profile %d", profiles[i]); + TEST_START(name); + + VAConfigAttrib a = { .type = VAConfigAttribRTFormat, .value = VA_RT_FORMAT_YUV420 }; + VAConfigID config; + VAStatus st = vaCreateConfig(g_dpy, profiles[i], VAEntrypointEncSlice, + &a, 1, &config); + EXPECT_STATUS(st); + st = vaDestroyConfig(g_dpy, config); + EXPECT_STATUS(st); + TEST_PASS(); + } +} + +/* --- Surface creation tests --- */ + +static void test_surface_nv12(void) { + TEST_START("Create NV12 surface 1920x1080"); + VASurfaceID surface; + VAStatus st = vaCreateSurfaces(g_dpy, VA_RT_FORMAT_YUV420, 1920, 1080, + &surface, 1, NULL, 0); + EXPECT_STATUS(st); + vaDestroySurfaces(g_dpy, &surface, 1); + TEST_PASS(); +} + +static void test_surface_p010(void) { + TEST_START("Create P010 surface 1920x1080 (10-bit)"); + VASurfaceID surface; + VAStatus st = vaCreateSurfaces(g_dpy, VA_RT_FORMAT_YUV420_10, 1920, 1080, + &surface, 1, NULL, 0); + if (st != VA_STATUS_SUCCESS) { + TEST_SKIP("10-bit surfaces not supported"); + return; + } + vaDestroySurfaces(g_dpy, &surface, 1); + TEST_PASS(); +} + +static void test_surface_multiple(void) { + TEST_START("Create 16 surfaces simultaneously"); + VASurfaceID surfaces[16]; + VAStatus st = vaCreateSurfaces(g_dpy, VA_RT_FORMAT_YUV420, 640, 480, + surfaces, 16, NULL, 0); + EXPECT_STATUS(st); + vaDestroySurfaces(g_dpy, surfaces, 16); + TEST_PASS(); +} + +static void test_surface_small(void) { + TEST_START("Create tiny surface 16x16"); + VASurfaceID surface; + VAStatus st = vaCreateSurfaces(g_dpy, VA_RT_FORMAT_YUV420, 16, 16, + &surface, 1, NULL, 0); + EXPECT_STATUS(st); + vaDestroySurfaces(g_dpy, &surface, 1); + TEST_PASS(); +} + +static void test_surface_4k(void) { + TEST_START("Create 4K surface 3840x2160"); + VASurfaceID surface; + VAStatus st = vaCreateSurfaces(g_dpy, VA_RT_FORMAT_YUV420, 3840, 2160, + &surface, 1, NULL, 0); + EXPECT_STATUS(st); + vaDestroySurfaces(g_dpy, &surface, 1); + TEST_PASS(); +} + +/* --- Main --- */ + +int main(void) +{ + test_global_setup(); + + printf("\n=== nvidia-vaapi-driver config & capability tests ===\n"); + printf("Driver: %s\n\n", vaQueryVendorString(g_dpy)); + + printf("Encode entrypoints:\n"); + test_encode_entrypoints(); + + printf("\nDecode entrypoints:\n"); + test_decode_entrypoints(); + + printf("\nConfig attributes:\n"); + test_config_rtformat(); + test_config_ratecontrol(); + test_config_packed_headers(); + test_config_max_ref_frames(); + test_config_quality_range(); + + printf("\nError paths:\n"); + test_invalid_entrypoint(); + test_encode_on_decode_only_profile(); + + printf("\nConfig creation:\n"); + test_create_config_all_encode_profiles(); + + printf("\nSurface creation:\n"); + test_surface_nv12(); + test_surface_p010(); + test_surface_multiple(); + test_surface_small(); + test_surface_4k(); + + test_print_summary("Config tests"); + test_global_teardown(); + return g_fail > 0 ? 1 : 0; +} diff --git a/tests/test_gstreamer.sh b/tests/test_gstreamer.sh new file mode 100755 index 00000000..58834620 --- /dev/null +++ b/tests/test_gstreamer.sh @@ -0,0 +1,217 @@ +#!/bin/bash +# +# test_gstreamer.sh — GStreamer VA-API encode integration tests +# +# Requires: gstreamer1-vaapi (Fedora) or gstreamer1.0-vaapi (Ubuntu) +# +# Exit code: 0 = all pass, 1 = failure + +set -u + +export GST_VAAPI_ALL_DRIVERS=1 +export LIBVA_DRIVER_NAME=nvidia + +PASS=0 +FAIL=0 +SKIP=0 +TMPDIR=$(mktemp -d) +trap 'rm -rf "$TMPDIR"' EXIT + +pass() { printf " %-55s \033[32mPASS\033[0m\n" "$1"; PASS=$((PASS+1)); } +fail() { printf " %-55s \033[31mFAIL\033[0m (%s)\n" "$1" "$2"; FAIL=$((FAIL+1)); } +skip() { printf " %-55s \033[33mSKIP\033[0m (%s)\n" "$1" "$2"; SKIP=$((SKIP+1)); } + +has_element() { gst-inspect-1.0 "$1" >/dev/null 2>&1; } + +echo "" +echo "=== nvidia-vaapi-driver GStreamer tests ===" +echo "" + +# --- Check prerequisites --- + +echo "Prerequisites:" + +if ! has_element vaapih264enc; then + skip "vaapih264enc available" "gstreamer-vaapi not installed" + echo "" + echo "=== Results: $PASS passed, $FAIL failed, $SKIP skipped ===" + exit 1 +fi +pass "vaapih264enc available" + +if ! has_element vaapih265enc; then + skip "vaapih265enc available" "element not found" +else + pass "vaapih265enc available" +fi + +# --- H.264 encode tests --- + +echo "" +echo "H.264 Encode:" + +# Basic encode to fakesink +if gst-launch-1.0 -e videotestsrc num-buffers=30 \ + ! video/x-raw,width=320,height=240,framerate=30/1 \ + ! vaapih264enc ! h264parse ! fakesink 2>&1 | grep -q "EOS"; then + pass "H.264 320x240 30 frames → fakesink" +else + fail "H.264 320x240 30 frames → fakesink" "pipeline error" +fi + +# Encode to file and validate +OUT="$TMPDIR/h264.mp4" +if gst-launch-1.0 -e videotestsrc num-buffers=60 \ + ! video/x-raw,width=1920,height=1080,framerate=30/1 \ + ! vaapih264enc bitrate=5000 ! h264parse \ + ! mp4mux ! filesink location="$OUT" 2>&1 | grep -q "EOS"; then + SIZE=$(stat -c%s "$OUT" 2>/dev/null || echo 0) + if [ "$SIZE" -gt 1000 ]; then + pass "H.264 1080p 60 frames → mp4 (${SIZE} bytes)" + else + fail "H.264 1080p 60 frames → mp4" "file too small: ${SIZE} bytes" + fi +else + fail "H.264 1080p 60 frames → mp4" "pipeline error" +fi + +# CBR bitrate control +OUT="$TMPDIR/h264_cbr.mp4" +if gst-launch-1.0 -e videotestsrc num-buffers=90 \ + ! video/x-raw,width=1280,height=720,framerate=30/1 \ + ! vaapih264enc rate-control=cbr bitrate=2000 ! h264parse \ + ! mp4mux ! filesink location="$OUT" 2>&1 | grep -q "EOS"; then + SIZE=$(stat -c%s "$OUT" 2>/dev/null || echo 0) + if [ "$SIZE" -gt 1000 ]; then + pass "H.264 720p CBR 2Mbps 90 frames" + else + fail "H.264 720p CBR 2Mbps 90 frames" "file too small" + fi +else + fail "H.264 720p CBR 2Mbps 90 frames" "pipeline error" +fi + +# Small resolution (GStreamer vaapi requires ~256x256 minimum) +if gst-launch-1.0 -e videotestsrc num-buffers=10 \ + ! video/x-raw,width=256,height=256,framerate=30/1 \ + ! vaapih264enc ! h264parse ! fakesink 2>&1 | grep -q "EOS"; then + pass "H.264 256x256 small resolution" +else + fail "H.264 256x256 small resolution" "pipeline error" +fi + +# 4K resolution +if gst-launch-1.0 -e videotestsrc num-buffers=5 \ + ! video/x-raw,width=3840,height=2160,framerate=30/1 \ + ! vaapih264enc ! h264parse ! fakesink 2>&1 | grep -q "EOS"; then + pass "H.264 4K 5 frames" +else + fail "H.264 4K 5 frames" "pipeline error" +fi + +# --- HEVC encode tests --- + +echo "" +echo "HEVC Encode:" + +if has_element vaapih265enc; then + # Basic encode + if gst-launch-1.0 -e videotestsrc num-buffers=30 \ + ! video/x-raw,width=320,height=240,framerate=30/1 \ + ! vaapih265enc ! h265parse ! fakesink 2>&1 | grep -q "EOS"; then + pass "HEVC 320x240 30 frames → fakesink" + else + fail "HEVC 320x240 30 frames → fakesink" "pipeline error" + fi + + # Encode to file + OUT="$TMPDIR/hevc.mp4" + if gst-launch-1.0 -e videotestsrc num-buffers=60 \ + ! video/x-raw,width=1920,height=1080,framerate=30/1 \ + ! vaapih265enc bitrate=5000 ! h265parse \ + ! mp4mux ! filesink location="$OUT" 2>&1 | grep -q "EOS"; then + SIZE=$(stat -c%s "$OUT" 2>/dev/null || echo 0) + if [ "$SIZE" -gt 1000 ]; then + pass "HEVC 1080p 60 frames → mp4 (${SIZE} bytes)" + else + fail "HEVC 1080p 60 frames → mp4" "file too small: ${SIZE} bytes" + fi + else + fail "HEVC 1080p 60 frames → mp4" "pipeline error" + fi + + # 4K + if gst-launch-1.0 -e videotestsrc num-buffers=5 \ + ! video/x-raw,width=3840,height=2160,framerate=30/1 \ + ! vaapih265enc ! h265parse ! fakesink 2>&1 | grep -q "EOS"; then + pass "HEVC 4K 5 frames" + else + fail "HEVC 4K 5 frames" "pipeline error" + fi +else + skip "HEVC tests" "vaapih265enc not available" +fi + +# --- Decode regression --- + +echo "" +echo "Decode regression:" + +if has_element vaapih264dec; then + pass "vaapih264dec still available" +else + fail "vaapih264dec still available" "element missing" +fi + +if has_element vaapih265dec; then + pass "vaapih265dec still available" +else + fail "vaapih265dec still available" "element missing" +fi + +# Decode an encoded file (round-trip) +if [ -f "$TMPDIR/h264.mp4" ]; then + if gst-launch-1.0 -e filesrc location="$TMPDIR/h264.mp4" \ + ! qtdemux ! h264parse ! vaapih264dec ! fakesink 2>&1 | grep -q "EOS"; then + pass "H.264 encode → decode round-trip" + else + fail "H.264 encode → decode round-trip" "decode pipeline error" + fi +fi + +# --- Stress --- + +echo "" +echo "Stress:" + +# Sequential pipeline restarts (leak check) +ALL_OK=1 +for i in $(seq 1 10); do + if ! gst-launch-1.0 -e videotestsrc num-buffers=10 \ + ! video/x-raw,width=320,height=240,framerate=30/1 \ + ! vaapih264enc ! fakesink 2>&1 | grep -q "EOS"; then + ALL_OK=0 + break + fi +done +if [ "$ALL_OK" = "1" ]; then + pass "10 sequential H.264 pipeline restarts" +else + fail "10 sequential H.264 pipeline restarts" "failed at iteration $i" +fi + +# Long encode (300 frames) +if gst-launch-1.0 -e videotestsrc num-buffers=300 \ + ! video/x-raw,width=1920,height=1080,framerate=60/1 \ + ! vaapih264enc bitrate=8000 ! h264parse ! fakesink 2>&1 | grep -q "EOS"; then + pass "H.264 1080p60 300 frames sustained" +else + fail "H.264 1080p60 300 frames sustained" "pipeline error" +fi + +# --- Summary --- + +echo "" +echo "=== Results: $PASS passed, $FAIL failed, $SKIP skipped ===" +echo "" +exit $FAIL diff --git a/tests/test_ipc_fuzz.c b/tests/test_ipc_fuzz.c new file mode 100644 index 00000000..c579f201 --- /dev/null +++ b/tests/test_ipc_fuzz.c @@ -0,0 +1,204 @@ +/* + * test_ipc_fuzz.c — Fuzz the nvenc-helper IPC protocol with malformed messages. + * Tests robustness against corrupt/malicious data from the socket. + * + * Build: gcc -o test_ipc_fuzz tests/test_ipc_fuzz.c src/nvenc-ipc-client.c -lm + * Run: ./test_ipc_fuzz (nvenc-helper must be running) + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../src/nvenc-ipc.h" + +static int g_pass = 0, g_fail = 0; +#define C_GREEN "\033[32m" +#define C_RED "\033[31m" +#define C_RESET "\033[0m" +#define TEST_START(n) printf(" %-55s ", n); fflush(stdout); +#define TEST_PASS() do { printf(C_GREEN "PASS" C_RESET "\n"); g_pass++; } while(0) +#define TEST_FAIL(r) do { printf(C_RED "FAIL" C_RESET " (%s)\n", r); g_fail++; } while(0) +#define EXPECT_TRUE(c, r) do { if(!(c)) { TEST_FAIL(r); return; } } while(0) + +static bool send_raw(int fd, const void *buf, size_t len) { + const char *p = buf; + while (len > 0) { + ssize_t n = send(fd, p, len, MSG_NOSIGNAL); + if (n <= 0) return false; + p += n; + len -= (size_t)n; + } + return true; +} + +static int connect_helper(void) { + char path[256]; + nvenc_ipc_get_socket_path(path, sizeof(path)); + int fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) return -1; + struct sockaddr_un addr = {0}; + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, path, sizeof(addr.sun_path) - 1); + if (connect(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + close(fd); + return -1; + } + return fd; +} + +static void test_invalid_command(void) { + TEST_START("Invalid command ID (0xFF)"); + int fd = connect_helper(); + EXPECT_TRUE(fd >= 0, "can't connect to helper"); + NVEncIPCMsgHeader hdr = { .cmd = 0xFF, .payload_size = 0 }; + send_raw(fd, &hdr, sizeof(hdr)); + NVEncIPCRespHeader resp = {0}; + recv(fd, &resp, sizeof(resp), 0); + EXPECT_TRUE(resp.status != 0, "should reject unknown command"); + close(fd); + TEST_PASS(); +} + +static void test_zero_payload(void) { + TEST_START("CMD_INIT with zero payload"); + int fd = connect_helper(); + EXPECT_TRUE(fd >= 0, "can't connect"); + NVEncIPCMsgHeader hdr = { .cmd = NVENC_IPC_CMD_INIT, .payload_size = 0 }; + send_raw(fd, &hdr, sizeof(hdr)); + NVEncIPCRespHeader resp = {0}; + recv(fd, &resp, sizeof(resp), 0); + EXPECT_TRUE(resp.status != 0, "should reject zero-size init"); + close(fd); + TEST_PASS(); +} + +static void test_truncated_init(void) { + TEST_START("CMD_INIT with truncated payload (5 bytes)"); + int fd = connect_helper(); + EXPECT_TRUE(fd >= 0, "can't connect"); + NVEncIPCMsgHeader hdr = { .cmd = NVENC_IPC_CMD_INIT, .payload_size = sizeof(NVEncIPCInitParams) }; + send_raw(fd, &hdr, sizeof(hdr)); + char partial[5] = {1, 2, 3, 4, 5}; + send_raw(fd, partial, sizeof(partial)); + close(fd); //disconnect mid-message + TEST_PASS(); //helper should not crash +} + +static void test_huge_payload_size(void) { + TEST_START("CMD_ENCODE with payload_size=0xFFFFFFFF"); + int fd = connect_helper(); + EXPECT_TRUE(fd >= 0, "can't connect"); + //first init a valid encoder + NVEncIPCMsgHeader ihdr = { .cmd = NVENC_IPC_CMD_INIT, .payload_size = sizeof(NVEncIPCInitParams) }; + NVEncIPCInitParams params = { .width = 320, .height = 240, .codec = 0, + .frameRateNum = 30, .frameRateDen = 1 }; + send_raw(fd, &ihdr, sizeof(ihdr)); + send_raw(fd, ¶ms, sizeof(params)); + //drain init response (may include shm fd) + char drain[256]; + recv(fd, drain, sizeof(drain), 0); + + //now send encode with huge size + NVEncIPCMsgHeader hdr = { .cmd = NVENC_IPC_CMD_ENCODE, .payload_size = 0xFFFFFFFF }; + send_raw(fd, &hdr, sizeof(hdr)); + close(fd); + TEST_PASS(); //helper should not malloc 4GB and crash +} + +static void test_encode_without_init(void) { + TEST_START("CMD_ENCODE_SHM without prior CMD_INIT"); + int fd = connect_helper(); + EXPECT_TRUE(fd >= 0, "can't connect"); + NVEncIPCMsgHeader hdr = { .cmd = NVENC_IPC_CMD_ENCODE_SHM, + .payload_size = sizeof(NVEncIPCEncodeShmParams) }; + NVEncIPCEncodeShmParams sp = { .width = 320, .height = 240, .frame_size = 115200 }; + send_raw(fd, &hdr, sizeof(hdr)); + send_raw(fd, &sp, sizeof(sp)); + NVEncIPCRespHeader resp = {0}; + recv(fd, &resp, sizeof(resp), 0); + EXPECT_TRUE(resp.status != 0, "should reject encode without init"); + close(fd); + TEST_PASS(); +} + +static void test_rapid_connect_disconnect(void) { + TEST_START("50 rapid connect/disconnect cycles"); + for (int i = 0; i < 50; i++) { + int fd = connect_helper(); + if (fd >= 0) close(fd); + } + //verify helper still alive + int fd = connect_helper(); + EXPECT_TRUE(fd >= 0, "helper died after rapid cycles"); + close(fd); + TEST_PASS(); +} + +static void test_close_without_init(void) { + TEST_START("CMD_CLOSE without prior CMD_INIT"); + int fd = connect_helper(); + EXPECT_TRUE(fd >= 0, "can't connect"); + NVEncIPCMsgHeader hdr = { .cmd = NVENC_IPC_CMD_CLOSE, .payload_size = 0 }; + send_raw(fd, &hdr, sizeof(hdr)); + NVEncIPCRespHeader resp = {0}; + recv(fd, &resp, sizeof(resp), 0); + EXPECT_TRUE(resp.status == 0, "close should succeed even without init"); + close(fd); + TEST_PASS(); +} + +static void test_double_init(void) { + TEST_START("Two CMD_INIT in a row (re-init)"); + int fd = connect_helper(); + EXPECT_TRUE(fd >= 0, "can't connect"); + NVEncIPCInitParams params = { .width = 320, .height = 240, .codec = 0, + .frameRateNum = 30, .frameRateDen = 1 }; + + for (int i = 0; i < 2; i++) { + NVEncIPCMsgHeader hdr = { .cmd = NVENC_IPC_CMD_INIT, .payload_size = sizeof(params) }; + send_raw(fd, &hdr, sizeof(hdr)); + send_raw(fd, ¶ms, sizeof(params)); + char drain[256]; + recv(fd, drain, sizeof(drain), 0); + } + //clean close + NVEncIPCMsgHeader chdr = { .cmd = NVENC_IPC_CMD_CLOSE, .payload_size = 0 }; + send_raw(fd, &chdr, sizeof(chdr)); + char drain[64]; + recv(fd, drain, sizeof(drain), 0); + close(fd); + TEST_PASS(); +} + +int main(void) { + signal(SIGPIPE, SIG_IGN); + + printf("\n=== nvenc-helper IPC fuzz tests ===\n\n"); + + //check helper is running + int fd = connect_helper(); + if (fd < 0) { + printf("ERROR: nvenc-helper not running\n"); + return 1; + } + close(fd); + + test_invalid_command(); + test_zero_payload(); + test_truncated_init(); + test_huge_payload_size(); + test_encode_without_init(); + test_rapid_connect_disconnect(); + test_close_without_init(); + test_double_init(); + + printf("\n=== Results: %d passed, %d failed ===\n\n", g_pass, g_fail); + return g_fail > 0 ? 1 : 0; +}