From 794d63d2e0134838e5bc79ad65df01e9b7b6fa34 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Wed, 4 Feb 2026 15:16:53 -0800 Subject: [PATCH] [improvement] capture NVIDIADriver CRs in must-gather Changes include: 1. step to capture nvidiadriver CRs as well 2. capture pod logs with timestamps 3. use common labelselector which works for both clusterpolicy and nvidiadriver 4. add logic to capture which processes are using GPU if any. This will help in identifying cases when we don't expect anyone to be using GPU Signed-off-by: Rahul Sharma --- hack/must-gather.sh | 77 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/hack/must-gather.sh b/hack/must-gather.sh index 841e81442..dabe0ba6c 100755 --- a/hack/must-gather.sh +++ b/hack/must-gather.sh @@ -101,6 +101,21 @@ else touch "${ARTIFACT_DIR}/cluster_policy.missing" fi +echo +echo "#" +echo "# NVIDIADriver" +echo "#" +echo + +NVIDIA_DRIVERS=$($K get nvidiadrivers.nvidia.com -A -oname) + +if [[ "${NVIDIA_DRIVERS}" ]]; then + echo "Get NVIDIADriver resources" + $K get nvidiadrivers.nvidia.com -A -oyaml > "${ARTIFACT_DIR}/nvidiadrivers.yaml" +else + echo "NVIDIADriver resource(s) not found in the cluster." +fi + echo echo "#" echo "# Nodes and machines" @@ -166,10 +181,12 @@ $K get "${OPERATOR_POD_NAME}" \ echo "Get the GPU Operator Pod logs" $K logs "${OPERATOR_POD_NAME}" \ -n "${OPERATOR_NAMESPACE}" \ + --timestamps \ > "${ARTIFACT_DIR}/gpu_operator_pod.log" $K logs "${OPERATOR_POD_NAME}" \ -n "${OPERATOR_NAMESPACE}" \ + --timestamps \ --previous \ > "${ARTIFACT_DIR}/gpu_operator_pod.previous.log" @@ -212,11 +229,13 @@ do $K logs "${pod}" \ -n "${OPERATOR_NAMESPACE}" \ --all-containers --prefix \ + --timestamps \ > "${ARTIFACT_DIR}/gpu_operand_pod_$pod_name.log" $K logs "${pod}" \ -n "${OPERATOR_NAMESPACE}" \ --all-containers --prefix \ + --timestamps \ --previous \ > "${ARTIFACT_DIR}/gpu_operand_pod_$pod_name.previous.log" @@ -262,7 +281,16 @@ echo "# nvidia-bug-report.sh" echo "#" echo "" -for pod in $($K get pods -lopenshift.driver-toolkit -oname -n "${OPERATOR_NAMESPACE}"; $K get pods -lapp=nvidia-driver-daemonset -oname -n "${OPERATOR_NAMESPACE}"; $K get pods -lapp=nvidia-vgpu-manager-daemonset -oname -n "${OPERATOR_NAMESPACE}"); +# Find driver pods using multiple label selectors to support different deployment methods +driver_pods="" +driver_pods="$driver_pods $($K get pods -lopenshift.driver-toolkit -oname -n "${OPERATOR_NAMESPACE}" 2>/dev/null || true)" +driver_pods="$driver_pods $($K get pods -lapp.kubernetes.io/component=nvidia-driver -oname -n "${OPERATOR_NAMESPACE}" 2>/dev/null || true)" +driver_pods="$driver_pods $($K get pods -lapp=nvidia-vgpu-manager-daemonset -oname -n "${OPERATOR_NAMESPACE}" 2>/dev/null || true)" + +# Deduplicate and filter out empty entries +driver_pods=$(echo "$driver_pods" | tr ' ' '\n' | grep -v '^$' | sort -u) + +for pod in $driver_pods; do pod_nodename=$($K get "${pod}" -ojsonpath={.spec.nodeName} -n "${OPERATOR_NAMESPACE}") echo "Saving nvidia-bug-report from ${pod_nodename} ..." @@ -276,6 +304,53 @@ do mv /tmp/nvidia-bug-report.log.gz "${ARTIFACT_DIR}/nvidia-bug-report_${pod_nodename}.log.gz" done +echo "" +echo "#" +echo "# GPU device usage (nvidia-smi, lsof, fuser, ps)" +echo "#" +echo "" + +# Using driver pods list from above +for pod in $driver_pods; +do + pod_nodename=$($K get "${pod}" -ojsonpath={.spec.nodeName} -n "${OPERATOR_NAMESPACE}") + echo "Collecting GPU device usage info from ${pod_nodename} ..." + + # Capture nvidia-smi output showing processes using GPUs + echo "# nvidia-smi output from ${pod_nodename}" > "${ARTIFACT_DIR}/gpu_device_usage_${pod_nodename}.log" + $K exec -n "${OPERATOR_NAMESPACE}" "${pod}" -- nvidia-smi >> "${ARTIFACT_DIR}/gpu_device_usage_${pod_nodename}.log" 2>&1 || \ + echo "Failed to run nvidia-smi on ${pod_nodename}" >> "${ARTIFACT_DIR}/gpu_device_usage_${pod_nodename}.log" + + # Capture lsof output for nvidia devices (run in host namespace) + echo -e "\n# lsof /run/nvidia/driver/dev/nvidia* output from ${pod_nodename}" >> "${ARTIFACT_DIR}/gpu_device_usage_${pod_nodename}.log" + $K exec -n "${OPERATOR_NAMESPACE}" "${pod}" -- nsenter --target 1 --mount --pid -- bash -c 'lsof /run/nvidia/driver/dev/nvidia* 2>&1 || true' >> "${ARTIFACT_DIR}/gpu_device_usage_${pod_nodename}.log" + + # Extract PIDs from lsof output and get detailed process information + echo -e "\n# Process details for PIDs using GPU devices from ${pod_nodename}" >> "${ARTIFACT_DIR}/gpu_device_usage_${pod_nodename}.log" + $K exec -n "${OPERATOR_NAMESPACE}" "${pod}" -- nsenter --target 1 --mount --pid -- bash -c ' + pids=$(lsof /run/nvidia/driver/dev/nvidia* 2>/dev/null | awk "NR>1 {print \$2}" | sort -u) + if [ -n "$pids" ]; then + echo "PIDs using GPU: $pids" + echo "" + for pid in $pids; do + echo "=== Process $pid ===" + ps -p $pid -o pid,ppid,user,stat,start,etime,pcpu,pmem,vsz,rss,args 2>&1 || echo "Process $pid not found" + echo "" + done + else + echo "No processes found using GPU devices" + fi + ' >> "${ARTIFACT_DIR}/gpu_device_usage_${pod_nodename}.log" 2>&1 || true + + # Capture fuser output for nvidia devices (run in host namespace) + echo -e "\n# fuser -v /run/nvidia/driver/dev/nvidia* output from ${pod_nodename}" >> "${ARTIFACT_DIR}/gpu_device_usage_${pod_nodename}.log" + $K exec -n "${OPERATOR_NAMESPACE}" "${pod}" -- nsenter --target 1 --mount --pid -- bash -c 'fuser -v /run/nvidia/driver/dev/nvidia* 2>&1 || true' >> "${ARTIFACT_DIR}/gpu_device_usage_${pod_nodename}.log" + + # List all nvidia device files + echo -e "\n# ls -la /run/nvidia/driver/dev/nvidia* output from ${pod_nodename}" >> "${ARTIFACT_DIR}/gpu_device_usage_${pod_nodename}.log" + $K exec -n "${OPERATOR_NAMESPACE}" "${pod}" -- nsenter --target 1 --mount --pid -- bash -c 'ls -la /run/nvidia/driver/dev/nvidia* 2>&1 || true' >> "${ARTIFACT_DIR}/gpu_device_usage_${pod_nodename}.log" +done + echo "" echo "#" echo "# All done!"