From c2d1ef676bdf2f7a264d170d69c50b13ebcbd3c9 Mon Sep 17 00:00:00 2001 From: alex-bene Date: Fri, 15 May 2026 15:43:17 +0200 Subject: [PATCH 1/4] relax torch dep; update ultralytics dep --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index fbfca46..b75ffc3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ -torch>=2.0,<=2.5 +torch>=2.0 torchvision>=0.15.1 smplx==0.1.28 timm einops -ultralytics==8.1.34 +ultralytics>=8.3.4 opencv-python huggingface_hub scikit-image From 9766463c9f5fdf008a0d5e87010a272663feaaff Mon Sep 17 00:00:00 2001 From: alex-bene Date: Fri, 15 May 2026 20:32:32 +0200 Subject: [PATCH 2/4] focal_length as inference input --- wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py b/wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py index 056eedf..53cd9d0 100644 --- a/wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py +++ b/wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py @@ -144,7 +144,7 @@ def predict(self, image, **kwargs): wilor_output_i["hand_pose"] = np.concatenate( (wilor_output_i["hand_pose"][:, :, 0:1], -wilor_output_i["hand_pose"][:, :, 1:3]), axis=-1) - scaled_focal_length = self.FOCAL_LENGTH / self.IMAGE_SIZE * img_size.max() + scaled_focal_length = kwargs.get("focal_length", self.FOCAL_LENGTH / self.IMAGE_SIZE * img_size.max()) pred_cam_t_full = utils.cam_crop_to_full(pred_cam, box_center[None], bbox_size, img_size[None], scaled_focal_length) wilor_output_i["pred_cam_t_full"] = pred_cam_t_full @@ -221,7 +221,7 @@ def predict_with_bboxes(self, image, bboxes, is_rights, **kwargs): wilor_output_i["hand_pose"] = np.concatenate( (wilor_output_i["hand_pose"][:, :, 0:1], -wilor_output_i["hand_pose"][:, :, 1:3]), axis=-1) - scaled_focal_length = self.FOCAL_LENGTH / self.IMAGE_SIZE * img_size.max() + scaled_focal_length = kwargs.get("focal_length", self.FOCAL_LENGTH / self.IMAGE_SIZE * img_size.max()) pred_cam_t_full = utils.cam_crop_to_full(pred_cam, box_center[None], bbox_size, img_size[None], scaled_focal_length) wilor_output_i["pred_cam_t_full"] = pred_cam_t_full From 99d41de3f77a29c2d902e16908b3aa72b58a37cd Mon Sep 17 00:00:00 2001 From: alex-bene Date: Mon, 18 May 2026 15:19:03 +0200 Subject: [PATCH 3/4] expose detection confidence from YOLO to the return dict --- .../pipelines/wilor_hand_pose3d_estimation_pipeline.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py b/wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py index 53cd9d0..be0ea7b 100644 --- a/wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py +++ b/wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py @@ -86,7 +86,11 @@ def predict(self, image, **kwargs): hand_bbox = det.boxes.data.cpu().detach().squeeze().numpy() is_rights.append(det.boxes.cls.cpu().detach().squeeze().item()) bboxes.append(hand_bbox[:4].tolist()) - detect_rets.append({"hand_bbox": bboxes[-1], "is_right": is_rights[-1]}) + detect_rets.append({ + "hand_bbox": bboxes[-1], + "is_right": is_rights[-1], + "hand_conf": det.boxes.conf.cpu().detach().squeeze().item(), + }) if len(bboxes) == 0: self.logger.warn("No hand detected!") From cf8f582223142a57f2f486cd7b2fc0545663473d Mon Sep 17 00:00:00 2001 From: alex-bene Date: Fri, 22 May 2026 15:29:32 +0200 Subject: [PATCH 4/4] use chumpy pr with install fix --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b75ffc3..dc73ece 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,4 @@ opencv-python huggingface_hub scikit-image roma -chumpy @ git+https://github.com/mattloper/chumpy +chumpy @ git+https://github.com/mattloper/chumpy@4228d703b622e172e843438fe0fada102979361a