warmshao · alex-bene · May 15, 2026 · May 15, 2026 · May 18, 2026 · May 22, 2026
diff --git a/requirements.txt b/requirements.txt
@@ -1,11 +1,11 @@
-torch>=2.0,<=2.5
+torch>=2.0
 torchvision>=0.15.1
 smplx==0.1.28
 timm
 einops
-ultralytics==8.1.34
+ultralytics>=8.3.4
 opencv-python
 huggingface_hub
 scikit-image
 roma
-chumpy @ git+https://github.com/mattloper/chumpy
+chumpy @ git+https://github.com/mattloper/chumpy@4228d703b622e172e843438fe0fada102979361a
diff --git a/wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py b/wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py
@@ -86,7 +86,11 @@ def predict(self, image, **kwargs):
             hand_bbox = det.boxes.data.cpu().detach().squeeze().numpy()
             is_rights.append(det.boxes.cls.cpu().detach().squeeze().item())
             bboxes.append(hand_bbox[:4].tolist())
-            detect_rets.append({"hand_bbox": bboxes[-1], "is_right": is_rights[-1]})
+            detect_rets.append({
+                "hand_bbox": bboxes[-1],
+                "is_right": is_rights[-1],
+                "hand_conf": det.boxes.conf.cpu().detach().squeeze().item(),
+            })
 
         if len(bboxes) == 0:
             self.logger.warn("No hand detected!")
@@ -144,7 +148,7 @@ def predict(self, image, **kwargs):
                 wilor_output_i["hand_pose"] = np.concatenate(
                     (wilor_output_i["hand_pose"][:, :, 0:1], -wilor_output_i["hand_pose"][:, :, 1:3]),
                     axis=-1)
-            scaled_focal_length = self.FOCAL_LENGTH / self.IMAGE_SIZE * img_size.max()
+            scaled_focal_length = kwargs.get("focal_length", self.FOCAL_LENGTH / self.IMAGE_SIZE * img_size.max())
             pred_cam_t_full = utils.cam_crop_to_full(pred_cam, box_center[None], bbox_size, img_size[None],
                                                      scaled_focal_length)
             wilor_output_i["pred_cam_t_full"] = pred_cam_t_full
@@ -221,7 +225,7 @@ def predict_with_bboxes(self, image, bboxes, is_rights, **kwargs):
                 wilor_output_i["hand_pose"] = np.concatenate(
                     (wilor_output_i["hand_pose"][:, :, 0:1], -wilor_output_i["hand_pose"][:, :, 1:3]),
                     axis=-1)
-            scaled_focal_length = self.FOCAL_LENGTH / self.IMAGE_SIZE * img_size.max()
+            scaled_focal_length = kwargs.get("focal_length", self.FOCAL_LENGTH / self.IMAGE_SIZE * img_size.max())
             pred_cam_t_full = utils.cam_crop_to_full(pred_cam, box_center[None], bbox_size, img_size[None],
                                                      scaled_focal_length)
             wilor_output_i["pred_cam_t_full"] = pred_cam_t_full