diff --git a/requirements.txt b/requirements.txt index fbfca46..dc73ece 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,11 @@ -torch>=2.0,<=2.5 +torch>=2.0 torchvision>=0.15.1 smplx==0.1.28 timm einops -ultralytics==8.1.34 +ultralytics>=8.3.4 opencv-python huggingface_hub scikit-image roma -chumpy @ git+https://github.com/mattloper/chumpy +chumpy @ git+https://github.com/mattloper/chumpy@4228d703b622e172e843438fe0fada102979361a diff --git a/wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py b/wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py index 056eedf..be0ea7b 100644 --- a/wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py +++ b/wilor_mini/pipelines/wilor_hand_pose3d_estimation_pipeline.py @@ -86,7 +86,11 @@ def predict(self, image, **kwargs): hand_bbox = det.boxes.data.cpu().detach().squeeze().numpy() is_rights.append(det.boxes.cls.cpu().detach().squeeze().item()) bboxes.append(hand_bbox[:4].tolist()) - detect_rets.append({"hand_bbox": bboxes[-1], "is_right": is_rights[-1]}) + detect_rets.append({ + "hand_bbox": bboxes[-1], + "is_right": is_rights[-1], + "hand_conf": det.boxes.conf.cpu().detach().squeeze().item(), + }) if len(bboxes) == 0: self.logger.warn("No hand detected!") @@ -144,7 +148,7 @@ def predict(self, image, **kwargs): wilor_output_i["hand_pose"] = np.concatenate( (wilor_output_i["hand_pose"][:, :, 0:1], -wilor_output_i["hand_pose"][:, :, 1:3]), axis=-1) - scaled_focal_length = self.FOCAL_LENGTH / self.IMAGE_SIZE * img_size.max() + scaled_focal_length = kwargs.get("focal_length", self.FOCAL_LENGTH / self.IMAGE_SIZE * img_size.max()) pred_cam_t_full = utils.cam_crop_to_full(pred_cam, box_center[None], bbox_size, img_size[None], scaled_focal_length) wilor_output_i["pred_cam_t_full"] = pred_cam_t_full @@ -221,7 +225,7 @@ def predict_with_bboxes(self, image, bboxes, is_rights, **kwargs): wilor_output_i["hand_pose"] = np.concatenate( (wilor_output_i["hand_pose"][:, :, 0:1], -wilor_output_i["hand_pose"][:, :, 1:3]), axis=-1) - scaled_focal_length = self.FOCAL_LENGTH / self.IMAGE_SIZE * img_size.max() + scaled_focal_length = kwargs.get("focal_length", self.FOCAL_LENGTH / self.IMAGE_SIZE * img_size.max()) pred_cam_t_full = utils.cam_crop_to_full(pred_cam, box_center[None], bbox_size, img_size[None], scaled_focal_length) wilor_output_i["pred_cam_t_full"] = pred_cam_t_full