diff --git a/tico/quantization/evaluation/vlm_eval_utils.py b/tico/quantization/evaluation/vlm_eval_utils.py index be23104f..e11fc952 100644 --- a/tico/quantization/evaluation/vlm_eval_utils.py +++ b/tico/quantization/evaluation/vlm_eval_utils.py @@ -179,8 +179,22 @@ def get_item_coco(ex: dict[str, Any]) -> dict[str, Any]: """ return { "image": ex["image"], - "question": ex.get("question", ""), - "golds": _extract_golds(ex.get("answer")), + "question": ex["question"], + "id": ex["id"], + "image_id": ex["question_id"], + "file_name": ex["file_name"], + "golds": ex["answer"], + } + + +def get_item_llava_bench_in_the_wild(ex: dict[str, Any]) -> dict[str, Any]: + return { + "image": ex["image"], + "question": ex["question"], + "id": ex["question_id"], + "image_id": ex["question_id"], # unique evaluation key + "file_name": ex["image_id"], # original image filename + "golds": [ex["gpt_answer"]], } @@ -248,6 +262,13 @@ def get_item_alpaca(ex: dict[str, Any]) -> dict[str, Any]: ], "is_text_only": False, }, + "llava_bench": { + "default_split": "train", + "adapter": get_item_llava_bench_in_the_wild, + "candidates": [ + "lmms-lab/llava-bench-in-the-wild", + ], + }, "wikitext2": { "default_split": "test", "adapter": get_item_wikitext2, @@ -402,7 +423,9 @@ def generate_answer( image=image, question=question, return_tensors="pt", - max_seq_len=max_seq_len, + # length of (inputs + max_new_tokens) should not exceed model's max_seq_len + # because quantized model has precomputed static causal mask and RoPE for max_seq_len + max_seq_len=max_seq_len - max_new_tokens, # type: ignore[operator] ) inputs = move_inputs_to_device(inputs, device) @@ -560,6 +583,7 @@ def compute_bleu_scores( def get_coco_scores_on_dataset( model, processor, + dataset_name: str, ds: Iterable[dict[str, Any]], device: str | torch.device, max_new_tokens: int = 30, @@ -620,25 +644,51 @@ def get_coco_scores_on_dataset( images: list[CocoImage] = [] annotations: list[CocoAnnotation] = [] + if "coco" in dataset_name.lower(): + get_item = get_item_coco + elif "llava_bench" in dataset_name.lower(): + get_item = get_item_llava_bench_in_the_wild + else: + raise ValueError(f"Invalid dataset_name={dataset_name}") + for i, ex in enumerate(ds, 1): - image: Any = ex["image"] - question: str = ex["question"] - id: int = ex["id"] - image_id: str = ex["question_id"] - file_name: str = ex["file_name"] - gold_answers: list[str] = ex["answer"] - - # Generate caption - pred = generate_answer( - model=model, - processor=processor, - image=image, - question=question, - device=device, - max_new_tokens=max_new_tokens, - temperature=temperature, - max_seq_len=max_seq_len, - ) + sample: dict[str, Any] = get_item(ex) + + image: Any = sample["image"] + question: str = sample["question"] + id: int = sample["id"] + image_id: str = sample["image_id"] + file_name: str = sample["file_name"] + gold_answers: list[str] = sample["golds"] + + try: + pred = generate_answer( + model=model, + processor=processor, + image=image, + question=question, + device=device, + max_new_tokens=max_new_tokens, + temperature=temperature, + max_seq_len=max_seq_len, + ) + except (ValueError, RuntimeError) as error: + message = str(error).lower() + if not any( + marker in message + for marker in ( + "too long", + "max_position_embeddings", + "maximum context length", + "sequence length", + "truncation", + ) + ): + raise + + print("[WARNING] The prompt was too long. Skipping.") + print(f"{type(error).__name__}: {error}") + continue # Store result result: CocoResult = {"image_id": image_id, "caption": pred} @@ -668,7 +718,11 @@ def get_coco_scores_on_dataset( print("golds[:10]:", [repr(x) for x in gold_answers[:10]]) print("-" * 60) - assert results + if not results: + raise RuntimeError( + "No evaluation results were collected. " + "All samples may have been skipped due to prompt length errors." + ) assert images assert annotations diff --git a/tico/quantization/recipes/evaluation/vlm.py b/tico/quantization/recipes/evaluation/vlm.py index da75d4ce..4d5da93c 100644 --- a/tico/quantization/recipes/evaluation/vlm.py +++ b/tico/quantization/recipes/evaluation/vlm.py @@ -74,6 +74,7 @@ def evaluate_coco( return get_coco_scores_on_dataset( model=model, processor=processor, + dataset_name="coco", ds=dataset, device=device, max_seq_len=max_seq_len, diff --git a/tico/quantization/wrapq/examples/quantize_qwen3_vl_with_gptq.py b/tico/quantization/wrapq/examples/quantize_qwen3_vl_with_gptq.py index 660fe938..2b8b4195 100644 --- a/tico/quantization/wrapq/examples/quantize_qwen3_vl_with_gptq.py +++ b/tico/quantization/wrapq/examples/quantize_qwen3_vl_with_gptq.py @@ -582,40 +582,39 @@ def evaluate_model( return results -def evaluate_model_coco( +def evaluate_model_with_coco_score( model, processor, device: str, + dataset_name: str, nsamples: int = 50, max_seq_len: Optional[int] = None, ): """ - Evaluate a model on the mini COCO captioning benchmark. + Evaluate a model on the provided dataset with COCO score Args: model: Model to evaluate. processor: Hugging Face processor. device: Target device string. + dataset_name: Dataset name for evaluation nsamples: Number of evaluation samples. -1 means full dataset. max_seq_len: Optional maximum text sequence length. Returns: COCO metric dictionary. """ - with ( - io.StringIO() as buffer, - contextlib.redirect_stdout(buffer), - contextlib.redirect_stderr(buffer), - ): - ds, _ = get_dataset("coco", n=nsamples) - result = get_coco_scores_on_dataset( - model=model, - processor=processor, - ds=ds, - device=device, - max_seq_len=max_seq_len, - ) - return result + + ds, _ = get_dataset(dataset_name, n=nsamples) + result = get_coco_scores_on_dataset( + model=model, + processor=processor, + dataset_name=dataset_name, + ds=ds, + device=device, + max_seq_len=max_seq_len, + ) + return result def move_batch_to_device( @@ -1171,10 +1170,24 @@ def evaluate_original_model(model, processor, args): if "coco" in args.eval_tasks: print("\n=== COCO Evaluation (Original Model) ===") - results = evaluate_model_coco( + results = evaluate_model_with_coco_score( + model=model, + processor=processor, + device=args.device, + dataset_name="coco", + nsamples=args.nsamples_for_evaluation, + max_seq_len=args.max_seq_len, + ) + for metric, value in results.items(): + print(f"{metric:<10} {value:.3f}") + + if "llava_bench" in args.eval_tasks: + print("\n=== Llava Bench Evaluation (Original Model) ===") + results = evaluate_model_with_coco_score( model=model, processor=processor, device=args.device, + dataset_name="llava_bench", nsamples=args.nsamples_for_evaluation, max_seq_len=args.max_seq_len, ) @@ -1265,10 +1278,24 @@ def evaluate_quantized_model(model, processor, args, original_results=None) -> N if "coco" in args.eval_tasks: print("\n=== COCO Evaluation (Quantized Model) ===") - results = evaluate_model_coco( + results = evaluate_model_with_coco_score( + model=model, + processor=processor, + device=args.device, + dataset_name="coco", + nsamples=args.nsamples_for_evaluation, + max_seq_len=args.max_seq_len, + ) + for metric, value in results.items(): + print(f"{metric:<10} {value:.3f}") + + if "llava_bench" in args.eval_tasks: + print("\n=== Llava Bench Evaluation (Quantized Model) ===") + results = evaluate_model_with_coco_score( model=model, processor=processor, device=args.device, + dataset_name="llava_bench", nsamples=args.nsamples_for_evaluation, max_seq_len=args.max_seq_len, )