Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 60 additions & 21 deletions tico/quantization/evaluation/vlm_eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,22 @@ def get_item_coco(ex: dict[str, Any]) -> dict[str, Any]:
"""
return {
"image": ex["image"],
"question": ex.get("question", ""),
"golds": _extract_golds(ex.get("answer")),
"question": ex["question"],
"id": ex["id"],
"image_id": ex["question_id"],
"file_name": ex["file_name"],
"golds": ex["answer"],
}


def get_item_llava_bench_in_the_wild(ex: dict[str, Any]) -> dict[str, Any]:
return {
"image": ex["image"],
"question": ex["question"],
"id": ex["question_id"],
"image_id": ex["image_id"],
"file_name": ex["image_id"],
"golds": [ex["gpt_answer"]],
Comment on lines +190 to +197
Copy link
Copy Markdown
Contributor

@mhs4670go mhs4670go May 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to the dataset card, I think this currently might compute llava_bench scores at the wrong granularity.

get_item_llava_bench_in_the_wild() maps image_id to ex["image_id"], which is the image filename such as 001.jpg. However, LLaVA Bench can have multiple questions for the same image, each with a different question_id. Later, get_coco_scores_on_dataset() rebuilds predictions as a dict keyed by image_id, so multiple predictions for the same image overwrite each other:

res[img_id] = [caption]

As a result, only the last prediction for an image is kept, while the references may contain multiple GPT answers for different questions on that same image. That means the metric compares one question’s prediction against answers from multiple different questions, which can distort the score.

I think the evaluation key should be unique per QA sample, not per image file. For example:

def get_item_llava_bench_in_the_wild(ex: dict[str, Any]) -> dict[str, Any]:
    return {
        "image": ex["image"],
        "question": ex["question"],
        "id": ex["question_id"],
        "image_id": ex["question_id"],  # unique evaluation key
        "file_name": ex["image_id"],    # original image filename, if needed
        "golds": [ex["gpt_answer"]],
    }

}


Expand All @@ -202,6 +216,13 @@ def get_item_coco(ex: dict[str, Any]) -> dict[str, Any]:
"lmms-lab/COCO-Caption2017",
],
},
"llava_bench": {
"default_split": "train",
"adapter": get_item_llava_bench_in_the_wild,
"candidates": [
"lmms-lab/llava-bench-in-the-wild",
],
},
"wikitext2": {
"default_split": "test",
"adapter": None, # Text-only dataset, no adapter needed
Expand Down Expand Up @@ -428,6 +449,7 @@ def compute_bleu_scores(
def get_coco_scores_on_dataset(
model,
processor,
dataset_name: str,
ds: Iterable[dict[str, Any]],
device: str | torch.device,
max_new_tokens: int = 30,
Expand Down Expand Up @@ -488,25 +510,38 @@ def get_coco_scores_on_dataset(
images: list[CocoImage] = []
annotations: list[CocoAnnotation] = []

if "coco" in dataset_name.lower():
get_item = get_item_coco
elif "llava_bench" in dataset_name.lower():
get_item = get_item_llava_bench_in_the_wild
else:
raise ValueError(f"Invalid dataset_name={dataset_name}")

for i, ex in enumerate(ds, 1):
image: Any = ex["image"]
question: str = ex["question"]
id: int = ex["id"]
image_id: str = ex["question_id"]
file_name: str = ex["file_name"]
gold_answers: list[str] = ex["answer"]

# Generate caption
pred = generate_answer(
model=model,
processor=processor,
image=image,
question=question,
device=device,
max_new_tokens=max_new_tokens,
temperature=temperature,
max_seq_len=max_seq_len,
)
sample: dict[str, Any] = get_item(ex)

image: Any = sample["image"]
question: str = sample["question"]
id: int = sample["id"]
image_id: str = sample["image_id"]
file_name: str = sample["file_name"]
gold_answers: list[str] = sample["golds"]

try:
pred = generate_answer(
model=model,
processor=processor,
image=image,
question=question,
device=device,
max_new_tokens=max_new_tokens,
temperature=temperature,
max_seq_len=max_seq_len,
)
except (ValueError, RuntimeError) as error:
print(f"[WARNING] The prompt was too long. Skipping.")
print(f"{error}")
continue
Comment on lines +541 to +544
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current error handling catches both ValueError and RuntimeError and treats all of them as "prompt too long". This can silently hide real evaluation failures such as CUDA OOM, tensor/device mismatch, shape mismatch, or model/processor incompatibility. Could we make the skip path conservative?

        except (ValueError, RuntimeError) as error:
            message = str(error).lower()
            if not any(
                marker in message
                for marker in (
                    "too long",
                    "max_position_embeddings",
                    "maximum context length",
                    "sequence length",
                )
            ):
                raise

            print("[WARNING] The prompt was too long. Skipping.")
            print(f"{type(error).__name__}: {error}")
            continue

Also, if every sample is skipped, returning {} makes the evaluation look successful but metric-less; raising RuntimeError would be safer.

     if not results:
-        print(
-            "[WARNING] No evaluation results were collected (all samples were skipped)."
-        )
-        return {}
+        raise RuntimeError(
+            "No evaluation results were collected. "
+            "All samples may have been skipped due to prompt length errors."
+        )


# Store result
result: CocoResult = {"image_id": image_id, "caption": pred}
Expand Down Expand Up @@ -536,7 +571,11 @@ def get_coco_scores_on_dataset(
print("golds[:10]:", [repr(x) for x in gold_answers[:10]])
print("-" * 60)

assert results
if not results:
print(
"[WARNING] No evaluation results were collected (all samples were skipped)."
)
return {}
assert images
assert annotations

Expand Down
63 changes: 63 additions & 0 deletions tico/quantization/wrapq/examples/quantize_qwen3_vl_with_gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,40 @@ def evaluate_model(


def evaluate_model_coco(
model,
processor,
device: str,
dataset_name: str,
nsamples: int = 50,
max_seq_len: Optional[int] = None,
):
"""
Evaluate a model on the mini COCO captioning benchmark.

Args:
model: Model to evaluate.
processor: Hugging Face processor.
device: Target device string.
nsamples: Number of evaluation samples. -1 means full dataset.
max_seq_len: Optional maximum text sequence length.

Returns:
COCO metric dictionary.
"""

ds, _ = get_dataset(dataset_name, n=nsamples)
result = get_coco_scores_on_dataset(
model=model,
processor=processor,
dataset_name=dataset_name,
ds=ds,
device=device,
max_seq_len=max_seq_len,
)
return result


def evaluate_model_llava_bench(
model,
processor,
device: str,
Expand Down Expand Up @@ -595,6 +629,7 @@ def evaluate_model_coco(
result = get_coco_scores_on_dataset(
model=model,
processor=processor,
dataset_name="llava_bench",
ds=ds,
device=device,
max_seq_len=max_seq_len,
Expand Down Expand Up @@ -1159,6 +1194,20 @@ def evaluate_original_model(model, processor, args):
model=model,
processor=processor,
device=args.device,
dataset_name="coco",
nsamples=args.nsamples_for_evaluation,
max_seq_len=args.max_seq_len,
)
for metric, value in results.items():
print(f"{metric:<10} {value:.3f}")

if "llava_bench" in args.eval_tasks:
print("\n=== Llava Bench Evaluation (Original Model) ===")
results = evaluate_model_coco(
model=model,
processor=processor,
device=args.device,
dataset_name="llava_bench",
Comment on lines +1206 to +1210
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
results = evaluate_model_coco(
model=model,
processor=processor,
device=args.device,
dataset_name="llava_bench",
results = evaluate_model_llava_bench(
model=model,
processor=processor,
device=args.device,

nsamples=args.nsamples_for_evaluation,
max_seq_len=args.max_seq_len,
)
Expand Down Expand Up @@ -1252,6 +1301,20 @@ def evaluate_quantized_model(model, processor, args, original_results=None) -> N
model=model,
processor=processor,
device=args.device,
dataset_name="coco",
nsamples=args.nsamples_for_evaluation,
max_seq_len=args.max_seq_len,
)
for metric, value in results.items():
print(f"{metric:<10} {value:.3f}")

if "llava_bench" in args.eval_tasks:
print("\n=== Llama Bench Evaluation (Original Model) ===")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
print("\n=== Llama Bench Evaluation (Original Model) ===")
print("\n=== Llava Bench Evaluation (Quantized Model) ===")

results = evaluate_model_coco(
model=model,
processor=processor,
device=args.device,
dataset_name="llava_bench",
Comment on lines +1313 to +1317
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
results = evaluate_model_coco(
model=model,
processor=processor,
device=args.device,
dataset_name="llava_bench",
results = evaluate_model_llava_bench(
model=model,
processor=processor,
device=args.device,

nsamples=args.nsamples_for_evaluation,
max_seq_len=args.max_seq_len,
)
Expand Down
Loading