-
Notifications
You must be signed in to change notification settings - Fork 26
[quantization] Support Evaluation of Qwen3-VL With llava-bench-in-the-wild Dataset #718
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -179,8 +179,22 @@ def get_item_coco(ex: dict[str, Any]) -> dict[str, Any]: | |
| """ | ||
| return { | ||
| "image": ex["image"], | ||
| "question": ex.get("question", ""), | ||
| "golds": _extract_golds(ex.get("answer")), | ||
| "question": ex["question"], | ||
| "id": ex["id"], | ||
| "image_id": ex["question_id"], | ||
| "file_name": ex["file_name"], | ||
| "golds": ex["answer"], | ||
| } | ||
|
|
||
|
|
||
| def get_item_llava_bench_in_the_wild(ex: dict[str, Any]) -> dict[str, Any]: | ||
| return { | ||
| "image": ex["image"], | ||
| "question": ex["question"], | ||
| "id": ex["question_id"], | ||
| "image_id": ex["image_id"], | ||
| "file_name": ex["image_id"], | ||
| "golds": [ex["gpt_answer"]], | ||
| } | ||
|
|
||
|
|
||
|
|
@@ -202,6 +216,13 @@ def get_item_coco(ex: dict[str, Any]) -> dict[str, Any]: | |
| "lmms-lab/COCO-Caption2017", | ||
| ], | ||
| }, | ||
| "llava_bench": { | ||
| "default_split": "train", | ||
| "adapter": get_item_llava_bench_in_the_wild, | ||
| "candidates": [ | ||
| "lmms-lab/llava-bench-in-the-wild", | ||
| ], | ||
| }, | ||
| "wikitext2": { | ||
| "default_split": "test", | ||
| "adapter": None, # Text-only dataset, no adapter needed | ||
|
|
@@ -428,6 +449,7 @@ def compute_bleu_scores( | |
| def get_coco_scores_on_dataset( | ||
| model, | ||
| processor, | ||
| dataset_name: str, | ||
| ds: Iterable[dict[str, Any]], | ||
| device: str | torch.device, | ||
| max_new_tokens: int = 30, | ||
|
|
@@ -488,25 +510,38 @@ def get_coco_scores_on_dataset( | |
| images: list[CocoImage] = [] | ||
| annotations: list[CocoAnnotation] = [] | ||
|
|
||
| if "coco" in dataset_name.lower(): | ||
| get_item = get_item_coco | ||
| elif "llava_bench" in dataset_name.lower(): | ||
| get_item = get_item_llava_bench_in_the_wild | ||
| else: | ||
| raise ValueError(f"Invalid dataset_name={dataset_name}") | ||
|
|
||
| for i, ex in enumerate(ds, 1): | ||
| image: Any = ex["image"] | ||
| question: str = ex["question"] | ||
| id: int = ex["id"] | ||
| image_id: str = ex["question_id"] | ||
| file_name: str = ex["file_name"] | ||
| gold_answers: list[str] = ex["answer"] | ||
|
|
||
| # Generate caption | ||
| pred = generate_answer( | ||
| model=model, | ||
| processor=processor, | ||
| image=image, | ||
| question=question, | ||
| device=device, | ||
| max_new_tokens=max_new_tokens, | ||
| temperature=temperature, | ||
| max_seq_len=max_seq_len, | ||
| ) | ||
| sample: dict[str, Any] = get_item(ex) | ||
|
|
||
| image: Any = sample["image"] | ||
| question: str = sample["question"] | ||
| id: int = sample["id"] | ||
| image_id: str = sample["image_id"] | ||
| file_name: str = sample["file_name"] | ||
| gold_answers: list[str] = sample["golds"] | ||
|
|
||
| try: | ||
| pred = generate_answer( | ||
| model=model, | ||
| processor=processor, | ||
| image=image, | ||
| question=question, | ||
| device=device, | ||
| max_new_tokens=max_new_tokens, | ||
| temperature=temperature, | ||
| max_seq_len=max_seq_len, | ||
| ) | ||
| except (ValueError, RuntimeError) as error: | ||
| print(f"[WARNING] The prompt was too long. Skipping.") | ||
| print(f"{error}") | ||
| continue | ||
|
Comment on lines
+541
to
+544
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Current error handling catches both except (ValueError, RuntimeError) as error:
message = str(error).lower()
if not any(
marker in message
for marker in (
"too long",
"max_position_embeddings",
"maximum context length",
"sequence length",
)
):
raise
print("[WARNING] The prompt was too long. Skipping.")
print(f"{type(error).__name__}: {error}")
continueAlso, if every sample is skipped, returning if not results:
- print(
- "[WARNING] No evaluation results were collected (all samples were skipped)."
- )
- return {}
+ raise RuntimeError(
+ "No evaluation results were collected. "
+ "All samples may have been skipped due to prompt length errors."
+ ) |
||
|
|
||
| # Store result | ||
| result: CocoResult = {"image_id": image_id, "caption": pred} | ||
|
|
@@ -536,7 +571,11 @@ def get_coco_scores_on_dataset( | |
| print("golds[:10]:", [repr(x) for x in gold_answers[:10]]) | ||
| print("-" * 60) | ||
|
|
||
| assert results | ||
| if not results: | ||
| print( | ||
| "[WARNING] No evaluation results were collected (all samples were skipped)." | ||
| ) | ||
| return {} | ||
| assert images | ||
| assert annotations | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -567,6 +567,40 @@ def evaluate_model( | |||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| def evaluate_model_coco( | ||||||||||||||||||||
| model, | ||||||||||||||||||||
| processor, | ||||||||||||||||||||
| device: str, | ||||||||||||||||||||
| dataset_name: str, | ||||||||||||||||||||
| nsamples: int = 50, | ||||||||||||||||||||
| max_seq_len: Optional[int] = None, | ||||||||||||||||||||
| ): | ||||||||||||||||||||
| """ | ||||||||||||||||||||
| Evaluate a model on the mini COCO captioning benchmark. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| Args: | ||||||||||||||||||||
| model: Model to evaluate. | ||||||||||||||||||||
| processor: Hugging Face processor. | ||||||||||||||||||||
| device: Target device string. | ||||||||||||||||||||
| nsamples: Number of evaluation samples. -1 means full dataset. | ||||||||||||||||||||
| max_seq_len: Optional maximum text sequence length. | ||||||||||||||||||||
|
|
||||||||||||||||||||
| Returns: | ||||||||||||||||||||
| COCO metric dictionary. | ||||||||||||||||||||
| """ | ||||||||||||||||||||
|
|
||||||||||||||||||||
| ds, _ = get_dataset(dataset_name, n=nsamples) | ||||||||||||||||||||
| result = get_coco_scores_on_dataset( | ||||||||||||||||||||
| model=model, | ||||||||||||||||||||
| processor=processor, | ||||||||||||||||||||
| dataset_name=dataset_name, | ||||||||||||||||||||
| ds=ds, | ||||||||||||||||||||
| device=device, | ||||||||||||||||||||
| max_seq_len=max_seq_len, | ||||||||||||||||||||
| ) | ||||||||||||||||||||
| return result | ||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| def evaluate_model_llava_bench( | ||||||||||||||||||||
| model, | ||||||||||||||||||||
| processor, | ||||||||||||||||||||
| device: str, | ||||||||||||||||||||
|
|
@@ -595,6 +629,7 @@ def evaluate_model_coco( | |||||||||||||||||||
| result = get_coco_scores_on_dataset( | ||||||||||||||||||||
| model=model, | ||||||||||||||||||||
| processor=processor, | ||||||||||||||||||||
| dataset_name="llava_bench", | ||||||||||||||||||||
| ds=ds, | ||||||||||||||||||||
| device=device, | ||||||||||||||||||||
| max_seq_len=max_seq_len, | ||||||||||||||||||||
|
|
@@ -1159,6 +1194,20 @@ def evaluate_original_model(model, processor, args): | |||||||||||||||||||
| model=model, | ||||||||||||||||||||
| processor=processor, | ||||||||||||||||||||
| device=args.device, | ||||||||||||||||||||
| dataset_name="coco", | ||||||||||||||||||||
| nsamples=args.nsamples_for_evaluation, | ||||||||||||||||||||
| max_seq_len=args.max_seq_len, | ||||||||||||||||||||
| ) | ||||||||||||||||||||
| for metric, value in results.items(): | ||||||||||||||||||||
| print(f"{metric:<10} {value:.3f}") | ||||||||||||||||||||
|
|
||||||||||||||||||||
| if "llava_bench" in args.eval_tasks: | ||||||||||||||||||||
| print("\n=== Llava Bench Evaluation (Original Model) ===") | ||||||||||||||||||||
| results = evaluate_model_coco( | ||||||||||||||||||||
| model=model, | ||||||||||||||||||||
| processor=processor, | ||||||||||||||||||||
| device=args.device, | ||||||||||||||||||||
| dataset_name="llava_bench", | ||||||||||||||||||||
|
Comment on lines
+1206
to
+1210
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||
| nsamples=args.nsamples_for_evaluation, | ||||||||||||||||||||
| max_seq_len=args.max_seq_len, | ||||||||||||||||||||
| ) | ||||||||||||||||||||
|
|
@@ -1252,6 +1301,20 @@ def evaluate_quantized_model(model, processor, args, original_results=None) -> N | |||||||||||||||||||
| model=model, | ||||||||||||||||||||
| processor=processor, | ||||||||||||||||||||
| device=args.device, | ||||||||||||||||||||
| dataset_name="coco", | ||||||||||||||||||||
| nsamples=args.nsamples_for_evaluation, | ||||||||||||||||||||
| max_seq_len=args.max_seq_len, | ||||||||||||||||||||
| ) | ||||||||||||||||||||
| for metric, value in results.items(): | ||||||||||||||||||||
| print(f"{metric:<10} {value:.3f}") | ||||||||||||||||||||
|
|
||||||||||||||||||||
| if "llava_bench" in args.eval_tasks: | ||||||||||||||||||||
| print("\n=== Llama Bench Evaluation (Original Model) ===") | ||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||
| results = evaluate_model_coco( | ||||||||||||||||||||
| model=model, | ||||||||||||||||||||
| processor=processor, | ||||||||||||||||||||
| device=args.device, | ||||||||||||||||||||
| dataset_name="llava_bench", | ||||||||||||||||||||
|
Comment on lines
+1313
to
+1317
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||
| nsamples=args.nsamples_for_evaluation, | ||||||||||||||||||||
| max_seq_len=args.max_seq_len, | ||||||||||||||||||||
| ) | ||||||||||||||||||||
|
|
||||||||||||||||||||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
According to the dataset card, I think this currently might compute
llava_benchscores at the wrong granularity.get_item_llava_bench_in_the_wild()mapsimage_idtoex["image_id"], which is the image filename such as001.jpg. However, LLaVA Bench can have multiple questions for the same image, each with a differentquestion_id. Later,get_coco_scores_on_dataset()rebuilds predictions as a dict keyed byimage_id, so multiple predictions for the same image overwrite each other:As a result, only the last prediction for an image is kept, while the references may contain multiple GPT answers for different questions on that same image. That means the metric compares one question’s prediction against answers from multiple different questions, which can distort the score.
I think the evaluation key should be unique per QA sample, not per image file. For example: