feat(evaluation): add VLMMetrics#545
Conversation
begumcig
left a comment
There was a problem hiding this comment.
Thank you so much David! I have asked some questions because I am not really familiar with litellm and the requirements using this specific framework brings. I think the metric classes themselves look really good, just needs some tweaks here and there! And really apologies for the delay in reviewing 💞
09e3f67 to
a045a38
Compare
Not up to standards ⛔🔴 Issues
|
| Category | Results |
|---|---|
| UnusedCode | 1 medium |
| Documentation | 62 minor |
| Security | 11 high |
| CodeStyle | 2 minor |
| Complexity | 3 critical 21 medium |
🟢 Metrics 538 complexity · 52 duplication
Metric Results Complexity 538 Duplication 52
TIP This summary will be updated as you push new changes. Give us feedback
begumcig
left a comment
There was a problem hiding this comment.
Hey David, great updates on these! I left some specific comments below, but the general theme is that there’s a bit of a mismatch between the current implementation and the original research methodologies.
The VLM infrastructure you’ve built is a great foundation, but we’re currently missing some of the logic that actually defines these benchmarks. Without those steps, our results might be hard to compare with the official papers. Let me know what you think!!
33d9135 to
8de57ad
Compare
|
@cursor review |
There was a problem hiding this comment.
Cursor Bugbot has reviewed your changes and found 3 potential issues.
Bugbot Autofix prepared fixes for all 3 issues found in the latest run.
- ✅ Fixed:
aggregationparameter stored but never applied in scoring- QAAccuracyMetric now validates and applies
aggregation, using strict binary scoring forall_or_nothinginstead of always averaging per-question scores.
- QAAccuracyMetric now validates and applies
- ✅ Fixed: Chinese language heuristic misclassifies EN-only rows
- The OneIG language heuristic now only infers Chinese from prompt-only rows when actual CJK characters are present, preventing EN prompt-only rows from being routed to
_zhQ_D files.
- The OneIG language heuristic now only infers Chinese from prompt-only rows when actual CJK characters are present, preventing EN prompt-only rows from being routed to
- ✅ Fixed: Text score uses unnormalized distance favoring short texts
- TextScoreMetric now stores normalized Levenshtein distance by dividing by ground-truth character length so the reported mean matches character error rate behavior across varying text lengths.
Or push these changes by commenting:
@cursor push 6e88aefdac
Preview (6e88aefdac)
diff --git a/src/pruna/data/datasets/prompt.py b/src/pruna/data/datasets/prompt.py
--- a/src/pruna/data/datasets/prompt.py
+++ b/src/pruna/data/datasets/prompt.py
@@ -139,11 +139,15 @@
lang = row.get("language") or row.get("lang")
if isinstance(lang, str) and lang.lower() in {"zh", "zh-cn", "zh_cn", "chinese", "cn"}:
return True
- if row.get("prompt_zh"):
+ if row.get("prompt_zh") or row.get("prompt_cn"):
return True
prompt = row.get("prompt")
prompt_en = row.get("prompt_en")
- return bool(prompt and not (isinstance(prompt_en, str) and prompt_en.strip()))
+ if not (isinstance(prompt, str) and prompt.strip()):
+ return False
+ if isinstance(prompt_en, str) and prompt_en.strip():
+ return False
+ return any("\u4e00" <= ch <= "\u9fff" for ch in prompt)
def _oneig_qd_prefix(row: dict) -> str:
diff --git a/src/pruna/evaluation/metrics/metric_qa_accuracy.py b/src/pruna/evaluation/metrics/metric_qa_accuracy.py
--- a/src/pruna/evaluation/metrics/metric_qa_accuracy.py
+++ b/src/pruna/evaluation/metrics/metric_qa_accuracy.py
@@ -105,6 +105,11 @@
self.call_type = get_call_type_for_single_metric(call_type, self.default_call_type)
self.add_state("scores", [])
self.aggregation = kwargs.pop("aggregation", "mean")
+ if self.aggregation not in {"mean", "all_or_nothing"}:
+ raise ValueError(
+ "qa_accuracy aggregation must be one of {'mean', 'all_or_nothing'}. "
+ f"Got: {self.aggregation!r}."
+ )
def _extract_questions(self, gt: Any, n: int) -> List[List[str]]:
if isinstance(gt, (list, tuple)) and len(gt) >= n:
@@ -151,7 +156,10 @@
["Yes"] * len(questions),
response_format=self.response_format,
)
- score = float(np.mean(scores))
+ if self.aggregation == "all_or_nothing":
+ score = float(all(float(s) == 1.0 for s in scores))
+ else:
+ score = float(np.mean(scores))
self.scores.append(score)
def compute(self) -> MetricResult:
diff --git a/src/pruna/evaluation/metrics/metric_text_score.py b/src/pruna/evaluation/metrics/metric_text_score.py
--- a/src/pruna/evaluation/metrics/metric_text_score.py
+++ b/src/pruna/evaluation/metrics/metric_text_score.py
@@ -172,7 +172,7 @@
@MetricRegistry.register("text_score")
class TextScoreMetric(_BaseVLMOCRTextMetric):
"""
- OCR then mean Levenshtein distance to ground truth (lower is better).
+ OCR then mean normalized Levenshtein distance (character error rate, lower is better).
Registry: ``ocr_levenshtein`` (descriptive) and ``text_score`` (legacy).
@@ -240,7 +240,8 @@
def _accumulate_sample(self, text_gt: str, ocr_text: str) -> None:
norm_gt = normalize_text_simple(text_gt)
norm_ocr = normalize_text_simple(ocr_text)
- self.scores.append(levenshtein(norm_ocr, norm_gt))
+ gt_len = max(len(norm_gt), 1)
+ self.scores.append(float(levenshtein(norm_ocr, norm_gt) / gt_len))
def _compute_result_value(self) -> float:
if not self.scores:
diff --git a/tests/data/test_oneig_loader.py b/tests/data/test_oneig_loader.py
--- a/tests/data/test_oneig_loader.py
+++ b/tests/data/test_oneig_loader.py
@@ -34,6 +34,18 @@
assert prompt_mod._oneig_qd_prefix(row) == "anime_zh"
+def test_oneig_qd_prefix_prompt_only_en_row_stays_en() -> None:
+ """Prompt-only EN rows must not be misclassified as Chinese."""
+ row = {
+ "category": "General_Object",
+ "id": "001",
+ "prompt": "a red apple on a table",
+ "prompt_en": "",
+ "class": "None",
+ }
+ assert prompt_mod._oneig_qd_prefix(row) == "object"
+
+
def test_to_oneig_record_multilingualism_fills_questions() -> None:
"""Synthetic Multilingualism row resolves Q_D from merged index."""
qb = {"multilingualism_zh_000": {"questions": {"1": "现场是不是颁奖典礼?"}, "dependencies": {"1": [0]}}}
diff --git a/tests/evaluation/test_vlm_metrics.py b/tests/evaluation/test_vlm_metrics.py
--- a/tests/evaluation/test_vlm_metrics.py
+++ b/tests/evaluation/test_vlm_metrics.py
@@ -146,6 +146,22 @@
@pytest.mark.cpu
+def test_qa_accuracy_aggregation_modes() -> None:
+ mock_vlm = MagicMock(spec=BaseVLM)
+ mock_vlm.score.return_value = [1.0, 0.0]
+ images = _dummy_image(batch=1)
+ aux = [{"questions": {"1": "Q1", "2": "Q2"}}]
+
+ mean_metric = QAAccuracyMetric(vlm=mock_vlm, vlm_type="litellm", device="cpu", aggregation="mean")
+ mean_metric.update(["a prompt"], aux, images)
+ assert mean_metric.compute().result == pytest.approx(0.5)
+
+ strict_metric = QAAccuracyMetric(vlm=mock_vlm, vlm_type="litellm", device="cpu", aggregation="all_or_nothing")
+ strict_metric.update(["a prompt"], aux, images)
+ assert strict_metric.compute().result == pytest.approx(0.0)
+
+
+@pytest.mark.cpu
def test_get_vlm_returns_custom() -> None:
custom = MagicMock(spec=BaseVLM)
out = get_vlm(vlm=custom, vlm_type="litellm", model_name="gpt-4o")
@@ -183,6 +199,19 @@
@pytest.mark.cpu
+def test_text_score_uses_normalized_edit_distance() -> None:
+ mock_vlm = MagicMock(spec=BaseVLM)
+ mock_vlm.generate.side_effect = [["abxde"], ["ax"]]
+ metric = TextScoreMetric(vlm=mock_vlm, vlm_type="litellm", device="cpu")
+
+ metric.update(["p1"], ["abcde"], _dummy_image(batch=1))
+ metric.update(["p2"], ["ab"], _dummy_image(batch=1))
+
+ assert metric.scores == pytest.approx([0.2, 0.5])
+ assert metric.compute().result == pytest.approx(0.35)
+
+
+@pytest.mark.cpu
def test_text_score_registry_aliases() -> None:
from pruna.evaluation.metrics.registry import MetricRegistryThis Bugbot Autofix run was free. To enable autofix for future PRs, go to the Cursor dashboard.
Comment @cursor review or bugbot run to trigger another review on this PR
Reviewed by Cursor Bugbot for commit 7435679. Configure here.
|
The code change (>5K) is too large for me :( Could you point me to some place where my review can be particularly useful ? I can definitely dedicate some time to review some files, but not all 😓 |
|
Hi @llcnt, if you have time. You can take a look at the LLM2CLIP implementation :) |
llcnt
left a comment
There was a problem hiding this comment.
Thanks for all the work!!
I was not able to look at all the contribution, but the llm2clip looks good to me:)
Other general comments:
- I like the vlm split in between local and API call, but what motivates the choices of litellm? Where should one define the according API key ? We already have several (openai and replicate) API keys for external v/llm calls (in pbench for eg.). Could we unify all this ? Or at least make some documentation to explain what to use, where the user should define his/her API key, etc.?;
- following the comment above I would say it could be super useful if we can have some documentation attached (eg. what vlm should I use for a given metric?, a tutorial in pruna docs on local only and/or API based evaluation)
b8d9322 to
3414ad8
Compare
Hi, I decided to use Litellm because it offers a simpler approach to lightweight validation, avoiding the need for a large local VLM, which is normally required for evaluation that aligns with human judgment. In pbench, the required environment variables are explicitly listed in the benchmarks and currently rely only on OpenAI for VLMs as the judge (similar to the current implementation shown here). For Pruna OSS, I'll make sure to add documentation or examples in the docstring, since we normally do not add explicit metric usage documentation. |
1e72685 to
4f90d9c
Compare
Thanks for adding explanations and details in the main comment:) |
ff19c02 to
e1f9a87
Compare
…m_response` in test
…umentation - Deleted AlignmentScoreMetric class and its associated tests. - Updated documentation to reflect the removal of the alignment score metric. - Adjusted import statements and assertions in relevant files to ensure consistency. - Enhanced clarity in API key handling within the user manual.
…cies - Removed the `vlm_benchmark_helpers.py` file as it was no longer needed. - Updated `pyproject.toml` to remove the `mine-replicate` dependency. - Refined documentation and code comments for clarity in VLM metrics handling. - Adjusted test cases to reflect the removal of the benchmark helpers.
- Introduced `vlm_benchmark_helpers.py` to facilitate VLM-backed benchmark jobs and random image generation. - Added utility functions for processing prompts and auxiliary data in various metrics. - Updated `metric_img_edit_score.py`, `metric_vie_score.py`, `metric_vqa.py`, and `metric_vlm_base.py` to utilize new helper functions for improved clarity and functionality. - Implemented padding for VIEScore sub-scores to ensure consistent output lengths. - Enhanced documentation for new and modified functions to improve usability.
… functionality - Updated import paths for `StatefulVLMMeanScoresMetric` and related functions to reflect the new structure. - Added `StatefulVLMMeanScoresMetric` to the `__all__` exports in `__init__.py` for better accessibility. - Removed the deprecated `metric_vlm_base.py` and `viescore_prompts.py` files to streamline the codebase. - Enhanced the `VieScoreMetric` class with new prompt-building functions for improved clarity and functionality. - Updated relevant metric files to utilize the new import structure and functions.
- Deleted the `vlm_benchmark_helpers.py` file as it is no longer needed in the codebase. - Adjusted related documentation and test cases to reflect this removal. - Streamlined the VLM metrics handling by eliminating unnecessary helper functions.
…ctions - Updated docstrings for `batch_to_device` and `LLM2Vec` class methods to provide detailed parameter and return type descriptions. - Improved clarity in the documentation for methods such as `prepare_for_tokenization`, `tokenize`, and `forward`, ensuring consistency and better understanding of their functionalities. - Adjusted the `check_docstrings_content` function in tests to streamline validation checks.
- Updated import statements in `metric_img_edit_score.py`, `metric_qa_accuracy.py`, `metric_vie_score.py`, and `metric_vqa.py` to consolidate VLM-related imports. - Enhanced code clarity by grouping related imports together, improving maintainability across metric files.
e1f9a87 to
12dd9a4
Compare
|
This PR has been inactive for 10 days and is now marked as stale. |
begumcig
left a comment
There was a problem hiding this comment.
Thank you very much David, left some comments on the datasets parts for now!
| category : OneIGCategory | list[OneIGCategory] | None | ||
| Filter by dataset category (Anime_Stylization, Portrait, etc.) or class (fauvism, | ||
| watercolor, etc.). If None, returns all subsets. | ||
| reasoning_language : str, optional |
There was a problem hiding this comment.
Do we support different languages only for reasoning set? maybe we should change the name of this parameter
| records = [_to_oneig_record(dict(row), questions_by_key) for row in ds_raw] | ||
| reasoning_gt_en, reasoning_gt_zh = _fetch_oneig_reasoning_gt() | ||
|
|
||
| ds_en = load_dataset("OneIG-Bench/OneIG-Bench", "OneIG-Bench")["train"] # type: ignore[index] |
There was a problem hiding this comment.
I am a bit confused, if we are taking the language as a parameter why are we loading the dataset for both languages?
| qd_prefix = _oneig_qd_prefix(row) | ||
| lookup_key = f"{qd_prefix}_{prompt_id}" if qd_prefix else "" | ||
| q_info = questions_by_key.get(lookup_key, {}) | ||
| text = row.get("prompt") or row.get("prompt_en") or row.get("prompt_cn") or "" |
There was a problem hiding this comment.
I am probably missing something, if we are taking the language as an input from the user, why are we trying the both language options here?
| if is_text_rendering and text: | ||
| import re as _re | ||
|
|
||
| quoted = _re.findall(r'"([^"]+)"', text) |
There was a problem hiding this comment.
Can you explain the logic a bit here? So if the text has quotes we use it, if not we make the prompt the class of the prompt?
| "subset": "Text_Rendering" if row_category in ("Text_Rendering", "Text Rendering") else row_category, | ||
| "text_content": row_class if row_class != "None" else None, | ||
| "text": text, | ||
| "subset": "Text_Rendering" if is_text_rendering else row_category, |
There was a problem hiding this comment.
if i am not wrong there are many categories in oneig that are two words (Anime Stylization, Knowledge Reasoning, etc.) Why do we do the "_" underscore or spaces check only for one category? Is it necessary?
| if category is None: | ||
| return True | ||
| categories = [category] if not isinstance(category, list) else category | ||
| return "Multilingualism" in categories |
There was a problem hiding this comment.
I think all of the other categories also have their chinese counterparts, why do we only return true for the multilingualism case?
| fraction: float = 1.0, | ||
| train_sample_size: int | None = None, | ||
| test_sample_size: int | None = None, | ||
| reasoning_language: str = "EN", |
There was a problem hiding this comment.
doesn't multilingualism only exist for the chinese dataset?
| instructions: dict = json.loads(response_instructions.text) | ||
| judge_prompts: dict = json.loads(response_judge_prompts.text) | ||
|
|
||
| image_folder = ensure_imgedit_benchmark_images_extracted() |
There was a problem hiding this comment.
we call this function here, but then on 599 we call load_imgedit_source_image_bytes function which then again calls this function, what's the reason for this?


Benchmark integration (
openai/gpt-4o, one batch per job)Each row loads one test batch, generates an image with Replicate (
prunaai/p-image(txt2img),prunaai/p-image-edit(edit)): text-to-image when there is no source image, or image edit whensource_image_bytesis present in the batch. Scores useopenai/gpt-4o(litellm) on CPU (shared VLM). Pred and optional source PNGs are saved undermedia/next to this report; images use./media/<file>.png(relative to this folder) for preview.00__GEditBench__vie_scoreGEditBench)vie_score0.9486832980505138(vie_score, higher_is_better=True)edit (source image + prompt)Change the background to a city street.Input prompt(s) (dataset / metric batch)
Auxiliary context (non-image fields)
category:background_changeSource image (dataset, edit input)
PNG on disk:
./media/00__GEditBench__vie_score_source.pngModel output (pred)
PNG on disk:
./media/00__GEditBench__vie_score_pred.pngJSON record (inputs, pred summary, metric)
{ "benchmark_lookup_key": "GEditBench", "benchmark_name": "GEditBench", "metric_name": "vie_score", "dataset_name": "GEditBench", "vlm_type": "litellm", "model_name": "openai/gpt-4o", "device": "cpu", "inputs": { "prompts": [ "Change the background to a city street." ], "auxiliary_0": { "category": "background_change", "source_image_bytes": { "bytes_len": 471052 } } }, "pred": { "shape": [ 1, 3, 1184, 896 ], "dtype": "torch.float32", "note": "replicate prunaai/p-image / prunaai/p-image-edit" }, "metric_result": { "name": "vie_score", "result": 0.9486832980505138, "higher_is_better": true, "metric_units": null }, "replicate": { "replicate_model_txt2img": "prunaai/p-image", "replicate_model_edit": "prunaai/p-image-edit", "replicate_model": "prunaai/p-image-edit", "output_url": "https://replicate.delivery/xezq/eNqOMpfW7btHzEtTjaHa9jKl9zXz5x0jsqYIrG4UCtXVIlbWA/output_603804.jpeg", "edit": true, "prompt_used": "Change the background to a city street." } }01__GenAIBench__vqaGenAIBench)vqa0.9999987517298831(vqa, higher_is_better=True)txt2imgA baker pulling freshly baked bread out of an oven in a bakery.Input prompt(s) (dataset / metric batch)
Input image: none (text-to-image from prompt only).
Model output (pred)
PNG on disk:
./media/01__GenAIBench__vqa_pred.pngJSON record (inputs, pred summary, metric)
{ "benchmark_lookup_key": "GenAIBench", "benchmark_name": "GenAI Bench", "metric_name": "vqa", "dataset_name": "GenAIBench", "vlm_type": "litellm", "model_name": "openai/gpt-4o", "device": "cpu", "inputs": { "prompts": [ "A baker pulling freshly baked bread out of an oven in a bakery." ], "auxiliary_0": { "Index": 0, "Tags": { "advanced": [], "basic": [ "Attribute", "Scene", "Spatial Relation", "Action Relation" ] }, "HumanRatings": { "DALLE_3": [ 5, 5, 5 ], "DeepFloyd_I_XL_v1": [ 4, 3, 3 ], "Midjourney_6": [ 3, 3, 3 ], "SDXL_2_1": [ 3, 2, 2 ], "SDXL_Base": [ 3, 3, 2 ], "SDXL_Turbo": [ 2, 4, 3 ] }, "DALLE_3": "<PIL.PngImagePlugin.PngImageFile image mode=RGB size=1024x1024 at 0x149748D90>", "DeepFloyd_I_XL_v1": "<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=256x256 at 0x14A041510>", "Midjourney_6": "<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x1024 at 0x14973CA10>", "SDXL_2_1": "<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=768x768 at 0x14973F250>", "SDXL_Base": "<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x1024 at 0x14973CB90>", "SDXL_Turbo": "<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x14A059D10>" } }, "pred": { "shape": [ 1, 3, 1024, 1024 ], "dtype": "torch.float32", "note": "replicate prunaai/p-image / prunaai/p-image-edit" }, "metric_result": { "name": "vqa", "result": 0.9999987517298831, "higher_is_better": true, "metric_units": null }, "replicate": { "replicate_model_txt2img": "prunaai/p-image", "replicate_model_edit": "prunaai/p-image-edit", "replicate_model": "prunaai/p-image", "output_url": "https://replicate.delivery/xezq/suLfzmeuWBi1V0hWT3KoMGicDu5xQJcYNEtlendkN4aMRK3sA/output_884390.jpeg", "edit": false, "prompt_used": "A baker pulling freshly baked bread out of an oven in a bakery." } }02__GenEval__qa_accuracyGenEval)qa_accuracy1.0(qa_accuracy, higher_is_better=True)txt2imga photo of a benchInput prompt(s) (dataset / metric batch)
Input image: none (text-to-image from prompt only).
Model output (pred)
PNG on disk:
./media/02__GenEval__qa_accuracy_pred.pngJSON record (inputs, pred summary, metric)
{ "benchmark_lookup_key": "GenEval", "benchmark_name": "GenEval", "metric_name": "qa_accuracy", "dataset_name": "GenEval", "vlm_type": "litellm", "model_name": "openai/gpt-4o", "device": "cpu", "inputs": { "prompts": [ "a photo of a bench" ], "auxiliary_0": { "tag": "single_object", "questions": [ "Does the image contain exactly 1 bench(s)?" ] } }, "pred": { "shape": [ 1, 3, 1024, 1024 ], "dtype": "torch.float32", "note": "replicate prunaai/p-image / prunaai/p-image-edit" }, "metric_result": { "name": "qa_accuracy", "result": 1.0, "higher_is_better": true, "metric_units": "accuracy" }, "replicate": { "replicate_model_txt2img": "prunaai/p-image", "replicate_model_edit": "prunaai/p-image-edit", "replicate_model": "prunaai/p-image", "output_url": "https://replicate.delivery/xezq/xqMc6xJWth5yFdO1ENsRszGxQPeJwW6pnU0e61QrTUaqIlbWA/output_876077.jpeg", "edit": false, "prompt_used": "a photo of a bench" } }03__ImgEdit__img_edit_scoreImgEdit)img_edit_score1.0(img_edit_score, higher_is_better=True)edit (source image + prompt)Replace the giant boot in the image with a large coffee mug.Input prompt(s) (dataset / metric batch)
Auxiliary context (non-image fields)
category:replaceimage_id:daily object/000276684.jpgSource image (dataset, edit input)
PNG on disk:
./media/03__ImgEdit__img_edit_score_source.pngModel output (pred)
PNG on disk:
./media/03__ImgEdit__img_edit_score_pred.pngJSON record (inputs, pred summary, metric)
{ "benchmark_lookup_key": "ImgEdit", "benchmark_name": "ImgEdit", "metric_name": "img_edit_score", "dataset_name": "ImgEdit", "vlm_type": "litellm", "model_name": "openai/gpt-4o", "device": "cpu", "inputs": { "prompts": [ "Replace the giant boot in the image with a large coffee mug." ], "auxiliary_0": { "category": "replace", "image_id": "daily object/000276684.jpg", "judge_prompt": "\nYou are a data rater specializing in grading image replacement edits. You will be given two images (before and after editing) and the corresponding editing instructions. Your task is to evaluate the replacement editing effect on a 5-point scale from three perspectives:\n\nPrompt Compliance\n1 Target not replaced, or an unrelated object edited.\n2 Only part of the target replaced, or wrong class/des\u2026", "source_image_bytes": { "bytes_len": 99889 } } }, "pred": { "shape": [ 1, 3, 1024, 1024 ], "dtype": "torch.float32", "note": "replicate prunaai/p-image / prunaai/p-image-edit" }, "metric_result": { "name": "img_edit_score", "result": 1.0, "higher_is_better": true, "metric_units": null }, "replicate": { "replicate_model_txt2img": "prunaai/p-image", "replicate_model_edit": "prunaai/p-image-edit", "replicate_model": "prunaai/p-image-edit", "output_url": "https://replicate.delivery/xezq/7ebbTpmrlr0XBC5zeisofUlYs4fR9nH61DawqpGojDUGjUuZB/output_264671.jpeg", "edit": true, "prompt_used": "Replace the giant boot in the image with a large coffee mug." } }04__LongTextBench__text_scoreLongTextBench)text_score0.7261904761904762(text_score, higher_is_better=True)txt2imgA lively and bustling urban street scene captured at daytime, featuring a colorful assortment of vibrant storefronts lining the busy sidewalk, with diverse groups of pedestrians leisurely strolling past shop windows, chatting or hurrying along on daily errands. Prominently displayed above an inviting, trendy coffee shop is a strikingly large, colorful billboard that clearly reads "FRESH BREWS, EVERY MORNING, EVERY DAY", set in bold, stylish, and modern typography that immediately captures attent…Input prompt(s) (dataset / metric batch)
Auxiliary context (non-image fields)
category:signInput image: none (text-to-image from prompt only).
Model output (pred)
PNG on disk:
./media/04__LongTextBench__text_score_pred.pngJSON record (inputs, pred summary, metric)
{ "benchmark_lookup_key": "LongTextBench", "benchmark_name": "Long Text Bench", "metric_name": "text_score", "dataset_name": "LongTextBench", "vlm_type": "litellm", "model_name": "openai/gpt-4o", "device": "cpu", "inputs": { "prompts": [ "A lively and bustling urban street scene captured at daytime, featuring a colorful assortment of vibrant storefronts lining the busy sidewalk, with diverse groups of pedestrians leisurely strolling past shop windows, chatting or hurrying along on daily errands. Prominently displayed above an inviting, trendy coffee shop is a strikingly large, colorful billboard that clearly reads \"FRESH BREWS, EVERY MORNING, EVERY DAY\", set in bold, stylish, and modern typography that immediately captures attent\u2026" ], "auxiliary_0": { "category": "sign", "length": "short", "text_content": [ "FRESH BREWS, EVERY MORNING, EVERY DAY", "Locally Roasted Since 1998", "CENTRAL PARK - 2 BLOCKS" ], "text_length": 15, "prompt_id": 0 } }, "pred": { "shape": [ 1, 3, 1024, 1024 ], "dtype": "torch.float32", "note": "replicate prunaai/p-image / prunaai/p-image-edit" }, "metric_result": { "name": "text_score", "result": 0.7261904761904762, "higher_is_better": true, "metric_units": null }, "replicate": { "replicate_model_txt2img": "prunaai/p-image", "replicate_model_edit": "prunaai/p-image-edit", "replicate_model": "prunaai/p-image", "output_url": "https://replicate.delivery/xezq/DQkMz7XRsF7NDB30t4RM6VGzS2CEr6WFF3pms2lGukCOS5mF/output_278123.jpeg", "edit": false, "prompt_used": "A lively and bustling urban street scene captured at daytime, featuring a colorful assortment of vibrant storefronts lining the busy sidewalk, with diverse groups of pedestrians leisurely strolling past shop windows, chatting or hurrying along on daily errands. Prominently displayed above an inviting, trendy coffee shop is a strikingly large, colorful billboard that clearly reads \"FRESH BREWS, EVERY MORNING, EVERY DAY\", set in bold, stylish, and modern typography that immediately captures attention. Elegantly positioned just below this main statement, the tagline in soft cursive font reads \"Locally Roasted Since 1998\", emphasizing tradition and authenticity. Nearby stands a classic directional street sign, neatly pointing pedestrians toward a calm oasis amid city life, clearly labeled \"CENTRAL PARK - 2 BLOCKS\", guiding visitors with its distinctly readable font. All signage is brightly lit, eye-catching, and clearly contrasts against the detailed backdrop of dynamic city buildings, enhancing the lively urban atmosphere." } }05__OneIGAnimeStylization__oneig_alignmentOneIGAnimeStylization)oneig_alignment0.7(oneig_alignment, higher_is_better=True)txt2img4boys, 5girls, multiple boys, multiple girls, nun, anime, white background, arm up, hand up, hands up, heart hands, looking at another, looking at viewer, blush, happy tears, light blush, light smile, wrinkled skin, horns, mustache, no mouth, open mouth, sweatdrop, blue eyes, red eyes, heart-shaped pupils, symbol-shaped pupils, eyepatch, blonde hair, brown hair, facial hair, grey hair, long hair, red hair, short hair, swept bangs, twintails, bare legs, tears, black dress, black jacket, collared …Input prompt(s) (dataset / metric batch)
Auxiliary context (non-image fields)
category:Anime_Stylizationsubset:Anime_Stylizationquestions: 21 keyed slots (full list in JSON)Input image: none (text-to-image from prompt only).
Model output (pred)
PNG on disk:
./media/05__OneIGAnimeStylization__oneig_alignment_pred.pngJSON record (inputs, pred summary, metric)
{ "benchmark_lookup_key": "OneIGAnimeStylization", "benchmark_name": "OneIG Anime Stylization", "metric_name": "oneig_alignment", "dataset_name": "OneIGAnimeStylization", "vlm_type": "litellm", "model_name": "openai/gpt-4o", "device": "cpu", "inputs": { "prompts": [ "4boys, 5girls, multiple boys, multiple girls, nun, anime, white background, arm up, hand up, hands up, heart hands, looking at another, looking at viewer, blush, happy tears, light blush, light smile, wrinkled skin, horns, mustache, no mouth, open mouth, sweatdrop, blue eyes, red eyes, heart-shaped pupils, symbol-shaped pupils, eyepatch, blonde hair, brown hair, facial hair, grey hair, long hair, red hair, short hair, swept bangs, twintails, bare legs, tears, black dress, black jacket, collared \u2026" ], "auxiliary_0": { "subset": "Anime_Stylization", "text_content": null, "category": "Anime_Stylization", "class": "None", "questions": { "1": "Are there boys?", "10": "Are the arms up?", "11": "Is the hand up?", "12": "Are the hands up?", "13": "Are the people looking at one another?", "14": "Are the people looking at the viewer?", "15": "Do the people have a blush?", "16": "Do the people have happy tears?", "17": "Is there a light blush on the people?", "18": "Is there a light smile on the people?", "19": "Is the skin wrinkled?", "2": "Are there four boys?", "20": "Do the people have horns?", "21": null, "3": "Are there girls?", "4": "Are there five girls?", "5": "Is there a nun?", "6": "Is this anime?", "7": "Is the background white?", "8": "Are the boys in the background?", "9": "Are the girls in the background?" }, "dependencies": { "1": [ 0 ], "10": [ 0 ], "11": [ 0 ], "12": [ 0 ], "13": [ 0 ], "14": [ 0 ], "15": [ 0 ], "16": [ 0 ], "17": [ 15 ], "18": [ 15 ], "19": [ 0 ], "2": [ 1 ], "20": [ 0 ], "21": null, "3": [ 0 ], "4": [ 3 ], "5": [ 0 ], "6": [ 0 ], "7": [ 0 ], "8": [ 1 ], "9": [ 3 ] }, "reasoning_gt_answer": null } }, "pred": { "shape": [ 1, 3, 1024, 1024 ], "dtype": "torch.float32", "note": "replicate prunaai/p-image / prunaai/p-image-edit" }, "metric_result": { "name": "oneig_alignment", "result": 0.7, "higher_is_better": true, "metric_units": "alignment" }, "replicate": { "replicate_model_txt2img": "prunaai/p-image", "replicate_model_edit": "prunaai/p-image-edit", "replicate_model": "prunaai/p-image", "output_url": "https://replicate.delivery/xezq/H4VKp1AEsG7bAd7qXXYM4a9QA4KiUtDf6DELppssjfRBJlbWA/output_624970.jpeg", "edit": false, "prompt_used": "4boys, 5girls, multiple boys, multiple girls, nun, anime, white background, arm up, hand up, hands up, heart hands, looking at another, looking at viewer, blush, happy tears, light blush, light smile, wrinkled skin, horns, mustache, no mouth, open mouth, sweatdrop, blue eyes, red eyes, heart-shaped pupils, symbol-shaped pupils, eyepatch, blonde hair, brown hair, facial hair, grey hair, long hair, red hair, short hair, swept bangs, twintails, bare legs, tears, black dress, black jacket, collared shirt, dress, gloves, hat, jacket, off-shoulder dress, pants, red shirt, shirt, top hat, veil, white dress, white gloves, white jacket, white pants, white shirt, white veil, bare shoulders, black necktie, blue ribbon, jewelry, long sleeves, necktie, off shoulder, ribbon, finger counting, absurdres, chain necklace, highres, necklace, :3, beak, bonnet, burning, chain, confetti, everyone, fur trim, heart, middle w, old, old man, pendant, sanpaku, teardrop, w, 4-finger heart hands" } }06__OneIGGeneralObject__oneig_alignmentOneIGGeneralObject)oneig_alignment1.0(oneig_alignment, higher_is_better=True)txt2imgImagine a charming and magical boho bedroom for a toddler girl. The room features a cozy, low bed with a frame made of natural wood branches, creating the illusion of sleeping in a forest den. The bedding is soft and inviting, adorned with patterns of woodland creatures such as foxes, deer, and owls, set against green and earthy tones. The walls are painted in a soothing moss green, enhancing the forest-like atmosphere. One wall boasts a large mural of an enchanted forest with whimsical creature…Input prompt(s) (dataset / metric batch)
Auxiliary context (non-image fields)
category:General_Objectsubset:General_Objectquestions: 21 keyed slots (full list in JSON)Input image: none (text-to-image from prompt only).
Model output (pred)
PNG on disk:
./media/06__OneIGGeneralObject__oneig_alignment_pred.pngJSON record (inputs, pred summary, metric)
{ "benchmark_lookup_key": "OneIGGeneralObject", "benchmark_name": "OneIG General Object", "metric_name": "oneig_alignment", "dataset_name": "OneIGGeneralObject", "vlm_type": "litellm", "model_name": "openai/gpt-4o", "device": "cpu", "inputs": { "prompts": [ "Imagine a charming and magical boho bedroom for a toddler girl. The room features a cozy, low bed with a frame made of natural wood branches, creating the illusion of sleeping in a forest den. The bedding is soft and inviting, adorned with patterns of woodland creatures such as foxes, deer, and owls, set against green and earthy tones. The walls are painted in a soothing moss green, enhancing the forest-like atmosphere. One wall boasts a large mural of an enchanted forest with whimsical creature\u2026" ], "auxiliary_0": { "subset": "General_Object", "text_content": null, "category": "General_Object", "class": "None", "questions": { "1": "Is there a bedroom?", "10": "Are the walls moss green?", "11": "Is there a wall mural?", "12": "Does the wall mural have a pattern of an enchanted forest and whimsical creatures?", "13": "Is there a canopy of greenery and fairy lights?", "14": "Is the canopy hanging above the bed?", "15": "Is there a play area?", "16": "Is there a teepee tent in the play area?", "17": "Is the teepee tent made from natural fabrics with leaf and vine patterns?", "18": "Is there a wooden bookshelf resembling tree trunks?", "19": "Is there a rug with leaf and acorn patterns?", "2": "Is the bedroom boho style?", "20": "Is the play area a cozy retreat?", "21": null, "3": "Is the bedroom for a toddler girl?", "4": "Is the bed cozy and low?", "5": "Is the bed frame made of natural wood branches?", "6": "Is the bed in the room?", "7": "Is the bedding soft and inviting?", "8": "Does the bedding have a pattern of woodland creatures?", "9": "Are the bedding colors green and earthy tones?" }, "dependencies": { "1": [ 0 ], "10": [ 0 ], "11": [ 1 ], "12": [ 11 ], "13": [ 1 ], "14": [ 13 ], "15": [ 1 ], "16": [ 15 ], "17": [ 16 ], "18": [ 1 ], "19": [ 1 ], "2": [ 1 ], "20": [ 15 ], "21": null, "3": [ 1 ], "4": [ 0 ], "5": [ 0 ], "6": [ 1 ], "7": [ 0 ], "8": [ 0 ], "9": [ 0 ] }, "reasoning_gt_answer": null } }, "pred": { "shape": [ 1, 3, 1024, 1024 ], "dtype": "torch.float32", "note": "replicate prunaai/p-image / prunaai/p-image-edit" }, "metric_result": { "name": "oneig_alignment", "result": 1.0, "higher_is_better": true, "metric_units": "alignment" }, "replicate": { "replicate_model_txt2img": "prunaai/p-image", "replicate_model_edit": "prunaai/p-image-edit", "replicate_model": "prunaai/p-image", "output_url": "https://replicate.delivery/xezq/TRpfx23UfKpWxEFOv6nzUXuxUcJejPbVmDetKiou51aXnUuZB/output_640175.jpeg", "edit": false, "prompt_used": "Imagine a charming and magical boho bedroom for a toddler girl. The room features a cozy, low bed with a frame made of natural wood branches, creating the illusion of sleeping in a forest den. The bedding is soft and inviting, adorned with patterns of woodland creatures such as foxes, deer, and owls, set against green and earthy tones. The walls are painted in a soothing moss green, enhancing the forest-like atmosphere. One wall boasts a large mural of an enchanted forest with whimsical creatures peeking from behind trees and bushes. Above the bed, a canopy of greenery and small fairy lights hangs, simulating the lush canopy of a forest and offering a magical light source. A dedicated play area includes a teepee tent made from natural fabrics with leaf and vine patterns. Inside, a plush mat and forest animal toys make it a cozy retreat. A small wooden bookshelf, resembling tree trunks, contains books about nature and fairy tales. A textured rug with patterns of leaves and acorns covers the floor, adding warmth for play." } }07__OneIGMultilingualism__oneig_alignmentOneIGMultilingualism)oneig_alignment0.8(oneig_alignment, higher_is_better=True)txt2img热闹的颁奖典礼现场,聚光灯耀眼,舞台上的杨紫身着一袭华丽晚礼服,面带自信微笑,手持奖杯,台下粉丝们挥舞着灯牌欢呼雀跃,背景的大屏幕闪烁着她的经典影视画面。Input prompt(s) (dataset / metric batch)
Auxiliary context (non-image fields)
category:Multilingualismsubset:Multilingualismquestions: 21 keyed slots (full list in JSON)Input image: none (text-to-image from prompt only).
Model output (pred)
PNG on disk:
./media/07__OneIGMultilingualism__oneig_alignment_pred.pngJSON record (inputs, pred summary, metric)
{ "benchmark_lookup_key": "OneIGMultilingualism", "benchmark_name": "OneIG Multilingualism", "metric_name": "oneig_alignment", "dataset_name": "OneIGMultilingualism", "vlm_type": "litellm", "model_name": "openai/gpt-4o", "device": "cpu", "inputs": { "prompts": [ "\u70ed\u95f9\u7684\u9881\u5956\u5178\u793c\u73b0\u573a\uff0c\u805a\u5149\u706f\u8000\u773c\uff0c\u821e\u53f0\u4e0a\u7684\u6768\u7d2b\u8eab\u7740\u4e00\u88ad\u534e\u4e3d\u665a\u793c\u670d\uff0c\u9762\u5e26\u81ea\u4fe1\u5fae\u7b11\uff0c\u624b\u6301\u5956\u676f\uff0c\u53f0\u4e0b\u7c89\u4e1d\u4eec\u6325\u821e\u7740\u706f\u724c\u6b22\u547c\u96c0\u8dc3\uff0c\u80cc\u666f\u7684\u5927\u5c4f\u5e55\u95ea\u70c1\u7740\u5979\u7684\u7ecf\u5178\u5f71\u89c6\u753b\u9762\u3002" ], "auxiliary_0": { "subset": "Multilingualism", "text_content": null, "category": "Multilingualism", "class": "None", "questions": { "1": "\u73b0\u573a\u662f\u4e0d\u662f\u9881\u5956\u5178\u793c\uff1f", "10": "\u9881\u5956\u5178\u793c\u73b0\u573a\u662f\u5426\u70ed\u95f9\uff1f", "11": "\u5973\u661f\u662f\u5426\u8eab\u7a7f\u665a\u793c\u670d\uff1f", "12": "\u665a\u793c\u670d\u662f\u5426\u534e\u4e3d\uff1f", "13": "\u5973\u661f\u662f\u5426\u9762\u5e26\u81ea\u4fe1\u7684\u5fae\u7b11\uff1f", "14": "\u5973\u661f\u662f\u5426\u624b\u6301\u5956\u676f\uff1f", "15": "\u7c89\u4e1d\u4eec\u662f\u5426\u6325\u821e\u706f\u724c\uff1f", "16": "\u7c89\u4e1d\u4eec\u662f\u5426\u6b22\u547c\u96c0\u8dc3\uff1f", "17": "\u5927\u5c4f\u5e55\u7684\u80cc\u666f\u662f\u5426\u95ea\u70c1\uff1f", "18": "\u5927\u5c4f\u5e55\u662f\u5426\u663e\u793a\u5973\u661f\u7684\u7ecf\u5178\u5f71\u89c6\u753b\u9762\uff1f", "19": "\u5973\u661f\u662f\u5426\u5728\u821e\u53f0\u4e0a\uff1f", "2": "\u662f\u4e0d\u662f\u6709\u805a\u5149\u706f\uff1f", "20": "\u7c89\u4e1d\u4eec\u662f\u5426\u5728\u821e\u53f0\u4e0b\uff1f", "21": null, "3": "\u662f\u4e0d\u662f\u6709\u821e\u53f0\uff1f", "4": "\u821e\u53f0\u4e0a\u662f\u5426\u6709\u5973\u661f\uff1f", "5": "\u5973\u661f\u662f\u5426\u7a7f\u7740\u665a\u793c\u670d\uff1f", "6": "\u662f\u5426\u6709\u5956\u676f\uff1f", "7": "\u7c89\u4e1d\u4eec\u662f\u5426\u6325\u821e\u706f\u724c\uff1f", "8": "\u662f\u4e0d\u662f\u6709\u7c89\u4e1d\u4eec\uff1f", "9": "\u662f\u4e0d\u662f\u6709\u5927\u5c4f\u5e55\uff1f" }, "dependencies": { "1": [ 0 ], "10": [ 1 ], "11": [ 4 ], "12": [ 5 ], "13": [ 4 ], "14": [ 4, 6 ], "15": [ 8, 7 ], "16": [ 8 ], "17": [ 9 ], "18": [ 9 ], "19": [ 3, 4 ], "2": [ 0 ], "20": [ 3, 8 ], "21": null, "3": [ 0 ], "4": [ 0 ], "5": [ 4 ], "6": [ 4 ], "7": [ 8 ], "8": [ 0 ], "9": [ 0 ] }, "reasoning_gt_answer": null } }, "pred": { "shape": [ 1, 3, 1024, 1024 ], "dtype": "torch.float32", "note": "replicate prunaai/p-image / prunaai/p-image-edit" }, "metric_result": { "name": "oneig_alignment", "result": 0.8, "higher_is_better": true, "metric_units": "alignment" }, "replicate": { "replicate_model_txt2img": "prunaai/p-image", "replicate_model_edit": "prunaai/p-image-edit", "replicate_model": "prunaai/p-image", "output_url": "https://replicate.delivery/xezq/eruy8eJERci18U2h54amfggiYboSQf1CcEgWqV8YmwBcrUuZB/output_709574.jpeg", "edit": false, "prompt_used": "\u70ed\u95f9\u7684\u9881\u5956\u5178\u793c\u73b0\u573a\uff0c\u805a\u5149\u706f\u8000\u773c\uff0c\u821e\u53f0\u4e0a\u7684\u6768\u7d2b\u8eab\u7740\u4e00\u88ad\u534e\u4e3d\u665a\u793c\u670d\uff0c\u9762\u5e26\u81ea\u4fe1\u5fae\u7b11\uff0c\u624b\u6301\u5956\u676f\uff0c\u53f0\u4e0b\u7c89\u4e1d\u4eec\u6325\u821e\u7740\u706f\u724c\u6b22\u547c\u96c0\u8dc3\uff0c\u80cc\u666f\u7684\u5927\u5c4f\u5e55\u95ea\u70c1\u7740\u5979\u7684\u7ecf\u5178\u5f71\u89c6\u753b\u9762\u3002" } }08__OneIGPortrait__oneig_alignmentOneIGPortrait)oneig_alignment1.0(oneig_alignment, higher_is_better=True)txt2imgTwo individuals sit closely on a charming park bench beneath a vivid, sunlit sky. The photograph should capture genuine people, emphasizing a realistic and authentic style.Input prompt(s) (dataset / metric batch)
Auxiliary context (non-image fields)
category:Portraitsubset:Portraitquestions: 21 keyed slots (full list in JSON)Input image: none (text-to-image from prompt only).
Model output (pred)
PNG on disk:
./media/08__OneIGPortrait__oneig_alignment_pred.pngJSON record (inputs, pred summary, metric)
{ "benchmark_lookup_key": "OneIGPortrait", "benchmark_name": "OneIG Portrait", "metric_name": "oneig_alignment", "dataset_name": "OneIGPortrait", "vlm_type": "litellm", "model_name": "openai/gpt-4o", "device": "cpu", "inputs": { "prompts": [ "Two individuals sit closely on a charming park bench beneath a vivid, sunlit sky. The photograph should capture genuine people, emphasizing a realistic and authentic style." ], "auxiliary_0": { "subset": "Portrait", "text_content": null, "category": "Portrait", "class": "None", "questions": { "1": "Are there individuals?", "10": null, "11": null, "12": null, "13": null, "14": null, "15": null, "16": null, "17": null, "18": null, "19": null, "2": "Are there two individuals?", "20": null, "21": null, "3": "Is there a bench?", "4": "Is the bench a park bench?", "5": "Is there a sky?", "6": "Are the individuals sitting closely on the bench?", "7": "Is the sky vivid and sunlit?", "8": "Is this a photograph?", "9": "Is the photograph realistic and authentic?" }, "dependencies": { "1": [ 0 ], "10": null, "11": null, "12": null, "13": null, "14": null, "15": null, "16": null, "17": null, "18": null, "19": null, "2": [ 1 ], "20": null, "21": null, "3": [ 0 ], "4": [ 3 ], "5": [ 0 ], "6": [ 1, 3 ], "7": [ 5 ], "8": [ 0 ], "9": [ 8 ] }, "reasoning_gt_answer": null } }, "pred": { "shape": [ 1, 3, 1024, 1024 ], "dtype": "torch.float32", "note": "replicate prunaai/p-image / prunaai/p-image-edit" }, "metric_result": { "name": "oneig_alignment", "result": 1.0, "higher_is_better": true, "metric_units": "alignment" }, "replicate": { "replicate_model_txt2img": "prunaai/p-image", "replicate_model_edit": "prunaai/p-image-edit", "replicate_model": "prunaai/p-image", "output_url": "https://replicate.delivery/xezq/dOl8Dc47zarcG1fmHEu77TmvY4Gky81yvwSGdnwQVpw6lyNLA/output_46542.jpeg", "edit": false, "prompt_used": "Two individuals sit closely on a charming park bench beneath a vivid, sunlit sky. The photograph should capture genuine people, emphasizing a realistic and authentic style." } }09__OneIGTextRendering__oneig_text_scoreOneIGTextRendering)oneig_text_score0.9925(oneig_text_score, higher_is_better=True)txt2imgA bold presentation visualizing The Impact of 5G Technology with a pastel-colored layout with modern design cues. The title reads "The Impact of 5G Technology" and is complemented by a paragraph stating "5G networks are enabling faster connectivity, supporting innovations in autonomous vehicles, remote surgeries, and smart devices.". A visual chart labeled "5G Coverage Expansion" includes categories like "Urban Areas", "Suburban", and "Rural". Decorative icons such as a 5G icon, a satellite dish…Input prompt(s) (dataset / metric batch)
Auxiliary context (non-image fields)
category:Text_Renderingsubset:Text_Renderingtext_content(long / auxiliary text excerpt):The Impact of 5G Technology 5G networks are enabling faster connectivity, supporting innovations in autonomous vehicles, remote surgeries, and smart devices. 5G Coverage Expansion Urban Areas Suburban Rural Telecom Industry Report, 2025questions: 21 keyed slots (full list in JSON)Input image: none (text-to-image from prompt only).
Model output (pred)
PNG on disk:
./media/09__OneIGTextRendering__oneig_text_score_pred.pngJSON record (inputs, pred summary, metric)
{ "benchmark_lookup_key": "OneIGTextRendering", "benchmark_name": "OneIG Text Rendering", "metric_name": "oneig_text_score", "dataset_name": "OneIGTextRendering", "vlm_type": "litellm", "model_name": "openai/gpt-4o", "device": "cpu", "inputs": { "prompts": [ "A bold presentation visualizing The Impact of 5G Technology with a pastel-colored layout with modern design cues. The title reads \"The Impact of 5G Technology\" and is complemented by a paragraph stating \"5G networks are enabling faster connectivity, supporting innovations in autonomous vehicles, remote surgeries, and smart devices.\". A visual chart labeled \"5G Coverage Expansion\" includes categories like \"Urban Areas\", \"Suburban\", and \"Rural\". Decorative icons such as a 5G icon, a satellite dish\u2026" ], "auxiliary_0": { "subset": "Text_Rendering", "text_content": "The Impact of 5G Technology 5G networks are enabling faster connectivity, supporting innovations in autonomous vehicles, remote surgeries, and smart devices. 5G Coverage Expansion Urban Areas Suburban Rural Telecom Industry Report, 2025", "category": "Text_Rendering", "class": "PPT generation", "questions": { "1": null, "10": null, "11": null, "12": null, "13": null, "14": null, "15": null, "16": null, "17": null, "18": null, "19": null, "2": null, "20": null, "21": null, "3": null, "4": null, "5": null, "6": null, "7": null, "8": null, "9": null }, "dependencies": { "1": null, "10": null, "11": null, "12": null, "13": null, "14": null, "15": null, "16": null, "17": null, "18": null, "19": null, "2": null, "20": null, "21": null, "3": null, "4": null, "5": null, "6": null, "7": null, "8": null, "9": null }, "reasoning_gt_answer": null } }, "pred": { "shape": [ 1, 3, 1024, 1024 ], "dtype": "torch.float32", "note": "replicate prunaai/p-image / prunaai/p-image-edit" }, "metric_result": { "name": "oneig_text_score", "result": 0.9925, "higher_is_better": true, "metric_units": null }, "replicate": { "replicate_model_txt2img": "prunaai/p-image", "replicate_model_edit": "prunaai/p-image-edit", "replicate_model": "prunaai/p-image", "output_url": "https://replicate.delivery/xezq/Syk9RbsXixLBONJ17qlF7JvpRAZhOydzeSOzMB5ZN1kLmyNLA/output_174964.jpeg", "edit": false, "prompt_used": "A bold presentation visualizing The Impact of 5G Technology with a pastel-colored layout with modern design cues. The title reads \"The Impact of 5G Technology\" and is complemented by a paragraph stating \"5G networks are enabling faster connectivity, supporting innovations in autonomous vehicles, remote surgeries, and smart devices.\". A visual chart labeled \"5G Coverage Expansion\" includes categories like \"Urban Areas\", \"Suburban\", and \"Rural\". Decorative icons such as a 5G icon, a satellite dish, and a smartphone add context. The slide concludes with a footer note: \"Telecom Industry Report, 2025\"." } }data__DPGDPG)txt2imgInput prompt(s) (first dataset sample)
Auxiliary context (non-image fields)
category:entityModel output (pred)
PNG on disk:
./media/data__DPG_pred.pngdata__HPSHPS)txt2imgInput prompt(s) (first dataset sample)
Auxiliary context (non-image fields)
category:animeModel output (pred)
PNG on disk:
./media/data__HPS_pred.pngdata__PartiPromptsPartiPrompts)txt2imgInput prompt(s) (first dataset sample)
Model output (pred)
PNG on disk:
./media/data__PartiPrompts_pred.png