Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/gimbench/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ def _add_scierc_eval_args(parser):

def _add_cv_eval_args(parser):
parser.add_argument("--use_outlines", action="store_true", help="Whether to use outlines in CV evaluation")
parser.add_argument("--use_uie", action="store_true", help="Whether to use Traditional UIE model in CV evaluation")
parser.add_argument(
Comment on lines 203 to 206
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

--use_outlines and --use_uie both select the CV extractor, but they can be enabled together. Since conduct_eval silently prioritizes UIE when both are set, it’d be clearer to make these flags mutually exclusive (argparse mutually-exclusive group) or validate and error when both are provided.

Copilot uses AI. Check for mistakes.
"--judge_model_name",
type=str,
Expand Down
42 changes: 41 additions & 1 deletion src/gimbench/cv/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,47 @@ def _extract_fields(self, cv_content: str) -> dict[str, str]:
raise ValueError(f"Expected dict but got {type(extraction).__name__}: {extraction}")


class UIEEvaluator(CVEvaluator):
def __init__(self, args: Namespace, dataset: Dataset):
super().__init__(args, dataset)
try:
from paddlenlp import Taskflow

model_name = args.model_name or "uie-base"
if "PP-UIE" in model_name and not model_name.startswith("paddlenlp/"):
model_name = f"paddlenlp/{model_name}"
# PP-UIE-7B or other LLM-based UIE might require different task names,
# but usually "information_extraction" covers UIE.
self.model = Taskflow("information_extraction", schema=CV_FIELDS, model=model_name, precision="bfloat16")
except ImportError:
raise ImportError("Please install paddlenlp to use the UIEEvaluator. (pip install paddlenlp)")

def _extract_fields(self, cv_content: str) -> dict[str, str]:
try:
# We might want to chunk or limit cv_content length depending on the model's context window.
# PP-UIE-7B handles longer contexts but passing the whole CV might still be long.
# Taskflow can handle it or truncate internally.
results = self.model(cv_content)
extraction = {}
for field in CV_FIELDS:
# UIE returns a list of dictionaries. The first dictionary contains the field if extracted.
if results and isinstance(results, list) and len(results) > 0 and field in results[0]:
# Extract the text of the first matched entity
# Taskflow usually returns [{'text': 'xxx', ...}, ...] for each field
extracted_texts = [item["text"] for item in results[0][field]]
extraction[field] = ", ".join(extracted_texts) if extracted_texts else ""
else:
extraction[field] = ""
except Exception as e:
logger.error(f"PaddleNLP UIE generation failed: {e}")
extraction = dict.fromkeys(CV_FIELDS, "")
Comment on lines +270 to +272
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This error path logs with logger.error(...), which drops the stack trace. If you keep handling exceptions here, prefer logger.exception(...) (or include exc_info=True) so UIE failures are debuggable in logs.

Copilot uses AI. Check for mistakes.
return extraction
Comment on lines +270 to +273
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

UIEEvaluator._extract_fields catches all exceptions and returns an all-empty extraction. This means failures won’t propagate to _evaluate_item, so error_msg stays empty and the run will count these as normal (incorrect) items instead of errors (and won’t be excluded by _filter_non_error_items). Consider letting the exception propagate (or re-raising after logging) so extraction failures are tracked consistently with other evaluators.

Copilot uses AI. Check for mistakes.


def conduct_eval(args: Namespace, ds: Dataset):
evaluator = OutlinesEvaluator(args, ds) if args.use_outlines else GIMEvaluator(args, ds)
if hasattr(args, "use_uie") and args.use_uie:
evaluator = UIEEvaluator(args, ds)
else:
evaluator = OutlinesEvaluator(args, ds) if args.use_outlines else GIMEvaluator(args, ds)
Comment on lines 276 to +280
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If both use_uie and use_outlines are set, UIE wins due to this conditional, but that precedence isn’t communicated to the user. Either enforce mutual exclusivity during arg parsing/validation, or emit a clear warning/error here to avoid surprising evaluator selection.

Copilot uses AI. Check for mistakes.
result = evaluator.evaluate()
result.dump()
Loading