-
Notifications
You must be signed in to change notification settings - Fork 0
For rebuttal, add UIE model support in CV evaluation and corresponding argument #100
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -236,7 +236,47 @@ def _extract_fields(self, cv_content: str) -> dict[str, str]: | |
| raise ValueError(f"Expected dict but got {type(extraction).__name__}: {extraction}") | ||
|
|
||
|
|
||
| class UIEEvaluator(CVEvaluator): | ||
| def __init__(self, args: Namespace, dataset: Dataset): | ||
| super().__init__(args, dataset) | ||
| try: | ||
| from paddlenlp import Taskflow | ||
|
|
||
| model_name = args.model_name or "uie-base" | ||
| if "PP-UIE" in model_name and not model_name.startswith("paddlenlp/"): | ||
| model_name = f"paddlenlp/{model_name}" | ||
| # PP-UIE-7B or other LLM-based UIE might require different task names, | ||
| # but usually "information_extraction" covers UIE. | ||
| self.model = Taskflow("information_extraction", schema=CV_FIELDS, model=model_name, precision="bfloat16") | ||
| except ImportError: | ||
| raise ImportError("Please install paddlenlp to use the UIEEvaluator. (pip install paddlenlp)") | ||
|
|
||
| def _extract_fields(self, cv_content: str) -> dict[str, str]: | ||
| try: | ||
| # We might want to chunk or limit cv_content length depending on the model's context window. | ||
| # PP-UIE-7B handles longer contexts but passing the whole CV might still be long. | ||
| # Taskflow can handle it or truncate internally. | ||
| results = self.model(cv_content) | ||
| extraction = {} | ||
| for field in CV_FIELDS: | ||
| # UIE returns a list of dictionaries. The first dictionary contains the field if extracted. | ||
| if results and isinstance(results, list) and len(results) > 0 and field in results[0]: | ||
| # Extract the text of the first matched entity | ||
| # Taskflow usually returns [{'text': 'xxx', ...}, ...] for each field | ||
| extracted_texts = [item["text"] for item in results[0][field]] | ||
| extraction[field] = ", ".join(extracted_texts) if extracted_texts else "" | ||
| else: | ||
| extraction[field] = "" | ||
| except Exception as e: | ||
| logger.error(f"PaddleNLP UIE generation failed: {e}") | ||
| extraction = dict.fromkeys(CV_FIELDS, "") | ||
|
Comment on lines
+270
to
+272
|
||
| return extraction | ||
|
Comment on lines
+270
to
+273
|
||
|
|
||
|
|
||
| def conduct_eval(args: Namespace, ds: Dataset): | ||
| evaluator = OutlinesEvaluator(args, ds) if args.use_outlines else GIMEvaluator(args, ds) | ||
| if hasattr(args, "use_uie") and args.use_uie: | ||
| evaluator = UIEEvaluator(args, ds) | ||
| else: | ||
| evaluator = OutlinesEvaluator(args, ds) if args.use_outlines else GIMEvaluator(args, ds) | ||
|
Comment on lines
276
to
+280
|
||
| result = evaluator.evaluate() | ||
| result.dump() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
--use_outlinesand--use_uieboth select the CV extractor, but they can be enabled together. Sinceconduct_evalsilently prioritizes UIE when both are set, it’d be clearer to make these flags mutually exclusive (argparse mutually-exclusive group) or validate and error when both are provided.