Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions data/label_update/origin_data/origin_one_two_label.json

Large diffs are not rendered by default.

44 changes: 44 additions & 0 deletions data/label_update/origin_data/part_data.json

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions examples/corpus_index_label.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Corpus Embedding and Indexing Demo

# MCP Server
servers:
retriever: servers/retriever

# MCP Client Pipeline
pipeline:
- retriever.retriever_init
- retriever.retriever_embed
- retriever.retriever_index_label
10 changes: 10 additions & 0 deletions examples/corpus_label_data_index.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Corpus Label Data Index

# MCP Servers
servers:
corpus: servers/corpus

# MCP Client Pipeline
pipeline:
- corpus.chunk_documents

28 changes: 28 additions & 0 deletions examples/corpus_label_new_data_update.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Corpus Label Build And Update Demo

# MCP Servers
servers:
corpus: servers/corpus
prompt: servers/prompt
generation: servers/generation

# MCP Client Pipeline
pipeline:
- corpus.load_onelabel_data
- prompt.onelabel_classify
- generation.generation_init
- generation.generate
- corpus.load_twolabel_data
- prompt.twolabel_classify
- generation.generate
- corpus.twolabel_result_pro
- prompt.twolabel_new_classify
- generation.generate
- corpus.twolabel_new_result_pro
- prompt.twolabel_merge
- generation.generate
- corpus.twolabel_merge_result_pro
- prompt.twolabel_desc
- generation.generate
- corpus.twolabel_desc_result_pro
- corpus.merge_new_old_data_pro
27 changes: 27 additions & 0 deletions examples/corpus_label_update.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Corpus Label Build And Update Demo

# MCP Servers
servers:
corpus: servers/corpus
prompt: servers/prompt
generation: servers/generation

# MCP Client Pipeline
pipeline:
- corpus.load_onelabel_data
- prompt.onelabel_classify
- generation.generation_init
- generation.generate
- corpus.load_twolabel_data
- prompt.twolabel_classify
- generation.generate
- corpus.twolabel_result_pro
- prompt.twolabel_new_classify
- generation.generate
- corpus.twolabel_new_result_pro
- prompt.twolabel_merge
- generation.generate
- corpus.twolabel_merge_result_pro
- prompt.twolabel_desc
- generation.generate
- corpus.twolabel_desc_result_pro
59 changes: 59 additions & 0 deletions examples/webnote_label.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# WebNote Demo

# MCP Server
servers:
benchmark: servers/benchmark
corpus: servers/corpus
generation: servers/generation
retriever: servers/retriever
prompt: servers/prompt
router: servers/router
custom: servers/custom
evaluation: servers/evaluation

# MCP Client Pipeline
pipeline:
- benchmark.get_data
- corpus.load_one_label
- retriever.retriever_init
- generation.generation_init
- prompt.webnote_gen_plan
- generation.generate:
output:
ans_ls: plan_ls
- prompt.webnote_init_page
- generation.generate:
output:
ans_ls: page_ls
- loop:
times: 5
steps:
- branch:
router:
- router.webnote_check_page
branches:
incomplete:
- prompt.webnote_gen_subq
- generation.generate:
output:
ans_ls: subq_ls
- prompt.query_label_classify
- generation.generate:
input:
prompt_ls: subq_ls
output:
ans_ls: onelabel_list
- retriever.retriever_search_by_label:
input:
query_list: subq_ls
output:
ret_psg: psg_ls
- prompt.webnote_fill_page
- generation.generate:
output:
ans_ls: page_ls
complete: []
- prompt.webnote_gen_answer
- generation.generate
- custom.output_extract_from_boxed
- evaluation.evaluate
1 change: 1 addition & 0 deletions prompt/label_classify.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Please help me categorize this article:"{{content}}" into one of these {{label_len}} tags:"{{label_str}}" separated by ";", and only output this single tag result. Thank you.
1 change: 1 addition & 0 deletions prompt/label_desc.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Please generate description for the secondary tag:"{{prompt_twolabel}}" based on the primary tag:"{{prompt_onelabel}}" and its description:"{{onelabel_desc}}". Only output the description result. Thank you.
1 change: 1 addition & 0 deletions prompt/label_merge.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Please help me merge these {{tag_len}} tags:"{{tag_str}}" separated by ";" into no more than 10 non-overlapping tags. Only output the original tags and their corresponding new tags in a one-to-one JSON format, and don't output redundant information. Thank you.
1 change: 1 addition & 0 deletions prompt/label_new_classify.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Please help assign a new tag to this article:"{{content}}" based on the primary category:"{{one_label}}" and its description:"{{one_label_desc}}". Please output only the single new tag and don't output redundant information. Thank you.
11 changes: 11 additions & 0 deletions prompt/label_one_two_classify.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
You are a Text Label Classification Expert.
Please perform label classification on the text based on the following {{label_num}} labels and their descriptions.

Labels and their descriptions:
{{label_description}}

Input:
Text: {{text}}

Output:
Please output the only one label result, and do not output anything else, Thank you.
11 changes: 9 additions & 2 deletions servers/corpus/parameter.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,21 @@ parse_file_path: data/UltraRAG.pdf
text_corpus_save_path: corpora/text.jsonl
image_corpus_save_path: corpora/image.jsonl

# label update
input_file_path: data/label_update/origin_data/part_data.json
input_one_two_label_path: data/label_update/origin_data/origin_one_two_label.json
input_old_data_file_path: data/label_update/update_data/update_onelabel_data_old.json
output_onelabel_file_path: data/label_update/update_data/update_onelabel_data.json
output_generate_twolabel_file_path: data/label_update/update_data/update_generate_twolabel_data.json

# mineru
mineru_dir: corpora/
mineru_extra_params:
source: modelscope

# chunking parameters
raw_chunk_path: corpora/text.jsonl
chunk_path: corpora/chunks.jsonl
raw_chunk_path: data/label_update/update_data/update_onelabel_data_old.json
chunk_path: data/label_update/update_data/update_onelabel_data_old_chunks.json
use_title: false
chunk_backend: token # choices=["token", "sentence", "recursive"]
chunk_backend_configs:
Expand Down
Loading