Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 33 additions & 8 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,18 +103,33 @@ async def check_title_appearance_in_start_concurrent(structure, page_list, model

def toc_detector_single_page(content, model=None):
prompt = f"""
Your job is to detect if there is a table of content provided in the given text.
Your job is to detect whether the given text is a table of contents.

A table of contents is a directory: a list of references that point to content
located ELSEWHERE in the document (typically section titles paired with page
or section numbers). The entries are pointers; the actual content they refer
to is on other pages.

Pages that contain the document's substantive content with numbered sections —
such as policies, regulations, rules, statutes, contracts, ordinances, or
articles — are NOT tables of contents, even when their visual structure
(numbered headings, indented sub-items) resembles one. If each numbered item
is followed on this same page by its own substantive body text, the page is
content, not a TOC.

When the input is a single self-contained page that reads as the document
itself rather than a directory pointing elsewhere, answer "no".

Given text: {content}

return the following JSON format:
{{
"thinking": <why do you think there is a table of content in the given text>
"thinking": <why do you think there is a table of contents in the given text>
"toc_detected": "<yes or no>",
}}

Directly return the final JSON structure. Do not output anything else.
Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents."""
Please note: abstract, summary, notation list, figure list, table list, etc. are not tables of contents."""

response = llm_completion(model=model, prompt=prompt)
# print('response', response)
Expand Down Expand Up @@ -698,6 +713,16 @@ def check_toc(page_list, opt=None):
if len(toc_page_list) == 0:
print('no toc found')
return {'toc_content': None, 'toc_page_list': [], 'page_index_given_in_toc': 'no'}
# A real table of contents points to content located beyond it. If
# find_toc_pages classified every available page (typically: a single-page
# document where toc_detector_single_page misfired on numbered policy /
# rule / statute content), then start_page_index = toc_page_list[-1] + 1
# would be >= len(page_list) and process_toc_with_page_numbers would build
# main_content="" and silently drop the entire document. Fall back to the
# no-toc path so the page itself is processed as content.
if toc_page_list[-1] + 1 >= len(page_list):
print('toc covers the entire document (likely misclassification); falling back to no-toc')
return {'toc_content': None, 'toc_page_list': [], 'page_index_given_in_toc': 'no'}
else:
print('toc found')
toc_json = toc_extractor(page_list, toc_page_list, opt.model)
Expand All @@ -707,17 +732,17 @@ def check_toc(page_list, opt=None):
return {'toc_content': toc_json['toc_content'], 'toc_page_list': toc_page_list, 'page_index_given_in_toc': 'yes'}
else:
current_start_index = toc_page_list[-1] + 1
while (toc_json['page_index_given_in_toc'] == 'no' and
current_start_index < len(page_list) and

while (toc_json['page_index_given_in_toc'] == 'no' and
current_start_index < len(page_list) and
current_start_index < opt.toc_check_page_num):

additional_toc_pages = find_toc_pages(
start_page_index=current_start_index,
page_list=page_list,
opt=opt
)

if len(additional_toc_pages) == 0:
break

Expand Down