Skip to content
34 changes: 17 additions & 17 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ async def check_title_appearance(item, page_list, start_index=1, model=None):
}}
Directly return the final JSON structure. Do not output anything else."""

response = await ChatGPT_API_async(model=model, prompt=prompt)
response = await OpenAI_API_async(model=model, prompt=prompt)
response = extract_json(response)
if 'answer' in response:
answer = response['answer']
Expand Down Expand Up @@ -64,7 +64,7 @@ async def check_title_appearance_in_start(title, page_text, model=None, logger=N
}}
Directly return the final JSON structure. Do not output anything else."""

response = await ChatGPT_API_async(model=model, prompt=prompt)
response = await OpenAI_API_async(model=model, prompt=prompt)
response = extract_json(response)
if logger:
logger.info(f"Response: {response}")
Expand Down Expand Up @@ -116,7 +116,7 @@ def toc_detector_single_page(content, model=None):
Directly return the final JSON structure. Do not output anything else.
Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents."""

response = ChatGPT_API(model=model, prompt=prompt)
response = OpenAI_API(model=model, prompt=prompt)
# print('response', response)
json_content = extract_json(response)
return json_content['toc_detected']
Expand All @@ -135,7 +135,7 @@ def check_if_toc_extraction_is_complete(content, toc, model=None):
Directly return the final JSON structure. Do not output anything else."""

prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc
response = ChatGPT_API(model=model, prompt=prompt)
response = OpenAI_API(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['completed']

Expand All @@ -153,7 +153,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None):
Directly return the final JSON structure. Do not output anything else."""

prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc
response = ChatGPT_API(model=model, prompt=prompt)
response = OpenAI_API(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['completed']

Expand All @@ -165,7 +165,7 @@ def extract_toc_content(content, model=None):
Directly return the full table of contents content. Do not output anything else."""

response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt)

if_complete = check_if_toc_transformation_is_complete(content, response, model)
if if_complete == "yes" and finish_reason == "finished":
Expand All @@ -176,7 +176,7 @@ def extract_toc_content(content, model=None):
{"role": "assistant", "content": response},
]
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
new_response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
response = response + new_response
if_complete = check_if_toc_transformation_is_complete(content, response, model)

Expand All @@ -186,7 +186,7 @@ def extract_toc_content(content, model=None):
{"role": "assistant", "content": response},
]
prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
new_response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
response = response + new_response
if_complete = check_if_toc_transformation_is_complete(content, response, model)

Expand All @@ -212,7 +212,7 @@ def detect_page_index(toc_content, model=None):
}}
Directly return the final JSON structure. Do not output anything else."""

response = ChatGPT_API(model=model, prompt=prompt)
response = OpenAI_API(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['page_index_given_in_toc']

Expand Down Expand Up @@ -261,7 +261,7 @@ def toc_index_extractor(toc, content, model=None):
Directly return the final JSON structure. Do not output anything else."""

prompt = toc_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content
response = ChatGPT_API(model=model, prompt=prompt)
response = OpenAI_API(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content

Expand Down Expand Up @@ -289,7 +289,7 @@ def toc_transformer(toc_content, model=None):
Directly return the final JSON structure, do not output anything else. """

prompt = init_prompt + '\n Given table of contents\n:' + toc_content
last_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
last_complete, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt)
if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)
if if_complete == "yes" and finish_reason == "finished":
last_complete = extract_json(last_complete)
Expand All @@ -313,7 +313,7 @@ def toc_transformer(toc_content, model=None):
Please continue the json structure, directly output the remaining part of the json structure."""

new_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
new_complete, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt)

if new_complete.startswith('```json'):
new_complete = get_json_content(new_complete)
Expand Down Expand Up @@ -474,7 +474,7 @@ def add_page_number_to_toc(part, structure, model=None):
Directly return the final JSON structure. Do not output anything else."""

prompt = fill_prompt_seq + f"\n\nCurrent Partial Document:\n{part}\n\nGiven Structure\n{json.dumps(structure, indent=2)}\n"
current_json_raw = ChatGPT_API(model=model, prompt=prompt)
current_json_raw = OpenAI_API(model=model, prompt=prompt)
json_result = extract_json(current_json_raw)

for item in json_result:
Expand Down Expand Up @@ -524,7 +524,7 @@ def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"):
Directly return the additional part of the final JSON structure. Do not output anything else."""

prompt = prompt + '\nGiven text\n:' + part + '\nPrevious tree structure\n:' + json.dumps(toc_content, indent=2)
response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt)
if finish_reason == 'finished':
return extract_json(response)
else:
Expand Down Expand Up @@ -558,7 +558,7 @@ def generate_toc_init(part, model=None):
Directly return the final JSON structure. Do not output anything else."""

prompt = prompt + '\nGiven text\n:' + part
response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
response, finish_reason = OpenAI_API_with_finish_reason(model=model, prompt=prompt)

if finish_reason == 'finished':
return extract_json(response)
Expand Down Expand Up @@ -743,7 +743,7 @@ def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20
Directly return the final JSON structure. Do not output anything else."""

prompt = toc_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content
response = ChatGPT_API(model=model, prompt=prompt)
response = OpenAI_API(model=model, prompt=prompt)
json_content = extract_json(response)
return convert_physical_index_to_int(json_content['physical_index'])

Expand Down Expand Up @@ -1141,4 +1141,4 @@ def validate_and_truncate_physical_indices(toc_with_page_number, page_list_lengt
if truncated_items:
print(f"Truncated {len(truncated_items)} TOC items that exceeded document length")

return toc_with_page_number
return toc_with_page_number
6 changes: 4 additions & 2 deletions pageindex/page_index_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,13 +300,15 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
if __name__ == "__main__":
import os
import json
from dotenv import load_dotenv
load_dotenv()

# MD_NAME = 'Detect-Order-Construct'
MD_NAME = 'cognitive-load'
MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'tests/markdowns/', f'{MD_NAME}.md')


MODEL="gpt-4.1"
MODEL = os.getenv('OPENAI_MODEL', 'gpt-4.1')
IF_THINNING=False
THINNING_THRESHOLD=5000
SUMMARY_TOKEN_THRESHOLD=200
Expand Down Expand Up @@ -336,4 +338,4 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(tree_structure, f, indent=2, ensure_ascii=False)

print(f"\nTree structure saved to: {output_path}")
print(f"\nTree structure saved to: {output_path}")
22 changes: 12 additions & 10 deletions pageindex/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
from pathlib import Path
from types import SimpleNamespace as config

CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-2024-11-20")
OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")

def count_tokens(text, model=None):
if not text:
Expand All @@ -26,9 +28,9 @@ def count_tokens(text, model=None):
tokens = enc.encode(text)
return len(tokens)

def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
def OpenAI_API_with_finish_reason(model, prompt, api_key=OPENAI_API_KEY, chat_history=None):
max_retries = 10
client = openai.OpenAI(api_key=api_key)
client = openai.OpenAI(api_key=api_key, base_url=OPENAI_BASE_URL)
for i in range(max_retries):
try:
if chat_history:
Expand Down Expand Up @@ -58,9 +60,9 @@ def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_



def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
def OpenAI_API(model, prompt, api_key=OPENAI_API_KEY, chat_history=None):
max_retries = 10
client = openai.OpenAI(api_key=api_key)
client = openai.OpenAI(api_key=api_key, base_url=OPENAI_BASE_URL)
for i in range(max_retries):
try:
if chat_history:
Expand All @@ -86,12 +88,12 @@ def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
return "Error"


async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY):
async def OpenAI_API_async(model, prompt, api_key=OPENAI_API_KEY):
max_retries = 10
messages = [{"role": "user", "content": prompt}]
for i in range(max_retries):
try:
async with openai.AsyncOpenAI(api_key=api_key) as client:
async with openai.AsyncOpenAI(api_key=api_key, base_url=OPENAI_BASE_URL) as client:
response = await client.chat.completions.create(
model=model,
messages=messages,
Expand Down Expand Up @@ -609,7 +611,7 @@ async def generate_node_summary(node, model=None):

Directly return the description, do not include any other text.
"""
response = await ChatGPT_API_async(model, prompt)
response = await OpenAI_API_async(model, prompt)
return response


Expand Down Expand Up @@ -654,7 +656,7 @@ def generate_doc_description(structure, model=None):

Directly return the description, do not include any other text.
"""
response = ChatGPT_API(model, prompt)
response = OpenAI_API(model, prompt)
return response


Expand Down Expand Up @@ -709,4 +711,4 @@ def load(self, user_opt=None) -> config:

self._validate_keys(user_dict)
merged = {**self._default_dict, **user_dict}
return config(**merged)
return config(**merged)
6 changes: 4 additions & 2 deletions run_pageindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
import json
from pageindex import *
from pageindex.page_index_md import md_to_tree
from dotenv import load_dotenv
load_dotenv()

if __name__ == "__main__":
# Set up argument parser
parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure')
parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
parser.add_argument('--md_path', type=str, help='Path to the Markdown file')

parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
parser.add_argument('--model', type=str, default=os.getenv('OPENAI_MODEL', 'gpt-4o-2024-11-20'), help='Model to use')

parser.add_argument('--toc-check-pages', type=int, default=20,
help='Number of pages to check for table of contents (PDF only)')
Expand Down Expand Up @@ -130,4 +132,4 @@
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)

print(f'Tree structure saved to: {output_file}')
print(f'Tree structure saved to: {output_file}')