Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,8 @@ def toc_detector_single_page(content, model=None):

response = llm_completion(model=model, prompt=prompt)
# print('response', response)
json_content = extract_json(response)
return json_content['toc_detected']
json_content = extract_json(response)
return json_content.get('toc_detected', 'no')


def check_if_toc_extraction_is_complete(content, toc, model=None):
Expand All @@ -137,7 +137,7 @@ def check_if_toc_extraction_is_complete(content, toc, model=None):
prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc
response = llm_completion(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['completed']
return json_content.get('completed', 'no')


def check_if_toc_transformation_is_complete(content, toc, model=None):
Expand All @@ -155,7 +155,7 @@ def check_if_toc_transformation_is_complete(content, toc, model=None):
prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc
response = llm_completion(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['completed']
return json_content.get('completed', 'no')

def extract_toc_content(content, model=None):
prompt = f"""
Expand Down Expand Up @@ -217,7 +217,7 @@ def detect_page_index(toc_content, model=None):

response = llm_completion(model=model, prompt=prompt)
json_content = extract_json(response)
return json_content['page_index_given_in_toc']
return json_content.get('page_index_given_in_toc', 'no')

def toc_extractor(page_list, toc_page_list, model):
def transform_dots_to_colon(text):
Expand Down Expand Up @@ -324,8 +324,8 @@ def toc_transformer(toc_content, model=None):
new_complete, finish_reason = llm_completion(model=model, prompt=prompt, return_finish_reason=True)

if new_complete.startswith('```json'):
new_complete = get_json_content(new_complete)
last_complete = last_complete+new_complete
new_complete = get_json_content(new_complete)
last_complete = last_complete + new_complete

if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)

Expand Down Expand Up @@ -683,8 +683,9 @@ def process_none_page_numbers(toc_items, page_list, start_index=1, model=None):

item_copy = copy.deepcopy(item)
del item_copy['page']
result = add_page_number_to_toc(page_contents, item_copy, model)
if isinstance(result[0]['physical_index'], str) and result[0]['physical_index'].startswith('<physical_index'):
page_contents_text = ''.join(page_contents)
result = add_page_number_to_toc(page_contents_text, [item_copy], model)
if result and isinstance(result[0].get('physical_index'), str) and result[0]['physical_index'].startswith('<physical_index'):
item['physical_index'] = int(result[0]['physical_index'].split('_')[-1].rstrip('>').strip())
del item['page']

Expand Down
2 changes: 1 addition & 1 deletion pageindex/page_index_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
try:
from .utils import *
except:
except ImportError:
from utils import *

async def get_node_summary(node, summary_token_threshold=200, model=None):
Expand Down
28 changes: 12 additions & 16 deletions pageindex/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ def llm_completion(model, prompt, chat_history=None, return_finish_reason=False)
print('************* Retrying *************')
logging.error(f"Error: {e}")
if i < max_retries - 1:
time.sleep(1)
wait = min(2 ** i, 60) # exponential backoff, capped at 60s
time.sleep(wait)
else:
logging.error('Max retries reached for prompt: ' + prompt)
if return_finish_reason:
Expand All @@ -76,7 +77,8 @@ async def llm_acompletion(model, prompt):
print('************* Retrying *************')
logging.error(f"Error: {e}")
if i < max_retries - 1:
await asyncio.sleep(1)
wait = min(2 ** i, 60) # exponential backoff, capped at 60s
await asyncio.sleep(wait)
else:
logging.error('Max retries reached for prompt: ' + prompt)
return ""
Expand Down Expand Up @@ -172,7 +174,7 @@ def structure_to_list(structure):

def get_leaf_nodes(structure):
if isinstance(structure, dict):
if not structure['nodes']:
if not structure.get('nodes'):
structure_node = copy.deepcopy(structure)
structure_node.pop('nodes', None)
return [structure_node]
Expand Down Expand Up @@ -284,23 +286,17 @@ class JsonLogger:
def __init__(self, file_path):
# Extract PDF name for logger name
pdf_name = get_pdf_name(file_path)

current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
self.filename = f"{pdf_name}_{current_time}.json"
# Use .jsonl extension to reflect the newline-delimited format
self.filename = f"{pdf_name}_{current_time}.jsonl"
os.makedirs("./logs", exist_ok=True)
# Initialize empty list to store all messages
self.log_data = []

def log(self, level, message, **kwargs):
if isinstance(message, dict):
self.log_data.append(message)
else:
self.log_data.append({'message': message})
# Add new message to the log data

# Write entire log data to file
with open(self._filepath(), "w") as f:
json.dump(self.log_data, f, indent=2)
entry = message if isinstance(message, dict) else {'message': message}
# Append a single JSON line — O(1) regardless of how many entries exist
with open(self._filepath(), "a", encoding="utf-8") as f:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")

def info(self, message, **kwargs):
self.log("INFO", message, **kwargs)
Expand Down
2 changes: 1 addition & 1 deletion run_pageindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@
}

# Load config with defaults from config.yaml
opt = config_loader.load(user_opt)
opt = config_loader.load({k: v for k, v in user_opt.items() if v is not None})

toc_with_page_number = asyncio.run(md_to_tree(
md_path=args.md_path,
Expand Down