Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pageindex/config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
model: "gpt-4o-2024-11-20"
tokenizer: ""
toc_check_page_num: 20
max_page_num_each_node: 10
max_token_num_each_node: 20000
Expand Down
20 changes: 11 additions & 9 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,13 +565,13 @@ def generate_toc_init(part, model=None):
else:
raise Exception(f'finish reason: {finish_reason}')

def process_no_toc(page_list, start_index=1, model=None, logger=None):
def process_no_toc(page_list, start_index=1, model=None, logger=None, tokenizer=None):
page_contents=[]
token_lengths=[]
for page_index in range(start_index, start_index+len(page_list)):
page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
page_contents.append(page_text)
token_lengths.append(count_tokens(page_text, model))
token_lengths.append(count_tokens(page_text, model, tokenizer=tokenizer))
group_texts = page_list_to_group_text(page_contents, token_lengths)
logger.info(f'len(group_texts): {len(group_texts)}')

Expand All @@ -586,16 +586,16 @@ def process_no_toc(page_list, start_index=1, model=None, logger=None):

return toc_with_page_number

def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_index=1, model=None, logger=None):
def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_index=1, model=None, logger=None, tokenizer=None):
page_contents=[]
token_lengths=[]
toc_content = toc_transformer(toc_content, model)
logger.info(f'toc_transformer: {toc_content}')
for page_index in range(start_index, start_index+len(page_list)):
page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
page_contents.append(page_text)
token_lengths.append(count_tokens(page_text, model))
token_lengths.append(count_tokens(page_text, model, tokenizer=tokenizer))

group_texts = page_list_to_group_text(page_contents, token_lengths)
logger.info(f'len(group_texts): {len(group_texts)}')

Expand Down Expand Up @@ -632,6 +632,8 @@ def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_che
logger.info(f'matching_pairs: {matching_pairs}')

offset = calculate_page_offset(matching_pairs)
if offset is None:
offset = 0
logger.info(f'offset: {offset}')

toc_with_page_number = add_page_offset_to_toc_json(toc_with_page_number, offset)
Expand Down Expand Up @@ -955,9 +957,9 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
if mode == 'process_toc_with_page_numbers':
toc_with_page_number = process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=opt.toc_check_page_num, model=opt.model, logger=logger)
elif mode == 'process_toc_no_page_numbers':
toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=opt.model, logger=logger)
toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=opt.model, logger=logger, tokenizer=opt.tokenizer)
else:
toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger)
toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger, tokenizer=opt.tokenizer)

toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None]

Expand Down Expand Up @@ -1066,7 +1068,7 @@ def page_index_main(doc, opt=None):
raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.")

print('Parsing PDF...')
page_list = get_page_tokens(doc)
page_list = get_page_tokens(doc, model=opt.model, tokenizer=opt.tokenizer)

logger.info({'total_page_number': len(page_list)})
logger.info({'total_token': sum([page[1] for page in page_list])})
Expand Down Expand Up @@ -1100,7 +1102,7 @@ async def page_index_builder():
return asyncio.run(page_index_builder())


def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
def page_index(doc, model=None, tokenizer=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None):

user_opt = {
Expand Down
26 changes: 13 additions & 13 deletions pageindex/page_index_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,18 @@
except:
from utils import *

async def get_node_summary(node, summary_token_threshold=200, model=None):
async def get_node_summary(node, summary_token_threshold=200, model=None, tokenizer=None):
node_text = node.get('text')
num_tokens = count_tokens(node_text, model=model)
num_tokens = count_tokens(node_text, model=model, tokenizer=tokenizer)
if num_tokens < summary_token_threshold:
return node_text
else:
return await generate_node_summary(node, model=model)


async def generate_summaries_for_structure_md(structure, summary_token_threshold, model=None):
async def generate_summaries_for_structure_md(structure, summary_token_threshold, model=None, tokenizer=None):
nodes = structure_to_list(structure)
tasks = [get_node_summary(node, summary_token_threshold=summary_token_threshold, model=model) for node in nodes]
tasks = [get_node_summary(node, summary_token_threshold=summary_token_threshold, model=model, tokenizer=tokenizer) for node in nodes]
summaries = await asyncio.gather(*tasks)

for node, summary in zip(nodes, summaries):
Expand Down Expand Up @@ -86,7 +86,7 @@ def extract_node_text_content(node_list, markdown_lines):
node['text'] = '\n'.join(markdown_lines[start_line:end_line]).strip()
return all_nodes

def update_node_list_with_text_token_count(node_list, model=None):
def update_node_list_with_text_token_count(node_list, model=None, tokenizer=None):

def find_all_children(parent_index, parent_level, node_list):
"""Find all direct and indirect children of a parent node"""
Expand Down Expand Up @@ -127,12 +127,12 @@ def find_all_children(parent_index, parent_level, node_list):
total_text += '\n' + child_text

# Calculate token count for combined text
result_list[i]['text_token_count'] = count_tokens(total_text, model=model)
result_list[i]['text_token_count'] = count_tokens(total_text, model=model, tokenizer=tokenizer)

return result_list


def tree_thinning_for_index(node_list, min_node_token=None, model=None):
def tree_thinning_for_index(node_list, min_node_token=None, model=None, tokenizer=None):
def find_all_children(parent_index, parent_level, node_list):
children_indices = []

Expand Down Expand Up @@ -179,7 +179,7 @@ def find_all_children(parent_index, parent_level, node_list):

result_list[i]['text'] = merged_text

result_list[i]['text_token_count'] = count_tokens(merged_text, model=model)
result_list[i]['text_token_count'] = count_tokens(merged_text, model=model, tokenizer=tokenizer)

for index in sorted(nodes_to_remove, reverse=True):
result_list.pop(index)
Expand Down Expand Up @@ -240,7 +240,7 @@ def clean_tree_for_output(tree_nodes):
return cleaned_nodes


async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'):
async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, tokenizer=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'):
with open(md_path, 'r', encoding='utf-8') as f:
markdown_content = f.read()

Expand All @@ -251,9 +251,9 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
nodes_with_content = extract_node_text_content(node_list, markdown_lines)

if if_thinning:
nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model)
nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model, tokenizer=tokenizer)
print(f"Thinning nodes...")
nodes_with_content = tree_thinning_for_index(nodes_with_content, min_token_threshold, model=model)
nodes_with_content = tree_thinning_for_index(nodes_with_content, min_token_threshold, model=model, tokenizer=tokenizer)

print(f"Building tree from nodes...")
tree_structure = build_tree_from_nodes(nodes_with_content)
Expand All @@ -268,7 +268,7 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])

print(f"Generating summaries for each node...")
tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)
tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model, tokenizer=tokenizer)

if if_add_node_text == 'no':
# Remove text after summary generation if not requested
Expand Down
14 changes: 10 additions & 4 deletions pageindex/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@

CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")

def count_tokens(text, model=None):
def count_tokens(text, model=None, tokenizer=None):
if not text:
return 0
enc = tiktoken.encoding_for_model(model)
if tokenizer:
enc = tiktoken.get_encoding(tokenizer)
else:
enc = tiktoken.encoding_for_model(model)
tokens = enc.encode(text)
return len(tokens)

Expand Down Expand Up @@ -410,8 +413,11 @@ def add_preface_if_needed(data):



def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
enc = tiktoken.encoding_for_model(model)
def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2", tokenizer=None):
if tokenizer:
enc = tiktoken.get_encoding(tokenizer)
else:
enc = tiktoken.encoding_for_model(model)
if pdf_parser == "PyPDF2":
pdf_reader = PyPDF2.PdfReader(pdf_path)
page_list = []
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ PyPDF2==3.0.1
python-dotenv==1.1.0
tiktoken==0.11.0
pyyaml==6.0.2
pycryptodome==3.15.0
4 changes: 4 additions & 0 deletions run_pageindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
parser.add_argument('--md_path', type=str, help='Path to the Markdown file')

parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
parser.add_argument('--tokenizer', type=str, default='', help='Tiktoken encoding name (e.g. o200k_base). Empty = auto-detect from model.')

parser.add_argument('--toc-check-pages', type=int, default=20,
help='Number of pages to check for table of contents (PDF only)')
Expand Down Expand Up @@ -54,6 +55,7 @@
# Configure options
opt = config(
model=args.model,
tokenizer=args.tokenizer,
toc_check_page_num=args.toc_check_pages,
max_page_num_each_node=args.max_pages_per_node,
max_token_num_each_node=args.max_tokens_per_node,
Expand Down Expand Up @@ -98,6 +100,7 @@
# Create options dict with user args
user_opt = {
'model': args.model,
'tokenizer': args.tokenizer,
'if_add_node_summary': args.if_add_node_summary,
'if_add_doc_description': args.if_add_doc_description,
'if_add_node_text': args.if_add_node_text,
Expand All @@ -114,6 +117,7 @@
if_add_node_summary=opt.if_add_node_summary,
summary_token_threshold=args.summary_token_threshold,
model=opt.model,
tokenizer=opt.tokenizer,
if_add_doc_description=opt.if_add_doc_description,
if_add_node_text=opt.if_add_node_text,
if_add_node_id=opt.if_add_node_id
Expand Down