Skip to content
16 changes: 6 additions & 10 deletions pageindex/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@
import concurrent.futures
from pathlib import Path

import PyPDF2

from .page_index import page_index
from .page_index_md import md_to_tree
from .retrieve import get_document, get_document_structure, get_page_content
from .utils import ConfigLoader, remove_fields
from .utils import ConfigLoader, read_pdf_pages, remove_fields

META_INDEX = "_meta.json"

Expand All @@ -32,7 +30,8 @@ class PageIndexClient:

For agent-based QA, see examples/agentic_vectorless_rag_demo.py.
"""
def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, workspace: str = None):
def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None,
workspace: str = None):
if api_key:
os.environ["OPENAI_API_KEY"] = api_key
elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
Expand Down Expand Up @@ -74,14 +73,11 @@ def index(self, file_path: str, mode: str = "auto") -> str:
if_add_node_summary='yes',
if_add_node_text='yes',
if_add_node_id='yes',
if_add_doc_description='yes'
if_add_doc_description='yes',
)
# Extract per-page text so queries don't need the original PDF
pages = []
with open(file_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
for i, page in enumerate(pdf_reader.pages, 1):
pages.append({'page': i, 'content': page.extract_text() or ''})
page_texts = read_pdf_pages(file_path)
pages = [{'page': i, 'content': text} for i, text in enumerate(page_texts, 1)]

self.documents[doc_id] = {
'id': doc_id,
Expand Down
6 changes: 3 additions & 3 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,9 +1065,9 @@ async def tree_parser(page_list, opt, doc=None, logger=None):

def page_index_main(doc, opt=None):
logger = JsonLogger(doc)

is_valid_pdf = (
(isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or
(isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or
isinstance(doc, BytesIO)
)
if not is_valid_pdf:
Expand Down Expand Up @@ -1112,7 +1112,7 @@ async def page_index_builder():

def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None):

user_opt = {
arg: value for arg, value in locals().items()
if arg != "doc" and value is not None
Expand Down
21 changes: 9 additions & 12 deletions pageindex/retrieve.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import json
import PyPDF2

try:
from .utils import get_number_of_pages, remove_fields
from .utils import get_number_of_pages, read_pdf_pages, remove_fields
except ImportError:
from utils import get_number_of_pages, remove_fields
from utils import get_number_of_pages, read_pdf_pages, remove_fields


# ── Helpers ──────────────────────────────────────────────────────────────────
Expand Down Expand Up @@ -42,15 +41,13 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
{'page': p, 'content': page_map[p]}
for p in page_nums if p in page_map
]
path = doc_info['path']
with open(path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
total = len(pdf_reader.pages)
valid_pages = [p for p in page_nums if 1 <= p <= total]
return [
{'page': p, 'content': pdf_reader.pages[p - 1].extract_text() or ''}
for p in valid_pages
]
all_pages = read_pdf_pages(doc_info['path'])
total = len(all_pages)
valid_pages = [p for p in page_nums if 1 <= p <= total]
return [
{'page': p, 'content': all_pages[p - 1]}
for p in valid_pages
]


def _get_md_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
Expand Down
82 changes: 58 additions & 24 deletions pageindex/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,30 +384,64 @@ def add_preface_if_needed(data):



def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"):
if pdf_parser == "PyPDF2":
pdf_reader = PyPDF2.PdfReader(pdf_path)
page_list = []
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
token_length = litellm.token_counter(model=model, text=page_text)
page_list.append((page_text, token_length))
return page_list
elif pdf_parser == "PyMuPDF":
if isinstance(pdf_path, BytesIO):
pdf_stream = pdf_path
doc = pymupdf.open(stream=pdf_stream, filetype="pdf")
elif isinstance(pdf_path, str) and os.path.isfile(pdf_path) and pdf_path.lower().endswith(".pdf"):
doc = pymupdf.open(pdf_path)
page_list = []
for page in doc:
page_text = page.get_text()
token_length = litellm.token_counter(model=model, text=page_text)
page_list.append((page_text, token_length))
return page_list
else:
raise ValueError(f"Unsupported PDF parser: {pdf_parser}")
SUPPORTED_PDF_PARSERS = ("PyPDF2", "pypdfium2", "PyMuPDF")

# Module-level setting. Override by mutating this attribute or setting
# PAGEINDEX_PDF_PARSER in the environment before import.
DEFAULT_PDF_PARSER = os.getenv("PAGEINDEX_PDF_PARSER") or SUPPORTED_PDF_PARSERS[0]


def read_pdf_pages(doc):
"""Return a list of per-page text strings using the currently configured parser."""
parser = DEFAULT_PDF_PARSER

if parser == "PyPDF2":
reader = PyPDF2.PdfReader(doc)
return [(p.extract_text() or "") for p in reader.pages]

if parser == "pypdfium2":
try:
import pypdfium2 as pdfium
except ImportError as e:
raise ImportError(
"DEFAULT_PDF_PARSER='pypdfium2' requires the optional dependency. "
"Install it with: pip install pypdfium2"
) from e
source = doc.getvalue() if isinstance(doc, BytesIO) else str(doc)
pdf = pdfium.PdfDocument(source)
try:
pages = []
for i in range(len(pdf)):
page = pdf[i]
tp = page.get_textpage()
try:
text = (tp.get_text_bounded() or "").replace("\r\n", "\n")
finally:
tp.close()
page.close()
pages.append(text)
return pages
finally:
pdf.close()

if parser == "PyMuPDF":
if isinstance(doc, BytesIO):
d = pymupdf.open(stream=doc, filetype="pdf")
else:
d = pymupdf.open(str(doc))
try:
return [p.get_text() for p in d]
finally:
d.close()

raise ValueError(
f"Unsupported DEFAULT_PDF_PARSER={parser!r}. Choose from {SUPPORTED_PDF_PARSERS}."
)


def get_page_tokens(pdf_path, model=None):
pages = read_pdf_pages(pdf_path)
return [(text, litellm.token_counter(model=model, text=text)) for text in pages]



Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
litellm==1.83.7
# openai-agents # optional: required for examples/agentic_vectorless_rag_demo.py
pymupdf==1.26.4
# pypdfium2 # optional: enables pdf_parser="pypdfium2" (cleaner text, faster, Apache 2.0)
PyPDF2==3.0.1
python-dotenv==1.2.2
pyyaml==6.0.2
15 changes: 10 additions & 5 deletions run_pageindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import os
import json
from pageindex import *
import pageindex.utils as pageindex_utils
Comment thread
github-code-quality[bot] marked this conversation as resolved.
Fixed
from pageindex.page_index_md import md_to_tree
from pageindex.utils import ConfigLoader

if __name__ == "__main__":
# Set up argument parser
Expand All @@ -28,7 +28,9 @@
help='Whether to add doc description to the doc')
parser.add_argument('--if-add-node-text', type=str, default=None,
help='Whether to add text to the node')

parser.add_argument('--pdf-parser', type=str, default=None,
help='PDF text extractor: PyPDF2 (default), pypdfium2 (requires `pip install pypdfium2`), or PyMuPDF')

# Markdown specific arguments
parser.add_argument('--if-thinning', type=str, default='no',
help='Whether to apply tree thinning for markdown (markdown only)')
Expand Down Expand Up @@ -62,7 +64,11 @@
'if_add_doc_description': args.if_add_doc_description,
'if_add_node_text': args.if_add_node_text,
}
opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})
opt = pageindex_utils.ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None})

# CLI flag overrides the module-level default (and env var PAGEINDEX_PDF_PARSER).
if args.pdf_parser:
pageindex_utils.DEFAULT_PDF_PARSER = args.pdf_parser

# Process the PDF
toc_with_page_number = page_index_main(args.pdf_path, opt)
Expand Down Expand Up @@ -93,8 +99,7 @@
import asyncio

# Use ConfigLoader to get consistent defaults (matching PDF behavior)
from pageindex.utils import ConfigLoader
config_loader = ConfigLoader()
config_loader = pageindex_utils.ConfigLoader()

# Create options dict with user args
user_opt = {
Expand Down
Loading