Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pageindex/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .page_index import *
from .page_index_md import md_to_tree
from .page_index_txt import txt_to_tree
from .retrieve import get_document, get_document_structure, get_page_content
from .client import PageIndexClient
31 changes: 30 additions & 1 deletion pageindex/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from .page_index import page_index
from .page_index_md import md_to_tree
from .page_index_txt import txt_to_tree
from .retrieve import get_document, get_document_structure, get_page_content
from .utils import ConfigLoader, remove_fields

Expand Down Expand Up @@ -65,6 +66,7 @@ def index(self, file_path: str, mode: str = "auto") -> str:

is_pdf = ext == '.pdf'
is_md = ext in ['.md', '.markdown']
is_txt = ext == '.txt'

if mode == "pdf" or (mode == "auto" and is_pdf):
print(f"Indexing PDF: {file_path}")
Expand Down Expand Up @@ -121,6 +123,33 @@ def index(self, file_path: str, mode: str = "auto") -> str:
'line_count': result.get('line_count', 0),
'structure': result['structure'],
}

elif mode == "txt" or (mode == "auto" and is_txt):
print(f"Indexing Text: {file_path}")
coro = txt_to_tree(
txt_path=file_path,
if_add_node_summary='yes',
summary_token_threshold=200,
model=self.model,
if_add_doc_description='yes',
if_add_node_text='yes',
if_add_node_id='yes'
)
try:
asyncio.get_running_loop()
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
result = pool.submit(asyncio.run, coro).result()
except RuntimeError:
result = asyncio.run(coro)
self.documents[doc_id] = {
'id': doc_id,
'type': 'txt',
'path': file_path,
'doc_name': result.get('doc_name', ''),
'doc_description': result.get('doc_description', ''),
'line_count': result.get('line_count', 0),
'structure': result['structure'],
}
else:
raise ValueError(f"Unsupported file format for: {file_path}")

Expand All @@ -140,7 +169,7 @@ def _make_meta_entry(doc: dict) -> dict:
}
if doc.get('type') == 'pdf':
entry['page_count'] = doc.get('page_count')
elif doc.get('type') == 'md':
elif doc.get('type') in ('md', 'txt'):
entry['line_count'] = doc.get('line_count')
return entry

Expand Down
61 changes: 61 additions & 0 deletions pageindex/page_index_txt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import os
try:
from .utils import *
except:
from utils import *

from .page_index_md import generate_summaries_for_structure_md


def _read_text_file(txt_path):
try:
with open(txt_path, 'r', encoding='utf-8') as f:
return f.read()
except UnicodeDecodeError:
with open(txt_path, 'r', encoding='latin-1') as f:
return f.read()


async def txt_to_tree(txt_path, if_add_node_summary='no', summary_token_threshold=200, model=None, if_add_doc_description='no', if_add_node_text='yes', if_add_node_id='yes'):
text = _read_text_file(txt_path)
line_count = text.count('\n') + 1

doc_name = os.path.splitext(os.path.basename(txt_path))[0]
tree_structure = [{
'title': doc_name,
'node_id': '0001',
'text': text,
'line_num': 1,
'nodes': [],
}]

if if_add_node_id == 'yes':
write_node_id(tree_structure)

if if_add_node_summary == 'yes':
tree_structure = format_structure(tree_structure, order=['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)

if if_add_node_text == 'no':
tree_structure = format_structure(tree_structure, order=['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])

if if_add_doc_description == 'yes':
clean_structure = create_clean_structure_for_description(tree_structure)
doc_description = generate_doc_description(clean_structure, model=model)
return {
'doc_name': doc_name,
'doc_description': doc_description,
'line_count': line_count,
'structure': tree_structure,
}
else:
if if_add_node_text == 'yes':
tree_structure = format_structure(tree_structure, order=['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
else:
tree_structure = format_structure(tree_structure, order=['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])

return {
'doc_name': doc_name,
'line_count': line_count,
'structure': tree_structure,
}
55 changes: 51 additions & 4 deletions run_pageindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
import json
from pageindex import *
from pageindex.page_index_md import md_to_tree
from pageindex.page_index_txt import txt_to_tree
from pageindex.utils import ConfigLoader

if __name__ == "__main__":
# Set up argument parser
parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure')
parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
parser.add_argument('--md_path', type=str, help='Path to the Markdown file')
parser.add_argument('--txt_path', type=str, help='Path to the plain text file')

parser.add_argument('--model', type=str, default=None, help='Model to use (overrides config.yaml)')

Expand Down Expand Up @@ -39,10 +41,11 @@
args = parser.parse_args()

# Validate that exactly one file type is specified
if not args.pdf_path and not args.md_path:
raise ValueError("Either --pdf_path or --md_path must be specified")
if args.pdf_path and args.md_path:
raise ValueError("Only one of --pdf_path or --md_path can be specified")
specified = [p for p in (args.pdf_path, args.md_path, args.txt_path) if p]
if not specified:
raise ValueError("Either --pdf_path, --md_path, or --txt_path must be specified")
if len(specified) > 1:
raise ValueError("Only one of --pdf_path, --md_path, or --txt_path can be specified")

if args.pdf_path:
# Validate PDF file
Expand Down Expand Up @@ -131,4 +134,48 @@
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)

print(f'Tree structure saved to: {output_file}')

elif args.txt_path:
if not args.txt_path.lower().endswith('.txt'):
raise ValueError("Text file must have .txt extension")
if not os.path.isfile(args.txt_path):
raise ValueError(f"Text file not found: {args.txt_path}")

print('Processing text file...')

import asyncio
from pageindex.utils import ConfigLoader
config_loader = ConfigLoader()

user_opt = {
'model': args.model,
'if_add_node_summary': args.if_add_node_summary,
'if_add_doc_description': args.if_add_doc_description,
'if_add_node_text': args.if_add_node_text,
'if_add_node_id': args.if_add_node_id
}

opt = config_loader.load(user_opt)

toc_with_page_number = asyncio.run(txt_to_tree(
txt_path=args.txt_path,
if_add_node_summary=opt.if_add_node_summary,
summary_token_threshold=args.summary_token_threshold,
model=opt.model,
if_add_doc_description=opt.if_add_doc_description,
if_add_node_text=opt.if_add_node_text,
if_add_node_id=opt.if_add_node_id
))

print('Parsing done, saving to file...')

txt_name = os.path.splitext(os.path.basename(args.txt_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{txt_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)

print(f'Tree structure saved to: {output_file}')
68 changes: 68 additions & 0 deletions tests/test_page_index_txt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import asyncio
import os
import tempfile

import pytest

from pageindex.page_index_txt import txt_to_tree


def _run(coro):
return asyncio.run(coro)


def _write_tmp(content, suffix=".txt", encoding="utf-8"):
fd, path = tempfile.mkstemp(suffix=suffix)
os.close(fd)
with open(path, "w", encoding=encoding) as f:
f.write(content)
return path


def test_txt_to_tree_parses_plain_text_into_single_node():
path = _write_tmp("Hello world.\nThis is a plain text document.\n")
try:
result = _run(txt_to_tree(txt_path=path, if_add_node_summary="no", if_add_doc_description="no"))
finally:
os.unlink(path)

assert result["doc_name"] == os.path.splitext(os.path.basename(path))[0]
assert isinstance(result["structure"], list)
assert len(result["structure"]) == 1
root = result["structure"][0]
assert root["text"].startswith("Hello world.")
assert "This is a plain text document." in root["text"]


def test_txt_to_tree_preserves_utf8_content():
path = _write_tmp("héllo wörld — 你好\n", encoding="utf-8")
try:
result = _run(txt_to_tree(txt_path=path, if_add_node_summary="no", if_add_doc_description="no"))
finally:
os.unlink(path)

assert "héllo wörld" in result["structure"][0]["text"]
assert "你好" in result["structure"][0]["text"]


def test_txt_to_tree_includes_line_count():
path = _write_tmp("line1\nline2\nline3\n")
try:
result = _run(txt_to_tree(txt_path=path, if_add_node_summary="no", if_add_doc_description="no"))
finally:
os.unlink(path)

assert result["line_count"] == 4


def test_txt_to_tree_exposed_from_package():
from pageindex import txt_to_tree as exported
assert exported is txt_to_tree


def test_client_dispatches_txt_extension():
import inspect
from pageindex import client
src = inspect.getsource(client)
assert "txt_to_tree" in src
assert ".txt" in src