Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ build = [
]
dev = [
"pytest>=7.4.2,<8.0.0",
"pypdf>=5.0.0,<6.0.0",
"black[jupyter]>=24.4.2,<26.0.0",
"python-semantic-release>=7.32.2,<8.0.0",
"pre-commit>=3.7.1,<4.0.0",
Expand Down
59 changes: 48 additions & 11 deletions src/parse/pdf_decoders/document.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ namespace pdflib
// Decode a single page and return the page decoder directly
page_decoder_ptr decode_page(int page_number,
const decode_config& config);

// New: Direct access to page decoders (typed API)
bool has_page_decoder(int page_number);
page_decoder_ptr get_page_decoder(int page_number);
Expand Down Expand Up @@ -181,27 +181,64 @@ namespace pdflib

bool pdf_decoder<DOCUMENT>::process_document_components()
{
LOG_S(INFO) << __FUNCTION__;

utils::timer timer;

if(qpdf_root.hasKey("/Pages"))
{
qpdf_pages = qpdf_root.getKey("/Pages");

int _number_of_pages = -1;
if(qpdf_pages.hasKey("/Count"))
{
number_of_pages = qpdf_pages.getKey("/Count").getIntValue();
_number_of_pages = qpdf_pages.getKey("/Count").getIntValue();
//LOG_S(WARNING) << "`/Count` (before): " << _number_of_pages;
}
else

// Be aware that this operation does some normalization
number_of_pages = 0;
for(QPDFObjectHandle page : qpdf_document.getAllPages())
{
number_of_pages += 1;
}
LOG_S(INFO) << "#-pages (from `qpdf_document.getAllPages()`): " << number_of_pages;

if(number_of_pages!=_number_of_pages and qpdf_pages.hasKey("/Count"))
{
LOG_S(WARNING) << "filename: " << filename << " has no `/Count`";
number_of_pages = 0;
for(QPDFObjectHandle page : qpdf_document.getAllPages())
{
number_of_pages += 1;
}
LOG_S(WARNING) << "`/Count` before (=" << _number_of_pages << ") != "
<< " len(`/Pages`) (=" << number_of_pages << ")";
}

/*
if(qpdf_pages.hasKey("/Count"))
{
int __number_of_pages = qpdf_pages.getKey("/Count").getIntValue();
LOG_S(WARNING) << "`/Count` (after): " << __number_of_pages;

LOG_S(INFO) << "#-pages: " << number_of_pages;
if(_number_of_pages!=__number_of_pages)
{
LOG_S(WARNING) << "`/Count` before (=" << _number_of_pages << ") != "
<< "`/Count` after (=" << __number_of_pages << ")";
}

if(number_of_pages!=_number_of_pages)
{
LOG_S(WARNING) << "`/Count` before (=" << _number_of_pages << ") != "
<< " len(`/Pages`) (=" << number_of_pages << ")";
}

if(number_of_pages!=__number_of_pages)
{
LOG_S(WARNING) << "`/Count` after (=" << __number_of_pages << ") != "
<< " len(`/Pages`) (=" << number_of_pages << ")";
}
}
else
{
LOG_S(WARNING) << "filename: " << filename << " has no `/Count`";
}
*/
}
else
{
Expand Down Expand Up @@ -434,7 +471,7 @@ namespace pdflib
if(config.do_thread_safe)
{
// creates its own QPDF document
page_decoder = std::make_shared<pdf_decoder<PAGE>>(buffer, password, page_number);
page_decoder = std::make_shared<pdf_decoder<PAGE>>(buffer, password, page_number);
}
else
{
Expand Down
2 changes: 1 addition & 1 deletion src/parse/pdf_resources/page_font/base_fonts.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ namespace pdflib
{
if(initialized)
{
LOG_S(WARNING) << "skipping base_fonts::initialise, already initialized ...";
LOG_S(INFO) << "skipping base_fonts::initialise, already initialized ...";
return;
}

Expand Down
2 changes: 1 addition & 1 deletion src/parse/pdf_resources/page_font/encodings.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ namespace pdflib
{
if(initialized)
{
LOG_S(WARNING) << "skipping font_encodings::initialise, already initialized ...";
LOG_S(INFO) << "skipping font_encodings::initialise, already initialized ...";
return;
}

Expand Down
2 changes: 1 addition & 1 deletion src/parse/pdf_resources/page_font/font_cids.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ namespace pdflib
{
if(initialized)
{
LOG_S(WARNING) << "skipping font_cids::initialise, already initialized ...";
LOG_S(INFO) << "skipping font_cids::initialise, already initialized ...";
return;
}

Expand Down
7 changes: 4 additions & 3 deletions src/parse/pdf_resources/page_font/glyphs.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,15 @@ namespace pdflib
LOG_S(ERROR) << "could not find a glyph with name=" << key;
unknown_glyphs.insert(key);

return "glyph["+key+"]";
// FIXME: we should not do this, especially if the decode_config does not allow this!
return "GLYPH["+key+"]";
}

void font_glyphs::initialise(std::string dirname)
{
if(initialized)
{
LOG_S(WARNING) << "skipping font_glyphs::initialise, already initialized ...";
LOG_S(INFO) << "skipping font_glyphs::initialise, already initialized ...";
return;
}

Expand Down Expand Up @@ -215,7 +216,7 @@ namespace pdflib

void font_glyphs::read_file_uni(std::string filename)
{
LOG_S(WARNING) << __FUNCTION__ << ": " << filename;
LOG_S(INFO) << __FUNCTION__ << ": " << filename;

std::ifstream file(filename.c_str());

Expand Down
4 changes: 2 additions & 2 deletions src/pybind/docling_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ namespace docling
pdf_resources_dir(resource_utils::get_resources_dir(true).string()),
key2doc({})
{
LOG_S(WARNING) << "pdf_resources_dir: " << pdf_resources_dir;
LOG_S(INFO) << "pdf_resources_dir: " << pdf_resources_dir;

auto RESOURCE_DIR_KEY = pdflib::pdf_resource<pdflib::PAGE_FONT>::RESOURCE_DIR_KEY;

Expand All @@ -95,7 +95,7 @@ namespace docling
{
set_loglevel_with_label(level);

LOG_S(WARNING) << "pdf_resources_dir: " << pdf_resources_dir;
LOG_S(INFO) << "pdf_resources_dir: " << pdf_resources_dir;

auto RESOURCE_DIR_KEY = pdflib::pdf_resource<pdflib::PAGE_FONT>::RESOURCE_DIR_KEY;

Expand Down
Binary file added tests/data/cases/case_18.pdf
Binary file not shown.
Loading
Loading