Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 49 additions & 18 deletions src/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@

EDGES = [] # vector graphics from PyMuPDF
CHARS = [] # text characters from PyMuPDF
TEXTPAGE = None
TEXTPAGE = None # textpage for cell text extraction
TEXT_BOLD = mupdf.FZ_STEXT_BOLD
TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
FLAGS = (
Expand All @@ -110,6 +110,29 @@
white_spaces = set(string.whitespace) # for checking white space only cells


def rect_in_rect(inner, outer):
"""Check whether rectangle 'inner' is fully inside rectangle 'outer'."""
return (
1
and inner[0] >= outer[0]
and inner[1] >= outer[1]
and inner[2] <= outer[2]
and inner[3] <= outer[3]
)


def chars_in_rect(CHARS, rect):
"""Check whether any of the chars in CHAR are inside rectangle 'rect'."""
return any(
1
and rect[0] <= c["x0"]
and c["x1"] <= rect[2]
and rect[1] <= c["y0"]
and rect[3] >= c["y1"]
for c in CHARS
)


def _iou(r1, r2):
"""Compute intersection over union of two rectangles."""
ix = max(0, min(r1[2], r2[2]) - max(r1[0], r2[0]))
Expand All @@ -126,7 +149,7 @@ def intersects_words_h(bbox, y, word_rects) -> bool:
"""Check whether any of the words in bbox are cut through by
horizontal line y.
"""
return any(r.y0 < y < r.y1 for r in word_rects if r in bbox)
return any(r.y0 < y < r.y1 for r in word_rects if rect_in_rect(r, bbox))


def get_table_dict_from_rect(textpage, rect):
Expand Down Expand Up @@ -182,7 +205,9 @@ def make_table_from_bbox(textpage, word_rects, rect):
for i in range(len(nypos) - 1):
row_box = pymupdf.Rect(bbox.x0, nypos[i], bbox.x1, nypos[i + 1])
# Sub-select words in this row and sort them by left coordinate
row_words = sorted([r for r in word_rects if r in row_box], key=lambda r: r.x0)
row_words = sorted(
[r for r in word_rects if rect_in_rect(r, row_box)], key=lambda r: r.x0
)
# Sub-select x values that do not cut through words
this_xpos = [x for x in nxpos if not any(r.x0 < x < r.x1 for r in row_words)]
for j in range(len(this_xpos) - 1):
Expand Down Expand Up @@ -1496,6 +1521,7 @@ def __init__(self, bbox, cells, names, above):
class Table:
def __init__(self, page, cells):
self.page = page
self.textpage = None
self.cells = cells
self.header = self._get_header() # PyMuPDF extension

Expand Down Expand Up @@ -1588,7 +1614,7 @@ def to_markdown(self, clean=False, fill_empty=True):
for j, cell in enumerate(row):
if cell is not None:
cells[i][j] = extract_cells(
TEXTPAGE, cell_boxes[i][j], markdown=True
self.textpage, cell_boxes[i][j], markdown=True
)

if fill_empty: # fill "None" cells where possible
Expand Down Expand Up @@ -1721,12 +1747,11 @@ def row_has_bold(bbox):

Returns True if any spans are bold else False.
"""
blocks = page.get_text("dict", flags=pymupdf.TEXTFLAGS_TEXT, clip=bbox)[
"blocks"
]
spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]

return any(s["flags"] & pymupdf.TEXT_FONT_BOLD for s in spans)
return any(
c["bold"]
for c in CHARS
if rect_in_rect((c["x0"], c["y0"], c["x1"], c["y1"]), bbox)
)

try:
row = self.rows[0]
Expand Down Expand Up @@ -2008,6 +2033,7 @@ class TableFinder:

def __init__(self, page, settings=None):
self.page = weakref.proxy(page)
self.textpage = None
self.settings = TableSettings.resolve(settings)
self.edges = self.get_edges()
self.intersections = edges_to_intersections(
Expand Down Expand Up @@ -2152,7 +2178,6 @@ def __getitem__(self, i):
# -----------------------------------------------------------------------------
def make_chars(page, clip=None):
"""Extract text as "rawdict" to fill CHARS."""
global TEXTPAGE
page_number = page.number + 1
page_height = page.rect.height
ctm = page.transformation_matrix
Expand All @@ -2171,6 +2196,9 @@ def make_chars(page, clip=None):
for span in sorted(line["spans"], key=lambda s: s["bbox"][0]):
fontname = span["font"]
fontsize = span["size"]
span_bold = bool(
span["flags"] & pymupdf.TEXT_FONT_BOLD or span["char_flags"] & 8
)
color = pymupdf.sRGB_to_pdf(span["color"])
for char in sorted(span["chars"], key=lambda c: c["bbox"][0]):
bbox = pymupdf.Rect(char["bbox"])
Expand All @@ -2194,6 +2222,7 @@ def make_chars(page, clip=None):
"size": fontsize if upright else bbox.y1 - bbox.y0,
"stroking_color": color,
"stroking_pattern": None,
"bold": span_bold,
"text": text,
"top": bbox.y0,
"upright": upright,
Expand All @@ -2204,6 +2233,7 @@ def make_chars(page, clip=None):
"y1": bbox_ctm.y1,
}
CHARS.append(char_dict)
return TEXTPAGE


# ------------------------------------------------------------------------
Expand Down Expand Up @@ -2303,7 +2333,7 @@ def clean_graphics(npaths=None):
repeat = True # keep checking the rest

# move rect 0 over to result list if there is some text in it
if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)):
if chars_in_rect(CHARS, prect0):
# contains text, so accept it as a table bbox candidate
new_rects.append(prect0)
del prects[0] # remove from rect list
Expand Down Expand Up @@ -2586,9 +2616,9 @@ def find_tables(
paths=None, # accept vector graphics as parameter
):
pymupdf._warn_layout_once()
global CHARS, EDGES
CHARS = []
EDGES = []
CHARS.clear()
EDGES.clear()
TEXTPAGE = None
old_small = bool(pymupdf.TOOLS.set_small_glyph_heights()) # save old value
pymupdf.TOOLS.set_small_glyph_heights(True) # we need minimum bboxes
if page.rotation != 0:
Expand Down Expand Up @@ -2656,7 +2686,7 @@ def find_tables(
tset = TableSettings.resolve(settings=settings)
page.table_settings = tset

make_chars(page, clip=clip) # create character list of page
TEXTPAGE = make_chars(page, clip=clip) # create character list of page
make_edges(
page,
clip=clip,
Expand All @@ -2667,7 +2697,7 @@ def find_tables(
) # create lines and curves

tbf = TableFinder(page, settings=tset)

tbf.textpage = TEXTPAGE # store textpage for later use
if boxes:
# only keep Finder tables that match a layout box
tbf.tables = [
Expand All @@ -2693,5 +2723,6 @@ def find_tables(
if old_xref is not None:
page = page_rotation_reset(page, old_xref, old_rot, old_mediabox)
pymupdf.TOOLS.unset_quad_corrections(old_quad_corrections)

for table in tbf.tables:
table.textpage = TEXTPAGE
return tbf
7 changes: 1 addition & 6 deletions tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,12 +184,7 @@ def test_2979():
), f"{pymupdf.TOOLS.set_small_glyph_heights()=}"

wt = pymupdf.TOOLS.mupdf_warnings()
if pymupdf.mupdf_version_tuple >= (1, 26, 8):
assert (
wt
== "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times...\nActualtext with no position. Text may be lost or mispositioned.\n... repeated 96 times..."
)
elif pymupdf.mupdf_version_tuple >= (1, 26, 0):
if pymupdf.mupdf_version_tuple >= (1, 26, 0):
assert (
wt
== "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times..."
Expand Down