pymupdf · JorjMcKie · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025
diff --git a/src/table.py b/src/table.py
@@ -89,7 +89,7 @@
 
 EDGES = []  # vector graphics from PyMuPDF
 CHARS = []  # text characters from PyMuPDF
-TEXTPAGE = None
+TEXTPAGE = None  # textpage for cell text extraction
 TEXT_BOLD = mupdf.FZ_STEXT_BOLD
 TEXT_STRIKEOUT = mupdf.FZ_STEXT_STRIKEOUT
 FLAGS = (
@@ -110,6 +110,29 @@
 white_spaces = set(string.whitespace)  # for checking white space only cells
 
 
+def rect_in_rect(inner, outer):
+    """Check whether rectangle 'inner' is fully inside rectangle 'outer'."""
+    return (
+        1
+        and inner[0] >= outer[0]
+        and inner[1] >= outer[1]
+        and inner[2] <= outer[2]
+        and inner[3] <= outer[3]
+    )
+
+
+def chars_in_rect(CHARS, rect):
+    """Check whether any of the chars in CHAR are inside rectangle 'rect'."""
+    return any(
+        1
+        and rect[0] <= c["x0"]
+        and c["x1"] <= rect[2]
+        and rect[1] <= c["y0"]
+        and rect[3] >= c["y1"]
+        for c in CHARS
+    )
+
+
 def _iou(r1, r2):
     """Compute intersection over union of two rectangles."""
     ix = max(0, min(r1[2], r2[2]) - max(r1[0], r2[0]))
@@ -126,7 +149,7 @@ def intersects_words_h(bbox, y, word_rects) -> bool:
     """Check whether any of the words in bbox are cut through by
     horizontal line y.
     """
-    return any(r.y0 < y < r.y1 for r in word_rects if r in bbox)
+    return any(r.y0 < y < r.y1 for r in word_rects if rect_in_rect(r, bbox))
 
 
 def get_table_dict_from_rect(textpage, rect):
@@ -182,7 +205,9 @@ def make_table_from_bbox(textpage, word_rects, rect):
     for i in range(len(nypos) - 1):
         row_box = pymupdf.Rect(bbox.x0, nypos[i], bbox.x1, nypos[i + 1])
         # Sub-select words in this row and sort them by left coordinate
-        row_words = sorted([r for r in word_rects if r in row_box], key=lambda r: r.x0)
+        row_words = sorted(
+            [r for r in word_rects if rect_in_rect(r, row_box)], key=lambda r: r.x0
+        )
         # Sub-select x values that do not cut through words
         this_xpos = [x for x in nxpos if not any(r.x0 < x < r.x1 for r in row_words)]
         for j in range(len(this_xpos) - 1):
@@ -1496,6 +1521,7 @@ def __init__(self, bbox, cells, names, above):
 class Table:
     def __init__(self, page, cells):
         self.page = page
+        self.textpage = None
         self.cells = cells
         self.header = self._get_header()  # PyMuPDF extension
 
@@ -1588,7 +1614,7 @@ def to_markdown(self, clean=False, fill_empty=True):
             for j, cell in enumerate(row):
                 if cell is not None:
                     cells[i][j] = extract_cells(
-                        TEXTPAGE, cell_boxes[i][j], markdown=True
+                        self.textpage, cell_boxes[i][j], markdown=True
                     )
 
         if fill_empty:  # fill "None" cells where possible
@@ -1721,12 +1747,11 @@ def row_has_bold(bbox):
 
             Returns True if any spans are bold else False.
             """
-            blocks = page.get_text("dict", flags=pymupdf.TEXTFLAGS_TEXT, clip=bbox)[
-                "blocks"
-            ]
-            spans = [s for b in blocks for l in b["lines"] for s in l["spans"]]
-
-            return any(s["flags"] & pymupdf.TEXT_FONT_BOLD for s in spans)
+            return any(
+                c["bold"]
+                for c in CHARS
+                if rect_in_rect((c["x0"], c["y0"], c["x1"], c["y1"]), bbox)
+            )
 
         try:
             row = self.rows[0]
@@ -2008,6 +2033,7 @@ class TableFinder:
 
     def __init__(self, page, settings=None):
         self.page = weakref.proxy(page)
+        self.textpage = None
         self.settings = TableSettings.resolve(settings)
         self.edges = self.get_edges()
         self.intersections = edges_to_intersections(
@@ -2152,7 +2178,6 @@ def __getitem__(self, i):
 # -----------------------------------------------------------------------------
 def make_chars(page, clip=None):
     """Extract text as "rawdict" to fill CHARS."""
-    global TEXTPAGE
     page_number = page.number + 1
     page_height = page.rect.height
     ctm = page.transformation_matrix
@@ -2171,6 +2196,9 @@ def make_chars(page, clip=None):
             for span in sorted(line["spans"], key=lambda s: s["bbox"][0]):
                 fontname = span["font"]
                 fontsize = span["size"]
+                span_bold = bool(
+                    span["flags"] & pymupdf.TEXT_FONT_BOLD or span["char_flags"] & 8
+                )
                 color = pymupdf.sRGB_to_pdf(span["color"])
                 for char in sorted(span["chars"], key=lambda c: c["bbox"][0]):
                     bbox = pymupdf.Rect(char["bbox"])
@@ -2194,6 +2222,7 @@ def make_chars(page, clip=None):
                         "size": fontsize if upright else bbox.y1 - bbox.y0,
                         "stroking_color": color,
                         "stroking_pattern": None,
+                        "bold": span_bold,
                         "text": text,
                         "top": bbox.y0,
                         "upright": upright,
@@ -2204,6 +2233,7 @@ def make_chars(page, clip=None):
                         "y1": bbox_ctm.y1,
                     }
                     CHARS.append(char_dict)
+    return TEXTPAGE
 
 
 # ------------------------------------------------------------------------
@@ -2303,7 +2333,7 @@ def clean_graphics(npaths=None):
                         repeat = True  # keep checking the rest
 
             # move rect 0 over to result list if there is some text in it
-            if not white_spaces.issuperset(page.get_textbox(prect0, textpage=TEXTPAGE)):
+            if chars_in_rect(CHARS, prect0):
                 # contains text, so accept it as a table bbox candidate
                 new_rects.append(prect0)
             del prects[0]  # remove from rect list
@@ -2586,9 +2616,9 @@ def find_tables(
     paths=None,  # accept vector graphics as parameter
 ):
     pymupdf._warn_layout_once()
-    global CHARS, EDGES
-    CHARS = []
-    EDGES = []
+    CHARS.clear()
+    EDGES.clear()
+    TEXTPAGE = None
     old_small = bool(pymupdf.TOOLS.set_small_glyph_heights())  # save old value
     pymupdf.TOOLS.set_small_glyph_heights(True)  # we need minimum bboxes
     if page.rotation != 0:
@@ -2656,7 +2686,7 @@ def find_tables(
         tset = TableSettings.resolve(settings=settings)
         page.table_settings = tset
 
-        make_chars(page, clip=clip)  # create character list of page
+        TEXTPAGE = make_chars(page, clip=clip)  # create character list of page
         make_edges(
             page,
             clip=clip,
@@ -2667,7 +2697,7 @@ def find_tables(
         )  # create lines and curves
 
         tbf = TableFinder(page, settings=tset)
-
+        tbf.textpage = TEXTPAGE  # store textpage for later use
         if boxes:
             # only keep Finder tables that match a layout box
             tbf.tables = [
@@ -2693,5 +2723,6 @@ def find_tables(
         if old_xref is not None:
             page = page_rotation_reset(page, old_xref, old_rot, old_mediabox)
         pymupdf.TOOLS.unset_quad_corrections(old_quad_corrections)
-
+    for table in tbf.tables:
+        table.textpage = TEXTPAGE
     return tbf
diff --git a/tests/test_tables.py b/tests/test_tables.py
@@ -184,12 +184,7 @@ def test_2979():
     ), f"{pymupdf.TOOLS.set_small_glyph_heights()=}"
 
     wt = pymupdf.TOOLS.mupdf_warnings()
-    if pymupdf.mupdf_version_tuple >= (1, 26, 8):
-        assert (
-            wt
-            == "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times...\nActualtext with no position. Text may be lost or mispositioned.\n... repeated 96 times..."
-        )
-    elif pymupdf.mupdf_version_tuple >= (1, 26, 0):
+    if pymupdf.mupdf_version_tuple >= (1, 26, 0):
         assert (
             wt
             == "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times..."