Merge pull request #285 from pymupdf/v0.0.25

JorjMcKie · web-flow · commit 8208d6e528c2 · 2025-06-13T13:07:43.000-04:00
Version 0.0.25
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,24 @@
 # Change Log
 
+## Changes in version 0.0.25
+
+### Fixes:
+
+* [282](https://github.com/pymupdf/RAG/issues/282) - Content Duplication with the latest version
+* [281](https://github.com/pymupdf/RAG/issues/281) - Latest version of pymupdf4llm.to_markdown returns empty text for some PDFs.
+* [280](https://github.com/pymupdf/RAG/issues/280) - Cannot extract text when ignore_images=False, can extract otherwise.
+* [278](https://github.com/pymupdf/RAG/issues/278) - Title words are fragmented
+* [249](https://github.com/pymupdf/RAG/issues/249) - Title duplication problem in markdown format
+* [202](https://github.com/pymupdf/RAG/issues/202) - BAD RECT ISSUE
+
+### Other Changes:
+
+* The table module in package PyMuDDF has been: Its method `to_markdown()` will now output markdown-styled cell text. Previously, table cells were extracted as plain text only.
+
+* The class `TocHeaders` is now a top-level import and can now be directly used.
+
+* Text written with a `Type 3` font will now always be considered. Previously, this text was always treated as invisible and was hence suppressed.
+
 ## Changes in version 0.0.24
 
 ### Fixes:
diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py
@@ -13,11 +13,11 @@
     "Programming Language :: Python :: 3",
     "Topic :: Utilities",
 ]
-requires = ["pymupdf4llm==0.0.24"]
+requires = ["pymupdf4llm==0.0.25"]
 
 setuptools.setup(
     name="pdf4llm",
-    version="0.0.24",
+    version="0.0.25",
     author="Artifex",
     author_email="support@artifex.com",
     description="PyMuPDF Utilities for LLM/RAG",
diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -1,6 +1,6 @@
-from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
+from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown
 
-__version__ = "0.0.24"
+__version__ = "0.0.25"
 version = __version__
 version_tuple = tuple(map(int, version.split(".")))
 
diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -77,6 +77,7 @@ def sanitize_spans(line):
         # sort ascending horizontally
         line.sort(key=lambda s: s["bbox"].x0)
         # join spans, delete duplicates
+        # underline differences are being ignored
         for i in range(len(line) - 1, 0, -1):  # iterate back to front
             s0 = line[i - 1]  # preceding span
             s1 = line[i]  # this span
@@ -86,9 +87,9 @@ def sanitize_spans(line):
             delta = s1["size"] * 0.1
             if s0["bbox"].x1 + delta < s1["bbox"].x0 or (
                 s0["flags"],
-                s0["char_flags"],
+                s0["char_flags"] & ~2,
                 s0["size"],
-            ) != (s1["flags"], s1["char_flags"], s1["size"]):
+            ) != (s1["flags"], s1["char_flags"] & ~2, s1["size"]):
                 continue  # no joining
             # We need to join bbox and text of two consecutive spans
             # On occasion, spans may also be duplicated.
@@ -116,8 +117,8 @@ def sanitize_spans(line):
                 sbbox = pymupdf.Rect(s["bbox"])  # span bbox as a Rect
                 if is_white(s["text"]):  # ignore white text
                     continue
-                # ignore invisible text
-                if s["alpha"] == 0 and ignore_invisible:
+                # Ignore invisible text. Type 3 font text is never invisible.
+                if s["font"] != "Unnamed-T3" and s["alpha"] == 0 and ignore_invisible:
                     continue
                 if abs(sbbox & clip) < abs(sbbox) * 0.8:  # if not in clip
                     continue
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -171,10 +171,10 @@ class TocHeaders:
     full document to identify font sizes, it uses the document's Table Of
     Contents (TOC) to identify headers on pages.
     Like IdentifyHeaders, this also is no guarantee to find headers, but it
-    is a good change for appropriately build documents. In such cases, this
-    method can be very much faster and more accurate, because we can use the
-    hierarchy level of TOC items directly to ientify the header level.
-    Examples where this approach works very well are the Adobe PDF documents.
+    represents a good chance for appropriately built documents. In such cases,
+    this method can be very much faster and more accurate, because we can
+    directly use the hierarchy level of TOC items to ientify the header level.
+    Examples where this works very well are the Adobe PDF documents.
     """
 
     def __init__(self, doc: str):
@@ -195,14 +195,15 @@ def get_header_id(self, span: dict, page=None) -> str:
         Given a text span from a "dict"/"rawdict" extraction, determine the
         markdown header prefix string of 0 to n concatenated '#' characters.
         """
-        if page is None:
+        if not page:
             return ""
         # check if this page has TOC entries with an actual title
         my_toc = [t for t in self.TOC if t[1] and t[-1] == page.number + 1]
-        if not my_toc:
+        if not my_toc:  # no TOC items present on this page
             return ""
-        # check if the span matches a TOC entry
-        text = span["text"].strip()
+        # Check if the span matches a TOC entry. This must be done in the
+        # most forgiving way: exact matches are rare animals.
+        text = span["text"].strip()  # remove leading and trailing whitespace
         for t in my_toc:
             title = t[1].strip()  # title of TOC entry
             lvl = t[0]  # level of TOC entry
@@ -321,6 +322,7 @@ def to_markdown(
     extract_words=False,
     show_progress=False,
     use_glyphs=False,
+    ignore_alpha=False,
 ) -> str:
     """Process the document and return the text of the selected pages.
 
@@ -341,9 +343,10 @@ def to_markdown(
         table_strategy: choose table detection strategy
         graphics_limit: (int) if vector graphics count exceeds this, ignore all.
         ignore_code: (bool) suppress code-like formatting (mono-space fonts)
-        extract_words: (bool) include "words"-like output in page chunks
-        show_progress: (bool) print progress as each page is processed.
-        use_glyphs: (bool) replace the Invalid Unicode by glyph numbers.
+        extract_words: (bool, False) include "words"-like output in page chunks
+        show_progress: (bool, False) print progress as each page is processed.
+        use_glyphs: (bool, False) replace the Invalid Unicode by glyph numbers.
+        ignore_alpha: (bool, True) ignore text with alpha = 0 (transparent).
 
     """
     if write_images is False and embed_images is False and force_text is False:
@@ -372,6 +375,8 @@ def to_markdown(
     FONTSIZE_LIMIT = fontsize_limit
     IGNORE_IMAGES = ignore_images
     IGNORE_GRAPHICS = ignore_graphics
+    if doc.is_form_pdf or doc.has_annots():
+        doc.bake()
 
     # for reflowable documents allow making 1 page for the whole document
     if doc.is_reflowable:
@@ -394,7 +399,7 @@ def to_markdown(
         margins = (0, margins[0], 0, margins[1])
     if len(margins) != 4:
         raise ValueError("margins must be one, two or four floats")
-    elif not all([hasattr(m, "__float__") for m in margins]):
+    elif not all(hasattr(m, "__float__") for m in margins):
         raise ValueError("margin values must be floats")
 
     # If "hdr_info" is not an object with a method "get_header_id", scan the
@@ -587,44 +592,28 @@ def write_text(
             # make text string for the full line
             text = " ".join([s["text"] for s in spans])
 
-            # if line is a header, this will return multiple "#" characters,
-            # otherwise an empty string
-            hdr_string = max_header_id(spans, page=parms.page)  # a header?
-
             # full line strikeout?
             all_strikeout = all([s["char_flags"] & 1 for s in spans])
             # full line italic?
             all_italic = all([s["flags"] & 2 for s in spans])
             # full line bold?
-            all_bold = all([s["flags"] & 16 or s["char_flags"] & 8 for s in spans])
-
+            all_bold = all([(s["flags"] & 16) or (s["char_flags"] & 8) for s in spans])
             # full line mono-spaced?
-            if not IGNORE_CODE:
-                all_mono = all([s["flags"] & 8 for s in spans])
-            else:
-                all_mono = False
+            all_mono = all([s["flags"] & 8 for s in spans])
 
-            if all_mono and not hdr_string:
-                if not code:  # if not already in code output mode:
-                    out_string += "```\n"  # switch on "code" mode
-                    code = True
-                # compute approx. distance from left - assuming a width
-                # of 0.5*fontsize.
-                delta = int((lrect.x0 - clip.x0) / (spans[0]["size"] * 0.5))
-                indent = " " * delta
-
-                out_string += indent + text + "\n"
-                continue  # done with this line
+            # if line is a header, this will return multiple "#" characters,
+            # otherwise an empty string
+            hdr_string = max_header_id(spans, page=parms.page)  # a header?
 
             if hdr_string:  # if a header line skip the rest
                 if all_mono:
                     text = "`" + text + "`"
-                if all_strikeout:
-                    text = "~~" + text + "~~"
                 if all_italic:
-                    text = "*" + text + "*"
+                    text = "_" + text + "_"
                 if all_bold:
                     text = "**" + text + "**"
+                if all_strikeout:
+                    text = "~~" + text + "~~"
                 if hdr_string != prev_hdr_string:
                     out_string += hdr_string + text + "\n"
                 else:
@@ -637,6 +626,23 @@ def write_text(
 
             prev_hdr_string = hdr_string
 
+            # start or extend a code block
+            if all_mono and not IGNORE_CODE:
+                if not code:  # if not already in code output mode:
+                    out_string += "```\n"  # switch on "code" mode
+                    code = True
+                # compute approx. distance from left - assuming a width
+                # of 0.5*fontsize.
+                delta = int((lrect.x0 - clip.x0) / (spans[0]["size"] * 0.5))
+                indent = " " * delta
+
+                out_string += indent + text + "\n"
+                continue  # done with this line
+
+            if code and not all_mono:
+                out_string += "```\n"  # switch off code mode
+                code = False
+
             span0 = spans[0]
             bno = span0["block"]  # block number of line
             if bno != prev_bno:
@@ -660,30 +666,30 @@ def write_text(
 
             for i, s in enumerate(spans):  # iterate spans of the line
                 # decode font properties
-                mono = s["flags"] & 8 and IGNORE_CODE is False
+                mono = s["flags"] & 8
                 bold = s["flags"] & 16 or s["char_flags"] & 8
                 italic = s["flags"] & 2
                 strikeout = s["char_flags"] & 1
 
-                if mono:
-                    # this is text in some monospaced font
-                    out_string += f"`{s['text'].strip()}` "
-                    continue
+                # if mono:
+                #     # this is text in some monospaced font
+                #     out_string += f"`{s['text'].strip()}` "
+                #     continue
 
                 prefix = ""
                 suffix = ""
+                if mono:
+                    prefix = "`" + prefix
+                    suffix += "`"
                 if bold:
                     prefix = "**" + prefix
                     suffix += "**"
                 if italic:
-                    prefix = "*" + prefix
-                    suffix += "*"
+                    prefix = "_" + prefix
+                    suffix += "_"
                 if strikeout:
                     prefix = "~~" + prefix
                     suffix += "~~"
-                if mono:
-                    prefix = "`" + prefix
-                    suffix += "`"
 
                 # convert intersecting link to markdown syntax
                 ltext = resolve_links(parms.links, s)
@@ -831,9 +837,12 @@ def page_is_ocr(page):
 
         For this to be true, all text must be written as "ignore-text".
         """
-        text_types = set([b[0] for b in page.get_bboxlog() if "text" in b[0]])
-        if text_types == {"ignore-text"}:
-            return True
+        try:
+            text_types = set([b[0] for b in page.get_bboxlog() if "text" in b[0]])
+            if text_types == {"ignore-text"}:
+                return True
+        except:
+            pass
         return False
 
     def get_bg_color(page):
@@ -934,7 +943,9 @@ def get_page_output(
         parms.graphics = []
         parms.words = []
         parms.line_rects = []
-        parms.accept_invisible = page_is_ocr(page)  # accept invisible text
+        parms.accept_invisible = (
+            page_is_ocr(page) or ignore_alpha
+        )  # accept invisible text
 
         # determine background color
         parms.bg_color = get_bg_color(page)
@@ -958,6 +969,8 @@ def get_page_output(
             img_info = []
         for i in range(len(img_info)):
             img_info[i]["bbox"] = pymupdf.Rect(img_info[i]["bbox"])
+
+        # filter out images that are too small or outside the clip
         img_info = [
             i
             for i in img_info
@@ -967,8 +980,19 @@ def get_page_output(
             and i["bbox"].width > 3
             and i["bbox"].height > 3
         ]
+
         # sort descending by image area size
         img_info.sort(key=lambda i: abs(i["bbox"]), reverse=True)
+
+        # subset of images truly inside the clip
+        sane = [i for i in img_info if parms.clip not in i["bbox"].irect]
+        if len(sane) < len(img_info):  # found some
+            img_info = sane  # use those images instead
+            # output full page image
+            name = save_image(parms, parms.clip, "full")
+            if name:
+                parms.md_string += GRAPHICS_TEXT % name
+
         img_info = img_info[:30]  # only accept the largest up to 30 images
         # run from back to front (= small to large)
         for i in range(len(img_info) - 1, 0, -1):
@@ -1152,7 +1176,7 @@ def get_page_output(
         0
         | mupdf.FZ_STEXT_CLIP
         | mupdf.FZ_STEXT_ACCURATE_BBOXES
-        | mupdf.FZ_STEXT_IGNORE_ACTUALTEXT
+        # | mupdf.FZ_STEXT_IGNORE_ACTUALTEXT
         | 32768  # mupdf.FZ_STEXT_COLLECT_STYLES
     )
     # optionally replace 0xFFFD by glyph number
@@ -1253,7 +1277,7 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
     import time
 
     try:
-        filename = "sample_document.pdf"
+        filename = sys.argv[1]
     except IndexError:
         print(f"Usage:\npython {os.path.basename(__file__)} input.pdf")
         sys.exit()
@@ -1284,11 +1308,6 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
     md_string = to_markdown(
         doc,
         pages=pages,
-        # write_images=True,
-        force_text=True,
-        ignore_images=True,
-        ignore_graphics=True,
-        table_strategy=None,
     )
     FILENAME = doc.name
     # output to a text file with extension ".md"
diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py
@@ -13,11 +13,11 @@
     "Programming Language :: Python :: 3",
     "Topic :: Utilities",
 ]
-requires = ["pymupdf>=1.25.5"]
+requires = ["pymupdf>=1.26.1"]
 
 setuptools.setup(
     name="pymupdf4llm",
-    version="0.0.24",
+    version="0.0.25",
     author="Artifex",
     author_email="support@artifex.com",
     description="PyMuPDF Utilities for LLM/RAG",