Merge pull request #291 from pymupdf/v0.0.26

JorjMcKie · web-flow · commit 7bd58fa164ea · 2025-07-02T19:02:12.000-04:00
Version 0.0.26
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,30 @@
 # Change Log
 
+## Changes in version 0.0.26
+
+### Fixes:
+
+* [282](https://github.com/pymupdf/RAG/issues/282) - Content Duplication with the latest version
+* [281](https://github.com/pymupdf/RAG/issues/281) - Latest version of pymupdf4llm.to_markdown returns empty text for some PDFs.
+* [280](https://github.com/pymupdf/RAG/issues/280) - Cannot extract text when ignore_images=False, can extract otherwise.
+* [278](https://github.com/pymupdf/RAG/issues/278) - Title words are fragmented
+* [249](https://github.com/pymupdf/RAG/issues/249) - Title duplication problem in markdown format
+* [202](https://github.com/pymupdf/RAG/issues/202) - BAD RECT ISSUE
+
+### Other Changes:
+
+* The table module in package PyMuPDF has been modified: Its method `to_markdown()` will now output markdown-styled cell text. Previously, table cells were extracted as plain text only.
+
+* The class `TocHeaders` is now a top-level import and can now be directly used.
+
+* Method `to_markdown` has a new parameter `detect_bg_color=True` which guesses the page's background color. If detection is successful, vectors having this fill color are ignored (default). Setting this to `False` will "fill" vectors to always be considered in vector graphics detection.
+
+* Text written with a `Type 3` font will now always be considered. Previously, this text was always treated as invisible and was hence suppressed.
+
+* The package now contains the license file GNU Affero GPL 3.0 to ease distribution (see LICENSE). It also clarifies that PyMuPDF4LLM is dual licensed under GNU AGPL 3.0 and individual commercial licenses.
+
+* There is a new file `versions_file.py` which contains version information. This is used to ensure the presence of a minimum PyMuPDF version at import time.
+
 ## Changes in version 0.0.25
 
 ### Fixes:
diff --git a/pdf4llm/LICENSE b/pdf4llm/LICENSE
diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py
@@ -13,19 +13,20 @@
     "Programming Language :: Python :: 3",
     "Topic :: Utilities",
 ]
-requires = ["pymupdf4llm==0.0.25"]
+requires = ["pymupdf4llm==0.0.26"]
 
 setuptools.setup(
     name="pdf4llm",
-    version="0.0.25",
+    version="0.0.26",
     author="Artifex",
     author_email="support@artifex.com",
     description="PyMuPDF Utilities for LLM/RAG",
     packages=setuptools.find_packages(),
     long_description=readme,
     long_description_content_type="text/markdown",
     install_requires=requires,
-    license="GNU AFFERO GPL 3.0",
+    python_requires=">=3.9",
+    license="Dual Licensed - GNU AFFERO GPL 3.0 or Artifex Commercial License",
     url="https://github.com/pymupdf/RAG",
     classifiers=classifiers,
     package_data={
diff --git a/pymupdf4llm/LICENSE b/pymupdf4llm/LICENSE
diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -1,7 +1,12 @@
+import pymupdf
 from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown
+from .versions_file import MINIMUM_PYMUPDF_VERSION, VERSION
 
-__version__ = "0.0.25"
-version = __version__
+if tuple(map(int, pymupdf.__version__.split("."))) < MINIMUM_PYMUPDF_VERSION:
+    raise ImportError(f"Requires PyMuPDF v. {MINIMUM_PYMUPDF_VERSION}, but you have {pymupdf.__version__}")
+
+__version__ = VERSION
+version = VERSION
 version_tuple = tuple(map(int, version.split(".")))
 
 
diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -18,6 +18,7 @@
 import pymupdf
 
 WHITE = set(string.whitespace)
+TYPE3_FONT_NAME = "Unnamed-T3"
 
 
 def is_white(text):
@@ -118,7 +119,11 @@ def sanitize_spans(line):
                 if is_white(s["text"]):  # ignore white text
                     continue
                 # Ignore invisible text. Type 3 font text is never invisible.
-                if s["font"] != "Unnamed-T3" and s["alpha"] == 0 and ignore_invisible:
+                if (
+                    s["font"] != TYPE3_FONT_NAME
+                    and s["alpha"] == 0
+                    and ignore_invisible
+                ):
                     continue
                 if abs(sbbox & clip) < abs(sbbox) * 0.8:  # if not in clip
                     continue
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -268,6 +268,10 @@ def is_significant(box, paths):
     nbox = box + (d, d, -d, -d)  # nbox covers 90% of box interior
     # paths contained in, but not equal to box:
     my_paths = [p for p in paths if p["rect"] in box and p["rect"] != box]
+    widths = set(round(p["rect"].width) for p in my_paths) | {round(box.width)}
+    heights = set(round(p["rect"].height) for p in my_paths) | {round(box.height)}
+    if len(widths) == 1 or len(heights) == 1:
+        return False  # all paths are horizontal or vertical lines / rectangles
     for p in my_paths:
         rect = p["rect"]
         if (
@@ -305,6 +309,7 @@ def to_markdown(
     embed_images=False,
     ignore_images=False,
     ignore_graphics=False,
+    detect_bg_color=True,
     image_path="",
     image_format="png",
     image_size_limit=0.05,
@@ -375,6 +380,7 @@ def to_markdown(
     FONTSIZE_LIMIT = fontsize_limit
     IGNORE_IMAGES = ignore_images
     IGNORE_GRAPHICS = ignore_graphics
+    DETECT_BG_COLOR = detect_bg_color
     if doc.is_form_pdf or doc.has_annots():
         doc.bake()
 
@@ -588,9 +594,14 @@ def write_text(
                         parms.written_images.append(i)
 
             parms.line_rects.append(lrect)
-
+            # if line rect is far away from the previous one, add a line break
+            if (
+                len(parms.line_rects) > 1
+                and lrect.y1 - parms.line_rects[-2].y1 > lrect.height * 1.5
+            ):
+                out_string += "\n"
             # make text string for the full line
-            text = " ".join([s["text"] for s in spans])
+            text = " ".join([s["text"] for s in spans]).strip()
 
             # full line strikeout?
             all_strikeout = all([s["char_flags"] & 1 for s in spans])
@@ -671,11 +682,6 @@ def write_text(
                 italic = s["flags"] & 2
                 strikeout = s["char_flags"] & 1
 
-                # if mono:
-                #     # this is text in some monospaced font
-                #     out_string += f"`{s['text'].strip()}` "
-                #     continue
-
                 prefix = ""
                 suffix = ""
                 if mono:
@@ -713,7 +719,7 @@ def write_text(
         if code:
             out_string += "```\n"  # switch of code mode
             code = False
-
+        out_string += "\n\n"
         return (
             out_string.replace(" \n", "\n").replace("  ", " ").replace("\n\n\n", "\n\n")
         )
@@ -948,7 +954,7 @@ def get_page_output(
         )  # accept invisible text
 
         # determine background color
-        parms.bg_color = get_bg_color(page)
+        parms.bg_color = get_bg_color(page) if DETECT_BG_COLOR else None
 
         left, top, right, bottom = margins
         parms.clip = page.rect + (left, top, -right, -bottom)
@@ -985,7 +991,9 @@ def get_page_output(
         img_info.sort(key=lambda i: abs(i["bbox"]), reverse=True)
 
         # subset of images truly inside the clip
-        sane = [i for i in img_info if parms.clip not in i["bbox"].irect]
+        if img_info:
+            img_max_size = abs(parms.clip) * 0.9
+            sane = [i for i in img_info if abs(i["bbox"] & parms.clip) < img_max_size]
         if len(sane) < len(img_info):  # found some
             img_info = sane  # use those images instead
             # output full page image
diff --git a/pymupdf4llm/pymupdf4llm/versions_file.py b/pymupdf4llm/pymupdf4llm/versions_file.py
@@ -0,0 +1,2 @@
+MINIMUM_PYMUPDF_VERSION = (1, 26, 3)
+VERSION = '0.0.26'
diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py
@@ -1,6 +1,6 @@
 import os
-
 import setuptools
+from pathlib import Path
 
 setup_py_cwd = os.path.dirname(__file__)
 with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
@@ -13,19 +13,27 @@
     "Programming Language :: Python :: 3",
     "Topic :: Utilities",
 ]
-requires = ["pymupdf>=1.26.1"]
+
+version = "0.0.26"
+requires = ["pymupdf>=1.26.3"]
+
+text = requires[0].split("=")[1]
+text = tuple(map(int, text.split(".")))
+text = f"MINIMUM_PYMUPDF_VERSION = {text}\nVERSION = '{version}'\n"
+Path("pymupdf4llm/versions_file.py").write_text(text)
 
 setuptools.setup(
     name="pymupdf4llm",
-    version="0.0.25",
+    version=version,
     author="Artifex",
     author_email="support@artifex.com",
     description="PyMuPDF Utilities for LLM/RAG",
     packages=setuptools.find_packages(),
     long_description=readme,
     long_description_content_type="text/markdown",
     install_requires=requires,
-    license="GNU AFFERO GPL 3.0",
+    python_requires=">=3.9",
+    license="Dual Licensed - GNU AFFERO GPL 3.0 or Artifex Commercial License",
     url="https://github.com/pymupdf/RAG",
     classifiers=classifiers,
     package_data={

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+MINIMUM_PYMUPDF_VERSION = (1, 26, 3)`
	`2`	`+VERSION = '0.0.26'`