Skip to content

Commit 7bd58fa

Browse files
authored
Merge pull request #291 from pymupdf/v0.0.26
Version 0.0.26
2 parents 8208d6e + 8005a4c commit 7bd58fa

File tree

9 files changed

+1396
-20
lines changed

9 files changed

+1396
-20
lines changed

CHANGES.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,30 @@
11
# Change Log
22

3+
## Changes in version 0.0.26
4+
5+
### Fixes:
6+
7+
* [282](https://github.com/pymupdf/RAG/issues/282) - Content Duplication with the latest version
8+
* [281](https://github.com/pymupdf/RAG/issues/281) - Latest version of pymupdf4llm.to_markdown returns empty text for some PDFs.
9+
* [280](https://github.com/pymupdf/RAG/issues/280) - Cannot extract text when ignore_images=False, can extract otherwise.
10+
* [278](https://github.com/pymupdf/RAG/issues/278) - Title words are fragmented
11+
* [249](https://github.com/pymupdf/RAG/issues/249) - Title duplication problem in markdown format
12+
* [202](https://github.com/pymupdf/RAG/issues/202) - BAD RECT ISSUE
13+
14+
### Other Changes:
15+
16+
* The table module in package PyMuPDF has been modified: Its method `to_markdown()` will now output markdown-styled cell text. Previously, table cells were extracted as plain text only.
17+
18+
* The class `TocHeaders` is now a top-level import and can now be directly used.
19+
20+
* Method `to_markdown` has a new parameter `detect_bg_color=True` which guesses the page's background color. If detection is successful, vectors having this fill color are ignored (default). Setting this to `False` will "fill" vectors to always be considered in vector graphics detection.
21+
22+
* Text written with a `Type 3` font will now always be considered. Previously, this text was always treated as invisible and was hence suppressed.
23+
24+
* The package now contains the license file GNU Affero GPL 3.0 to ease distribution (see LICENSE). It also clarifies that PyMuPDF4LLM is dual licensed under GNU AGPL 3.0 and individual commercial licenses.
25+
26+
* There is a new file `versions_file.py` which contains version information. This is used to ensure the presence of a minimum PyMuPDF version at import time.
27+
328
## Changes in version 0.0.25
429

530
### Fixes:

pdf4llm/LICENSE

Lines changed: 661 additions & 0 deletions
Large diffs are not rendered by default.

pdf4llm/setup.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,20 @@
1313
"Programming Language :: Python :: 3",
1414
"Topic :: Utilities",
1515
]
16-
requires = ["pymupdf4llm==0.0.25"]
16+
requires = ["pymupdf4llm==0.0.26"]
1717

1818
setuptools.setup(
1919
name="pdf4llm",
20-
version="0.0.25",
20+
version="0.0.26",
2121
author="Artifex",
2222
author_email="[email protected]",
2323
description="PyMuPDF Utilities for LLM/RAG",
2424
packages=setuptools.find_packages(),
2525
long_description=readme,
2626
long_description_content_type="text/markdown",
2727
install_requires=requires,
28-
license="GNU AFFERO GPL 3.0",
28+
python_requires=">=3.9",
29+
license="Dual Licensed - GNU AFFERO GPL 3.0 or Artifex Commercial License",
2930
url="https://github.com/pymupdf/RAG",
3031
classifiers=classifiers,
3132
package_data={

pymupdf4llm/LICENSE

Lines changed: 661 additions & 0 deletions
Large diffs are not rendered by default.

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
1+
import pymupdf
12
from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown
3+
from .versions_file import MINIMUM_PYMUPDF_VERSION, VERSION
24

3-
__version__ = "0.0.25"
4-
version = __version__
5+
if tuple(map(int, pymupdf.__version__.split("."))) < MINIMUM_PYMUPDF_VERSION:
6+
raise ImportError(f"Requires PyMuPDF v. {MINIMUM_PYMUPDF_VERSION}, but you have {pymupdf.__version__}")
7+
8+
__version__ = VERSION
9+
version = VERSION
510
version_tuple = tuple(map(int, version.split(".")))
611

712

pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import pymupdf
1919

2020
WHITE = set(string.whitespace)
21+
TYPE3_FONT_NAME = "Unnamed-T3"
2122

2223

2324
def is_white(text):
@@ -118,7 +119,11 @@ def sanitize_spans(line):
118119
if is_white(s["text"]): # ignore white text
119120
continue
120121
# Ignore invisible text. Type 3 font text is never invisible.
121-
if s["font"] != "Unnamed-T3" and s["alpha"] == 0 and ignore_invisible:
122+
if (
123+
s["font"] != TYPE3_FONT_NAME
124+
and s["alpha"] == 0
125+
and ignore_invisible
126+
):
122127
continue
123128
if abs(sbbox & clip) < abs(sbbox) * 0.8: # if not in clip
124129
continue

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,10 @@ def is_significant(box, paths):
268268
nbox = box + (d, d, -d, -d) # nbox covers 90% of box interior
269269
# paths contained in, but not equal to box:
270270
my_paths = [p for p in paths if p["rect"] in box and p["rect"] != box]
271+
widths = set(round(p["rect"].width) for p in my_paths) | {round(box.width)}
272+
heights = set(round(p["rect"].height) for p in my_paths) | {round(box.height)}
273+
if len(widths) == 1 or len(heights) == 1:
274+
return False # all paths are horizontal or vertical lines / rectangles
271275
for p in my_paths:
272276
rect = p["rect"]
273277
if (
@@ -305,6 +309,7 @@ def to_markdown(
305309
embed_images=False,
306310
ignore_images=False,
307311
ignore_graphics=False,
312+
detect_bg_color=True,
308313
image_path="",
309314
image_format="png",
310315
image_size_limit=0.05,
@@ -375,6 +380,7 @@ def to_markdown(
375380
FONTSIZE_LIMIT = fontsize_limit
376381
IGNORE_IMAGES = ignore_images
377382
IGNORE_GRAPHICS = ignore_graphics
383+
DETECT_BG_COLOR = detect_bg_color
378384
if doc.is_form_pdf or doc.has_annots():
379385
doc.bake()
380386

@@ -588,9 +594,14 @@ def write_text(
588594
parms.written_images.append(i)
589595

590596
parms.line_rects.append(lrect)
591-
597+
# if line rect is far away from the previous one, add a line break
598+
if (
599+
len(parms.line_rects) > 1
600+
and lrect.y1 - parms.line_rects[-2].y1 > lrect.height * 1.5
601+
):
602+
out_string += "\n"
592603
# make text string for the full line
593-
text = " ".join([s["text"] for s in spans])
604+
text = " ".join([s["text"] for s in spans]).strip()
594605

595606
# full line strikeout?
596607
all_strikeout = all([s["char_flags"] & 1 for s in spans])
@@ -671,11 +682,6 @@ def write_text(
671682
italic = s["flags"] & 2
672683
strikeout = s["char_flags"] & 1
673684

674-
# if mono:
675-
# # this is text in some monospaced font
676-
# out_string += f"`{s['text'].strip()}` "
677-
# continue
678-
679685
prefix = ""
680686
suffix = ""
681687
if mono:
@@ -713,7 +719,7 @@ def write_text(
713719
if code:
714720
out_string += "```\n" # switch of code mode
715721
code = False
716-
722+
out_string += "\n\n"
717723
return (
718724
out_string.replace(" \n", "\n").replace(" ", " ").replace("\n\n\n", "\n\n")
719725
)
@@ -948,7 +954,7 @@ def get_page_output(
948954
) # accept invisible text
949955

950956
# determine background color
951-
parms.bg_color = get_bg_color(page)
957+
parms.bg_color = get_bg_color(page) if DETECT_BG_COLOR else None
952958

953959
left, top, right, bottom = margins
954960
parms.clip = page.rect + (left, top, -right, -bottom)
@@ -985,7 +991,9 @@ def get_page_output(
985991
img_info.sort(key=lambda i: abs(i["bbox"]), reverse=True)
986992

987993
# subset of images truly inside the clip
988-
sane = [i for i in img_info if parms.clip not in i["bbox"].irect]
994+
if img_info:
995+
img_max_size = abs(parms.clip) * 0.9
996+
sane = [i for i in img_info if abs(i["bbox"] & parms.clip) < img_max_size]
989997
if len(sane) < len(img_info): # found some
990998
img_info = sane # use those images instead
991999
# output full page image
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
MINIMUM_PYMUPDF_VERSION = (1, 26, 3)
2+
VERSION = '0.0.26'

pymupdf4llm/setup.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import os
2-
32
import setuptools
3+
from pathlib import Path
44

55
setup_py_cwd = os.path.dirname(__file__)
66
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
@@ -13,19 +13,27 @@
1313
"Programming Language :: Python :: 3",
1414
"Topic :: Utilities",
1515
]
16-
requires = ["pymupdf>=1.26.1"]
16+
17+
version = "0.0.26"
18+
requires = ["pymupdf>=1.26.3"]
19+
20+
text = requires[0].split("=")[1]
21+
text = tuple(map(int, text.split(".")))
22+
text = f"MINIMUM_PYMUPDF_VERSION = {text}\nVERSION = '{version}'\n"
23+
Path("pymupdf4llm/versions_file.py").write_text(text)
1724

1825
setuptools.setup(
1926
name="pymupdf4llm",
20-
version="0.0.25",
27+
version=version,
2128
author="Artifex",
2229
author_email="[email protected]",
2330
description="PyMuPDF Utilities for LLM/RAG",
2431
packages=setuptools.find_packages(),
2532
long_description=readme,
2633
long_description_content_type="text/markdown",
2734
install_requires=requires,
28-
license="GNU AFFERO GPL 3.0",
35+
python_requires=">=3.9",
36+
license="Dual Licensed - GNU AFFERO GPL 3.0 or Artifex Commercial License",
2937
url="https://github.com/pymupdf/RAG",
3038
classifiers=classifiers,
3139
package_data={

0 commit comments

Comments
 (0)