Skip to content

Commit 8208d6e

Browse files
authored
Merge pull request #285 from pymupdf/v0.0.25
Version 0.0.25
2 parents 1c796f4 + 8e00856 commit 8208d6e

File tree

6 files changed

+106
-67
lines changed

6 files changed

+106
-67
lines changed

CHANGES.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,24 @@
11
# Change Log
22

3+
## Changes in version 0.0.25
4+
5+
### Fixes:
6+
7+
* [282](https://github.com/pymupdf/RAG/issues/282) - Content Duplication with the latest version
8+
* [281](https://github.com/pymupdf/RAG/issues/281) - Latest version of pymupdf4llm.to_markdown returns empty text for some PDFs.
9+
* [280](https://github.com/pymupdf/RAG/issues/280) - Cannot extract text when ignore_images=False, can extract otherwise.
10+
* [278](https://github.com/pymupdf/RAG/issues/278) - Title words are fragmented
11+
* [249](https://github.com/pymupdf/RAG/issues/249) - Title duplication problem in markdown format
12+
* [202](https://github.com/pymupdf/RAG/issues/202) - BAD RECT ISSUE
13+
14+
### Other Changes:
15+
16+
* The table module in package PyMuDDF has been: Its method `to_markdown()` will now output markdown-styled cell text. Previously, table cells were extracted as plain text only.
17+
18+
* The class `TocHeaders` is now a top-level import and can now be directly used.
19+
20+
* Text written with a `Type 3` font will now always be considered. Previously, this text was always treated as invisible and was hence suppressed.
21+
322
## Changes in version 0.0.24
423

524
### Fixes:

pdf4llm/setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@
1313
"Programming Language :: Python :: 3",
1414
"Topic :: Utilities",
1515
]
16-
requires = ["pymupdf4llm==0.0.24"]
16+
requires = ["pymupdf4llm==0.0.25"]
1717

1818
setuptools.setup(
1919
name="pdf4llm",
20-
version="0.0.24",
20+
version="0.0.25",
2121
author="Artifex",
2222
author_email="[email protected]",
2323
description="PyMuPDF Utilities for LLM/RAG",

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
1+
from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown
22

3-
__version__ = "0.0.24"
3+
__version__ = "0.0.25"
44
version = __version__
55
version_tuple = tuple(map(int, version.split(".")))
66

pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def sanitize_spans(line):
7777
# sort ascending horizontally
7878
line.sort(key=lambda s: s["bbox"].x0)
7979
# join spans, delete duplicates
80+
# underline differences are being ignored
8081
for i in range(len(line) - 1, 0, -1): # iterate back to front
8182
s0 = line[i - 1] # preceding span
8283
s1 = line[i] # this span
@@ -86,9 +87,9 @@ def sanitize_spans(line):
8687
delta = s1["size"] * 0.1
8788
if s0["bbox"].x1 + delta < s1["bbox"].x0 or (
8889
s0["flags"],
89-
s0["char_flags"],
90+
s0["char_flags"] & ~2,
9091
s0["size"],
91-
) != (s1["flags"], s1["char_flags"], s1["size"]):
92+
) != (s1["flags"], s1["char_flags"] & ~2, s1["size"]):
9293
continue # no joining
9394
# We need to join bbox and text of two consecutive spans
9495
# On occasion, spans may also be duplicated.
@@ -116,8 +117,8 @@ def sanitize_spans(line):
116117
sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect
117118
if is_white(s["text"]): # ignore white text
118119
continue
119-
# ignore invisible text
120-
if s["alpha"] == 0 and ignore_invisible:
120+
# Ignore invisible text. Type 3 font text is never invisible.
121+
if s["font"] != "Unnamed-T3" and s["alpha"] == 0 and ignore_invisible:
121122
continue
122123
if abs(sbbox & clip) < abs(sbbox) * 0.8: # if not in clip
123124
continue

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 76 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -171,10 +171,10 @@ class TocHeaders:
171171
full document to identify font sizes, it uses the document's Table Of
172172
Contents (TOC) to identify headers on pages.
173173
Like IdentifyHeaders, this also is no guarantee to find headers, but it
174-
is a good change for appropriately build documents. In such cases, this
175-
method can be very much faster and more accurate, because we can use the
176-
hierarchy level of TOC items directly to ientify the header level.
177-
Examples where this approach works very well are the Adobe PDF documents.
174+
represents a good chance for appropriately built documents. In such cases,
175+
this method can be very much faster and more accurate, because we can
176+
directly use the hierarchy level of TOC items to ientify the header level.
177+
Examples where this works very well are the Adobe PDF documents.
178178
"""
179179

180180
def __init__(self, doc: str):
@@ -195,14 +195,15 @@ def get_header_id(self, span: dict, page=None) -> str:
195195
Given a text span from a "dict"/"rawdict" extraction, determine the
196196
markdown header prefix string of 0 to n concatenated '#' characters.
197197
"""
198-
if page is None:
198+
if not page:
199199
return ""
200200
# check if this page has TOC entries with an actual title
201201
my_toc = [t for t in self.TOC if t[1] and t[-1] == page.number + 1]
202-
if not my_toc:
202+
if not my_toc: # no TOC items present on this page
203203
return ""
204-
# check if the span matches a TOC entry
205-
text = span["text"].strip()
204+
# Check if the span matches a TOC entry. This must be done in the
205+
# most forgiving way: exact matches are rare animals.
206+
text = span["text"].strip() # remove leading and trailing whitespace
206207
for t in my_toc:
207208
title = t[1].strip() # title of TOC entry
208209
lvl = t[0] # level of TOC entry
@@ -321,6 +322,7 @@ def to_markdown(
321322
extract_words=False,
322323
show_progress=False,
323324
use_glyphs=False,
325+
ignore_alpha=False,
324326
) -> str:
325327
"""Process the document and return the text of the selected pages.
326328
@@ -341,9 +343,10 @@ def to_markdown(
341343
table_strategy: choose table detection strategy
342344
graphics_limit: (int) if vector graphics count exceeds this, ignore all.
343345
ignore_code: (bool) suppress code-like formatting (mono-space fonts)
344-
extract_words: (bool) include "words"-like output in page chunks
345-
show_progress: (bool) print progress as each page is processed.
346-
use_glyphs: (bool) replace the Invalid Unicode by glyph numbers.
346+
extract_words: (bool, False) include "words"-like output in page chunks
347+
show_progress: (bool, False) print progress as each page is processed.
348+
use_glyphs: (bool, False) replace the Invalid Unicode by glyph numbers.
349+
ignore_alpha: (bool, True) ignore text with alpha = 0 (transparent).
347350
348351
"""
349352
if write_images is False and embed_images is False and force_text is False:
@@ -372,6 +375,8 @@ def to_markdown(
372375
FONTSIZE_LIMIT = fontsize_limit
373376
IGNORE_IMAGES = ignore_images
374377
IGNORE_GRAPHICS = ignore_graphics
378+
if doc.is_form_pdf or doc.has_annots():
379+
doc.bake()
375380

376381
# for reflowable documents allow making 1 page for the whole document
377382
if doc.is_reflowable:
@@ -394,7 +399,7 @@ def to_markdown(
394399
margins = (0, margins[0], 0, margins[1])
395400
if len(margins) != 4:
396401
raise ValueError("margins must be one, two or four floats")
397-
elif not all([hasattr(m, "__float__") for m in margins]):
402+
elif not all(hasattr(m, "__float__") for m in margins):
398403
raise ValueError("margin values must be floats")
399404

400405
# If "hdr_info" is not an object with a method "get_header_id", scan the
@@ -587,44 +592,28 @@ def write_text(
587592
# make text string for the full line
588593
text = " ".join([s["text"] for s in spans])
589594

590-
# if line is a header, this will return multiple "#" characters,
591-
# otherwise an empty string
592-
hdr_string = max_header_id(spans, page=parms.page) # a header?
593-
594595
# full line strikeout?
595596
all_strikeout = all([s["char_flags"] & 1 for s in spans])
596597
# full line italic?
597598
all_italic = all([s["flags"] & 2 for s in spans])
598599
# full line bold?
599-
all_bold = all([s["flags"] & 16 or s["char_flags"] & 8 for s in spans])
600-
600+
all_bold = all([(s["flags"] & 16) or (s["char_flags"] & 8) for s in spans])
601601
# full line mono-spaced?
602-
if not IGNORE_CODE:
603-
all_mono = all([s["flags"] & 8 for s in spans])
604-
else:
605-
all_mono = False
602+
all_mono = all([s["flags"] & 8 for s in spans])
606603

607-
if all_mono and not hdr_string:
608-
if not code: # if not already in code output mode:
609-
out_string += "```\n" # switch on "code" mode
610-
code = True
611-
# compute approx. distance from left - assuming a width
612-
# of 0.5*fontsize.
613-
delta = int((lrect.x0 - clip.x0) / (spans[0]["size"] * 0.5))
614-
indent = " " * delta
615-
616-
out_string += indent + text + "\n"
617-
continue # done with this line
604+
# if line is a header, this will return multiple "#" characters,
605+
# otherwise an empty string
606+
hdr_string = max_header_id(spans, page=parms.page) # a header?
618607

619608
if hdr_string: # if a header line skip the rest
620609
if all_mono:
621610
text = "`" + text + "`"
622-
if all_strikeout:
623-
text = "~~" + text + "~~"
624611
if all_italic:
625-
text = "*" + text + "*"
612+
text = "_" + text + "_"
626613
if all_bold:
627614
text = "**" + text + "**"
615+
if all_strikeout:
616+
text = "~~" + text + "~~"
628617
if hdr_string != prev_hdr_string:
629618
out_string += hdr_string + text + "\n"
630619
else:
@@ -637,6 +626,23 @@ def write_text(
637626

638627
prev_hdr_string = hdr_string
639628

629+
# start or extend a code block
630+
if all_mono and not IGNORE_CODE:
631+
if not code: # if not already in code output mode:
632+
out_string += "```\n" # switch on "code" mode
633+
code = True
634+
# compute approx. distance from left - assuming a width
635+
# of 0.5*fontsize.
636+
delta = int((lrect.x0 - clip.x0) / (spans[0]["size"] * 0.5))
637+
indent = " " * delta
638+
639+
out_string += indent + text + "\n"
640+
continue # done with this line
641+
642+
if code and not all_mono:
643+
out_string += "```\n" # switch off code mode
644+
code = False
645+
640646
span0 = spans[0]
641647
bno = span0["block"] # block number of line
642648
if bno != prev_bno:
@@ -660,30 +666,30 @@ def write_text(
660666

661667
for i, s in enumerate(spans): # iterate spans of the line
662668
# decode font properties
663-
mono = s["flags"] & 8 and IGNORE_CODE is False
669+
mono = s["flags"] & 8
664670
bold = s["flags"] & 16 or s["char_flags"] & 8
665671
italic = s["flags"] & 2
666672
strikeout = s["char_flags"] & 1
667673

668-
if mono:
669-
# this is text in some monospaced font
670-
out_string += f"`{s['text'].strip()}` "
671-
continue
674+
# if mono:
675+
# # this is text in some monospaced font
676+
# out_string += f"`{s['text'].strip()}` "
677+
# continue
672678

673679
prefix = ""
674680
suffix = ""
681+
if mono:
682+
prefix = "`" + prefix
683+
suffix += "`"
675684
if bold:
676685
prefix = "**" + prefix
677686
suffix += "**"
678687
if italic:
679-
prefix = "*" + prefix
680-
suffix += "*"
688+
prefix = "_" + prefix
689+
suffix += "_"
681690
if strikeout:
682691
prefix = "~~" + prefix
683692
suffix += "~~"
684-
if mono:
685-
prefix = "`" + prefix
686-
suffix += "`"
687693

688694
# convert intersecting link to markdown syntax
689695
ltext = resolve_links(parms.links, s)
@@ -831,9 +837,12 @@ def page_is_ocr(page):
831837
832838
For this to be true, all text must be written as "ignore-text".
833839
"""
834-
text_types = set([b[0] for b in page.get_bboxlog() if "text" in b[0]])
835-
if text_types == {"ignore-text"}:
836-
return True
840+
try:
841+
text_types = set([b[0] for b in page.get_bboxlog() if "text" in b[0]])
842+
if text_types == {"ignore-text"}:
843+
return True
844+
except:
845+
pass
837846
return False
838847

839848
def get_bg_color(page):
@@ -934,7 +943,9 @@ def get_page_output(
934943
parms.graphics = []
935944
parms.words = []
936945
parms.line_rects = []
937-
parms.accept_invisible = page_is_ocr(page) # accept invisible text
946+
parms.accept_invisible = (
947+
page_is_ocr(page) or ignore_alpha
948+
) # accept invisible text
938949

939950
# determine background color
940951
parms.bg_color = get_bg_color(page)
@@ -958,6 +969,8 @@ def get_page_output(
958969
img_info = []
959970
for i in range(len(img_info)):
960971
img_info[i]["bbox"] = pymupdf.Rect(img_info[i]["bbox"])
972+
973+
# filter out images that are too small or outside the clip
961974
img_info = [
962975
i
963976
for i in img_info
@@ -967,8 +980,19 @@ def get_page_output(
967980
and i["bbox"].width > 3
968981
and i["bbox"].height > 3
969982
]
983+
970984
# sort descending by image area size
971985
img_info.sort(key=lambda i: abs(i["bbox"]), reverse=True)
986+
987+
# subset of images truly inside the clip
988+
sane = [i for i in img_info if parms.clip not in i["bbox"].irect]
989+
if len(sane) < len(img_info): # found some
990+
img_info = sane # use those images instead
991+
# output full page image
992+
name = save_image(parms, parms.clip, "full")
993+
if name:
994+
parms.md_string += GRAPHICS_TEXT % name
995+
972996
img_info = img_info[:30] # only accept the largest up to 30 images
973997
# run from back to front (= small to large)
974998
for i in range(len(img_info) - 1, 0, -1):
@@ -1152,7 +1176,7 @@ def get_page_output(
11521176
0
11531177
| mupdf.FZ_STEXT_CLIP
11541178
| mupdf.FZ_STEXT_ACCURATE_BBOXES
1155-
| mupdf.FZ_STEXT_IGNORE_ACTUALTEXT
1179+
# | mupdf.FZ_STEXT_IGNORE_ACTUALTEXT
11561180
| 32768 # mupdf.FZ_STEXT_COLLECT_STYLES
11571181
)
11581182
# optionally replace 0xFFFD by glyph number
@@ -1253,7 +1277,7 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
12531277
import time
12541278

12551279
try:
1256-
filename = "sample_document.pdf"
1280+
filename = sys.argv[1]
12571281
except IndexError:
12581282
print(f"Usage:\npython {os.path.basename(__file__)} input.pdf")
12591283
sys.exit()
@@ -1284,11 +1308,6 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
12841308
md_string = to_markdown(
12851309
doc,
12861310
pages=pages,
1287-
# write_images=True,
1288-
force_text=True,
1289-
ignore_images=True,
1290-
ignore_graphics=True,
1291-
table_strategy=None,
12921311
)
12931312
FILENAME = doc.name
12941313
# output to a text file with extension ".md"

pymupdf4llm/setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@
1313
"Programming Language :: Python :: 3",
1414
"Topic :: Utilities",
1515
]
16-
requires = ["pymupdf>=1.25.5"]
16+
requires = ["pymupdf>=1.26.1"]
1717

1818
setuptools.setup(
1919
name="pymupdf4llm",
20-
version="0.0.24",
20+
version="0.0.25",
2121
author="Artifex",
2222
author_email="[email protected]",
2323
description="PyMuPDF Utilities for LLM/RAG",

0 commit comments

Comments
 (0)