@@ -171,10 +171,10 @@ class TocHeaders:
171
171
full document to identify font sizes, it uses the document's Table Of
172
172
Contents (TOC) to identify headers on pages.
173
173
Like IdentifyHeaders, this also is no guarantee to find headers, but it
174
- is a good change for appropriately build documents. In such cases, this
175
- method can be very much faster and more accurate, because we can use the
176
- hierarchy level of TOC items directly to ientify the header level.
177
- Examples where this approach works very well are the Adobe PDF documents.
174
+ represents a good chance for appropriately built documents. In such cases,
175
+ this method can be very much faster and more accurate, because we can
176
+ directly use the hierarchy level of TOC items to ientify the header level.
177
+ Examples where this works very well are the Adobe PDF documents.
178
178
"""
179
179
180
180
def __init__ (self , doc : str ):
@@ -195,14 +195,15 @@ def get_header_id(self, span: dict, page=None) -> str:
195
195
Given a text span from a "dict"/"rawdict" extraction, determine the
196
196
markdown header prefix string of 0 to n concatenated '#' characters.
197
197
"""
198
- if page is None :
198
+ if not page :
199
199
return ""
200
200
# check if this page has TOC entries with an actual title
201
201
my_toc = [t for t in self .TOC if t [1 ] and t [- 1 ] == page .number + 1 ]
202
- if not my_toc :
202
+ if not my_toc : # no TOC items present on this page
203
203
return ""
204
- # check if the span matches a TOC entry
205
- text = span ["text" ].strip ()
204
+ # Check if the span matches a TOC entry. This must be done in the
205
+ # most forgiving way: exact matches are rare animals.
206
+ text = span ["text" ].strip () # remove leading and trailing whitespace
206
207
for t in my_toc :
207
208
title = t [1 ].strip () # title of TOC entry
208
209
lvl = t [0 ] # level of TOC entry
@@ -321,6 +322,7 @@ def to_markdown(
321
322
extract_words = False ,
322
323
show_progress = False ,
323
324
use_glyphs = False ,
325
+ ignore_alpha = False ,
324
326
) -> str :
325
327
"""Process the document and return the text of the selected pages.
326
328
@@ -341,9 +343,10 @@ def to_markdown(
341
343
table_strategy: choose table detection strategy
342
344
graphics_limit: (int) if vector graphics count exceeds this, ignore all.
343
345
ignore_code: (bool) suppress code-like formatting (mono-space fonts)
344
- extract_words: (bool) include "words"-like output in page chunks
345
- show_progress: (bool) print progress as each page is processed.
346
- use_glyphs: (bool) replace the Invalid Unicode by glyph numbers.
346
+ extract_words: (bool, False) include "words"-like output in page chunks
347
+ show_progress: (bool, False) print progress as each page is processed.
348
+ use_glyphs: (bool, False) replace the Invalid Unicode by glyph numbers.
349
+ ignore_alpha: (bool, True) ignore text with alpha = 0 (transparent).
347
350
348
351
"""
349
352
if write_images is False and embed_images is False and force_text is False :
@@ -372,6 +375,8 @@ def to_markdown(
372
375
FONTSIZE_LIMIT = fontsize_limit
373
376
IGNORE_IMAGES = ignore_images
374
377
IGNORE_GRAPHICS = ignore_graphics
378
+ if doc .is_form_pdf or doc .has_annots ():
379
+ doc .bake ()
375
380
376
381
# for reflowable documents allow making 1 page for the whole document
377
382
if doc .is_reflowable :
@@ -394,7 +399,7 @@ def to_markdown(
394
399
margins = (0 , margins [0 ], 0 , margins [1 ])
395
400
if len (margins ) != 4 :
396
401
raise ValueError ("margins must be one, two or four floats" )
397
- elif not all ([ hasattr (m , "__float__" ) for m in margins ] ):
402
+ elif not all (hasattr (m , "__float__" ) for m in margins ):
398
403
raise ValueError ("margin values must be floats" )
399
404
400
405
# If "hdr_info" is not an object with a method "get_header_id", scan the
@@ -587,44 +592,28 @@ def write_text(
587
592
# make text string for the full line
588
593
text = " " .join ([s ["text" ] for s in spans ])
589
594
590
- # if line is a header, this will return multiple "#" characters,
591
- # otherwise an empty string
592
- hdr_string = max_header_id (spans , page = parms .page ) # a header?
593
-
594
595
# full line strikeout?
595
596
all_strikeout = all ([s ["char_flags" ] & 1 for s in spans ])
596
597
# full line italic?
597
598
all_italic = all ([s ["flags" ] & 2 for s in spans ])
598
599
# full line bold?
599
- all_bold = all ([s ["flags" ] & 16 or s ["char_flags" ] & 8 for s in spans ])
600
-
600
+ all_bold = all ([(s ["flags" ] & 16 ) or (s ["char_flags" ] & 8 ) for s in spans ])
601
601
# full line mono-spaced?
602
- if not IGNORE_CODE :
603
- all_mono = all ([s ["flags" ] & 8 for s in spans ])
604
- else :
605
- all_mono = False
602
+ all_mono = all ([s ["flags" ] & 8 for s in spans ])
606
603
607
- if all_mono and not hdr_string :
608
- if not code : # if not already in code output mode:
609
- out_string += "```\n " # switch on "code" mode
610
- code = True
611
- # compute approx. distance from left - assuming a width
612
- # of 0.5*fontsize.
613
- delta = int ((lrect .x0 - clip .x0 ) / (spans [0 ]["size" ] * 0.5 ))
614
- indent = " " * delta
615
-
616
- out_string += indent + text + "\n "
617
- continue # done with this line
604
+ # if line is a header, this will return multiple "#" characters,
605
+ # otherwise an empty string
606
+ hdr_string = max_header_id (spans , page = parms .page ) # a header?
618
607
619
608
if hdr_string : # if a header line skip the rest
620
609
if all_mono :
621
610
text = "`" + text + "`"
622
- if all_strikeout :
623
- text = "~~" + text + "~~"
624
611
if all_italic :
625
- text = "* " + text + "* "
612
+ text = "_ " + text + "_ "
626
613
if all_bold :
627
614
text = "**" + text + "**"
615
+ if all_strikeout :
616
+ text = "~~" + text + "~~"
628
617
if hdr_string != prev_hdr_string :
629
618
out_string += hdr_string + text + "\n "
630
619
else :
@@ -637,6 +626,23 @@ def write_text(
637
626
638
627
prev_hdr_string = hdr_string
639
628
629
+ # start or extend a code block
630
+ if all_mono and not IGNORE_CODE :
631
+ if not code : # if not already in code output mode:
632
+ out_string += "```\n " # switch on "code" mode
633
+ code = True
634
+ # compute approx. distance from left - assuming a width
635
+ # of 0.5*fontsize.
636
+ delta = int ((lrect .x0 - clip .x0 ) / (spans [0 ]["size" ] * 0.5 ))
637
+ indent = " " * delta
638
+
639
+ out_string += indent + text + "\n "
640
+ continue # done with this line
641
+
642
+ if code and not all_mono :
643
+ out_string += "```\n " # switch off code mode
644
+ code = False
645
+
640
646
span0 = spans [0 ]
641
647
bno = span0 ["block" ] # block number of line
642
648
if bno != prev_bno :
@@ -660,30 +666,30 @@ def write_text(
660
666
661
667
for i , s in enumerate (spans ): # iterate spans of the line
662
668
# decode font properties
663
- mono = s ["flags" ] & 8 and IGNORE_CODE is False
669
+ mono = s ["flags" ] & 8
664
670
bold = s ["flags" ] & 16 or s ["char_flags" ] & 8
665
671
italic = s ["flags" ] & 2
666
672
strikeout = s ["char_flags" ] & 1
667
673
668
- if mono :
669
- # this is text in some monospaced font
670
- out_string += f"`{ s ['text' ].strip ()} ` "
671
- continue
674
+ # if mono:
675
+ # # this is text in some monospaced font
676
+ # out_string += f"`{s['text'].strip()}` "
677
+ # continue
672
678
673
679
prefix = ""
674
680
suffix = ""
681
+ if mono :
682
+ prefix = "`" + prefix
683
+ suffix += "`"
675
684
if bold :
676
685
prefix = "**" + prefix
677
686
suffix += "**"
678
687
if italic :
679
- prefix = "* " + prefix
680
- suffix += "* "
688
+ prefix = "_ " + prefix
689
+ suffix += "_ "
681
690
if strikeout :
682
691
prefix = "~~" + prefix
683
692
suffix += "~~"
684
- if mono :
685
- prefix = "`" + prefix
686
- suffix += "`"
687
693
688
694
# convert intersecting link to markdown syntax
689
695
ltext = resolve_links (parms .links , s )
@@ -831,9 +837,12 @@ def page_is_ocr(page):
831
837
832
838
For this to be true, all text must be written as "ignore-text".
833
839
"""
834
- text_types = set ([b [0 ] for b in page .get_bboxlog () if "text" in b [0 ]])
835
- if text_types == {"ignore-text" }:
836
- return True
840
+ try :
841
+ text_types = set ([b [0 ] for b in page .get_bboxlog () if "text" in b [0 ]])
842
+ if text_types == {"ignore-text" }:
843
+ return True
844
+ except :
845
+ pass
837
846
return False
838
847
839
848
def get_bg_color (page ):
@@ -934,7 +943,9 @@ def get_page_output(
934
943
parms .graphics = []
935
944
parms .words = []
936
945
parms .line_rects = []
937
- parms .accept_invisible = page_is_ocr (page ) # accept invisible text
946
+ parms .accept_invisible = (
947
+ page_is_ocr (page ) or ignore_alpha
948
+ ) # accept invisible text
938
949
939
950
# determine background color
940
951
parms .bg_color = get_bg_color (page )
@@ -958,6 +969,8 @@ def get_page_output(
958
969
img_info = []
959
970
for i in range (len (img_info )):
960
971
img_info [i ]["bbox" ] = pymupdf .Rect (img_info [i ]["bbox" ])
972
+
973
+ # filter out images that are too small or outside the clip
961
974
img_info = [
962
975
i
963
976
for i in img_info
@@ -967,8 +980,19 @@ def get_page_output(
967
980
and i ["bbox" ].width > 3
968
981
and i ["bbox" ].height > 3
969
982
]
983
+
970
984
# sort descending by image area size
971
985
img_info .sort (key = lambda i : abs (i ["bbox" ]), reverse = True )
986
+
987
+ # subset of images truly inside the clip
988
+ sane = [i for i in img_info if parms .clip not in i ["bbox" ].irect ]
989
+ if len (sane ) < len (img_info ): # found some
990
+ img_info = sane # use those images instead
991
+ # output full page image
992
+ name = save_image (parms , parms .clip , "full" )
993
+ if name :
994
+ parms .md_string += GRAPHICS_TEXT % name
995
+
972
996
img_info = img_info [:30 ] # only accept the largest up to 30 images
973
997
# run from back to front (= small to large)
974
998
for i in range (len (img_info ) - 1 , 0 , - 1 ):
@@ -1152,7 +1176,7 @@ def get_page_output(
1152
1176
0
1153
1177
| mupdf .FZ_STEXT_CLIP
1154
1178
| mupdf .FZ_STEXT_ACCURATE_BBOXES
1155
- | mupdf .FZ_STEXT_IGNORE_ACTUALTEXT
1179
+ # | mupdf.FZ_STEXT_IGNORE_ACTUALTEXT
1156
1180
| 32768 # mupdf.FZ_STEXT_COLLECT_STYLES
1157
1181
)
1158
1182
# optionally replace 0xFFFD by glyph number
@@ -1253,7 +1277,7 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
1253
1277
import time
1254
1278
1255
1279
try :
1256
- filename = "sample_document.pdf"
1280
+ filename = sys . argv [ 1 ]
1257
1281
except IndexError :
1258
1282
print (f"Usage:\n python { os .path .basename (__file__ )} input.pdf" )
1259
1283
sys .exit ()
@@ -1284,11 +1308,6 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
1284
1308
md_string = to_markdown (
1285
1309
doc ,
1286
1310
pages = pages ,
1287
- # write_images=True,
1288
- force_text = True ,
1289
- ignore_images = True ,
1290
- ignore_graphics = True ,
1291
- table_strategy = None ,
1292
1311
)
1293
1312
FILENAME = doc .name
1294
1313
# output to a text file with extension ".md"
0 commit comments