Skip to content

Commit fd56d5e

Browse files
committed
Correctly ignoring fill-only vector graphics
For table detection strategy "lines_strict" we must exclude paths with type "f" (= no borders). This check is located wrong currently (i.e. happens too late). This fix positions this exclusion directly at vector graphics extraction.
1 parent d183c2f commit fd56d5e

File tree

1 file changed

+18
-10
lines changed

1 file changed

+18
-10
lines changed

src/table.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1886,8 +1886,24 @@ def clean_graphics():
18861886

18871887
parea = abs(page.rect) * 0.8 # area of the full page (80%)
18881888

1889-
# exclude graphics that are too large
1890-
paths = [p for p in page.get_drawings() if abs(p["rect"]) < parea]
1889+
# exclude irrelevant graphics
1890+
paths = []
1891+
for p in page.get_drawings():
1892+
if abs(p["rect"]) >= parea:
1893+
continue
1894+
if "s" in p["type"]:
1895+
paths.append(p)
1896+
continue
1897+
if (
1898+
p["rect"].width > 3
1899+
and p["rect"].height > 3
1900+
and (
1901+
tset.vertical_strategy == "lines_strict"
1902+
or tset.horizontal_strategy == "lines_strict"
1903+
)
1904+
):
1905+
continue
1906+
paths.append(p)
18911907

18921908
# make a list of vector graphics rectangles (IRects are sufficient)
18931909
prects = sorted([p["rect"] for p in paths], key=lambda r: (r.y1, r.x0))
@@ -1998,14 +2014,6 @@ def make_line(p, p1, p2, clip):
19982014
return line_dict
19992015

20002016
for p in paths:
2001-
if p["type"] == "f" and p["fill"] == (1, 1, 1):
2002-
continue
2003-
if p["type"] == "f" and p["rect"].width > 3 and p["rect"].height > 3:
2004-
if (
2005-
tset.vertical_strategy == "lines_strict"
2006-
or tset.horizontal_strategy == "lines_strict"
2007-
):
2008-
continue
20092017
items = p["items"] # items in this path
20102018

20112019
# if 'closePath', add a line from last to first point

0 commit comments

Comments
 (0)