Skip to content

Commit aab3060

Browse files
authored
Post Processing Made Easy (#31)
* split_merged_rows functionality * To fix decimal and thousands separator values * split_merged_columns, fix_date_format functionalities * validations added * Easy Naming * added `server_response` attribute to the session * move unneccessary variable initialization * Added Google Colab Contents * Handle empty tables * Save tables to multiple sheets of a single excel file * standardized params naming * Functionality to save Tables & Text output to local * Version Update * Updated Tutorial v2.1.0
1 parent 55866b2 commit aab3060

File tree

5 files changed

+1126
-466
lines changed

5 files changed

+1126
-466
lines changed

ExtractTable/__init__.py

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,14 @@ def _make_request(self, method, host: urlparse, params: dict = None, data: dict
4949
"""
5050
tmp = self.__dict__.copy()
5151
for _type, _obj in tmp.items():
52-
if _type not in ("api_key", "_session"):
52+
if _type not in ("api_key", "_session", "input_filename"):
5353
self.__delattr__(_type)
5454

5555
host = host if not host.startswith("http") else host.split("/")[2]
5656
url = urlparse.urlunparse(('https', host, '', '', '', ''))
5757
self.ServerResponse = self._session.request(method, url, params=params, data=data, **kwargs)
5858
ValidateResponse(resp=self.ServerResponse, show_warn=self._WARNINGS)
59-
59+
self.server_response = self.ServerResponse.json()
6060
return self.ServerResponse.json()
6161

6262
def check_usage(self) -> dict:
@@ -150,11 +150,13 @@ def process_file(
150150
"""
151151
# Raise a warning if unknown format is requested
152152
if output_format not in self._OUTPUT_FORMATS:
153-
default_format = "dict"
154-
warn_msg = f"Found: {output_format} as output_format; Allowed only {self._OUTPUT_FORMATS}. " \
155-
f"Assigned default format: {default_format}"
153+
warn_msg = f"Found: '{output_format}' as output_format; Allowed formats are {self._OUTPUT_FORMATS}. " \
154+
f"Assigned to default format: {self._DEFAULT}"
156155
warnings.warn(warn_msg)
157156

157+
# To use the reference when saving the output
158+
self.__setattr__('input_filename', os.path.basename(filepath))
159+
158160
try:
159161
with PrepareInput(filepath, pages=pages) as infile:
160162
with open(infile.filepath, 'rb') as fp:
@@ -168,5 +170,40 @@ def process_file(
168170
for _type, _obj in trigger_resp.items():
169171
self.__setattr__(_type, _obj)
170172

171-
result = ConvertTo(data=trigger_resp, fmt=output_format, indexing=indexing).output
173+
result = ConvertTo(server_response=trigger_resp, output_format=output_format, indexing=indexing).output
172174
return result
175+
176+
def save_output(self, output_folder: os.PathLike = "", output_format: str = "csv"):
177+
"""
178+
Save the objects of session data to user preferred location or a default folder
179+
:param output_folder: user preferred output location; default tmp directory
180+
:param output_format: needed only for tables CSV or XLSX
181+
:return: location of the output
182+
"""
183+
input_fname = self.input_filename.rsplit('.')[0]
184+
185+
output_format = output_format.lower()
186+
if output_format not in ("csv", "xlsx"):
187+
output_format = "csv"
188+
warnings.warn("Invalid 'output_format' given. Defaulted to 'csv'")
189+
190+
table_outputs_path = ConvertTo(server_response=self.server_response, output_format=output_format).output
191+
192+
if output_folder:
193+
if not os.path.exists(output_folder):
194+
output_folder = os.path.split(table_outputs_path[0])[0]
195+
warnings.warn(f"Your output_folder not exists. Saving the outputs to {output_folder}")
196+
else:
197+
for each_tbl_path in table_outputs_path:
198+
os.replace(each_tbl_path, os.path.join(output_folder, input_fname+os.path.basename(each_tbl_path)))
199+
200+
else:
201+
output_folder = os.path.split(table_outputs_path[0])[0]
202+
203+
for each_page in self.server_response.get("Lines", []):
204+
page_txt_fname = os.path.join(output_folder, f"{input_fname}_Page_{str(each_page['Page'])}.txt")
205+
page_txt = [each_line['Line'] for each_line in each_page['LinesArray']]
206+
with open(page_txt_fname, "w", encoding="utf-8") as ofile:
207+
ofile.write("\n".join(page_txt))
208+
209+
return output_folder

ExtractTable/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = (2, 0, 2)
1+
VERSION = (2, 1, 0)
22
PRERELEASE = None # "alpha", "beta" or "rc"
33
REVISION = None
44

@@ -13,7 +13,7 @@ def generate_version():
1313

1414

1515
__title__ = "ExtractTable"
16-
__description__ = "Extract tabular data from images and scanned PDFs. Easily convert image to table, convert pdf to table"
16+
__description__ = "Extract table data from images and scanned PDFs. Easily convert image to excel, convert pdf to table"
1717
__url__ = "https://github.com/ExtractTable/ExtractTable-py"
1818
__version__ = generate_version()
1919
__author__ = "Saradhi"

ExtractTable/common.py

Lines changed: 201 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,40 +2,42 @@
22
Preprocess the output received from server and interface as a final result to the client
33
"""
44
import os
5+
import re
56
import tempfile
67
import warnings
78
import collections
9+
from statistics import mode
10+
from typing import List
811

912
import pandas as pd
1013

1114

1215
class ConvertTo:
13-
"""Convert tabular JSON to an user requested output format"""
14-
FORMATS = {"df", "dataframe", "json", "csv", "dict"}
16+
FORMATS = {"df", "dataframe", "json", "csv", "dict", "xlsx", "excel"}
1517
DEFAULT = "df"
1618

17-
def __init__(self, data: dict, fmt: str = DEFAULT, indexing: bool = False):
19+
def __init__(self, server_response: dict, output_format: str = DEFAULT, indexing: bool = False, table_obj="TableJson"):
1820
"""
19-
20-
:param data: Tabular JSON data from server
21-
:param fmt: format to be converted into
21+
Convert the server response to an user requested output format on Tables
22+
:param server_response: Tabular JSON data from server
23+
:param output_format: format to be converted into
2224
:param indexing: row & column index consideration in the output
2325
"""
24-
self.data = data
25-
self.output = self._converter(fmt.lower(), indexing=indexing)
26+
self.server_response = server_response
27+
self.output = self._converter(output_format.lower(), indexing=indexing, table_obj=table_obj)
2628

27-
def _converter(self, fmt: str, indexing: bool = False) -> list:
29+
def _converter(self, fmt: str, indexing: bool = False, table_obj="TableJson") -> list:
2830
"""
2931
Actual conversion takes place here using Pandas
3032
:param fmt: format to be converted into
3133
:param indexing: row index consideration in the output
3234
:return: list of tables from converted into the requested output format
3335
"""
3436
dfs = []
35-
for table in self.data.get("Tables", []):
36-
tmp = {int(k): v for k, v in table["TableJson"].items()}
37+
for table in self.server_response.get("Tables", []):
38+
tmp = {int(k): v for k, v in table[table_obj].items()}
3739
# To convert column indices to int to maintain the table order with more than 9 columns
38-
cols = [str(x) for x in sorted([int(x) for x in tmp[0]])]
40+
cols = [str(x) for x in sorted([int(x) for x in tmp[0]])] if tmp else None
3941
# To convert row indices to int and maintain the table order with more than 9 rows
4042
tmp = collections.OrderedDict(sorted(tmp.items()))
4143
dfs.append(pd.DataFrame.from_dict(tmp, orient="index", columns=cols))
@@ -52,9 +54,196 @@ def _converter(self, fmt: str, indexing: bool = False) -> list:
5254
df.to_csv(csv_name, index=indexing, header=indexing)
5355
output_location.append(csv_name)
5456
return output_location
57+
elif fmt in ("xlsx", "excel"):
58+
output_excel_location = os.path.join(tempfile.mkdtemp(), f"_tables_{len(dfs)}.xlsx")
59+
if len(dfs) >= 10:
60+
warnings.warn(f"There are {dfs} tables extracted. Consider to change the output_format to 'csv' instead")
61+
with pd.ExcelWriter(output_excel_location) as writer:
62+
for n, df in enumerate(dfs):
63+
df.to_excel(writer, f'table_{n+1}')
64+
writer.save()
65+
return [output_excel_location]
5566
elif fmt == "json":
5667
return [df.to_json() for df in dfs]
5768
else:
5869
warn_msg = f"Supported output formats {self.FORMATS} only. Assigned to default: {self.DEFAULT}"
5970
warnings.warn(warn_msg)
6071
return dfs
72+
73+
74+
class MakeCorrections:
75+
def __init__(self, et_resp: dict = None, dataframes: List[pd.DataFrame] = None):
76+
"""
77+
To apply post processing techniques on the output
78+
:param et_resp: ExtractTable response
79+
:param dataframes: user preferred dataframe(s).
80+
Default assumes all dataframes from the extracttable response, `et_resp`.
81+
If both `et_resp` and `dataframes` are provided, the later is considered for the processing
82+
"""
83+
if et_resp:
84+
self.dataframes = ConvertTo(data=et_resp).output
85+
86+
if not et_resp:
87+
try:
88+
self.dataframes = self.__isacceptable__(dataframes)
89+
except ValueError:
90+
raise ValueError("Either ExtractTable response or your preferred list of pandas dataframes is required")
91+
92+
@staticmethod
93+
def __isacceptable__(dfs) -> List[pd.DataFrame]:
94+
"""Validate the `dataframes` param"""
95+
if type(dfs) is list:
96+
if all([type(df) is pd.DataFrame for df in dfs]):
97+
return dfs
98+
elif type(dfs) is pd.DataFrame:
99+
return [dfs]
100+
raise ValueError("Dataframes should be list of dataframes or a dataframe")
101+
102+
def split_merged_rows(self) -> List[pd.DataFrame]:
103+
"""
104+
To split the merged rows into possible multiple rows
105+
:return: reformatted list of dataframes
106+
"""
107+
for df_idx, each_df in enumerate(self.dataframes):
108+
reformat = []
109+
for row in each_df.to_numpy():
110+
row = list(row)
111+
112+
# looks like line separator is " "
113+
seperators = [col.strip().count(" ") for col in row]
114+
# Statistical mode to assume the number of rows merged
115+
mode_ = mode(seperators)
116+
117+
if mode_:
118+
# split the merged rows inside the col
119+
tmp = [col.strip().split(' ', mode_) for col in row]
120+
for idx in range(len(tmp[0])):
121+
tmp_ = []
122+
for x in range(len(tmp)):
123+
try:
124+
val = tmp[x][idx]
125+
except IndexError:
126+
val = ""
127+
tmp_.append(val)
128+
reformat.append(tmp_)
129+
else:
130+
reformat.append(row)
131+
132+
self.dataframes[df_idx] = pd.DataFrame(reformat)
133+
134+
return self.dataframes
135+
136+
def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool = False) -> List[pd.DataFrame]:
137+
"""
138+
To split the merged columns into possible multiple columns
139+
:param columns_idx: user preferred columns indices.
140+
Default loops through all columns to find numeric or decimal columns
141+
:param force_split: To force split through the columns
142+
:return: reformatted list of dataframes
143+
"""
144+
# TODO: Should we consider delimiter_pattern for the split?
145+
for df_idx, df in enumerate(self.dataframes):
146+
if not columns_idx:
147+
columns_idx = df.columns
148+
149+
columns_idx = [str(x) for x in columns_idx]
150+
reformat = []
151+
for col_idx in columns_idx:
152+
tmp = df[col_idx].str.split(expand=True)
153+
154+
if not any([not any(tmp.isna().any()), force_split]) or tmp.shape[-1] == 1:
155+
reformat.append(df[col_idx].tolist())
156+
# If user wanted force_split or the split columns have all cell values
157+
# then proceed next
158+
else:
159+
reformat.extend([tmp[each].tolist() for each in tmp.columns])
160+
161+
self.dataframes[df_idx] = pd.DataFrame(reformat).T
162+
163+
return self.dataframes
164+
165+
def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: str = ".", thousands_separator: str = ",", decimal_position: int = 2) -> List[pd.DataFrame]:
166+
"""
167+
To fix decimal and thousands separator values. Often commas as detected as period
168+
:param columns_idx: user preferred columns indices.
169+
Default loops through all columns to find numeric or decimal columns
170+
:param decimal_separator: preferred decimal separator
171+
:param thousands_separator: preferred thousands separator
172+
:param decimal_position: preferred decimal position
173+
:return: corrected list of dataframes
174+
"""
175+
# TODO: Should we consider only bad confidence values?
176+
reg_ = f"[{decimal_separator}{thousands_separator}]"
177+
if decimal_position > 0:
178+
thou_regex = reg_ + '(?=.*' + reg_ + ')'
179+
else:
180+
thou_regex = reg_
181+
decimal_position = int(decimal_position)
182+
183+
for df_idx, df in enumerate(self.dataframes):
184+
if not columns_idx:
185+
columns_idx = df.columns
186+
columns_idx = [str(x) for x in columns_idx]
187+
188+
for col_idx in columns_idx:
189+
digits = df[col_idx].str.count(pat=r'\d').sum()
190+
chars = df[col_idx].str.count(pat=r'[\w]').sum()
191+
192+
if digits/chars < 0.75:
193+
# To infer a numeric or float column
194+
# Check if the column contains more digits or characters
195+
continue
196+
197+
df[col_idx] = df[col_idx].str.strip()
198+
df[col_idx].replace(regex={r'%s' % thou_regex: thousands_separator}, inplace=True)
199+
200+
# To correct decimal position
201+
if not decimal_position > 0:
202+
continue
203+
204+
for i, _ in enumerate(df[col_idx]):
205+
if not len(df[col_idx][i]) > decimal_position:
206+
# length of atleast decimal_position
207+
continue
208+
elif df[col_idx][i][-(decimal_position+1)] == decimal_separator:
209+
# nothing to do if decimal separator already in place
210+
continue
211+
212+
# If decimal position is a not alphanumeric
213+
if re.search(r'\W+', df[col_idx][i][-(decimal_position+1)]):
214+
digits = len(re.findall(r'\d', df[col_idx][i]))
215+
if digits/len(df[col_idx][i]) >= 0.5:
216+
df[col_idx][i] = df[col_idx][i][:-(decimal_position+1)] + decimal_separator + df[col_idx][i][-decimal_position:]
217+
218+
self.dataframes[df_idx] = df
219+
return self.dataframes
220+
221+
def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
222+
"""
223+
To fix date formats of the column
224+
Eg: 12|1212020 as 12/12/2020
225+
:param columns_idx: user preferred columns indices.
226+
Default loops through all columns to find Date Columns
227+
:param delimiter: "/" or "-" whatelse you prefer
228+
:return: correted list of dataframes
229+
"""
230+
date_regex = r'(\d{2}(\d{2})?)(\W)(\d{2}|[A-Za-z]{3,9})(\W)(\d{2}(\d{2})?)\b'
231+
for df_idx, df in enumerate(self.dataframes):
232+
if not columns_idx:
233+
columns_idx = df.columns
234+
columns_idx = [str(x) for x in columns_idx]
235+
236+
for col_idx in columns_idx:
237+
dates = df[col_idx].str.count(pat=date_regex).sum()
238+
239+
if not (dates >= len(df) * 0.75):
240+
# To infer a date column
241+
# Check if the column contains digits and non-alpha character greater than column length
242+
continue
243+
244+
df[col_idx] = df[col_idx].str.strip()
245+
df[col_idx].replace(regex={date_regex: r'\1%s\4%s\6' % (delimiter, delimiter)}, inplace=True)
246+
247+
self.dataframes[df_idx] = df
248+
249+
return self.dataframes

README.md

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,33 @@ table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, output_f
3232
```
3333

3434
## Detailed Library Usage
35-
[example-code.ipynb](example-code.ipynb)
36-
37-
<a href="https://colab.research.google.com/github/ExtractTable/ExtractTable-py/blob/master/example-code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
35+
The tutorial available at <a href="https://colab.research.google.com/github/ExtractTable/ExtractTable-py/blob/master/example-code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> takes you through
36+
37+
```Markup
38+
1. Installation
39+
2. Import and check version
40+
3. Create Session & Validate API Key
41+
3.1 Create Session with your API Key
42+
3.2 Validate the Key and check the plan usage
43+
3.3 Check Usage Details
44+
4. Trigger the extraction process
45+
4.1 Accepted Input Types
46+
4.2 Process an IMAGE Input
47+
4.3 Process a PDF Input
48+
4.4 Output options
49+
4.5 Explore session objects
50+
5. Explore the Output
51+
5.1 Output Structure
52+
5.2 Output Details
53+
6. Make Corrections
54+
6.1 Split Merged Rows
55+
6.2 Split Merged Columns
56+
6.3 Fix Decimal Format
57+
6.4 Fix Date Format
58+
7. Helpful Code Snippets
59+
7.1 Get text data
60+
7.2 Table output to Excel
61+
```
3862

3963
### Woahh, as simple as that ?!
4064

0 commit comments

Comments
 (0)