2
2
Preprocess the output received from server and interface as a final result to the client
3
3
"""
4
4
import os
5
+ import re
5
6
import tempfile
6
7
import warnings
7
8
import collections
9
+ from statistics import mode
10
+ from typing import List
8
11
9
12
import pandas as pd
10
13
11
14
12
15
class ConvertTo :
13
- """Convert tabular JSON to an user requested output format"""
14
- FORMATS = {"df" , "dataframe" , "json" , "csv" , "dict" }
16
+ FORMATS = {"df" , "dataframe" , "json" , "csv" , "dict" , "xlsx" , "excel" }
15
17
DEFAULT = "df"
16
18
17
- def __init__ (self , data : dict , fmt : str = DEFAULT , indexing : bool = False ):
19
+ def __init__ (self , server_response : dict , output_format : str = DEFAULT , indexing : bool = False , table_obj = "TableJson" ):
18
20
"""
19
-
20
- :param data : Tabular JSON data from server
21
- :param fmt : format to be converted into
21
+ Convert the server response to an user requested output format on Tables
22
+ :param server_response : Tabular JSON data from server
23
+ :param output_format : format to be converted into
22
24
:param indexing: row & column index consideration in the output
23
25
"""
24
- self .data = data
25
- self .output = self ._converter (fmt .lower (), indexing = indexing )
26
+ self .server_response = server_response
27
+ self .output = self ._converter (output_format .lower (), indexing = indexing , table_obj = table_obj )
26
28
27
- def _converter (self , fmt : str , indexing : bool = False ) -> list :
29
+ def _converter (self , fmt : str , indexing : bool = False , table_obj = "TableJson" ) -> list :
28
30
"""
29
31
Actual conversion takes place here using Pandas
30
32
:param fmt: format to be converted into
31
33
:param indexing: row index consideration in the output
32
34
:return: list of tables from converted into the requested output format
33
35
"""
34
36
dfs = []
35
- for table in self .data .get ("Tables" , []):
36
- tmp = {int (k ): v for k , v in table ["TableJson" ].items ()}
37
+ for table in self .server_response .get ("Tables" , []):
38
+ tmp = {int (k ): v for k , v in table [table_obj ].items ()}
37
39
# To convert column indices to int to maintain the table order with more than 9 columns
38
- cols = [str (x ) for x in sorted ([int (x ) for x in tmp [0 ]])]
40
+ cols = [str (x ) for x in sorted ([int (x ) for x in tmp [0 ]])] if tmp else None
39
41
# To convert row indices to int and maintain the table order with more than 9 rows
40
42
tmp = collections .OrderedDict (sorted (tmp .items ()))
41
43
dfs .append (pd .DataFrame .from_dict (tmp , orient = "index" , columns = cols ))
@@ -52,9 +54,196 @@ def _converter(self, fmt: str, indexing: bool = False) -> list:
52
54
df .to_csv (csv_name , index = indexing , header = indexing )
53
55
output_location .append (csv_name )
54
56
return output_location
57
+ elif fmt in ("xlsx" , "excel" ):
58
+ output_excel_location = os .path .join (tempfile .mkdtemp (), f"_tables_{ len (dfs )} .xlsx" )
59
+ if len (dfs ) >= 10 :
60
+ warnings .warn (f"There are { dfs } tables extracted. Consider to change the output_format to 'csv' instead" )
61
+ with pd .ExcelWriter (output_excel_location ) as writer :
62
+ for n , df in enumerate (dfs ):
63
+ df .to_excel (writer , f'table_{ n + 1 } ' )
64
+ writer .save ()
65
+ return [output_excel_location ]
55
66
elif fmt == "json" :
56
67
return [df .to_json () for df in dfs ]
57
68
else :
58
69
warn_msg = f"Supported output formats { self .FORMATS } only. Assigned to default: { self .DEFAULT } "
59
70
warnings .warn (warn_msg )
60
71
return dfs
72
+
73
+
74
+ class MakeCorrections :
75
+ def __init__ (self , et_resp : dict = None , dataframes : List [pd .DataFrame ] = None ):
76
+ """
77
+ To apply post processing techniques on the output
78
+ :param et_resp: ExtractTable response
79
+ :param dataframes: user preferred dataframe(s).
80
+ Default assumes all dataframes from the extracttable response, `et_resp`.
81
+ If both `et_resp` and `dataframes` are provided, the later is considered for the processing
82
+ """
83
+ if et_resp :
84
+ self .dataframes = ConvertTo (data = et_resp ).output
85
+
86
+ if not et_resp :
87
+ try :
88
+ self .dataframes = self .__isacceptable__ (dataframes )
89
+ except ValueError :
90
+ raise ValueError ("Either ExtractTable response or your preferred list of pandas dataframes is required" )
91
+
92
+ @staticmethod
93
+ def __isacceptable__ (dfs ) -> List [pd .DataFrame ]:
94
+ """Validate the `dataframes` param"""
95
+ if type (dfs ) is list :
96
+ if all ([type (df ) is pd .DataFrame for df in dfs ]):
97
+ return dfs
98
+ elif type (dfs ) is pd .DataFrame :
99
+ return [dfs ]
100
+ raise ValueError ("Dataframes should be list of dataframes or a dataframe" )
101
+
102
+ def split_merged_rows (self ) -> List [pd .DataFrame ]:
103
+ """
104
+ To split the merged rows into possible multiple rows
105
+ :return: reformatted list of dataframes
106
+ """
107
+ for df_idx , each_df in enumerate (self .dataframes ):
108
+ reformat = []
109
+ for row in each_df .to_numpy ():
110
+ row = list (row )
111
+
112
+ # looks like line separator is " "
113
+ seperators = [col .strip ().count (" " ) for col in row ]
114
+ # Statistical mode to assume the number of rows merged
115
+ mode_ = mode (seperators )
116
+
117
+ if mode_ :
118
+ # split the merged rows inside the col
119
+ tmp = [col .strip ().split (' ' , mode_ ) for col in row ]
120
+ for idx in range (len (tmp [0 ])):
121
+ tmp_ = []
122
+ for x in range (len (tmp )):
123
+ try :
124
+ val = tmp [x ][idx ]
125
+ except IndexError :
126
+ val = ""
127
+ tmp_ .append (val )
128
+ reformat .append (tmp_ )
129
+ else :
130
+ reformat .append (row )
131
+
132
+ self .dataframes [df_idx ] = pd .DataFrame (reformat )
133
+
134
+ return self .dataframes
135
+
136
+ def split_merged_columns (self , columns_idx : List [int ] = None , force_split : bool = False ) -> List [pd .DataFrame ]:
137
+ """
138
+ To split the merged columns into possible multiple columns
139
+ :param columns_idx: user preferred columns indices.
140
+ Default loops through all columns to find numeric or decimal columns
141
+ :param force_split: To force split through the columns
142
+ :return: reformatted list of dataframes
143
+ """
144
+ # TODO: Should we consider delimiter_pattern for the split?
145
+ for df_idx , df in enumerate (self .dataframes ):
146
+ if not columns_idx :
147
+ columns_idx = df .columns
148
+
149
+ columns_idx = [str (x ) for x in columns_idx ]
150
+ reformat = []
151
+ for col_idx in columns_idx :
152
+ tmp = df [col_idx ].str .split (expand = True )
153
+
154
+ if not any ([not any (tmp .isna ().any ()), force_split ]) or tmp .shape [- 1 ] == 1 :
155
+ reformat .append (df [col_idx ].tolist ())
156
+ # If user wanted force_split or the split columns have all cell values
157
+ # then proceed next
158
+ else :
159
+ reformat .extend ([tmp [each ].tolist () for each in tmp .columns ])
160
+
161
+ self .dataframes [df_idx ] = pd .DataFrame (reformat ).T
162
+
163
+ return self .dataframes
164
+
165
+ def fix_decimal_format (self , columns_idx : List [int ] = None , decimal_separator : str = "." , thousands_separator : str = "," , decimal_position : int = 2 ) -> List [pd .DataFrame ]:
166
+ """
167
+ To fix decimal and thousands separator values. Often commas as detected as period
168
+ :param columns_idx: user preferred columns indices.
169
+ Default loops through all columns to find numeric or decimal columns
170
+ :param decimal_separator: preferred decimal separator
171
+ :param thousands_separator: preferred thousands separator
172
+ :param decimal_position: preferred decimal position
173
+ :return: corrected list of dataframes
174
+ """
175
+ # TODO: Should we consider only bad confidence values?
176
+ reg_ = f"[{ decimal_separator } { thousands_separator } ]"
177
+ if decimal_position > 0 :
178
+ thou_regex = reg_ + '(?=.*' + reg_ + ')'
179
+ else :
180
+ thou_regex = reg_
181
+ decimal_position = int (decimal_position )
182
+
183
+ for df_idx , df in enumerate (self .dataframes ):
184
+ if not columns_idx :
185
+ columns_idx = df .columns
186
+ columns_idx = [str (x ) for x in columns_idx ]
187
+
188
+ for col_idx in columns_idx :
189
+ digits = df [col_idx ].str .count (pat = r'\d' ).sum ()
190
+ chars = df [col_idx ].str .count (pat = r'[\w]' ).sum ()
191
+
192
+ if digits / chars < 0.75 :
193
+ # To infer a numeric or float column
194
+ # Check if the column contains more digits or characters
195
+ continue
196
+
197
+ df [col_idx ] = df [col_idx ].str .strip ()
198
+ df [col_idx ].replace (regex = {r'%s' % thou_regex : thousands_separator }, inplace = True )
199
+
200
+ # To correct decimal position
201
+ if not decimal_position > 0 :
202
+ continue
203
+
204
+ for i , _ in enumerate (df [col_idx ]):
205
+ if not len (df [col_idx ][i ]) > decimal_position :
206
+ # length of atleast decimal_position
207
+ continue
208
+ elif df [col_idx ][i ][- (decimal_position + 1 )] == decimal_separator :
209
+ # nothing to do if decimal separator already in place
210
+ continue
211
+
212
+ # If decimal position is a not alphanumeric
213
+ if re .search (r'\W+' , df [col_idx ][i ][- (decimal_position + 1 )]):
214
+ digits = len (re .findall (r'\d' , df [col_idx ][i ]))
215
+ if digits / len (df [col_idx ][i ]) >= 0.5 :
216
+ df [col_idx ][i ] = df [col_idx ][i ][:- (decimal_position + 1 )] + decimal_separator + df [col_idx ][i ][- decimal_position :]
217
+
218
+ self .dataframes [df_idx ] = df
219
+ return self .dataframes
220
+
221
+ def fix_date_format (self , columns_idx : List [int ] = None , delimiter : str = "/" ):
222
+ """
223
+ To fix date formats of the column
224
+ Eg: 12|1212020 as 12/12/2020
225
+ :param columns_idx: user preferred columns indices.
226
+ Default loops through all columns to find Date Columns
227
+ :param delimiter: "/" or "-" whatelse you prefer
228
+ :return: correted list of dataframes
229
+ """
230
+ date_regex = r'(\d{2}(\d{2})?)(\W)(\d{2}|[A-Za-z]{3,9})(\W)(\d{2}(\d{2})?)\b'
231
+ for df_idx , df in enumerate (self .dataframes ):
232
+ if not columns_idx :
233
+ columns_idx = df .columns
234
+ columns_idx = [str (x ) for x in columns_idx ]
235
+
236
+ for col_idx in columns_idx :
237
+ dates = df [col_idx ].str .count (pat = date_regex ).sum ()
238
+
239
+ if not (dates >= len (df ) * 0.75 ):
240
+ # To infer a date column
241
+ # Check if the column contains digits and non-alpha character greater than column length
242
+ continue
243
+
244
+ df [col_idx ] = df [col_idx ].str .strip ()
245
+ df [col_idx ].replace (regex = {date_regex : r'\1%s\4%s\6' % (delimiter , delimiter )}, inplace = True )
246
+
247
+ self .dataframes [df_idx ] = df
248
+
249
+ return self .dataframes
0 commit comments