Skip to content

Commit

Permalink
Post Processing Made Easy (#31)
Browse files Browse the repository at this point in the history
* split_merged_rows functionality

* To fix decimal and thousands separator values

* split_merged_columns, fix_date_format functionalities

* validations added

* Easy Naming

* added `server_response` attribute to the session

* move unneccessary variable initialization

* Added Google Colab Contents

* Handle empty tables

* Save tables to multiple sheets of a single excel file

* standardized params naming

* Functionality to save Tables & Text output to local

* Version Update

* Updated Tutorial v2.1.0
  • Loading branch information
akshowhini authored Aug 27, 2020
1 parent 55866b2 commit aab3060
Show file tree
Hide file tree
Showing 5 changed files with 1,126 additions and 466 deletions.
49 changes: 43 additions & 6 deletions ExtractTable/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,14 @@ def _make_request(self, method, host: urlparse, params: dict = None, data: dict
"""
tmp = self.__dict__.copy()
for _type, _obj in tmp.items():
if _type not in ("api_key", "_session"):
if _type not in ("api_key", "_session", "input_filename"):
self.__delattr__(_type)

host = host if not host.startswith("http") else host.split("/")[2]
url = urlparse.urlunparse(('https', host, '', '', '', ''))
self.ServerResponse = self._session.request(method, url, params=params, data=data, **kwargs)
ValidateResponse(resp=self.ServerResponse, show_warn=self._WARNINGS)

self.server_response = self.ServerResponse.json()
return self.ServerResponse.json()

def check_usage(self) -> dict:
Expand Down Expand Up @@ -150,11 +150,13 @@ def process_file(
"""
# Raise a warning if unknown format is requested
if output_format not in self._OUTPUT_FORMATS:
default_format = "dict"
warn_msg = f"Found: {output_format} as output_format; Allowed only {self._OUTPUT_FORMATS}. " \
f"Assigned default format: {default_format}"
warn_msg = f"Found: '{output_format}' as output_format; Allowed formats are {self._OUTPUT_FORMATS}. " \
f"Assigned to default format: {self._DEFAULT}"
warnings.warn(warn_msg)

# To use the reference when saving the output
self.__setattr__('input_filename', os.path.basename(filepath))

try:
with PrepareInput(filepath, pages=pages) as infile:
with open(infile.filepath, 'rb') as fp:
Expand All @@ -168,5 +170,40 @@ def process_file(
for _type, _obj in trigger_resp.items():
self.__setattr__(_type, _obj)

result = ConvertTo(data=trigger_resp, fmt=output_format, indexing=indexing).output
result = ConvertTo(server_response=trigger_resp, output_format=output_format, indexing=indexing).output
return result

def save_output(self, output_folder: os.PathLike = "", output_format: str = "csv"):
"""
Save the objects of session data to user preferred location or a default folder
:param output_folder: user preferred output location; default tmp directory
:param output_format: needed only for tables CSV or XLSX
:return: location of the output
"""
input_fname = self.input_filename.rsplit('.')[0]

output_format = output_format.lower()
if output_format not in ("csv", "xlsx"):
output_format = "csv"
warnings.warn("Invalid 'output_format' given. Defaulted to 'csv'")

table_outputs_path = ConvertTo(server_response=self.server_response, output_format=output_format).output

if output_folder:
if not os.path.exists(output_folder):
output_folder = os.path.split(table_outputs_path[0])[0]
warnings.warn(f"Your output_folder not exists. Saving the outputs to {output_folder}")
else:
for each_tbl_path in table_outputs_path:
os.replace(each_tbl_path, os.path.join(output_folder, input_fname+os.path.basename(each_tbl_path)))

else:
output_folder = os.path.split(table_outputs_path[0])[0]

for each_page in self.server_response.get("Lines", []):
page_txt_fname = os.path.join(output_folder, f"{input_fname}_Page_{str(each_page['Page'])}.txt")
page_txt = [each_line['Line'] for each_line in each_page['LinesArray']]
with open(page_txt_fname, "w", encoding="utf-8") as ofile:
ofile.write("\n".join(page_txt))

return output_folder
4 changes: 2 additions & 2 deletions ExtractTable/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION = (2, 0, 2)
VERSION = (2, 1, 0)
PRERELEASE = None # "alpha", "beta" or "rc"
REVISION = None

Expand All @@ -13,7 +13,7 @@ def generate_version():


__title__ = "ExtractTable"
__description__ = "Extract tabular data from images and scanned PDFs. Easily convert image to table, convert pdf to table"
__description__ = "Extract table data from images and scanned PDFs. Easily convert image to excel, convert pdf to table"
__url__ = "https://github.com/ExtractTable/ExtractTable-py"
__version__ = generate_version()
__author__ = "Saradhi"
Expand Down
213 changes: 201 additions & 12 deletions ExtractTable/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,42 @@
Preprocess the output received from server and interface as a final result to the client
"""
import os
import re
import tempfile
import warnings
import collections
from statistics import mode
from typing import List

import pandas as pd


class ConvertTo:
"""Convert tabular JSON to an user requested output format"""
FORMATS = {"df", "dataframe", "json", "csv", "dict"}
FORMATS = {"df", "dataframe", "json", "csv", "dict", "xlsx", "excel"}
DEFAULT = "df"

def __init__(self, data: dict, fmt: str = DEFAULT, indexing: bool = False):
def __init__(self, server_response: dict, output_format: str = DEFAULT, indexing: bool = False, table_obj="TableJson"):
"""
:param data: Tabular JSON data from server
:param fmt: format to be converted into
Convert the server response to an user requested output format on Tables
:param server_response: Tabular JSON data from server
:param output_format: format to be converted into
:param indexing: row & column index consideration in the output
"""
self.data = data
self.output = self._converter(fmt.lower(), indexing=indexing)
self.server_response = server_response
self.output = self._converter(output_format.lower(), indexing=indexing, table_obj=table_obj)

def _converter(self, fmt: str, indexing: bool = False) -> list:
def _converter(self, fmt: str, indexing: bool = False, table_obj="TableJson") -> list:
"""
Actual conversion takes place here using Pandas
:param fmt: format to be converted into
:param indexing: row index consideration in the output
:return: list of tables from converted into the requested output format
"""
dfs = []
for table in self.data.get("Tables", []):
tmp = {int(k): v for k, v in table["TableJson"].items()}
for table in self.server_response.get("Tables", []):
tmp = {int(k): v for k, v in table[table_obj].items()}
# To convert column indices to int to maintain the table order with more than 9 columns
cols = [str(x) for x in sorted([int(x) for x in tmp[0]])]
cols = [str(x) for x in sorted([int(x) for x in tmp[0]])] if tmp else None
# To convert row indices to int and maintain the table order with more than 9 rows
tmp = collections.OrderedDict(sorted(tmp.items()))
dfs.append(pd.DataFrame.from_dict(tmp, orient="index", columns=cols))
Expand All @@ -52,9 +54,196 @@ def _converter(self, fmt: str, indexing: bool = False) -> list:
df.to_csv(csv_name, index=indexing, header=indexing)
output_location.append(csv_name)
return output_location
elif fmt in ("xlsx", "excel"):
output_excel_location = os.path.join(tempfile.mkdtemp(), f"_tables_{len(dfs)}.xlsx")
if len(dfs) >= 10:
warnings.warn(f"There are {dfs} tables extracted. Consider to change the output_format to 'csv' instead")
with pd.ExcelWriter(output_excel_location) as writer:
for n, df in enumerate(dfs):
df.to_excel(writer, f'table_{n+1}')
writer.save()
return [output_excel_location]
elif fmt == "json":
return [df.to_json() for df in dfs]
else:
warn_msg = f"Supported output formats {self.FORMATS} only. Assigned to default: {self.DEFAULT}"
warnings.warn(warn_msg)
return dfs


class MakeCorrections:
def __init__(self, et_resp: dict = None, dataframes: List[pd.DataFrame] = None):
"""
To apply post processing techniques on the output
:param et_resp: ExtractTable response
:param dataframes: user preferred dataframe(s).
Default assumes all dataframes from the extracttable response, `et_resp`.
If both `et_resp` and `dataframes` are provided, the later is considered for the processing
"""
if et_resp:
self.dataframes = ConvertTo(data=et_resp).output

if not et_resp:
try:
self.dataframes = self.__isacceptable__(dataframes)
except ValueError:
raise ValueError("Either ExtractTable response or your preferred list of pandas dataframes is required")

@staticmethod
def __isacceptable__(dfs) -> List[pd.DataFrame]:
"""Validate the `dataframes` param"""
if type(dfs) is list:
if all([type(df) is pd.DataFrame for df in dfs]):
return dfs
elif type(dfs) is pd.DataFrame:
return [dfs]
raise ValueError("Dataframes should be list of dataframes or a dataframe")

def split_merged_rows(self) -> List[pd.DataFrame]:
"""
To split the merged rows into possible multiple rows
:return: reformatted list of dataframes
"""
for df_idx, each_df in enumerate(self.dataframes):
reformat = []
for row in each_df.to_numpy():
row = list(row)

# looks like line separator is " "
seperators = [col.strip().count(" ") for col in row]
# Statistical mode to assume the number of rows merged
mode_ = mode(seperators)

if mode_:
# split the merged rows inside the col
tmp = [col.strip().split(' ', mode_) for col in row]
for idx in range(len(tmp[0])):
tmp_ = []
for x in range(len(tmp)):
try:
val = tmp[x][idx]
except IndexError:
val = ""
tmp_.append(val)
reformat.append(tmp_)
else:
reformat.append(row)

self.dataframes[df_idx] = pd.DataFrame(reformat)

return self.dataframes

def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool = False) -> List[pd.DataFrame]:
"""
To split the merged columns into possible multiple columns
:param columns_idx: user preferred columns indices.
Default loops through all columns to find numeric or decimal columns
:param force_split: To force split through the columns
:return: reformatted list of dataframes
"""
# TODO: Should we consider delimiter_pattern for the split?
for df_idx, df in enumerate(self.dataframes):
if not columns_idx:
columns_idx = df.columns

columns_idx = [str(x) for x in columns_idx]
reformat = []
for col_idx in columns_idx:
tmp = df[col_idx].str.split(expand=True)

if not any([not any(tmp.isna().any()), force_split]) or tmp.shape[-1] == 1:
reformat.append(df[col_idx].tolist())
# If user wanted force_split or the split columns have all cell values
# then proceed next
else:
reformat.extend([tmp[each].tolist() for each in tmp.columns])

self.dataframes[df_idx] = pd.DataFrame(reformat).T

return self.dataframes

def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: str = ".", thousands_separator: str = ",", decimal_position: int = 2) -> List[pd.DataFrame]:
"""
To fix decimal and thousands separator values. Often commas as detected as period
:param columns_idx: user preferred columns indices.
Default loops through all columns to find numeric or decimal columns
:param decimal_separator: preferred decimal separator
:param thousands_separator: preferred thousands separator
:param decimal_position: preferred decimal position
:return: corrected list of dataframes
"""
# TODO: Should we consider only bad confidence values?
reg_ = f"[{decimal_separator}{thousands_separator}]"
if decimal_position > 0:
thou_regex = reg_ + '(?=.*' + reg_ + ')'
else:
thou_regex = reg_
decimal_position = int(decimal_position)

for df_idx, df in enumerate(self.dataframes):
if not columns_idx:
columns_idx = df.columns
columns_idx = [str(x) for x in columns_idx]

for col_idx in columns_idx:
digits = df[col_idx].str.count(pat=r'\d').sum()
chars = df[col_idx].str.count(pat=r'[\w]').sum()

if digits/chars < 0.75:
# To infer a numeric or float column
# Check if the column contains more digits or characters
continue

df[col_idx] = df[col_idx].str.strip()
df[col_idx].replace(regex={r'%s' % thou_regex: thousands_separator}, inplace=True)

# To correct decimal position
if not decimal_position > 0:
continue

for i, _ in enumerate(df[col_idx]):
if not len(df[col_idx][i]) > decimal_position:
# length of atleast decimal_position
continue
elif df[col_idx][i][-(decimal_position+1)] == decimal_separator:
# nothing to do if decimal separator already in place
continue

# If decimal position is a not alphanumeric
if re.search(r'\W+', df[col_idx][i][-(decimal_position+1)]):
digits = len(re.findall(r'\d', df[col_idx][i]))
if digits/len(df[col_idx][i]) >= 0.5:
df[col_idx][i] = df[col_idx][i][:-(decimal_position+1)] + decimal_separator + df[col_idx][i][-decimal_position:]

self.dataframes[df_idx] = df
return self.dataframes

def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
"""
To fix date formats of the column
Eg: 12|1212020 as 12/12/2020
:param columns_idx: user preferred columns indices.
Default loops through all columns to find Date Columns
:param delimiter: "/" or "-" whatelse you prefer
:return: correted list of dataframes
"""
date_regex = r'(\d{2}(\d{2})?)(\W)(\d{2}|[A-Za-z]{3,9})(\W)(\d{2}(\d{2})?)\b'
for df_idx, df in enumerate(self.dataframes):
if not columns_idx:
columns_idx = df.columns
columns_idx = [str(x) for x in columns_idx]

for col_idx in columns_idx:
dates = df[col_idx].str.count(pat=date_regex).sum()

if not (dates >= len(df) * 0.75):
# To infer a date column
# Check if the column contains digits and non-alpha character greater than column length
continue

df[col_idx] = df[col_idx].str.strip()
df[col_idx].replace(regex={date_regex: r'\1%s\4%s\6' % (delimiter, delimiter)}, inplace=True)

self.dataframes[df_idx] = df

return self.dataframes
30 changes: 27 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,33 @@ table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, output_f
```

## Detailed Library Usage
[example-code.ipynb](example-code.ipynb)

<a href="https://colab.research.google.com/github/ExtractTable/ExtractTable-py/blob/master/example-code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
The tutorial available at <a href="https://colab.research.google.com/github/ExtractTable/ExtractTable-py/blob/master/example-code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> takes you through

```Markup
1. Installation
2. Import and check version
3. Create Session & Validate API Key
3.1 Create Session with your API Key
3.2 Validate the Key and check the plan usage
3.3 Check Usage Details
4. Trigger the extraction process
4.1 Accepted Input Types
4.2 Process an IMAGE Input
4.3 Process a PDF Input
4.4 Output options
4.5 Explore session objects
5. Explore the Output
5.1 Output Structure
5.2 Output Details
6. Make Corrections
6.1 Split Merged Rows
6.2 Split Merged Columns
6.3 Fix Decimal Format
6.4 Fix Date Format
7. Helpful Code Snippets
7.1 Get text data
7.2 Table output to Excel
```

### Woahh, as simple as that ?!

Expand Down
Loading

0 comments on commit aab3060

Please sign in to comment.