Post Processing Made Easy (#31)

* split_merged_rows functionality * To fix decimal and thousands separator values * split_merged_columns, fix_date_format functionalities * validations added * Easy Naming * added `server_response` attribute to the session * move unneccessary variable initialization * Added Google Colab Contents * Handle empty tables * Save tables to multiple sheets of a single excel file * standardized params naming * Functionality to save Tables & Text output to local * Version Update * Updated Tutorial v2.1.0
ExtractTable · Aug 27, 2020 · aab3060 · aab3060
1 parent 55866b2
commit aab3060
Show file tree

Hide file tree

Showing 5 changed files with 1,126 additions and 466 deletions.
diff --git a/ExtractTable/__init__.py b/ExtractTable/__init__.py
@@ -49,14 +49,14 @@ def _make_request(self, method, host: urlparse, params: dict = None, data: dict
         """
         tmp = self.__dict__.copy()
         for _type, _obj in tmp.items():
-            if _type not in ("api_key", "_session"):
+            if _type not in ("api_key", "_session", "input_filename"):
                 self.__delattr__(_type)
 
         host = host if not host.startswith("http") else host.split("/")[2]
         url = urlparse.urlunparse(('https', host, '', '', '', ''))
         self.ServerResponse = self._session.request(method, url, params=params, data=data, **kwargs)
         ValidateResponse(resp=self.ServerResponse, show_warn=self._WARNINGS)
-
+        self.server_response = self.ServerResponse.json()
         return self.ServerResponse.json()
 
     def check_usage(self) -> dict:
@@ -150,11 +150,13 @@ def process_file(
         """
         # Raise a warning if unknown format is requested
         if output_format not in self._OUTPUT_FORMATS:
-            default_format = "dict"
-            warn_msg = f"Found: {output_format} as output_format; Allowed only {self._OUTPUT_FORMATS}. " \
-                       f"Assigned default format: {default_format}"
+            warn_msg = f"Found: '{output_format}' as output_format; Allowed formats are {self._OUTPUT_FORMATS}. " \
+                       f"Assigned to default format: {self._DEFAULT}"
             warnings.warn(warn_msg)
 
+        # To use the reference when saving the output
+        self.__setattr__('input_filename', os.path.basename(filepath))
+
         try:
             with PrepareInput(filepath, pages=pages) as infile:
                 with open(infile.filepath, 'rb') as fp:
@@ -168,5 +170,40 @@ def process_file(
         for _type, _obj in trigger_resp.items():
             self.__setattr__(_type, _obj)
 
-        result = ConvertTo(data=trigger_resp, fmt=output_format, indexing=indexing).output
+        result = ConvertTo(server_response=trigger_resp, output_format=output_format, indexing=indexing).output
         return result
+
+    def save_output(self, output_folder: os.PathLike = "", output_format: str = "csv"):
+        """
+        Save the objects of session data to user preferred location or a default folder
+        :param output_folder: user preferred output location; default tmp directory
+        :param output_format: needed only for tables CSV or XLSX
+        :return: location of the output
+        """
+        input_fname = self.input_filename.rsplit('.')[0]
+
+        output_format = output_format.lower()
+        if output_format not in ("csv", "xlsx"):
+            output_format = "csv"
+            warnings.warn("Invalid 'output_format' given. Defaulted to 'csv'")
+
+        table_outputs_path = ConvertTo(server_response=self.server_response, output_format=output_format).output
+
+        if output_folder:
+            if not os.path.exists(output_folder):
+                output_folder = os.path.split(table_outputs_path[0])[0]
+                warnings.warn(f"Your output_folder not exists. Saving the outputs to {output_folder}")
+            else:
+                for each_tbl_path in table_outputs_path:
+                    os.replace(each_tbl_path, os.path.join(output_folder, input_fname+os.path.basename(each_tbl_path)))
+
+        else:
+            output_folder = os.path.split(table_outputs_path[0])[0]
+
+        for each_page in self.server_response.get("Lines", []):
+            page_txt_fname = os.path.join(output_folder, f"{input_fname}_Page_{str(each_page['Page'])}.txt")
+            page_txt = [each_line['Line'] for each_line in each_page['LinesArray']]
+            with open(page_txt_fname, "w", encoding="utf-8") as ofile:
+                ofile.write("\n".join(page_txt))
+
+        return output_folder
diff --git a/ExtractTable/__version__.py b/ExtractTable/__version__.py
@@ -1,4 +1,4 @@
-VERSION = (2, 0, 2)
+VERSION = (2, 1, 0)
 PRERELEASE = None  # "alpha", "beta" or "rc"
 REVISION = None
 
@@ -13,7 +13,7 @@ def generate_version():
 
 
 __title__ = "ExtractTable"
-__description__ = "Extract tabular data from images and scanned PDFs. Easily convert image to table, convert pdf to table"
+__description__ = "Extract table data from images and scanned PDFs. Easily convert image to excel, convert pdf to table"
 __url__ = "https://github.com/ExtractTable/ExtractTable-py"
 __version__ = generate_version()
 __author__ = "Saradhi"

diff --git a/ExtractTable/common.py b/ExtractTable/common.py
@@ -2,40 +2,42 @@
 Preprocess the output received from server and interface as a final result to the client
 """
 import os
+import re
 import tempfile
 import warnings
 import collections
+from statistics import mode
+from typing import List
 
 import pandas as pd
 
 
 class ConvertTo:
-    """Convert tabular JSON to an user requested output format"""
-    FORMATS = {"df", "dataframe", "json", "csv", "dict"}
+    FORMATS = {"df", "dataframe", "json", "csv", "dict", "xlsx", "excel"}
     DEFAULT = "df"
 
-    def __init__(self, data: dict, fmt: str = DEFAULT, indexing: bool = False):
+    def __init__(self, server_response: dict, output_format: str = DEFAULT, indexing: bool = False, table_obj="TableJson"):
         """
-
-        :param data: Tabular JSON data from server
-        :param fmt: format to be converted into
+        Convert the server response to an user requested output format on Tables
+        :param server_response: Tabular JSON data from server
+        :param output_format: format to be converted into
         :param indexing: row & column index consideration in the output
         """
-        self.data = data
-        self.output = self._converter(fmt.lower(), indexing=indexing)
+        self.server_response = server_response
+        self.output = self._converter(output_format.lower(), indexing=indexing, table_obj=table_obj)
 
-    def _converter(self, fmt: str, indexing: bool = False) -> list:
+    def _converter(self, fmt: str, indexing: bool = False, table_obj="TableJson") -> list:
         """
         Actual conversion takes place here using Pandas
         :param fmt: format to be converted into
         :param indexing: row index consideration in the output
         :return: list of tables from converted into the requested output format
         """
         dfs = []
-        for table in self.data.get("Tables", []):
-            tmp = {int(k): v for k, v in table["TableJson"].items()}
+        for table in self.server_response.get("Tables", []):
+            tmp = {int(k): v for k, v in table[table_obj].items()}
             # To convert column indices to int to maintain the table order with more than 9 columns
-            cols = [str(x) for x in sorted([int(x) for x in tmp[0]])]
+            cols = [str(x) for x in sorted([int(x) for x in tmp[0]])] if tmp else None
             # To convert row indices to int and maintain the table order with more than 9 rows
             tmp = collections.OrderedDict(sorted(tmp.items()))
             dfs.append(pd.DataFrame.from_dict(tmp, orient="index", columns=cols))
@@ -52,9 +54,196 @@ def _converter(self, fmt: str, indexing: bool = False) -> list:
                 df.to_csv(csv_name, index=indexing, header=indexing)
                 output_location.append(csv_name)
             return output_location
+        elif fmt in ("xlsx", "excel"):
+            output_excel_location = os.path.join(tempfile.mkdtemp(), f"_tables_{len(dfs)}.xlsx")
+            if len(dfs) >= 10:
+                warnings.warn(f"There are {dfs} tables extracted. Consider to change the output_format to 'csv' instead")
+            with pd.ExcelWriter(output_excel_location) as writer:
+                for n, df in enumerate(dfs):
+                    df.to_excel(writer, f'table_{n+1}')
+                writer.save()
+            return [output_excel_location]
         elif fmt == "json":
             return [df.to_json() for df in dfs]
         else:
             warn_msg = f"Supported output formats {self.FORMATS} only. Assigned to default: {self.DEFAULT}"
             warnings.warn(warn_msg)
             return dfs
+
+
+class MakeCorrections:
+    def __init__(self, et_resp: dict = None, dataframes: List[pd.DataFrame] = None):
+        """
+        To apply post processing techniques on the output
+        :param et_resp: ExtractTable response
+        :param dataframes: user preferred dataframe(s).
+            Default assumes all dataframes from the extracttable response, `et_resp`.
+            If both `et_resp` and `dataframes` are provided, the later is considered for the processing
+        """
+        if et_resp:
+            self.dataframes = ConvertTo(data=et_resp).output
+
+        if not et_resp:
+            try:
+                self.dataframes = self.__isacceptable__(dataframes)
+            except ValueError:
+                raise ValueError("Either ExtractTable response or your preferred list of pandas dataframes is required")
+
+    @staticmethod
+    def __isacceptable__(dfs) -> List[pd.DataFrame]:
+        """Validate the `dataframes` param"""
+        if type(dfs) is list:
+            if all([type(df) is pd.DataFrame for df in dfs]):
+                return dfs
+        elif type(dfs) is pd.DataFrame:
+            return [dfs]
+        raise ValueError("Dataframes should be list of dataframes or a dataframe")
+
+    def split_merged_rows(self) -> List[pd.DataFrame]:
+        """
+        To split the merged rows into possible multiple rows
+        :return: reformatted list of dataframes
+        """
+        for df_idx, each_df in enumerate(self.dataframes):
+            reformat = []
+            for row in each_df.to_numpy():
+                row = list(row)
+
+                # looks like line separator is " "
+                seperators = [col.strip().count(" ") for col in row]
+                # Statistical mode to assume the number of rows merged
+                mode_ = mode(seperators)
+
+                if mode_:
+                    # split the merged rows inside the col
+                    tmp = [col.strip().split(' ', mode_) for col in row]
+                    for idx in range(len(tmp[0])):
+                        tmp_ = []
+                        for x in range(len(tmp)):
+                            try:
+                                val = tmp[x][idx]
+                            except IndexError:
+                                val = ""
+                            tmp_.append(val)
+                        reformat.append(tmp_)
+                else:
+                    reformat.append(row)
+
+            self.dataframes[df_idx] = pd.DataFrame(reformat)
+
+        return self.dataframes
+
+    def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool = False) -> List[pd.DataFrame]:
+        """
+        To split the merged columns into possible multiple columns
+        :param columns_idx: user preferred columns indices.
+                Default loops through all columns to find numeric or decimal columns
+        :param force_split: To force split through the columns
+        :return: reformatted list of dataframes
+        """
+        # TODO: Should we consider delimiter_pattern for the split?
+        for df_idx, df in enumerate(self.dataframes):
+            if not columns_idx:
+                columns_idx = df.columns
+
+            columns_idx = [str(x) for x in columns_idx]
+            reformat = []
+            for col_idx in columns_idx:
+                tmp = df[col_idx].str.split(expand=True)
+
+                if not any([not any(tmp.isna().any()), force_split]) or tmp.shape[-1] == 1:
+                    reformat.append(df[col_idx].tolist())
+                    # If user wanted force_split or the split columns have all cell values
+                    # then proceed next
+                else:
+                    reformat.extend([tmp[each].tolist() for each in tmp.columns])
+
+            self.dataframes[df_idx] = pd.DataFrame(reformat).T
+
+        return self.dataframes
+
+    def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: str = ".", thousands_separator: str = ",", decimal_position: int = 2) -> List[pd.DataFrame]:
+        """
+        To fix decimal and thousands separator values. Often commas as detected as period
+        :param columns_idx: user preferred columns indices.
+                Default loops through all columns to find numeric or decimal columns
+        :param decimal_separator: preferred decimal separator
+        :param thousands_separator: preferred thousands separator
+        :param decimal_position: preferred decimal position
+        :return: corrected list of dataframes
+        """
+        # TODO: Should we consider only bad confidence values?
+        reg_ = f"[{decimal_separator}{thousands_separator}]"
+        if decimal_position > 0:
+            thou_regex = reg_ + '(?=.*' + reg_ + ')'
+        else:
+            thou_regex = reg_
+        decimal_position = int(decimal_position)
+
+        for df_idx, df in enumerate(self.dataframes):
+            if not columns_idx:
+                columns_idx = df.columns
+            columns_idx = [str(x) for x in columns_idx]
+
+            for col_idx in columns_idx:
+                digits = df[col_idx].str.count(pat=r'\d').sum()
+                chars = df[col_idx].str.count(pat=r'[\w]').sum()
+
+                if digits/chars < 0.75:
+                    # To infer a numeric or float column
+                    # Check if the column contains more digits or characters
+                    continue
+
+                df[col_idx] = df[col_idx].str.strip()
+                df[col_idx].replace(regex={r'%s' % thou_regex: thousands_separator}, inplace=True)
+
+                # To correct decimal position
+                if not decimal_position > 0:
+                    continue
+
+                for i, _ in enumerate(df[col_idx]):
+                    if not len(df[col_idx][i]) > decimal_position:
+                        # length of atleast decimal_position
+                        continue
+                    elif df[col_idx][i][-(decimal_position+1)] == decimal_separator:
+                        # nothing to do if decimal separator already in place
+                        continue
+
+                    # If decimal position is a not alphanumeric
+                    if re.search(r'\W+', df[col_idx][i][-(decimal_position+1)]):
+                        digits = len(re.findall(r'\d', df[col_idx][i]))
+                        if digits/len(df[col_idx][i]) >= 0.5:
+                            df[col_idx][i] = df[col_idx][i][:-(decimal_position+1)] + decimal_separator + df[col_idx][i][-decimal_position:]
+
+            self.dataframes[df_idx] = df
+        return self.dataframes
+
+    def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
+        """
+        To fix date formats of the column
+        Eg: 12|1212020 as 12/12/2020
+        :param columns_idx: user preferred columns indices.
+                Default loops through all columns to find Date Columns
+        :param delimiter: "/" or "-" whatelse you prefer
+        :return: correted list of dataframes
+        """
+        date_regex = r'(\d{2}(\d{2})?)(\W)(\d{2}|[A-Za-z]{3,9})(\W)(\d{2}(\d{2})?)\b'
+        for df_idx, df in enumerate(self.dataframes):
+            if not columns_idx:
+                columns_idx = df.columns
+            columns_idx = [str(x) for x in columns_idx]
+
+            for col_idx in columns_idx:
+                dates = df[col_idx].str.count(pat=date_regex).sum()
+
+                if not (dates >= len(df) * 0.75):
+                    # To infer a date column
+                    # Check if the column contains digits and non-alpha character greater than column length
+                    continue
+
+                df[col_idx] = df[col_idx].str.strip()
+                df[col_idx].replace(regex={date_regex: r'\1%s\4%s\6' % (delimiter, delimiter)}, inplace=True)
+
+            self.dataframes[df_idx] = df
+
+        return self.dataframes
diff --git a/README.md b/README.md
@@ -32,9 +32,33 @@ table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, output_f
 ```
 
 ## Detailed Library Usage
- [example-code.ipynb](example-code.ipynb)
-
-<a href="https://colab.research.google.com/github/ExtractTable/ExtractTable-py/blob/master/example-code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
+The tutorial available at <a href="https://colab.research.google.com/github/ExtractTable/ExtractTable-py/blob/master/example-code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> takes you through
+
+```Markup
+1. Installation
+2. Import and check version
+3. Create Session & Validate API Key
+    3.1 Create Session with your API Key
+    3.2 Validate the Key and check the plan usage
+    3.3 Check Usage Details
+4. Trigger the extraction process
+    4.1 Accepted Input Types
+    4.2 Process an IMAGE Input
+    4.3 Process a PDF Input
+    4.4 Output options
+    4.5 Explore session objects
+5. Explore the Output
+    5.1 Output Structure
+    5.2 Output Details
+6. Make Corrections
+    6.1 Split Merged Rows
+    6.2 Split Merged Columns
+    6.3 Fix Decimal Format
+    6.4 Fix Date Format
+7. Helpful Code Snippets
+    7.1 Get text data
+    7.2 Table output to Excel
+```
 
 ### Woahh, as simple as that ?!