From aab30609fc698c36c60f2d9723e3b5b33a2a4521 Mon Sep 17 00:00:00 2001
From: akshowhini <33936764+akshowhini@users.noreply.github.com>
Date: Wed, 26 Aug 2020 21:22:48 -0400
Subject: [PATCH] Post Processing Made Easy (#31)

* split_merged_rows functionality

* To fix decimal and thousands separator values

* split_merged_columns, fix_date_format functionalities

* validations added

* Easy Naming

* added `server_response` attribute to the session

* move unneccessary variable initialization

* Added Google Colab Contents

* Handle empty tables

* Save tables to multiple sheets of a single excel file

* standardized params naming

* Functionality to save Tables & Text output to local

* Version Update

* Updated Tutorial v2.1.0
---
 ExtractTable/__init__.py    |   49 +-
 ExtractTable/__version__.py |    4 +-
 ExtractTable/common.py      |  213 +++++-
 README.md                   |   30 +-
 example-code.ipynb          | 1296 +++++++++++++++++++++++------------
 5 files changed, 1126 insertions(+), 466 deletions(-)

diff --git a/ExtractTable/__init__.py b/ExtractTable/__init__.py
index 61cd650..1a965b8 100644
--- a/ExtractTable/__init__.py
+++ b/ExtractTable/__init__.py
@@ -49,14 +49,14 @@ def _make_request(self, method, host: urlparse, params: dict = None, data: dict
         """
         tmp = self.__dict__.copy()
         for _type, _obj in tmp.items():
-            if _type not in ("api_key", "_session"):
+            if _type not in ("api_key", "_session", "input_filename"):
                 self.__delattr__(_type)
 
         host = host if not host.startswith("http") else host.split("/")[2]
         url = urlparse.urlunparse(('https', host, '', '', '', ''))
         self.ServerResponse = self._session.request(method, url, params=params, data=data, **kwargs)
         ValidateResponse(resp=self.ServerResponse, show_warn=self._WARNINGS)
-
+        self.server_response = self.ServerResponse.json()
         return self.ServerResponse.json()
 
     def check_usage(self) -> dict:
@@ -150,11 +150,13 @@ def process_file(
         """
         # Raise a warning if unknown format is requested
         if output_format not in self._OUTPUT_FORMATS:
-            default_format = "dict"
-            warn_msg = f"Found: {output_format} as output_format; Allowed only {self._OUTPUT_FORMATS}. " \
-                       f"Assigned default format: {default_format}"
+            warn_msg = f"Found: '{output_format}' as output_format; Allowed formats are {self._OUTPUT_FORMATS}. " \
+                       f"Assigned to default format: {self._DEFAULT}"
             warnings.warn(warn_msg)
 
+        # To use the reference when saving the output
+        self.__setattr__('input_filename', os.path.basename(filepath))
+
         try:
             with PrepareInput(filepath, pages=pages) as infile:
                 with open(infile.filepath, 'rb') as fp:
@@ -168,5 +170,40 @@ def process_file(
         for _type, _obj in trigger_resp.items():
             self.__setattr__(_type, _obj)
 
-        result = ConvertTo(data=trigger_resp, fmt=output_format, indexing=indexing).output
+        result = ConvertTo(server_response=trigger_resp, output_format=output_format, indexing=indexing).output
         return result
+
+    def save_output(self, output_folder: os.PathLike = "", output_format: str = "csv"):
+        """
+        Save the objects of session data to user preferred location or a default folder
+        :param output_folder: user preferred output location; default tmp directory
+        :param output_format: needed only for tables CSV or XLSX
+        :return: location of the output
+        """
+        input_fname = self.input_filename.rsplit('.')[0]
+
+        output_format = output_format.lower()
+        if output_format not in ("csv", "xlsx"):
+            output_format = "csv"
+            warnings.warn("Invalid 'output_format' given. Defaulted to 'csv'")
+
+        table_outputs_path = ConvertTo(server_response=self.server_response, output_format=output_format).output
+
+        if output_folder:
+            if not os.path.exists(output_folder):
+                output_folder = os.path.split(table_outputs_path[0])[0]
+                warnings.warn(f"Your output_folder not exists. Saving the outputs to {output_folder}")
+            else:
+                for each_tbl_path in table_outputs_path:
+                    os.replace(each_tbl_path, os.path.join(output_folder, input_fname+os.path.basename(each_tbl_path)))
+
+        else:
+            output_folder = os.path.split(table_outputs_path[0])[0]
+
+        for each_page in self.server_response.get("Lines", []):
+            page_txt_fname = os.path.join(output_folder, f"{input_fname}_Page_{str(each_page['Page'])}.txt")
+            page_txt = [each_line['Line'] for each_line in each_page['LinesArray']]
+            with open(page_txt_fname, "w", encoding="utf-8") as ofile:
+                ofile.write("\n".join(page_txt))
+
+        return output_folder
diff --git a/ExtractTable/__version__.py b/ExtractTable/__version__.py
index ed6e20d..8fdbebe 100644
--- a/ExtractTable/__version__.py
+++ b/ExtractTable/__version__.py
@@ -1,4 +1,4 @@
-VERSION = (2, 0, 2)
+VERSION = (2, 1, 0)
 PRERELEASE = None  # "alpha", "beta" or "rc"
 REVISION = None
 
@@ -13,7 +13,7 @@ def generate_version():
 
 
 __title__ = "ExtractTable"
-__description__ = "Extract tabular data from images and scanned PDFs. Easily convert image to table, convert pdf to table"
+__description__ = "Extract table data from images and scanned PDFs. Easily convert image to excel, convert pdf to table"
 __url__ = "https://github.com/ExtractTable/ExtractTable-py"
 __version__ = generate_version()
 __author__ = "Saradhi"
diff --git a/ExtractTable/common.py b/ExtractTable/common.py
index 0b1b32a..b5c0787 100644
--- a/ExtractTable/common.py
+++ b/ExtractTable/common.py
@@ -2,29 +2,31 @@
 Preprocess the output received from server and interface as a final result to the client
 """
 import os
+import re
 import tempfile
 import warnings
 import collections
+from statistics import mode
+from typing import List
 
 import pandas as pd
 
 
 class ConvertTo:
-    """Convert tabular JSON to an user requested output format"""
-    FORMATS = {"df", "dataframe", "json", "csv", "dict"}
+    FORMATS = {"df", "dataframe", "json", "csv", "dict", "xlsx", "excel"}
     DEFAULT = "df"
 
-    def __init__(self, data: dict, fmt: str = DEFAULT, indexing: bool = False):
+    def __init__(self, server_response: dict, output_format: str = DEFAULT, indexing: bool = False, table_obj="TableJson"):
         """
-
-        :param data: Tabular JSON data from server
-        :param fmt: format to be converted into
+        Convert the server response to an user requested output format on Tables
+        :param server_response: Tabular JSON data from server
+        :param output_format: format to be converted into
         :param indexing: row & column index consideration in the output
         """
-        self.data = data
-        self.output = self._converter(fmt.lower(), indexing=indexing)
+        self.server_response = server_response
+        self.output = self._converter(output_format.lower(), indexing=indexing, table_obj=table_obj)
 
-    def _converter(self, fmt: str, indexing: bool = False) -> list:
+    def _converter(self, fmt: str, indexing: bool = False, table_obj="TableJson") -> list:
         """
         Actual conversion takes place here using Pandas
         :param fmt: format to be converted into
@@ -32,10 +34,10 @@ def _converter(self, fmt: str, indexing: bool = False) -> list:
         :return: list of tables from converted into the requested output format
         """
         dfs = []
-        for table in self.data.get("Tables", []):
-            tmp = {int(k): v for k, v in table["TableJson"].items()}
+        for table in self.server_response.get("Tables", []):
+            tmp = {int(k): v for k, v in table[table_obj].items()}
             # To convert column indices to int to maintain the table order with more than 9 columns
-            cols = [str(x) for x in sorted([int(x) for x in tmp[0]])]
+            cols = [str(x) for x in sorted([int(x) for x in tmp[0]])] if tmp else None
             # To convert row indices to int and maintain the table order with more than 9 rows
             tmp = collections.OrderedDict(sorted(tmp.items()))
             dfs.append(pd.DataFrame.from_dict(tmp, orient="index", columns=cols))
@@ -52,9 +54,196 @@ def _converter(self, fmt: str, indexing: bool = False) -> list:
                 df.to_csv(csv_name, index=indexing, header=indexing)
                 output_location.append(csv_name)
             return output_location
+        elif fmt in ("xlsx", "excel"):
+            output_excel_location = os.path.join(tempfile.mkdtemp(), f"_tables_{len(dfs)}.xlsx")
+            if len(dfs) >= 10:
+                warnings.warn(f"There are {dfs} tables extracted. Consider to change the output_format to 'csv' instead")
+            with pd.ExcelWriter(output_excel_location) as writer:
+                for n, df in enumerate(dfs):
+                    df.to_excel(writer, f'table_{n+1}')
+                writer.save()
+            return [output_excel_location]
         elif fmt == "json":
             return [df.to_json() for df in dfs]
         else:
             warn_msg = f"Supported output formats {self.FORMATS} only. Assigned to default: {self.DEFAULT}"
             warnings.warn(warn_msg)
             return dfs
+
+
+class MakeCorrections:
+    def __init__(self, et_resp: dict = None, dataframes: List[pd.DataFrame] = None):
+        """
+        To apply post processing techniques on the output
+        :param et_resp: ExtractTable response
+        :param dataframes: user preferred dataframe(s).
+            Default assumes all dataframes from the extracttable response, `et_resp`.
+            If both `et_resp` and `dataframes` are provided, the later is considered for the processing
+        """
+        if et_resp:
+            self.dataframes = ConvertTo(data=et_resp).output
+
+        if not et_resp:
+            try:
+                self.dataframes = self.__isacceptable__(dataframes)
+            except ValueError:
+                raise ValueError("Either ExtractTable response or your preferred list of pandas dataframes is required")
+
+    @staticmethod
+    def __isacceptable__(dfs) -> List[pd.DataFrame]:
+        """Validate the `dataframes` param"""
+        if type(dfs) is list:
+            if all([type(df) is pd.DataFrame for df in dfs]):
+                return dfs
+        elif type(dfs) is pd.DataFrame:
+            return [dfs]
+        raise ValueError("Dataframes should be list of dataframes or a dataframe")
+
+    def split_merged_rows(self) -> List[pd.DataFrame]:
+        """
+        To split the merged rows into possible multiple rows
+        :return: reformatted list of dataframes
+        """
+        for df_idx, each_df in enumerate(self.dataframes):
+            reformat = []
+            for row in each_df.to_numpy():
+                row = list(row)
+
+                # looks like line separator is " "
+                seperators = [col.strip().count(" ") for col in row]
+                # Statistical mode to assume the number of rows merged
+                mode_ = mode(seperators)
+
+                if mode_:
+                    # split the merged rows inside the col
+                    tmp = [col.strip().split(' ', mode_) for col in row]
+                    for idx in range(len(tmp[0])):
+                        tmp_ = []
+                        for x in range(len(tmp)):
+                            try:
+                                val = tmp[x][idx]
+                            except IndexError:
+                                val = ""
+                            tmp_.append(val)
+                        reformat.append(tmp_)
+                else:
+                    reformat.append(row)
+
+            self.dataframes[df_idx] = pd.DataFrame(reformat)
+
+        return self.dataframes
+
+    def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool = False) -> List[pd.DataFrame]:
+        """
+        To split the merged columns into possible multiple columns
+        :param columns_idx: user preferred columns indices.
+                Default loops through all columns to find numeric or decimal columns
+        :param force_split: To force split through the columns
+        :return: reformatted list of dataframes
+        """
+        # TODO: Should we consider delimiter_pattern for the split?
+        for df_idx, df in enumerate(self.dataframes):
+            if not columns_idx:
+                columns_idx = df.columns
+
+            columns_idx = [str(x) for x in columns_idx]
+            reformat = []
+            for col_idx in columns_idx:
+                tmp = df[col_idx].str.split(expand=True)
+
+                if not any([not any(tmp.isna().any()), force_split]) or tmp.shape[-1] == 1:
+                    reformat.append(df[col_idx].tolist())
+                    # If user wanted force_split or the split columns have all cell values
+                    # then proceed next
+                else:
+                    reformat.extend([tmp[each].tolist() for each in tmp.columns])
+
+            self.dataframes[df_idx] = pd.DataFrame(reformat).T
+
+        return self.dataframes
+
+    def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: str = ".", thousands_separator: str = ",", decimal_position: int = 2) -> List[pd.DataFrame]:
+        """
+        To fix decimal and thousands separator values. Often commas as detected as period
+        :param columns_idx: user preferred columns indices.
+                Default loops through all columns to find numeric or decimal columns
+        :param decimal_separator: preferred decimal separator
+        :param thousands_separator: preferred thousands separator
+        :param decimal_position: preferred decimal position
+        :return: corrected list of dataframes
+        """
+        # TODO: Should we consider only bad confidence values?
+        reg_ = f"[{decimal_separator}{thousands_separator}]"
+        if decimal_position > 0:
+            thou_regex = reg_ + '(?=.*' + reg_ + ')'
+        else:
+            thou_regex = reg_
+        decimal_position = int(decimal_position)
+
+        for df_idx, df in enumerate(self.dataframes):
+            if not columns_idx:
+                columns_idx = df.columns
+            columns_idx = [str(x) for x in columns_idx]
+
+            for col_idx in columns_idx:
+                digits = df[col_idx].str.count(pat=r'\d').sum()
+                chars = df[col_idx].str.count(pat=r'[\w]').sum()
+
+                if digits/chars < 0.75:
+                    # To infer a numeric or float column
+                    # Check if the column contains more digits or characters
+                    continue
+
+                df[col_idx] = df[col_idx].str.strip()
+                df[col_idx].replace(regex={r'%s' % thou_regex: thousands_separator}, inplace=True)
+
+                # To correct decimal position
+                if not decimal_position > 0:
+                    continue
+
+                for i, _ in enumerate(df[col_idx]):
+                    if not len(df[col_idx][i]) > decimal_position:
+                        # length of atleast decimal_position
+                        continue
+                    elif df[col_idx][i][-(decimal_position+1)] == decimal_separator:
+                        # nothing to do if decimal separator already in place
+                        continue
+
+                    # If decimal position is a not alphanumeric
+                    if re.search(r'\W+', df[col_idx][i][-(decimal_position+1)]):
+                        digits = len(re.findall(r'\d', df[col_idx][i]))
+                        if digits/len(df[col_idx][i]) >= 0.5:
+                            df[col_idx][i] = df[col_idx][i][:-(decimal_position+1)] + decimal_separator + df[col_idx][i][-decimal_position:]
+
+            self.dataframes[df_idx] = df
+        return self.dataframes
+
+    def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
+        """
+        To fix date formats of the column
+        Eg: 12|1212020 as 12/12/2020
+        :param columns_idx: user preferred columns indices.
+                Default loops through all columns to find Date Columns
+        :param delimiter: "/" or "-" whatelse you prefer
+        :return: correted list of dataframes
+        """
+        date_regex = r'(\d{2}(\d{2})?)(\W)(\d{2}|[A-Za-z]{3,9})(\W)(\d{2}(\d{2})?)\b'
+        for df_idx, df in enumerate(self.dataframes):
+            if not columns_idx:
+                columns_idx = df.columns
+            columns_idx = [str(x) for x in columns_idx]
+
+            for col_idx in columns_idx:
+                dates = df[col_idx].str.count(pat=date_regex).sum()
+
+                if not (dates >= len(df) * 0.75):
+                    # To infer a date column
+                    # Check if the column contains digits and non-alpha character greater than column length
+                    continue
+
+                df[col_idx] = df[col_idx].str.strip()
+                df[col_idx].replace(regex={date_regex: r'\1%s\4%s\6' % (delimiter, delimiter)}, inplace=True)
+
+            self.dataframes[df_idx] = df
+
+        return self.dataframes
diff --git a/README.md b/README.md
index 72079cf..0bfd772 100644
--- a/README.md
+++ b/README.md
@@ -32,9 +32,33 @@ table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, output_f
 ```
 
 ## Detailed Library Usage
- [example-code.ipynb](example-code.ipynb)
-
-<a href="https://colab.research.google.com/github/ExtractTable/ExtractTable-py/blob/master/example-code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
+The tutorial available at <a href="https://colab.research.google.com/github/ExtractTable/ExtractTable-py/blob/master/example-code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> takes you through
+
+```Markup
+1. Installation
+2. Import and check version
+3. Create Session & Validate API Key
+    3.1 Create Session with your API Key
+    3.2 Validate the Key and check the plan usage
+    3.3 Check Usage Details
+4. Trigger the extraction process
+    4.1 Accepted Input Types
+    4.2 Process an IMAGE Input
+    4.3 Process a PDF Input
+    4.4 Output options
+    4.5 Explore session objects
+5. Explore the Output
+    5.1 Output Structure
+    5.2 Output Details
+6. Make Corrections
+    6.1 Split Merged Rows
+    6.2 Split Merged Columns
+    6.3 Fix Decimal Format
+    6.4 Fix Date Format
+7. Helpful Code Snippets
+    7.1 Get text data
+    7.2 Table output to Excel
+```
 
 ### Woahh, as simple as that ?!
 
diff --git a/example-code.ipynb b/example-code.ipynb
index 3446264..d18a325 100644
--- a/example-code.ipynb
+++ b/example-code.ipynb
@@ -1,449 +1,859 @@
 {
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
-  "colab": {
-   "name": "ExtractTable - Advanced Code Usage.ipynb",
-   "provenance": [],
-   "collapsed_sections": []
-  },
-  "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3"
-  }
- },
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": false,
-    "id": "BnYb9aztB48u",
-    "colab_type": "text"
-   },
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/ExtractTable/ExtractTable-py/blob/master/example-code.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {
-    "id": "NhVhMrQ0ZdQr",
-    "colab_type": "code",
-    "colab": {}
-   },
-   "source": [
-    "!pip install ExtractTable"
-   ],
-   "execution_count": 0,
-   "outputs": []
-  },
-  {
-   "cell_type": "code",
-   "metadata": {
-    "id": "2aIaghfeZnQr",
-    "colab_type": "code",
-    "colab": {}
-   },
-   "source": [
-    "from ExtractTable import ExtractTable"
-   ],
-   "execution_count": 0,
-   "outputs": []
-  },
-  {
-   "cell_type": "code",
-   "metadata": {
-    "id": "LJL_ZyYzZsFY",
-    "colab_type": "code",
-    "colab": {}
-   },
-   "source": [
-    "api_key = YOUR_APIKEY_HERE"
-   ],
-   "execution_count": 0,
-   "outputs": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "bwtpzTJxZHRi",
-    "colab_type": "text"
-   },
-   "source": [
-    "**Create Session** with your API Key"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {
-    "id": "Bfw5GTNvZGv8",
-    "colab_type": "code",
-    "colab": {}
-   },
-   "source": [
-    "et_sess = ExtractTable(api_key)"
-   ],
-   "execution_count": 0,
-   "outputs": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "On4_X8v3Zk3v",
-    "colab_type": "text"
-   },
-   "source": [
-    "**Validate** the Key and check the plan usage"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {
-    "id": "a7EPvvvMZ0Ub",
-    "colab_type": "code",
-    "colab": {}
-   },
-   "source": [
-    "usage = et_sess.check_usage()"
-   ],
-   "execution_count": 0,
-   "outputs": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "sovuclERjRqy",
-    "colab_type": "text"
-   },
-   "source": [
-    "*If there is no error encountered in the above cell, it means we have a valid API key. Now, get started by checking the usage and trigger the file for processing*"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {
-    "id": "HT97IP8MZ9WF",
-    "colab_type": "code",
-    "outputId": "b5dfbc96-5ce8-4461-c988-6b17e58a1448",
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
     "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 34
+      "name": "ExtractTable Usage -2.1.0",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "pycharm": {
+      "stem_cell": {
+        "cell_type": "raw",
+        "source": [],
+        "metadata": {
+          "collapsed": false
+        }
+      }
     }
-   },
-   "source": [
-    "print(usage)"
-   ],
-   "execution_count": 0,
-   "outputs": [
-    {
-     "output_type": "stream",
-     "text": [
-      "{'credits': 500, 'queued': 0, 'used': 132}\n"
-     ],
-     "name": "stdout"
-    }
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "-XqbBoB-i3pi",
-    "colab_type": "text"
-   },
-   "source": [
-    "**credits**: Total number credits attached to the API Key\n",
-    "\n",
-    "**queued** : Number of triggered jobs that are still processing in the queue\n",
-    "\n",
-    "**used**   : Number of credits already used "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {
-    "id": "P_xzVgHmZ9sw",
-    "colab_type": "code",
-    "colab": {}
-   },
-   "source": [
-    "# filepath = \"image_path_or_image_url_with_tables\"\n",
-    "# filepath = r'samples/BlurryImage.jpg'\n",
-    "filepath = \"https://raw.githubusercontent.com/ExtractTable/ExtractTable-py/master/samples/QualityImage.jpg\""
-   ],
-   "execution_count": 0,
-   "outputs": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "oUnBFxYiZ1Ka",
-    "colab_type": "text"
-   },
-   "source": [
-    "**Trigger** the process to extract tabular data from the file"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {
-    "id": "3H9jzk6wJ5-V",
-    "colab_type": "code",
-    "colab": {}
-   },
-   "source": [
-    "table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables)"
-   ],
-   "execution_count": 0,
-   "outputs": []
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "k98KTihPJwyO",
-    "colab_type": "text"
-   },
-   "source": [
-    "<ins>Note</ins>: To <ins>process a PDF</ins>, use **pages** params in the read_pdf function, as shown below\n",
-    "```python \n",
-    "table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, pages=\"all\")\n",
-    "```\n",
-    "Below are the sample values ```pages``` accepts\n",
-    "\n",
-    "* pages = \"2\" - considers only 2nd page of the PDF\n",
-    "* pages = \"1,3,5\" - considers only 1st, 3rd and 5th page of the PDF\n",
-    "* pages = \"1, 3-5\" - considers 1st, 3rd, 4th and 5th page of the PDF\n",
-    "* pages = \"all\" - considers complete PDF"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "Th12lbKfJhu9",
-    "colab_type": "text"
-   },
-   "source": [
-    "> By default, the `process_file()` returns **only** the table data"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "HNZ8ieKNH5db",
-    "colab_type": "text"
-   },
-   "source": [
-    "> **Explore** all objects of the latest file processing with `et_sess.__dict__.keys()` - Depends on the plan type of your API Key"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {
-    "id": "aDUaDyX8IGmK",
-    "colab_type": "code",
-    "outputId": "e22422ca-b27d-405f-c263-a92898c010ea",
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 34
-    }
-   },
-   "source": [
-    "et_sess.__dict__.keys()"
-   ],
-   "execution_count": 0,
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "dict_keys(['api_key', '_session', 'ServerResponse', 'JobStatus', 'Lines', 'Pages', 'Tables'])"
-      ]
-     },
-     "metadata": {
-      "tags": []
-     },
-     "execution_count": 14
-    }
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {
-    "id": "QYwnBXwQJ9D9",
-    "colab_type": "code",
-    "outputId": "5fee31e6-a38b-43a4-9297-8f1a920ad87e",
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 68
-    }
-   },
-   "source": [
-    "# Access the class objects as you want\n",
-    "print(\"Number of pages processed in this job:\", et_sess.Pages)\n",
-    "print(\"Number of tables found in this job:\", len(et_sess.Tables))\n",
-    "# print(\"Number of lines in the first page of this job:\", len(et_sess.Lines[0]['LineArray']))\n",
-    "\n",
-    "# et_sess.Tables\n",
-    "# et_sess.Lines\n"
-   ],
-   "execution_count": 0,
-   "outputs": [
-    {
-     "output_type": "stream",
-     "text": [
-      "Number of pages processed in this job: 1\n",
-      "Number of tables found in this job: 1\n",
-      "Number of lines in the first page of this job: 42\n"
-     ],
-     "name": "stdout"
-    }
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "KbIJ9kpqFxRu",
-    "colab_type": "text"
-   },
-   "source": [
-    "> **Understand the output**: The response of a triggered job is a JSON object in the below format. Note that the response depends on the plan type of the API Key.\n",
-    "\n",
-    "```javascript\n",
-    "{\n",
-    "    \"JobStatus\": <string>,                              # Status of the triggered Process  @ JOB-LEVEL\n",
-    "    \"Pages\": <integer>,                                 # Number of pages processed in this request @ PAGE-LEVEL\n",
-    "    \"Tables\": [<list of key-value objects of table>     # List of all tables found @ TABLE-LEVEL\n",
-    "        {\n",
-    "            \"Page\": <integer>,                              ## Page number in which this table is found\n",
-    "            \"CharacterConfidence\": <float>,                 ## Accuracy of Characters recognized from the input-page\n",
-    "            \"LayoutConfidence\": <float>,                    ## Accuracy of table layout's design decision\n",
-    "            \"TableJson\": <dict>,                            ## Table Cell Text in key-value format with index orientation - {row#: {col#: <str>}}\n",
-    "            \"TableCoordinates\": <dict>,                     ## Top-left & Bottom-right Cell Coordinates - {row#: {col#: <list(x1,y1,x2,y2)>}}\n",
-    "            \"TableConfidence\": <dict>                       ## Cell level accuracy of detected characters - {row#: {col#: <float>}}\n",
-    "        },\n",
-    "    {...}                                               ## ... more \"Tables\" objects\n",
-    "    ],\n",
-    "    \"Lines\": [<list of key-value objects>               # Pagewise Line details @ PAGE-LEVEL\n",
-    "        {\n",
-    "            \"Page\": <integer>,                          # Page number in which the lines are found\n",
-    "            \"CharacterConfidence\": <float>,             # Average Accuracy of all Characters recognized from the input-page\n",
-    "            \"LinesArray\": [\n",
-    "                <list of key-value objects of line>     # Ordered list of lines in this page @ LINE-LEVEL\n",
-    "                {\n",
-    "                    \"Line\": <str>,                          ## Detected text of the complete line\n",
-    "                    \"WordsArray\": [\n",
-    "                        <list of key-value objects>         ## Word level datails in this line @ WORD-LEVEL\n",
-    "                        {\n",
-    "                            \"Conf\": <float>,                    ### Accuracy of recognized characters of the word\n",
-    "                            \"Word\": <str>,                      ### Detected text of the word\n",
-    "                            \"Loc\": [x1, y1, x2, y2]             ### Top-left & Bottom-right coordinates, w.r.t the input-page width-height dimensions\n",
-    "                        },\n",
-    "                    {...}                                   ### More \"WordsArray\" objects\n",
-    "                    ]\n",
-    "                },\n",
-    "            {...}                                       ## More \"LinesArray\" objects\n",
-    "            ]\n",
-    "        },\n",
-    "    {...}                                               # More Pagewise \"Lines\" details\n",
-    "    ]\n",
-    "}\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {
-    "id": "XrXBfENfZ2AI",
-    "colab_type": "code",
-    "outputId": "6c12b493-b774-4687-f44e-1f1731f7ce43",
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 170
-    }
-   },
-   "source": [
-    "table_data     # Notice the default output is a pandas dataframe"
-   ],
-   "execution_count": 0,
-   "outputs": [
-    {
-     "output_type": "execute_result",
-     "data": {
-      "text/plain": [
-       "[          0                           1  ...      5                   6\n",
-       " 0  FLC Code                   Room Name  ...  W (m)  Ceiling Height (m)\n",
-       " 1   RGOOTO1  Indigenous Support Officer  ...    7.3                 2.7\n",
-       " 2   RGOOTO2         Instrum. Music Room  ...    7.3                 2.7\n",
-       " 3   RGOTO1A                    Verandah  ...    1.7                 3.0\n",
-       " 4   RGOTO1B              Eastern Stairs  ...    1.7                 N/A\n",
-       " 5   RGOTO2B              Western Stairs  ...    1.0                 N/A\n",
-       " \n",
-       " [6 rows x 7 columns]]"
-      ]
-     },
-     "metadata": {
-      "tags": []
-     },
-     "execution_count": 22
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "collapsed": false,
+        "id": "BnYb9aztB48u",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/ExtractTable/ExtractTable-py/blob/master/example-code.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "935SwBV4Z-CH",
+        "colab_type": "text"
+      },
+      "source": [
+        "# 1. Installation"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "NhVhMrQ0ZdQr",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!pip install -U ExtractTable"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jfsHIegraT2l",
+        "colab_type": "text"
+      },
+      "source": [
+        "# 2. Import and check version"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "2aIaghfeZnQr",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "from ExtractTable import ExtractTable"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "YLLCa6qQaaZu",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "print(ExtractTable.VERSION)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "VYERq5s9aiNy",
+        "colab_type": "text"
+      },
+      "source": [
+        "# 3. Create Session & Validate API Key\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bwtpzTJxZHRi",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 3.1 **Create Session** with your API Key"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "LJL_ZyYzZsFY",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "api_key = YOUR_APIKEY_HERE"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Bfw5GTNvZGv8",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "et_sess = ExtractTable(api_key)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "b5fQB7dGxLKf",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# et_sess.__dict__"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "On4_X8v3Zk3v",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 3.2 **Validate** the Key and check the plan usage"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "a7EPvvvMZ0Ub",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "usage = et_sess.check_usage()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "sovuclERjRqy",
+        "colab_type": "text"
+      },
+      "source": [
+        "*If there is no error encountered in the above cell, it means we have a valid API key. Now, lets get started by checking the usage and trigger the file for processing*"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "GJdjlTPKxcXF",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# et_sess.server_response"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "HT97IP8MZ9WF",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "bc872e57-c5cb-4db0-a034-56f03f275d4b"
+      },
+      "source": [
+        "print(usage)"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "{'credits': 100, 'queued': 0, 'used': 49}\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-XqbBoB-i3pi",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 3.3 Check Usage Details\n",
+        "\n",
+        "**credits**: Total number credits attached to the API Key\n",
+        "\n",
+        "**queued** : Number of triggered jobs that are still processing in the queue\n",
+        "\n",
+        "**used**   : Number of credits already used "
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ARRJaIgFcYoe",
+        "colab_type": "text"
+      },
+      "source": [
+        "# 4. Trigger the extraction process\n",
+        "\n",
+        "> Note: We will use the session, `et_sess`, created earlier in step 3.1, to save the session data and retrieve when needed"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6GpN9ho1chi6",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 4.1 Accepted Input Types\n",
+        "\n",
+        "**Allowed input formats** are:\n",
+        "- Image\n",
+        "  - JPG/JPEG\n",
+        "  - PNG\n",
+        "- PDF\n",
+        "  - Text PDF\n",
+        "  - Scan PDF\n",
+        "  - Image PDF\n",
+        "\n",
+        "\n",
+        "**Input Location Options**\n",
+        "- Location can be a file from the local drive\n",
+        "- Accessible remote URL - *the file object will be locally downloaded and deleted once sent to the process*"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "P_xzVgHmZ9sw",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# image_location = \"local_image_path_OR_remote_image_url_with_tables\"\n",
+        "# image_location = r'samples/BlurryImage.jpg'\n",
+        "image_location = \"https://raw.githubusercontent.com/ExtractTable/ExtractTable-py/master/samples/QualityImage.jpg\""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "oUnBFxYiZ1Ka",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 4.2 Process an <ins>IMAGE</ins> Input\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "3H9jzk6wJ5-V",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "table_data = et_sess.process_file(filepath=image_location, output_format=\"df\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "AoKuQBVQy3LN",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 170
+        },
+        "outputId": "032898c5-0a6b-41b7-a9fc-4107defe056f"
+      },
+      "source": [
+        "table_data"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "[          0                           1  ...      5                   6\n",
+              " 0  FLC Code                   Room Name  ...  W (m)  Ceiling Height (m)\n",
+              " 1   RGOOTO1  Indigenous Support Officer  ...    7.3                 2.7\n",
+              " 2   RGOOTO2         Instrum. Music Room  ...    7.3                 2.7\n",
+              " 3   RGOTO1A                    Verandah  ...    1.7                 3.0\n",
+              " 4   RGOTO1B              Eastern Stairs  ...    1.7                 N/A\n",
+              " 5   RGOTO2B              Western Stairs  ...    1.0                 N/A\n",
+              " \n",
+              " [6 rows x 7 columns]]"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 19
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9uiDCtGpfTwF",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 4.3 Process a <ins>PDF</ins> Input"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ehxmaPgthoCC",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# pdf_location = \"local_image_path_OR_remote_image_url_with_tables\"\n",
+        "# pdf_location = r'samples/BlurryImage.jpg'\n",
+        "pdf_location = \"https://raw.githubusercontent.com/ExtractTable/ExtractTable-py/master/samples/QualityImage.jpg\""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "qdU1Au3LhiuD",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, pages=\"all\", output_format=\"df\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "k98KTihPJwyO",
+        "colab_type": "text"
+      },
+      "source": [
+        "Below are the sample values ```pages``` accepts **string** type\n",
+        "\n",
+        "\n",
+        "\n",
+        "| pages  \t| Explanation                                     \t|\n",
+        "|----------\t|-------------------------------------------------\t|\n",
+        "| \"1\"      \t| [Default] considers only 1st page of the PDF    \t|\n",
+        "| \"1,3,5\"  \t| considers only 1st, 3rd and 5th page of the PDF \t|\n",
+        "| \"1, 3-5\" \t| considers 1st, 3rd, 4th and 5th page of the PDF \t|\n",
+        "| \"all\"    \t| considers complete PDF                          \t|"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "e-KPdSNQBeR-",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 4.4 Table Output options\n",
+        "\n",
+        "> By default, the `process_file()` returns **only** the table data. Output depends on the `output_format` , explained below\n",
+        "\n",
+        "Explore the available options with `ExtractTable._OUTPUT_FORMATS`\n",
+        "\n",
+        "| output_format \t| Explanation                                \t|\n",
+        "|---------------\t|--------------------------------------------\t|\n",
+        "| \"df\"          \t| [Default] Array of Pandas dataframes        \t|\n",
+        "| \"dataframe\"   \t| same as \"df\"; Array of Pandas dataframes    \t|\n",
+        "| \"json\"        \t| JSON data with index orientation           \t|\n",
+        "| \"dict\"        \t| Similar to JSON data but python dictionary \t|\n",
+        "| \"csv\"         \t| Array of locally saved CSV file locations   \t|\n",
+        "| \"xlsx\"         \t| To save multiple tables as sheets into a single excel\t|\n",
+        "| \"excel\"         | same as \"xlsx\"; output is an array of excel location\t|\n",
+        "\n",
+        "\n",
+        "Default output is an array of pandas dataframes, with which you can change to any other format like excel, html etc. Follow https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nQ1as9mkCOyu",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 4.5 Explore session objects\n",
+        "\n",
+        "> **Explore** all objects of the latest file processing with `et_sess.__dict__.\n",
+        "keys()` - Depends on the plan type of your API Key"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "aDUaDyX8IGmK",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "b89cd4d2-72a1-4196-ad6e-2a05cc6f0448"
+      },
+      "source": [
+        "et_sess.__dict__.keys()"
+      ],
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "dict_keys(['api_key', '_session', 'ServerResponse', 'JobStatus', 'Lines', 'Pages', 'Tables'])"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 20
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "E0Z6RJa9DFlT",
+        "colab_type": "text"
+      },
+      "source": [
+        "Based on the API Key PLAN type, the et_sess contains below objects\n",
+        "\n",
+        "| Object          \t| Explanation                                                                            \t|\n",
+        "|-----------------\t|----------------------------------------------------------------------------------------\t|\n",
+        "| api_key         \t| Your API Key                                                                           \t|\n",
+        "| _session        \t| Session data of the **latest** performed request/action                                          \t|\n",
+        "| input_filename  \t| Name of the processed input file |\n",
+        "| ServerResponse  \t| Complete ServerResponse, along with response code and headers                          \t|\n",
+        "| server_response \t| complete server response content; equivalent to `ServerResponse.json()`                \t|\n",
+        "| JobStatus       \t| Job Status of the triggered process                                                    \t|\n",
+        "| Pages           \t| Number of pages in the input; also number of credits consumed on the triggered process \t|\n",
+        "| Tables          \t| Tabular Data in JSON format with index orientation; ordered table wise                 \t|\n",
+        "| Lines           \t| Text Data in JSON format, ordered page wise                                            \t|"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MytCD36ja6KM",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 4.6 Save Table & Text to LOCAL\n",
+        "\n",
+        "```python\n",
+        "et_sess.save_output(output_folder, output_format=\"csv\")\n",
+        "```\n",
+        "`output_format` param is relavant only for the table data, with options \"csv\" or \"xlsx\"\n",
+        "\n",
+        "\n",
+        "> Note: As the `et_sess` contains the latest action performed, make sure this call is right after the `process_file()`"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-CgU2PCJFQNr",
+        "colab_type": "text"
+      },
+      "source": [
+        "# 5. Explore the Output"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "JI-CLS9UF0iS",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 5.1 Output Structure\n",
+        "\n",
+        "> **Understand the output**: The response of a triggered job is a JSON object in the below format. \n",
+        "\n",
+        "Note that the response depends on the plan type of the API Key.\n",
+        "\n",
+        "\n",
+        "```javascript\n",
+        "{\n",
+        "    \"JobStatus\": <string>,                              # Status of the triggered Process  @ JOB-LEVEL\n",
+        "    \"Pages\": <integer>,                                 # Number of pages processed in this request @ PAGE-LEVEL\n",
+        "    \"Tables\": [<list of key-value objects of table>     # List of all tables found @ TABLE-LEVEL\n",
+        "        {\n",
+        "            \"Page\": <integer>,                              ## Page number in which this table is found\n",
+        "            \"CharacterConfidence\": <float>,                 ## Accuracy of Characters recognized from the input-page\n",
+        "            \"LayoutConfidence\": <float>,                    ## Accuracy of table layout's design decision\n",
+        "            \"TableJson\": <dict>,                            ## Table Cell Text in key-value format with index orientation - {row#: {col#: <str>}}\n",
+        "            \"TableCoordinates\": <dict>,                     ## Top-left & Bottom-right Cell Coordinates - {row#: {col#: <list(x1,y1,x2,y2)>}}\n",
+        "            \"TableConfidence\": <dict>                       ## Cell level accuracy of detected characters - {row#: {col#: <float>}}\n",
+        "        },\n",
+        "    {...}                                               ## ... more \"Tables\" objects\n",
+        "    ],\n",
+        "    \"Lines\": [<list of key-value objects>               # Pagewise Line details @ PAGE-LEVEL\n",
+        "        {\n",
+        "            \"Page\": <integer>,                          # Page number in which the lines are found\n",
+        "            \"CharacterConfidence\": <float>,             # Average Accuracy of all Characters recognized from the input-page\n",
+        "            \"LinesArray\": [\n",
+        "                <list of key-value objects of line>     # Ordered list of lines in this page @ LINE-LEVEL\n",
+        "                {\n",
+        "                    \"Line\": <str>,                          ## Detected text of the complete line\n",
+        "                    \"WordsArray\": [\n",
+        "                        <list of key-value objects>         ## Word level datails in this line @ WORD-LEVEL\n",
+        "                        {\n",
+        "                            \"Conf\": <float>,                    ### Accuracy of recognized characters of the word\n",
+        "                            \"Word\": <str>,                      ### Detected text of the word\n",
+        "                            \"Loc\": [x1, y1, x2, y2]             ### Top-left & Bottom-right coordinates, w.r.t the input-page width-height dimensions\n",
+        "                        },\n",
+        "                    {...}                                   ### More \"WordsArray\" objects\n",
+        "                    ]\n",
+        "                },\n",
+        "            {...}                                       ## More \"LinesArray\" objects\n",
+        "            ]\n",
+        "        },\n",
+        "    {...}                                               # More Pagewise \"Lines\" details\n",
+        "    ]\n",
+        "}\n",
+        "```"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BPY9KziZF6jR",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 5.2 Output Details\n",
+        "\n",
+        "Output objects are based on the API Key Plan type. Available plan types are \n",
+        "\n",
+        "**Purchased Plans**\n",
+        "*   \"LITE\"   - **only table data** in the output\n",
+        "*   \"FULL\"   - **table and text data** in the output\n",
+        "*   \"EXTRA\"  - **table, text data along with cell & word coordintates and character detection accuracy**\n",
+        "\n",
+        "**Promotional Plans**: Any plan other than Purchased plans are promotional\n",
+        "*   \"free_trial\", \"camelotpro\" - these are promotional API Keys, gives only table data equivalent to \"LITE\" plan type\n",
+        "\n",
+        "\n",
+        "<br>\n",
+        "Output objects detail below\n",
+        "\n",
+        "\n",
+        "\n",
+        "| Key Name \t| Parent \t| Type \t| Description \t| Availability \t|\n",
+        "|-\t|-\t|-\t|-\t|-\t|\n",
+        "| JobStatus \t| Job \t| String \t| Status of the triggered process \t| ALL Plans \t|\n",
+        "| Pages \t| Job \t| Integer \t| Number of pages processed in the request \t| ALL Plans \t|\n",
+        "| Tables \t| Job \t| Array \t| List of all tables found \t| ALL Plans \t|\n",
+        "| Tables[0].Page \t| Table \t| Integer \t| Page number in which the table is found \t| ALL Plans \t|\n",
+        "| Tables[0].CharacterConfidence \t| Table \t| Decimal \t| Accuracy of Characters recognized from the image \t| ALL Plans \t|\n",
+        "| Tables[0].LayoutConfidence \t| Table \t| Decimal \t| Accuracy of table layout's design decision \t| ALL Plans \t|\n",
+        "| Tables[0].TableJson \t| Table \t| Json/dict \t| Table Cell Text in key-value format with index orientation - {row#: {col#: }} \t| ALL Plans \t|\n",
+        "| Tables[0].TableCoordinates \t| Table \t| Json/dict \t| Top-left & Bottom-right Cell Coordinates - {row#: {col#: }} \t| EXTRA Plan \t|\n",
+        "| Tables[0].TableConfidence \t| Table \t| Json/dict \t| Cell level accuracy of detected characters - {row#: {col#: }} \t| EXTRA Plan \t|\n",
+        "| Lines \t| Job \t| Array \t| List of page-wise lines text \t| FULL, EXTRA\t|\n",
+        "| Lines[0].Page \t| Page \t| Integer \t| Page number in which the lines are found \t| Full Plan \t|\n",
+        "| Lines[0].CharacterConfidence \t| Page \t| Decimal \t| Average Accuracy of all Characters recognized from the input-page \t| Full Plan \t|\n",
+        "| Lines[0].LineArray \t| Page \t| Array \t| Ordered list of lines of the page \t|  \t|\n",
+        "| Lines[0].LineArray[0].Line \t| Line \t| String \t| Detected text of the complete line \t| Full Plan \t|\n",
+        "| Lines[0].LineArray[0].WordsArray \t| Line \t| Array \t| Word level datails in this line \t| EXTRA Plan \t|\n",
+        "| Lines[0].LineArray[0].WordsArray[0].Conf \t| Word \t| Decimal \t| Accuracy of recognized characters of the word \t| EXTRA Plan \t|\n",
+        "| Lines[0].LineArray[0].WordsArray[0].Word \t| Word \t| String \t| Detected text of the word \t| EXTRA Plan \t|\n",
+        "| Lines[0].LineArray[0].WordsArray[0].Loc \t| Word \t| Array \t| Top-left & Bottom-right coordinates, w.r.t the input-page width-height dimensions \t| EXTRA Plan \t|"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4ysCj8_GSrd8",
+        "colab_type": "text"
+      },
+      "source": [
+        "# 6. Make Corrections\n",
+        "\n",
+        "> **Objective**: To ease corrections on the most common issues with the `MakeCorrections` module.\n",
+        "\n",
+        "**Details:** The service relies on OCR (Optical Character Recognition) for character detection and deep learning models to detect tabular structures on the input. There may be a chance for merged rows or columns or incorrect type detections on low-quality inputs with a complex table layout or tightly packed columns. With those in mind, we want to offer the built-in service at the client-side to give control and ease in making corrections on the output. \n",
+        "\n",
+        "\n",
+        "The module, `MakeCorrections`, currently supports below functionalities\n",
+        "\n",
+        "| Functionality        \t| Explanation                                    \t|\n",
+        "|----------------------\t|------------------------------------------------\t|\n",
+        "| Split Merged Rows    \t| Works well on cell values with no spaces       \t|\n",
+        "| Split Merged Columns \t| Works well on cell values with no spaces       \t|\n",
+        "| Fix Decimal Format   \t| To fix thousand and decimal separators         \t|\n",
+        "| Fix Date Format      \t| To handle and modify incorrect date separators \t|"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "MpH284nxX2KJ",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# First things first lets import the module and prepare for corrections\n",
+        "\n",
+        "from ExtractTable.common import MakeCorrections\n",
+        "\n",
+        "corrections = MakeCorrections(et_resp=et_sess.server_response)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PKRntUqYXIEQ",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 6.1 Split Merged Rows"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "_MmSR0mLXS6x",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "corrected_table_dataframes = corrections.split_merged_rows()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "34YLZL3nXIU6",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 6.2 Split Merged Columns\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "hzinneCWXVZ8",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "corrected_table_dataframes = corrections.split_merged_columns()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "3XSBMo5rXIkC",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 6.3 Fix Decimal Format\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "hK9nWOSfXXUc",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "corrected_table_dataframes = corrections.fix_decimal_format(decimal_separator=\".\", thousands_separator=\",\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mok2QbmFXIz9",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 6.4 Fix Date Format\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "UFQNzPxUSqga",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "corrected_table_dataframes = corrections.fix_date_format(delimiter=\"/\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "I5-CGT3oy7KG",
+        "colab_type": "text"
+      },
+      "source": [
+        "# 7. Helpful Code Snippets\n",
+        "\n",
+        "Extra code snippets that are useful to perform some actions on the output. Based on the frequently asked questions."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ACcH7oUpMfFp",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 7.1 Get text data"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "_CVAYfnK_sTk",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# If your API Key supports \"Lines\"\n",
+        "\n",
+        "all_page_lines = []\n",
+        "for each_page in et_sess.Lines:\n",
+        "  for each_line in each_page['LinesArray']:\n",
+        "    all_page_lines.append(each_line['Line'])\n",
+        "  \n",
+        "print(\"\\n\".join(all_page_lines))"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6c0vnCiXM_FM",
+        "colab_type": "text"
+      },
+      "source": [
+        "## 7.2 All tables output to a single excel"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "VhOTglS-NIXN",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, output_format=\"df\", pages=\"all\")\n",
+        " \n",
+        "import pandas as pd\n",
+        "accumulate_all_dfs = pd.DataFrame()\n",
+        "\n",
+        "for each_df in table_data:\n",
+        "    accumulate_all_dfs = accumulate_all_dfs.append(each_df, ignore_index=True)\n",
+        "    # print(each_df.shape, accumulate_all_dfs.shape)\n",
+        "\n",
+        "print(\"Shape of all tables accumulated together is\", accumulate_all_dfs.shape)\n",
+        "\n",
+        "\n",
+        "output_excel_location = <location_for_the_excel_output.xlsx>\n",
+        "# Save the accumulated output to a single excel file\n",
+        "accumulate_all_dfs.to_excel(output_excel_location, index=False, header=False)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ba8vIKuXdAPT",
+        "colab_type": "text"
+      },
+      "source": [
+        "# 8. Support & Contact\n",
+        "\n",
+        "Please do not hesitate to approach our developer team at pydevs@extracttable.com for any assitance needed or to report a bug"
+      ]
     }
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "ie9D1umMEv6D",
-    "colab_type": "text"
-   },
-   "source": [
-    "Default output is an array of pandas dataframes, with which you can change to any other format, follow https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "metadata": {
-    "id": "_CVAYfnK_sTk",
-    "colab_type": "code",
-    "colab": {}
-   },
-   "source": [
-    "# If your API Key supports \"Lines\" - Sample to get Lines\n",
-    "\n",
-    "all_page_lines = []\n",
-    "for each_page in et_sess.Lines:\n",
-    "  for each_line in each_page['LinesArray']:\n",
-    "    all_page_lines.append(each_line['Line'])"
-   ],
-   "execution_count": 0,
-   "outputs": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "TYLfNBQ6bL64",
-    "colab_type": "text"
-   },
-   "source": [
-    "Play with the <ins>result</ins>:\n",
-    "- check the complete server response of the latest job with `et_sess.ServerResponse.json()`\n",
-    "- check out list of available output formats of table `ExtractTable._OUTPUT_FORMATS`\n",
-    "- Retrieve the result as long as the `JobId` is unexpired, usually stays for 24 hours\n",
-    "  - ```job_output = et_sess.get_result(job_id=JobID_HERE)```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": false,
-    "id": "yRsqFlIvB4-D",
-    "colab_type": "text"
-   },
-   "source": [
-    "## Social Media\n",
-    "Follow us on Social media for library updates and free credits.\n",
-    "\n",
-    "[![Image](https://cdn3.iconfinder.com/data/icons/socialnetworking/32/linkedin.png)](https://www.linkedin.com/company/extracttable)\n",
-    "&nbsp;&nbsp;&nbsp;&nbsp;\n",
-    "[![Image](https://abs.twimg.com/favicons/twitter.ico)](https://twitter.com/extracttable)"
-   ]
-  }
- ]
+  ]
 }
\ No newline at end of file