From aab30609fc698c36c60f2d9723e3b5b33a2a4521 Mon Sep 17 00:00:00 2001
From: akshowhini <33936764+akshowhini@users.noreply.github.com>
Date: Wed, 26 Aug 2020 21:22:48 -0400
Subject: [PATCH] Post Processing Made Easy (#31)
* split_merged_rows functionality
* To fix decimal and thousands separator values
* split_merged_columns, fix_date_format functionalities
* validations added
* Easy Naming
* added `server_response` attribute to the session
* move unneccessary variable initialization
* Added Google Colab Contents
* Handle empty tables
* Save tables to multiple sheets of a single excel file
* standardized params naming
* Functionality to save Tables & Text output to local
* Version Update
* Updated Tutorial v2.1.0
---
ExtractTable/__init__.py | 49 +-
ExtractTable/__version__.py | 4 +-
ExtractTable/common.py | 213 +++++-
README.md | 30 +-
example-code.ipynb | 1296 +++++++++++++++++++++++------------
5 files changed, 1126 insertions(+), 466 deletions(-)
diff --git a/ExtractTable/__init__.py b/ExtractTable/__init__.py
index 61cd650..1a965b8 100644
--- a/ExtractTable/__init__.py
+++ b/ExtractTable/__init__.py
@@ -49,14 +49,14 @@ def _make_request(self, method, host: urlparse, params: dict = None, data: dict
"""
tmp = self.__dict__.copy()
for _type, _obj in tmp.items():
- if _type not in ("api_key", "_session"):
+ if _type not in ("api_key", "_session", "input_filename"):
self.__delattr__(_type)
host = host if not host.startswith("http") else host.split("/")[2]
url = urlparse.urlunparse(('https', host, '', '', '', ''))
self.ServerResponse = self._session.request(method, url, params=params, data=data, **kwargs)
ValidateResponse(resp=self.ServerResponse, show_warn=self._WARNINGS)
-
+ self.server_response = self.ServerResponse.json()
return self.ServerResponse.json()
def check_usage(self) -> dict:
@@ -150,11 +150,13 @@ def process_file(
"""
# Raise a warning if unknown format is requested
if output_format not in self._OUTPUT_FORMATS:
- default_format = "dict"
- warn_msg = f"Found: {output_format} as output_format; Allowed only {self._OUTPUT_FORMATS}. " \
- f"Assigned default format: {default_format}"
+ warn_msg = f"Found: '{output_format}' as output_format; Allowed formats are {self._OUTPUT_FORMATS}. " \
+ f"Assigned to default format: {self._DEFAULT}"
warnings.warn(warn_msg)
+ # To use the reference when saving the output
+ self.__setattr__('input_filename', os.path.basename(filepath))
+
try:
with PrepareInput(filepath, pages=pages) as infile:
with open(infile.filepath, 'rb') as fp:
@@ -168,5 +170,40 @@ def process_file(
for _type, _obj in trigger_resp.items():
self.__setattr__(_type, _obj)
- result = ConvertTo(data=trigger_resp, fmt=output_format, indexing=indexing).output
+ result = ConvertTo(server_response=trigger_resp, output_format=output_format, indexing=indexing).output
return result
+
+ def save_output(self, output_folder: os.PathLike = "", output_format: str = "csv"):
+ """
+ Save the objects of session data to user preferred location or a default folder
+ :param output_folder: user preferred output location; default tmp directory
+ :param output_format: needed only for tables CSV or XLSX
+ :return: location of the output
+ """
+ input_fname = self.input_filename.rsplit('.')[0]
+
+ output_format = output_format.lower()
+ if output_format not in ("csv", "xlsx"):
+ output_format = "csv"
+ warnings.warn("Invalid 'output_format' given. Defaulted to 'csv'")
+
+ table_outputs_path = ConvertTo(server_response=self.server_response, output_format=output_format).output
+
+ if output_folder:
+ if not os.path.exists(output_folder):
+ output_folder = os.path.split(table_outputs_path[0])[0]
+ warnings.warn(f"Your output_folder not exists. Saving the outputs to {output_folder}")
+ else:
+ for each_tbl_path in table_outputs_path:
+ os.replace(each_tbl_path, os.path.join(output_folder, input_fname+os.path.basename(each_tbl_path)))
+
+ else:
+ output_folder = os.path.split(table_outputs_path[0])[0]
+
+ for each_page in self.server_response.get("Lines", []):
+ page_txt_fname = os.path.join(output_folder, f"{input_fname}_Page_{str(each_page['Page'])}.txt")
+ page_txt = [each_line['Line'] for each_line in each_page['LinesArray']]
+ with open(page_txt_fname, "w", encoding="utf-8") as ofile:
+ ofile.write("\n".join(page_txt))
+
+ return output_folder
diff --git a/ExtractTable/__version__.py b/ExtractTable/__version__.py
index ed6e20d..8fdbebe 100644
--- a/ExtractTable/__version__.py
+++ b/ExtractTable/__version__.py
@@ -1,4 +1,4 @@
-VERSION = (2, 0, 2)
+VERSION = (2, 1, 0)
PRERELEASE = None # "alpha", "beta" or "rc"
REVISION = None
@@ -13,7 +13,7 @@ def generate_version():
__title__ = "ExtractTable"
-__description__ = "Extract tabular data from images and scanned PDFs. Easily convert image to table, convert pdf to table"
+__description__ = "Extract table data from images and scanned PDFs. Easily convert image to excel, convert pdf to table"
__url__ = "https://github.com/ExtractTable/ExtractTable-py"
__version__ = generate_version()
__author__ = "Saradhi"
diff --git a/ExtractTable/common.py b/ExtractTable/common.py
index 0b1b32a..b5c0787 100644
--- a/ExtractTable/common.py
+++ b/ExtractTable/common.py
@@ -2,29 +2,31 @@
Preprocess the output received from server and interface as a final result to the client
"""
import os
+import re
import tempfile
import warnings
import collections
+from statistics import mode
+from typing import List
import pandas as pd
class ConvertTo:
- """Convert tabular JSON to an user requested output format"""
- FORMATS = {"df", "dataframe", "json", "csv", "dict"}
+ FORMATS = {"df", "dataframe", "json", "csv", "dict", "xlsx", "excel"}
DEFAULT = "df"
- def __init__(self, data: dict, fmt: str = DEFAULT, indexing: bool = False):
+ def __init__(self, server_response: dict, output_format: str = DEFAULT, indexing: bool = False, table_obj="TableJson"):
"""
-
- :param data: Tabular JSON data from server
- :param fmt: format to be converted into
+ Convert the server response to an user requested output format on Tables
+ :param server_response: Tabular JSON data from server
+ :param output_format: format to be converted into
:param indexing: row & column index consideration in the output
"""
- self.data = data
- self.output = self._converter(fmt.lower(), indexing=indexing)
+ self.server_response = server_response
+ self.output = self._converter(output_format.lower(), indexing=indexing, table_obj=table_obj)
- def _converter(self, fmt: str, indexing: bool = False) -> list:
+ def _converter(self, fmt: str, indexing: bool = False, table_obj="TableJson") -> list:
"""
Actual conversion takes place here using Pandas
:param fmt: format to be converted into
@@ -32,10 +34,10 @@ def _converter(self, fmt: str, indexing: bool = False) -> list:
:return: list of tables from converted into the requested output format
"""
dfs = []
- for table in self.data.get("Tables", []):
- tmp = {int(k): v for k, v in table["TableJson"].items()}
+ for table in self.server_response.get("Tables", []):
+ tmp = {int(k): v for k, v in table[table_obj].items()}
# To convert column indices to int to maintain the table order with more than 9 columns
- cols = [str(x) for x in sorted([int(x) for x in tmp[0]])]
+ cols = [str(x) for x in sorted([int(x) for x in tmp[0]])] if tmp else None
# To convert row indices to int and maintain the table order with more than 9 rows
tmp = collections.OrderedDict(sorted(tmp.items()))
dfs.append(pd.DataFrame.from_dict(tmp, orient="index", columns=cols))
@@ -52,9 +54,196 @@ def _converter(self, fmt: str, indexing: bool = False) -> list:
df.to_csv(csv_name, index=indexing, header=indexing)
output_location.append(csv_name)
return output_location
+ elif fmt in ("xlsx", "excel"):
+ output_excel_location = os.path.join(tempfile.mkdtemp(), f"_tables_{len(dfs)}.xlsx")
+ if len(dfs) >= 10:
+ warnings.warn(f"There are {dfs} tables extracted. Consider to change the output_format to 'csv' instead")
+ with pd.ExcelWriter(output_excel_location) as writer:
+ for n, df in enumerate(dfs):
+ df.to_excel(writer, f'table_{n+1}')
+ writer.save()
+ return [output_excel_location]
elif fmt == "json":
return [df.to_json() for df in dfs]
else:
warn_msg = f"Supported output formats {self.FORMATS} only. Assigned to default: {self.DEFAULT}"
warnings.warn(warn_msg)
return dfs
+
+
+class MakeCorrections:
+ def __init__(self, et_resp: dict = None, dataframes: List[pd.DataFrame] = None):
+ """
+ To apply post processing techniques on the output
+ :param et_resp: ExtractTable response
+ :param dataframes: user preferred dataframe(s).
+ Default assumes all dataframes from the extracttable response, `et_resp`.
+ If both `et_resp` and `dataframes` are provided, the later is considered for the processing
+ """
+ if et_resp:
+ self.dataframes = ConvertTo(data=et_resp).output
+
+ if not et_resp:
+ try:
+ self.dataframes = self.__isacceptable__(dataframes)
+ except ValueError:
+ raise ValueError("Either ExtractTable response or your preferred list of pandas dataframes is required")
+
+ @staticmethod
+ def __isacceptable__(dfs) -> List[pd.DataFrame]:
+ """Validate the `dataframes` param"""
+ if type(dfs) is list:
+ if all([type(df) is pd.DataFrame for df in dfs]):
+ return dfs
+ elif type(dfs) is pd.DataFrame:
+ return [dfs]
+ raise ValueError("Dataframes should be list of dataframes or a dataframe")
+
+ def split_merged_rows(self) -> List[pd.DataFrame]:
+ """
+ To split the merged rows into possible multiple rows
+ :return: reformatted list of dataframes
+ """
+ for df_idx, each_df in enumerate(self.dataframes):
+ reformat = []
+ for row in each_df.to_numpy():
+ row = list(row)
+
+ # looks like line separator is " "
+ seperators = [col.strip().count(" ") for col in row]
+ # Statistical mode to assume the number of rows merged
+ mode_ = mode(seperators)
+
+ if mode_:
+ # split the merged rows inside the col
+ tmp = [col.strip().split(' ', mode_) for col in row]
+ for idx in range(len(tmp[0])):
+ tmp_ = []
+ for x in range(len(tmp)):
+ try:
+ val = tmp[x][idx]
+ except IndexError:
+ val = ""
+ tmp_.append(val)
+ reformat.append(tmp_)
+ else:
+ reformat.append(row)
+
+ self.dataframes[df_idx] = pd.DataFrame(reformat)
+
+ return self.dataframes
+
+ def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool = False) -> List[pd.DataFrame]:
+ """
+ To split the merged columns into possible multiple columns
+ :param columns_idx: user preferred columns indices.
+ Default loops through all columns to find numeric or decimal columns
+ :param force_split: To force split through the columns
+ :return: reformatted list of dataframes
+ """
+ # TODO: Should we consider delimiter_pattern for the split?
+ for df_idx, df in enumerate(self.dataframes):
+ if not columns_idx:
+ columns_idx = df.columns
+
+ columns_idx = [str(x) for x in columns_idx]
+ reformat = []
+ for col_idx in columns_idx:
+ tmp = df[col_idx].str.split(expand=True)
+
+ if not any([not any(tmp.isna().any()), force_split]) or tmp.shape[-1] == 1:
+ reformat.append(df[col_idx].tolist())
+ # If user wanted force_split or the split columns have all cell values
+ # then proceed next
+ else:
+ reformat.extend([tmp[each].tolist() for each in tmp.columns])
+
+ self.dataframes[df_idx] = pd.DataFrame(reformat).T
+
+ return self.dataframes
+
+ def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: str = ".", thousands_separator: str = ",", decimal_position: int = 2) -> List[pd.DataFrame]:
+ """
+ To fix decimal and thousands separator values. Often commas as detected as period
+ :param columns_idx: user preferred columns indices.
+ Default loops through all columns to find numeric or decimal columns
+ :param decimal_separator: preferred decimal separator
+ :param thousands_separator: preferred thousands separator
+ :param decimal_position: preferred decimal position
+ :return: corrected list of dataframes
+ """
+ # TODO: Should we consider only bad confidence values?
+ reg_ = f"[{decimal_separator}{thousands_separator}]"
+ if decimal_position > 0:
+ thou_regex = reg_ + '(?=.*' + reg_ + ')'
+ else:
+ thou_regex = reg_
+ decimal_position = int(decimal_position)
+
+ for df_idx, df in enumerate(self.dataframes):
+ if not columns_idx:
+ columns_idx = df.columns
+ columns_idx = [str(x) for x in columns_idx]
+
+ for col_idx in columns_idx:
+ digits = df[col_idx].str.count(pat=r'\d').sum()
+ chars = df[col_idx].str.count(pat=r'[\w]').sum()
+
+ if digits/chars < 0.75:
+ # To infer a numeric or float column
+ # Check if the column contains more digits or characters
+ continue
+
+ df[col_idx] = df[col_idx].str.strip()
+ df[col_idx].replace(regex={r'%s' % thou_regex: thousands_separator}, inplace=True)
+
+ # To correct decimal position
+ if not decimal_position > 0:
+ continue
+
+ for i, _ in enumerate(df[col_idx]):
+ if not len(df[col_idx][i]) > decimal_position:
+ # length of atleast decimal_position
+ continue
+ elif df[col_idx][i][-(decimal_position+1)] == decimal_separator:
+ # nothing to do if decimal separator already in place
+ continue
+
+ # If decimal position is a not alphanumeric
+ if re.search(r'\W+', df[col_idx][i][-(decimal_position+1)]):
+ digits = len(re.findall(r'\d', df[col_idx][i]))
+ if digits/len(df[col_idx][i]) >= 0.5:
+ df[col_idx][i] = df[col_idx][i][:-(decimal_position+1)] + decimal_separator + df[col_idx][i][-decimal_position:]
+
+ self.dataframes[df_idx] = df
+ return self.dataframes
+
+ def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
+ """
+ To fix date formats of the column
+ Eg: 12|1212020 as 12/12/2020
+ :param columns_idx: user preferred columns indices.
+ Default loops through all columns to find Date Columns
+ :param delimiter: "/" or "-" whatelse you prefer
+ :return: correted list of dataframes
+ """
+ date_regex = r'(\d{2}(\d{2})?)(\W)(\d{2}|[A-Za-z]{3,9})(\W)(\d{2}(\d{2})?)\b'
+ for df_idx, df in enumerate(self.dataframes):
+ if not columns_idx:
+ columns_idx = df.columns
+ columns_idx = [str(x) for x in columns_idx]
+
+ for col_idx in columns_idx:
+ dates = df[col_idx].str.count(pat=date_regex).sum()
+
+ if not (dates >= len(df) * 0.75):
+ # To infer a date column
+ # Check if the column contains digits and non-alpha character greater than column length
+ continue
+
+ df[col_idx] = df[col_idx].str.strip()
+ df[col_idx].replace(regex={date_regex: r'\1%s\4%s\6' % (delimiter, delimiter)}, inplace=True)
+
+ self.dataframes[df_idx] = df
+
+ return self.dataframes
diff --git a/README.md b/README.md
index 72079cf..0bfd772 100644
--- a/README.md
+++ b/README.md
@@ -32,9 +32,33 @@ table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, output_f
```
## Detailed Library Usage
- [example-code.ipynb](example-code.ipynb)
-
-
+The tutorial available at takes you through
+
+```Markup
+1. Installation
+2. Import and check version
+3. Create Session & Validate API Key
+ 3.1 Create Session with your API Key
+ 3.2 Validate the Key and check the plan usage
+ 3.3 Check Usage Details
+4. Trigger the extraction process
+ 4.1 Accepted Input Types
+ 4.2 Process an IMAGE Input
+ 4.3 Process a PDF Input
+ 4.4 Output options
+ 4.5 Explore session objects
+5. Explore the Output
+ 5.1 Output Structure
+ 5.2 Output Details
+6. Make Corrections
+ 6.1 Split Merged Rows
+ 6.2 Split Merged Columns
+ 6.3 Fix Decimal Format
+ 6.4 Fix Date Format
+7. Helpful Code Snippets
+ 7.1 Get text data
+ 7.2 Table output to Excel
+```
### Woahh, as simple as that ?!
diff --git a/example-code.ipynb b/example-code.ipynb
index 3446264..d18a325 100644
--- a/example-code.ipynb
+++ b/example-code.ipynb
@@ -1,449 +1,859 @@
{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
- "colab": {
- "name": "ExtractTable - Advanced Code Usage.ipynb",
- "provenance": [],
- "collapsed_sections": []
- },
- "kernelspec": {
- "name": "python3",
- "display_name": "Python 3"
- }
- },
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "collapsed": false,
- "id": "BnYb9aztB48u",
- "colab_type": "text"
- },
- "source": [
- ""
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "NhVhMrQ0ZdQr",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- "!pip install ExtractTable"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "2aIaghfeZnQr",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- "from ExtractTable import ExtractTable"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "LJL_ZyYzZsFY",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- "api_key = YOUR_APIKEY_HERE"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "bwtpzTJxZHRi",
- "colab_type": "text"
- },
- "source": [
- "**Create Session** with your API Key"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "Bfw5GTNvZGv8",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- "et_sess = ExtractTable(api_key)"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "On4_X8v3Zk3v",
- "colab_type": "text"
- },
- "source": [
- "**Validate** the Key and check the plan usage"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "a7EPvvvMZ0Ub",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- "usage = et_sess.check_usage()"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "sovuclERjRqy",
- "colab_type": "text"
- },
- "source": [
- "*If there is no error encountered in the above cell, it means we have a valid API key. Now, get started by checking the usage and trigger the file for processing*"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "HT97IP8MZ9WF",
- "colab_type": "code",
- "outputId": "b5dfbc96-5ce8-4461-c988-6b17e58a1448",
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
"colab": {
- "base_uri": "https://localhost:8080/",
- "height": 34
+ "name": "ExtractTable Usage -2.1.0",
+ "provenance": [],
+ "collapsed_sections": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "pycharm": {
+ "stem_cell": {
+ "cell_type": "raw",
+ "source": [],
+ "metadata": {
+ "collapsed": false
+ }
+ }
}
- },
- "source": [
- "print(usage)"
- ],
- "execution_count": 0,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "{'credits': 500, 'queued': 0, 'used': 132}\n"
- ],
- "name": "stdout"
- }
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "-XqbBoB-i3pi",
- "colab_type": "text"
- },
- "source": [
- "**credits**: Total number credits attached to the API Key\n",
- "\n",
- "**queued** : Number of triggered jobs that are still processing in the queue\n",
- "\n",
- "**used** : Number of credits already used "
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "P_xzVgHmZ9sw",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- "# filepath = \"image_path_or_image_url_with_tables\"\n",
- "# filepath = r'samples/BlurryImage.jpg'\n",
- "filepath = \"https://raw.githubusercontent.com/ExtractTable/ExtractTable-py/master/samples/QualityImage.jpg\""
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "oUnBFxYiZ1Ka",
- "colab_type": "text"
- },
- "source": [
- "**Trigger** the process to extract tabular data from the file"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "3H9jzk6wJ5-V",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- "table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables)"
- ],
- "execution_count": 0,
- "outputs": []
},
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "k98KTihPJwyO",
- "colab_type": "text"
- },
- "source": [
- "Note: To process a PDF, use **pages** params in the read_pdf function, as shown below\n",
- "```python \n",
- "table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, pages=\"all\")\n",
- "```\n",
- "Below are the sample values ```pages``` accepts\n",
- "\n",
- "* pages = \"2\" - considers only 2nd page of the PDF\n",
- "* pages = \"1,3,5\" - considers only 1st, 3rd and 5th page of the PDF\n",
- "* pages = \"1, 3-5\" - considers 1st, 3rd, 4th and 5th page of the PDF\n",
- "* pages = \"all\" - considers complete PDF"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Th12lbKfJhu9",
- "colab_type": "text"
- },
- "source": [
- "> By default, the `process_file()` returns **only** the table data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "HNZ8ieKNH5db",
- "colab_type": "text"
- },
- "source": [
- "> **Explore** all objects of the latest file processing with `et_sess.__dict__.keys()` - Depends on the plan type of your API Key"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "aDUaDyX8IGmK",
- "colab_type": "code",
- "outputId": "e22422ca-b27d-405f-c263-a92898c010ea",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 34
- }
- },
- "source": [
- "et_sess.__dict__.keys()"
- ],
- "execution_count": 0,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "dict_keys(['api_key', '_session', 'ServerResponse', 'JobStatus', 'Lines', 'Pages', 'Tables'])"
- ]
- },
- "metadata": {
- "tags": []
- },
- "execution_count": 14
- }
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "QYwnBXwQJ9D9",
- "colab_type": "code",
- "outputId": "5fee31e6-a38b-43a4-9297-8f1a920ad87e",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 68
- }
- },
- "source": [
- "# Access the class objects as you want\n",
- "print(\"Number of pages processed in this job:\", et_sess.Pages)\n",
- "print(\"Number of tables found in this job:\", len(et_sess.Tables))\n",
- "# print(\"Number of lines in the first page of this job:\", len(et_sess.Lines[0]['LineArray']))\n",
- "\n",
- "# et_sess.Tables\n",
- "# et_sess.Lines\n"
- ],
- "execution_count": 0,
- "outputs": [
- {
- "output_type": "stream",
- "text": [
- "Number of pages processed in this job: 1\n",
- "Number of tables found in this job: 1\n",
- "Number of lines in the first page of this job: 42\n"
- ],
- "name": "stdout"
- }
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "KbIJ9kpqFxRu",
- "colab_type": "text"
- },
- "source": [
- "> **Understand the output**: The response of a triggered job is a JSON object in the below format. Note that the response depends on the plan type of the API Key.\n",
- "\n",
- "```javascript\n",
- "{\n",
- " \"JobStatus\": , # Status of the triggered Process @ JOB-LEVEL\n",
- " \"Pages\": , # Number of pages processed in this request @ PAGE-LEVEL\n",
- " \"Tables\": [ # List of all tables found @ TABLE-LEVEL\n",
- " {\n",
- " \"Page\": , ## Page number in which this table is found\n",
- " \"CharacterConfidence\": , ## Accuracy of Characters recognized from the input-page\n",
- " \"LayoutConfidence\": , ## Accuracy of table layout's design decision\n",
- " \"TableJson\": , ## Table Cell Text in key-value format with index orientation - {row#: {col#: }}\n",
- " \"TableCoordinates\": , ## Top-left & Bottom-right Cell Coordinates - {row#: {col#: }}\n",
- " \"TableConfidence\": ## Cell level accuracy of detected characters - {row#: {col#: }}\n",
- " },\n",
- " {...} ## ... more \"Tables\" objects\n",
- " ],\n",
- " \"Lines\": [ # Pagewise Line details @ PAGE-LEVEL\n",
- " {\n",
- " \"Page\": , # Page number in which the lines are found\n",
- " \"CharacterConfidence\": , # Average Accuracy of all Characters recognized from the input-page\n",
- " \"LinesArray\": [\n",
- " # Ordered list of lines in this page @ LINE-LEVEL\n",
- " {\n",
- " \"Line\": , ## Detected text of the complete line\n",
- " \"WordsArray\": [\n",
- " ## Word level datails in this line @ WORD-LEVEL\n",
- " {\n",
- " \"Conf\": , ### Accuracy of recognized characters of the word\n",
- " \"Word\": , ### Detected text of the word\n",
- " \"Loc\": [x1, y1, x2, y2] ### Top-left & Bottom-right coordinates, w.r.t the input-page width-height dimensions\n",
- " },\n",
- " {...} ### More \"WordsArray\" objects\n",
- " ]\n",
- " },\n",
- " {...} ## More \"LinesArray\" objects\n",
- " ]\n",
- " },\n",
- " {...} # More Pagewise \"Lines\" details\n",
- " ]\n",
- "}\n",
- "```"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "XrXBfENfZ2AI",
- "colab_type": "code",
- "outputId": "6c12b493-b774-4687-f44e-1f1731f7ce43",
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 170
- }
- },
- "source": [
- "table_data # Notice the default output is a pandas dataframe"
- ],
- "execution_count": 0,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "[ 0 1 ... 5 6\n",
- " 0 FLC Code Room Name ... W (m) Ceiling Height (m)\n",
- " 1 RGOOTO1 Indigenous Support Officer ... 7.3 2.7\n",
- " 2 RGOOTO2 Instrum. Music Room ... 7.3 2.7\n",
- " 3 RGOTO1A Verandah ... 1.7 3.0\n",
- " 4 RGOTO1B Eastern Stairs ... 1.7 N/A\n",
- " 5 RGOTO2B Western Stairs ... 1.0 N/A\n",
- " \n",
- " [6 rows x 7 columns]]"
- ]
- },
- "metadata": {
- "tags": []
- },
- "execution_count": 22
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "collapsed": false,
+ "id": "BnYb9aztB48u",
+ "colab_type": "text"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "935SwBV4Z-CH",
+ "colab_type": "text"
+ },
+ "source": [
+ "# 1. Installation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "NhVhMrQ0ZdQr",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "!pip install -U ExtractTable"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "jfsHIegraT2l",
+ "colab_type": "text"
+ },
+ "source": [
+ "# 2. Import and check version"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "2aIaghfeZnQr",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "from ExtractTable import ExtractTable"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "YLLCa6qQaaZu",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "print(ExtractTable.VERSION)"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "VYERq5s9aiNy",
+ "colab_type": "text"
+ },
+ "source": [
+ "# 3. Create Session & Validate API Key\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "bwtpzTJxZHRi",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 3.1 **Create Session** with your API Key"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "LJL_ZyYzZsFY",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "api_key = YOUR_APIKEY_HERE"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "Bfw5GTNvZGv8",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "et_sess = ExtractTable(api_key)"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "b5fQB7dGxLKf",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# et_sess.__dict__"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "On4_X8v3Zk3v",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 3.2 **Validate** the Key and check the plan usage"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "a7EPvvvMZ0Ub",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "usage = et_sess.check_usage()"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "sovuclERjRqy",
+ "colab_type": "text"
+ },
+ "source": [
+ "*If there is no error encountered in the above cell, it means we have a valid API key. Now, lets get started by checking the usage and trigger the file for processing*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "GJdjlTPKxcXF",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# et_sess.server_response"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "HT97IP8MZ9WF",
+ "colab_type": "code",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 34
+ },
+ "outputId": "bc872e57-c5cb-4db0-a034-56f03f275d4b"
+ },
+ "source": [
+ "print(usage)"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "{'credits': 100, 'queued': 0, 'used': 49}\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-XqbBoB-i3pi",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 3.3 Check Usage Details\n",
+ "\n",
+ "**credits**: Total number credits attached to the API Key\n",
+ "\n",
+ "**queued** : Number of triggered jobs that are still processing in the queue\n",
+ "\n",
+ "**used** : Number of credits already used "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ARRJaIgFcYoe",
+ "colab_type": "text"
+ },
+ "source": [
+ "# 4. Trigger the extraction process\n",
+ "\n",
+ "> Note: We will use the session, `et_sess`, created earlier in step 3.1, to save the session data and retrieve when needed"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6GpN9ho1chi6",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 4.1 Accepted Input Types\n",
+ "\n",
+ "**Allowed input formats** are:\n",
+ "- Image\n",
+ " - JPG/JPEG\n",
+ " - PNG\n",
+ "- PDF\n",
+ " - Text PDF\n",
+ " - Scan PDF\n",
+ " - Image PDF\n",
+ "\n",
+ "\n",
+ "**Input Location Options**\n",
+ "- Location can be a file from the local drive\n",
+ "- Accessible remote URL - *the file object will be locally downloaded and deleted once sent to the process*"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "P_xzVgHmZ9sw",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# image_location = \"local_image_path_OR_remote_image_url_with_tables\"\n",
+ "# image_location = r'samples/BlurryImage.jpg'\n",
+ "image_location = \"https://raw.githubusercontent.com/ExtractTable/ExtractTable-py/master/samples/QualityImage.jpg\""
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "oUnBFxYiZ1Ka",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 4.2 Process an IMAGE Input\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "3H9jzk6wJ5-V",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "table_data = et_sess.process_file(filepath=image_location, output_format=\"df\")"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "AoKuQBVQy3LN",
+ "colab_type": "code",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 170
+ },
+ "outputId": "032898c5-0a6b-41b7-a9fc-4107defe056f"
+ },
+ "source": [
+ "table_data"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "[ 0 1 ... 5 6\n",
+ " 0 FLC Code Room Name ... W (m) Ceiling Height (m)\n",
+ " 1 RGOOTO1 Indigenous Support Officer ... 7.3 2.7\n",
+ " 2 RGOOTO2 Instrum. Music Room ... 7.3 2.7\n",
+ " 3 RGOTO1A Verandah ... 1.7 3.0\n",
+ " 4 RGOTO1B Eastern Stairs ... 1.7 N/A\n",
+ " 5 RGOTO2B Western Stairs ... 1.0 N/A\n",
+ " \n",
+ " [6 rows x 7 columns]]"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 19
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "9uiDCtGpfTwF",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 4.3 Process a PDF Input"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "ehxmaPgthoCC",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# pdf_location = \"local_image_path_OR_remote_image_url_with_tables\"\n",
+ "# pdf_location = r'samples/BlurryImage.jpg'\n",
+ "pdf_location = \"https://raw.githubusercontent.com/ExtractTable/ExtractTable-py/master/samples/QualityImage.jpg\""
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "qdU1Au3LhiuD",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, pages=\"all\", output_format=\"df\")"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "k98KTihPJwyO",
+ "colab_type": "text"
+ },
+ "source": [
+ "Below are the sample values ```pages``` accepts **string** type\n",
+ "\n",
+ "\n",
+ "\n",
+ "| pages \t| Explanation \t|\n",
+ "|----------\t|-------------------------------------------------\t|\n",
+ "| \"1\" \t| [Default] considers only 1st page of the PDF \t|\n",
+ "| \"1,3,5\" \t| considers only 1st, 3rd and 5th page of the PDF \t|\n",
+ "| \"1, 3-5\" \t| considers 1st, 3rd, 4th and 5th page of the PDF \t|\n",
+ "| \"all\" \t| considers complete PDF \t|"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "e-KPdSNQBeR-",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 4.4 Table Output options\n",
+ "\n",
+ "> By default, the `process_file()` returns **only** the table data. Output depends on the `output_format` , explained below\n",
+ "\n",
+ "Explore the available options with `ExtractTable._OUTPUT_FORMATS`\n",
+ "\n",
+ "| output_format \t| Explanation \t|\n",
+ "|---------------\t|--------------------------------------------\t|\n",
+ "| \"df\" \t| [Default] Array of Pandas dataframes \t|\n",
+ "| \"dataframe\" \t| same as \"df\"; Array of Pandas dataframes \t|\n",
+ "| \"json\" \t| JSON data with index orientation \t|\n",
+ "| \"dict\" \t| Similar to JSON data but python dictionary \t|\n",
+ "| \"csv\" \t| Array of locally saved CSV file locations \t|\n",
+ "| \"xlsx\" \t| To save multiple tables as sheets into a single excel\t|\n",
+ "| \"excel\" | same as \"xlsx\"; output is an array of excel location\t|\n",
+ "\n",
+ "\n",
+ "Default output is an array of pandas dataframes, with which you can change to any other format like excel, html etc. Follow https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "nQ1as9mkCOyu",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 4.5 Explore session objects\n",
+ "\n",
+ "> **Explore** all objects of the latest file processing with `et_sess.__dict__.\n",
+ "keys()` - Depends on the plan type of your API Key"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "aDUaDyX8IGmK",
+ "colab_type": "code",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 34
+ },
+ "outputId": "b89cd4d2-72a1-4196-ad6e-2a05cc6f0448"
+ },
+ "source": [
+ "et_sess.__dict__.keys()"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "dict_keys(['api_key', '_session', 'ServerResponse', 'JobStatus', 'Lines', 'Pages', 'Tables'])"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 20
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "E0Z6RJa9DFlT",
+ "colab_type": "text"
+ },
+ "source": [
+ "Based on the API Key PLAN type, the et_sess contains below objects\n",
+ "\n",
+ "| Object \t| Explanation \t|\n",
+ "|-----------------\t|----------------------------------------------------------------------------------------\t|\n",
+ "| api_key \t| Your API Key \t|\n",
+ "| _session \t| Session data of the **latest** performed request/action \t|\n",
+ "| input_filename \t| Name of the processed input file |\n",
+ "| ServerResponse \t| Complete ServerResponse, along with response code and headers \t|\n",
+ "| server_response \t| complete server response content; equivalent to `ServerResponse.json()` \t|\n",
+ "| JobStatus \t| Job Status of the triggered process \t|\n",
+ "| Pages \t| Number of pages in the input; also number of credits consumed on the triggered process \t|\n",
+ "| Tables \t| Tabular Data in JSON format with index orientation; ordered table wise \t|\n",
+ "| Lines \t| Text Data in JSON format, ordered page wise \t|"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "MytCD36ja6KM",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 4.6 Save Table & Text to LOCAL\n",
+ "\n",
+ "```python\n",
+ "et_sess.save_output(output_folder, output_format=\"csv\")\n",
+ "```\n",
+ "`output_format` param is relavant only for the table data, with options \"csv\" or \"xlsx\"\n",
+ "\n",
+ "\n",
+ "> Note: As the `et_sess` contains the latest action performed, make sure this call is right after the `process_file()`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-CgU2PCJFQNr",
+ "colab_type": "text"
+ },
+ "source": [
+ "# 5. Explore the Output"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "JI-CLS9UF0iS",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 5.1 Output Structure\n",
+ "\n",
+ "> **Understand the output**: The response of a triggered job is a JSON object in the below format. \n",
+ "\n",
+ "Note that the response depends on the plan type of the API Key.\n",
+ "\n",
+ "\n",
+ "```javascript\n",
+ "{\n",
+ " \"JobStatus\": , # Status of the triggered Process @ JOB-LEVEL\n",
+ " \"Pages\": , # Number of pages processed in this request @ PAGE-LEVEL\n",
+ " \"Tables\": [ # List of all tables found @ TABLE-LEVEL\n",
+ " {\n",
+ " \"Page\": , ## Page number in which this table is found\n",
+ " \"CharacterConfidence\": , ## Accuracy of Characters recognized from the input-page\n",
+ " \"LayoutConfidence\": , ## Accuracy of table layout's design decision\n",
+ " \"TableJson\": , ## Table Cell Text in key-value format with index orientation - {row#: {col#: }}\n",
+ " \"TableCoordinates\": , ## Top-left & Bottom-right Cell Coordinates - {row#: {col#: }}\n",
+ " \"TableConfidence\": ## Cell level accuracy of detected characters - {row#: {col#: }}\n",
+ " },\n",
+ " {...} ## ... more \"Tables\" objects\n",
+ " ],\n",
+ " \"Lines\": [ # Pagewise Line details @ PAGE-LEVEL\n",
+ " {\n",
+ " \"Page\": , # Page number in which the lines are found\n",
+ " \"CharacterConfidence\": , # Average Accuracy of all Characters recognized from the input-page\n",
+ " \"LinesArray\": [\n",
+ " # Ordered list of lines in this page @ LINE-LEVEL\n",
+ " {\n",
+ " \"Line\": , ## Detected text of the complete line\n",
+ " \"WordsArray\": [\n",
+ " ## Word level datails in this line @ WORD-LEVEL\n",
+ " {\n",
+ " \"Conf\": , ### Accuracy of recognized characters of the word\n",
+ " \"Word\": , ### Detected text of the word\n",
+ " \"Loc\": [x1, y1, x2, y2] ### Top-left & Bottom-right coordinates, w.r.t the input-page width-height dimensions\n",
+ " },\n",
+ " {...} ### More \"WordsArray\" objects\n",
+ " ]\n",
+ " },\n",
+ " {...} ## More \"LinesArray\" objects\n",
+ " ]\n",
+ " },\n",
+ " {...} # More Pagewise \"Lines\" details\n",
+ " ]\n",
+ "}\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "BPY9KziZF6jR",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 5.2 Output Details\n",
+ "\n",
+ "Output objects are based on the API Key Plan type. Available plan types are \n",
+ "\n",
+ "**Purchased Plans**\n",
+ "* \"LITE\" - **only table data** in the output\n",
+ "* \"FULL\" - **table and text data** in the output\n",
+ "* \"EXTRA\" - **table, text data along with cell & word coordintates and character detection accuracy**\n",
+ "\n",
+ "**Promotional Plans**: Any plan other than Purchased plans are promotional\n",
+ "* \"free_trial\", \"camelotpro\" - these are promotional API Keys, gives only table data equivalent to \"LITE\" plan type\n",
+ "\n",
+ "\n",
+ "
\n",
+ "Output objects detail below\n",
+ "\n",
+ "\n",
+ "\n",
+ "| Key Name \t| Parent \t| Type \t| Description \t| Availability \t|\n",
+ "|-\t|-\t|-\t|-\t|-\t|\n",
+ "| JobStatus \t| Job \t| String \t| Status of the triggered process \t| ALL Plans \t|\n",
+ "| Pages \t| Job \t| Integer \t| Number of pages processed in the request \t| ALL Plans \t|\n",
+ "| Tables \t| Job \t| Array \t| List of all tables found \t| ALL Plans \t|\n",
+ "| Tables[0].Page \t| Table \t| Integer \t| Page number in which the table is found \t| ALL Plans \t|\n",
+ "| Tables[0].CharacterConfidence \t| Table \t| Decimal \t| Accuracy of Characters recognized from the image \t| ALL Plans \t|\n",
+ "| Tables[0].LayoutConfidence \t| Table \t| Decimal \t| Accuracy of table layout's design decision \t| ALL Plans \t|\n",
+ "| Tables[0].TableJson \t| Table \t| Json/dict \t| Table Cell Text in key-value format with index orientation - {row#: {col#: }} \t| ALL Plans \t|\n",
+ "| Tables[0].TableCoordinates \t| Table \t| Json/dict \t| Top-left & Bottom-right Cell Coordinates - {row#: {col#: }} \t| EXTRA Plan \t|\n",
+ "| Tables[0].TableConfidence \t| Table \t| Json/dict \t| Cell level accuracy of detected characters - {row#: {col#: }} \t| EXTRA Plan \t|\n",
+ "| Lines \t| Job \t| Array \t| List of page-wise lines text \t| FULL, EXTRA\t|\n",
+ "| Lines[0].Page \t| Page \t| Integer \t| Page number in which the lines are found \t| Full Plan \t|\n",
+ "| Lines[0].CharacterConfidence \t| Page \t| Decimal \t| Average Accuracy of all Characters recognized from the input-page \t| Full Plan \t|\n",
+ "| Lines[0].LineArray \t| Page \t| Array \t| Ordered list of lines of the page \t| \t|\n",
+ "| Lines[0].LineArray[0].Line \t| Line \t| String \t| Detected text of the complete line \t| Full Plan \t|\n",
+ "| Lines[0].LineArray[0].WordsArray \t| Line \t| Array \t| Word level datails in this line \t| EXTRA Plan \t|\n",
+ "| Lines[0].LineArray[0].WordsArray[0].Conf \t| Word \t| Decimal \t| Accuracy of recognized characters of the word \t| EXTRA Plan \t|\n",
+ "| Lines[0].LineArray[0].WordsArray[0].Word \t| Word \t| String \t| Detected text of the word \t| EXTRA Plan \t|\n",
+ "| Lines[0].LineArray[0].WordsArray[0].Loc \t| Word \t| Array \t| Top-left & Bottom-right coordinates, w.r.t the input-page width-height dimensions \t| EXTRA Plan \t|"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4ysCj8_GSrd8",
+ "colab_type": "text"
+ },
+ "source": [
+ "# 6. Make Corrections\n",
+ "\n",
+ "> **Objective**: To ease corrections on the most common issues with the `MakeCorrections` module.\n",
+ "\n",
+ "**Details:** The service relies on OCR (Optical Character Recognition) for character detection and deep learning models to detect tabular structures on the input. There may be a chance for merged rows or columns or incorrect type detections on low-quality inputs with a complex table layout or tightly packed columns. With those in mind, we want to offer the built-in service at the client-side to give control and ease in making corrections on the output. \n",
+ "\n",
+ "\n",
+ "The module, `MakeCorrections`, currently supports below functionalities\n",
+ "\n",
+ "| Functionality \t| Explanation \t|\n",
+ "|----------------------\t|------------------------------------------------\t|\n",
+ "| Split Merged Rows \t| Works well on cell values with no spaces \t|\n",
+ "| Split Merged Columns \t| Works well on cell values with no spaces \t|\n",
+ "| Fix Decimal Format \t| To fix thousand and decimal separators \t|\n",
+ "| Fix Date Format \t| To handle and modify incorrect date separators \t|"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "MpH284nxX2KJ",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# First things first lets import the module and prepare for corrections\n",
+ "\n",
+ "from ExtractTable.common import MakeCorrections\n",
+ "\n",
+ "corrections = MakeCorrections(et_resp=et_sess.server_response)"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "PKRntUqYXIEQ",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 6.1 Split Merged Rows"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "_MmSR0mLXS6x",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "corrected_table_dataframes = corrections.split_merged_rows()"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "34YLZL3nXIU6",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 6.2 Split Merged Columns\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "hzinneCWXVZ8",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "corrected_table_dataframes = corrections.split_merged_columns()"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "3XSBMo5rXIkC",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 6.3 Fix Decimal Format\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "hK9nWOSfXXUc",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "corrected_table_dataframes = corrections.fix_decimal_format(decimal_separator=\".\", thousands_separator=\",\")"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "mok2QbmFXIz9",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 6.4 Fix Date Format\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "UFQNzPxUSqga",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "corrected_table_dataframes = corrections.fix_date_format(delimiter=\"/\")"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "I5-CGT3oy7KG",
+ "colab_type": "text"
+ },
+ "source": [
+ "# 7. Helpful Code Snippets\n",
+ "\n",
+ "Extra code snippets that are useful to perform some actions on the output. Based on the frequently asked questions."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ACcH7oUpMfFp",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 7.1 Get text data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "_CVAYfnK_sTk",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "# If your API Key supports \"Lines\"\n",
+ "\n",
+ "all_page_lines = []\n",
+ "for each_page in et_sess.Lines:\n",
+ " for each_line in each_page['LinesArray']:\n",
+ " all_page_lines.append(each_line['Line'])\n",
+ " \n",
+ "print(\"\\n\".join(all_page_lines))"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6c0vnCiXM_FM",
+ "colab_type": "text"
+ },
+ "source": [
+ "## 7.2 All tables output to a single excel"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "VhOTglS-NIXN",
+ "colab_type": "code",
+ "colab": {}
+ },
+ "source": [
+ "table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, output_format=\"df\", pages=\"all\")\n",
+ " \n",
+ "import pandas as pd\n",
+ "accumulate_all_dfs = pd.DataFrame()\n",
+ "\n",
+ "for each_df in table_data:\n",
+ " accumulate_all_dfs = accumulate_all_dfs.append(each_df, ignore_index=True)\n",
+ " # print(each_df.shape, accumulate_all_dfs.shape)\n",
+ "\n",
+ "print(\"Shape of all tables accumulated together is\", accumulate_all_dfs.shape)\n",
+ "\n",
+ "\n",
+ "output_excel_location = \n",
+ "# Save the accumulated output to a single excel file\n",
+ "accumulate_all_dfs.to_excel(output_excel_location, index=False, header=False)"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ba8vIKuXdAPT",
+ "colab_type": "text"
+ },
+ "source": [
+ "# 8. Support & Contact\n",
+ "\n",
+ "Please do not hesitate to approach our developer team at pydevs@extracttable.com for any assitance needed or to report a bug"
+ ]
}
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "ie9D1umMEv6D",
- "colab_type": "text"
- },
- "source": [
- "Default output is an array of pandas dataframes, with which you can change to any other format, follow https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "_CVAYfnK_sTk",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- "# If your API Key supports \"Lines\" - Sample to get Lines\n",
- "\n",
- "all_page_lines = []\n",
- "for each_page in et_sess.Lines:\n",
- " for each_line in each_page['LinesArray']:\n",
- " all_page_lines.append(each_line['Line'])"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "TYLfNBQ6bL64",
- "colab_type": "text"
- },
- "source": [
- "Play with the result:\n",
- "- check the complete server response of the latest job with `et_sess.ServerResponse.json()`\n",
- "- check out list of available output formats of table `ExtractTable._OUTPUT_FORMATS`\n",
- "- Retrieve the result as long as the `JobId` is unexpired, usually stays for 24 hours\n",
- " - ```job_output = et_sess.get_result(job_id=JobID_HERE)```"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "collapsed": false,
- "id": "yRsqFlIvB4-D",
- "colab_type": "text"
- },
- "source": [
- "## Social Media\n",
- "Follow us on Social media for library updates and free credits.\n",
- "\n",
- "[![Image](https://cdn3.iconfinder.com/data/icons/socialnetworking/32/linkedin.png)](https://www.linkedin.com/company/extracttable)\n",
- " \n",
- "[![Image](https://abs.twimg.com/favicons/twitter.ico)](https://twitter.com/extracttable)"
- ]
- }
- ]
+ ]
}
\ No newline at end of file