From aab30609fc698c36c60f2d9723e3b5b33a2a4521 Mon Sep 17 00:00:00 2001 From: akshowhini <33936764+akshowhini@users.noreply.github.com> Date: Wed, 26 Aug 2020 21:22:48 -0400 Subject: [PATCH] Post Processing Made Easy (#31) * split_merged_rows functionality * To fix decimal and thousands separator values * split_merged_columns, fix_date_format functionalities * validations added * Easy Naming * added `server_response` attribute to the session * move unneccessary variable initialization * Added Google Colab Contents * Handle empty tables * Save tables to multiple sheets of a single excel file * standardized params naming * Functionality to save Tables & Text output to local * Version Update * Updated Tutorial v2.1.0 --- ExtractTable/__init__.py | 49 +- ExtractTable/__version__.py | 4 +- ExtractTable/common.py | 213 +++++- README.md | 30 +- example-code.ipynb | 1296 +++++++++++++++++++++++------------ 5 files changed, 1126 insertions(+), 466 deletions(-) diff --git a/ExtractTable/__init__.py b/ExtractTable/__init__.py index 61cd650..1a965b8 100644 --- a/ExtractTable/__init__.py +++ b/ExtractTable/__init__.py @@ -49,14 +49,14 @@ def _make_request(self, method, host: urlparse, params: dict = None, data: dict """ tmp = self.__dict__.copy() for _type, _obj in tmp.items(): - if _type not in ("api_key", "_session"): + if _type not in ("api_key", "_session", "input_filename"): self.__delattr__(_type) host = host if not host.startswith("http") else host.split("/")[2] url = urlparse.urlunparse(('https', host, '', '', '', '')) self.ServerResponse = self._session.request(method, url, params=params, data=data, **kwargs) ValidateResponse(resp=self.ServerResponse, show_warn=self._WARNINGS) - + self.server_response = self.ServerResponse.json() return self.ServerResponse.json() def check_usage(self) -> dict: @@ -150,11 +150,13 @@ def process_file( """ # Raise a warning if unknown format is requested if output_format not in self._OUTPUT_FORMATS: - default_format = "dict" - warn_msg = f"Found: {output_format} as output_format; Allowed only {self._OUTPUT_FORMATS}. " \ - f"Assigned default format: {default_format}" + warn_msg = f"Found: '{output_format}' as output_format; Allowed formats are {self._OUTPUT_FORMATS}. " \ + f"Assigned to default format: {self._DEFAULT}" warnings.warn(warn_msg) + # To use the reference when saving the output + self.__setattr__('input_filename', os.path.basename(filepath)) + try: with PrepareInput(filepath, pages=pages) as infile: with open(infile.filepath, 'rb') as fp: @@ -168,5 +170,40 @@ def process_file( for _type, _obj in trigger_resp.items(): self.__setattr__(_type, _obj) - result = ConvertTo(data=trigger_resp, fmt=output_format, indexing=indexing).output + result = ConvertTo(server_response=trigger_resp, output_format=output_format, indexing=indexing).output return result + + def save_output(self, output_folder: os.PathLike = "", output_format: str = "csv"): + """ + Save the objects of session data to user preferred location or a default folder + :param output_folder: user preferred output location; default tmp directory + :param output_format: needed only for tables CSV or XLSX + :return: location of the output + """ + input_fname = self.input_filename.rsplit('.')[0] + + output_format = output_format.lower() + if output_format not in ("csv", "xlsx"): + output_format = "csv" + warnings.warn("Invalid 'output_format' given. Defaulted to 'csv'") + + table_outputs_path = ConvertTo(server_response=self.server_response, output_format=output_format).output + + if output_folder: + if not os.path.exists(output_folder): + output_folder = os.path.split(table_outputs_path[0])[0] + warnings.warn(f"Your output_folder not exists. Saving the outputs to {output_folder}") + else: + for each_tbl_path in table_outputs_path: + os.replace(each_tbl_path, os.path.join(output_folder, input_fname+os.path.basename(each_tbl_path))) + + else: + output_folder = os.path.split(table_outputs_path[0])[0] + + for each_page in self.server_response.get("Lines", []): + page_txt_fname = os.path.join(output_folder, f"{input_fname}_Page_{str(each_page['Page'])}.txt") + page_txt = [each_line['Line'] for each_line in each_page['LinesArray']] + with open(page_txt_fname, "w", encoding="utf-8") as ofile: + ofile.write("\n".join(page_txt)) + + return output_folder diff --git a/ExtractTable/__version__.py b/ExtractTable/__version__.py index ed6e20d..8fdbebe 100644 --- a/ExtractTable/__version__.py +++ b/ExtractTable/__version__.py @@ -1,4 +1,4 @@ -VERSION = (2, 0, 2) +VERSION = (2, 1, 0) PRERELEASE = None # "alpha", "beta" or "rc" REVISION = None @@ -13,7 +13,7 @@ def generate_version(): __title__ = "ExtractTable" -__description__ = "Extract tabular data from images and scanned PDFs. Easily convert image to table, convert pdf to table" +__description__ = "Extract table data from images and scanned PDFs. Easily convert image to excel, convert pdf to table" __url__ = "https://github.com/ExtractTable/ExtractTable-py" __version__ = generate_version() __author__ = "Saradhi" diff --git a/ExtractTable/common.py b/ExtractTable/common.py index 0b1b32a..b5c0787 100644 --- a/ExtractTable/common.py +++ b/ExtractTable/common.py @@ -2,29 +2,31 @@ Preprocess the output received from server and interface as a final result to the client """ import os +import re import tempfile import warnings import collections +from statistics import mode +from typing import List import pandas as pd class ConvertTo: - """Convert tabular JSON to an user requested output format""" - FORMATS = {"df", "dataframe", "json", "csv", "dict"} + FORMATS = {"df", "dataframe", "json", "csv", "dict", "xlsx", "excel"} DEFAULT = "df" - def __init__(self, data: dict, fmt: str = DEFAULT, indexing: bool = False): + def __init__(self, server_response: dict, output_format: str = DEFAULT, indexing: bool = False, table_obj="TableJson"): """ - - :param data: Tabular JSON data from server - :param fmt: format to be converted into + Convert the server response to an user requested output format on Tables + :param server_response: Tabular JSON data from server + :param output_format: format to be converted into :param indexing: row & column index consideration in the output """ - self.data = data - self.output = self._converter(fmt.lower(), indexing=indexing) + self.server_response = server_response + self.output = self._converter(output_format.lower(), indexing=indexing, table_obj=table_obj) - def _converter(self, fmt: str, indexing: bool = False) -> list: + def _converter(self, fmt: str, indexing: bool = False, table_obj="TableJson") -> list: """ Actual conversion takes place here using Pandas :param fmt: format to be converted into @@ -32,10 +34,10 @@ def _converter(self, fmt: str, indexing: bool = False) -> list: :return: list of tables from converted into the requested output format """ dfs = [] - for table in self.data.get("Tables", []): - tmp = {int(k): v for k, v in table["TableJson"].items()} + for table in self.server_response.get("Tables", []): + tmp = {int(k): v for k, v in table[table_obj].items()} # To convert column indices to int to maintain the table order with more than 9 columns - cols = [str(x) for x in sorted([int(x) for x in tmp[0]])] + cols = [str(x) for x in sorted([int(x) for x in tmp[0]])] if tmp else None # To convert row indices to int and maintain the table order with more than 9 rows tmp = collections.OrderedDict(sorted(tmp.items())) dfs.append(pd.DataFrame.from_dict(tmp, orient="index", columns=cols)) @@ -52,9 +54,196 @@ def _converter(self, fmt: str, indexing: bool = False) -> list: df.to_csv(csv_name, index=indexing, header=indexing) output_location.append(csv_name) return output_location + elif fmt in ("xlsx", "excel"): + output_excel_location = os.path.join(tempfile.mkdtemp(), f"_tables_{len(dfs)}.xlsx") + if len(dfs) >= 10: + warnings.warn(f"There are {dfs} tables extracted. Consider to change the output_format to 'csv' instead") + with pd.ExcelWriter(output_excel_location) as writer: + for n, df in enumerate(dfs): + df.to_excel(writer, f'table_{n+1}') + writer.save() + return [output_excel_location] elif fmt == "json": return [df.to_json() for df in dfs] else: warn_msg = f"Supported output formats {self.FORMATS} only. Assigned to default: {self.DEFAULT}" warnings.warn(warn_msg) return dfs + + +class MakeCorrections: + def __init__(self, et_resp: dict = None, dataframes: List[pd.DataFrame] = None): + """ + To apply post processing techniques on the output + :param et_resp: ExtractTable response + :param dataframes: user preferred dataframe(s). + Default assumes all dataframes from the extracttable response, `et_resp`. + If both `et_resp` and `dataframes` are provided, the later is considered for the processing + """ + if et_resp: + self.dataframes = ConvertTo(data=et_resp).output + + if not et_resp: + try: + self.dataframes = self.__isacceptable__(dataframes) + except ValueError: + raise ValueError("Either ExtractTable response or your preferred list of pandas dataframes is required") + + @staticmethod + def __isacceptable__(dfs) -> List[pd.DataFrame]: + """Validate the `dataframes` param""" + if type(dfs) is list: + if all([type(df) is pd.DataFrame for df in dfs]): + return dfs + elif type(dfs) is pd.DataFrame: + return [dfs] + raise ValueError("Dataframes should be list of dataframes or a dataframe") + + def split_merged_rows(self) -> List[pd.DataFrame]: + """ + To split the merged rows into possible multiple rows + :return: reformatted list of dataframes + """ + for df_idx, each_df in enumerate(self.dataframes): + reformat = [] + for row in each_df.to_numpy(): + row = list(row) + + # looks like line separator is " " + seperators = [col.strip().count(" ") for col in row] + # Statistical mode to assume the number of rows merged + mode_ = mode(seperators) + + if mode_: + # split the merged rows inside the col + tmp = [col.strip().split(' ', mode_) for col in row] + for idx in range(len(tmp[0])): + tmp_ = [] + for x in range(len(tmp)): + try: + val = tmp[x][idx] + except IndexError: + val = "" + tmp_.append(val) + reformat.append(tmp_) + else: + reformat.append(row) + + self.dataframes[df_idx] = pd.DataFrame(reformat) + + return self.dataframes + + def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool = False) -> List[pd.DataFrame]: + """ + To split the merged columns into possible multiple columns + :param columns_idx: user preferred columns indices. + Default loops through all columns to find numeric or decimal columns + :param force_split: To force split through the columns + :return: reformatted list of dataframes + """ + # TODO: Should we consider delimiter_pattern for the split? + for df_idx, df in enumerate(self.dataframes): + if not columns_idx: + columns_idx = df.columns + + columns_idx = [str(x) for x in columns_idx] + reformat = [] + for col_idx in columns_idx: + tmp = df[col_idx].str.split(expand=True) + + if not any([not any(tmp.isna().any()), force_split]) or tmp.shape[-1] == 1: + reformat.append(df[col_idx].tolist()) + # If user wanted force_split or the split columns have all cell values + # then proceed next + else: + reformat.extend([tmp[each].tolist() for each in tmp.columns]) + + self.dataframes[df_idx] = pd.DataFrame(reformat).T + + return self.dataframes + + def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: str = ".", thousands_separator: str = ",", decimal_position: int = 2) -> List[pd.DataFrame]: + """ + To fix decimal and thousands separator values. Often commas as detected as period + :param columns_idx: user preferred columns indices. + Default loops through all columns to find numeric or decimal columns + :param decimal_separator: preferred decimal separator + :param thousands_separator: preferred thousands separator + :param decimal_position: preferred decimal position + :return: corrected list of dataframes + """ + # TODO: Should we consider only bad confidence values? + reg_ = f"[{decimal_separator}{thousands_separator}]" + if decimal_position > 0: + thou_regex = reg_ + '(?=.*' + reg_ + ')' + else: + thou_regex = reg_ + decimal_position = int(decimal_position) + + for df_idx, df in enumerate(self.dataframes): + if not columns_idx: + columns_idx = df.columns + columns_idx = [str(x) for x in columns_idx] + + for col_idx in columns_idx: + digits = df[col_idx].str.count(pat=r'\d').sum() + chars = df[col_idx].str.count(pat=r'[\w]').sum() + + if digits/chars < 0.75: + # To infer a numeric or float column + # Check if the column contains more digits or characters + continue + + df[col_idx] = df[col_idx].str.strip() + df[col_idx].replace(regex={r'%s' % thou_regex: thousands_separator}, inplace=True) + + # To correct decimal position + if not decimal_position > 0: + continue + + for i, _ in enumerate(df[col_idx]): + if not len(df[col_idx][i]) > decimal_position: + # length of atleast decimal_position + continue + elif df[col_idx][i][-(decimal_position+1)] == decimal_separator: + # nothing to do if decimal separator already in place + continue + + # If decimal position is a not alphanumeric + if re.search(r'\W+', df[col_idx][i][-(decimal_position+1)]): + digits = len(re.findall(r'\d', df[col_idx][i])) + if digits/len(df[col_idx][i]) >= 0.5: + df[col_idx][i] = df[col_idx][i][:-(decimal_position+1)] + decimal_separator + df[col_idx][i][-decimal_position:] + + self.dataframes[df_idx] = df + return self.dataframes + + def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"): + """ + To fix date formats of the column + Eg: 12|1212020 as 12/12/2020 + :param columns_idx: user preferred columns indices. + Default loops through all columns to find Date Columns + :param delimiter: "/" or "-" whatelse you prefer + :return: correted list of dataframes + """ + date_regex = r'(\d{2}(\d{2})?)(\W)(\d{2}|[A-Za-z]{3,9})(\W)(\d{2}(\d{2})?)\b' + for df_idx, df in enumerate(self.dataframes): + if not columns_idx: + columns_idx = df.columns + columns_idx = [str(x) for x in columns_idx] + + for col_idx in columns_idx: + dates = df[col_idx].str.count(pat=date_regex).sum() + + if not (dates >= len(df) * 0.75): + # To infer a date column + # Check if the column contains digits and non-alpha character greater than column length + continue + + df[col_idx] = df[col_idx].str.strip() + df[col_idx].replace(regex={date_regex: r'\1%s\4%s\6' % (delimiter, delimiter)}, inplace=True) + + self.dataframes[df_idx] = df + + return self.dataframes diff --git a/README.md b/README.md index 72079cf..0bfd772 100644 --- a/README.md +++ b/README.md @@ -32,9 +32,33 @@ table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, output_f ``` ## Detailed Library Usage - [example-code.ipynb](example-code.ipynb) - -Open In Colab +The tutorial available at Open In Colab takes you through + +```Markup +1. Installation +2. Import and check version +3. Create Session & Validate API Key + 3.1 Create Session with your API Key + 3.2 Validate the Key and check the plan usage + 3.3 Check Usage Details +4. Trigger the extraction process + 4.1 Accepted Input Types + 4.2 Process an IMAGE Input + 4.3 Process a PDF Input + 4.4 Output options + 4.5 Explore session objects +5. Explore the Output + 5.1 Output Structure + 5.2 Output Details +6. Make Corrections + 6.1 Split Merged Rows + 6.2 Split Merged Columns + 6.3 Fix Decimal Format + 6.4 Fix Date Format +7. Helpful Code Snippets + 7.1 Get text data + 7.2 Table output to Excel +``` ### Woahh, as simple as that ?! diff --git a/example-code.ipynb b/example-code.ipynb index 3446264..d18a325 100644 --- a/example-code.ipynb +++ b/example-code.ipynb @@ -1,449 +1,859 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "ExtractTable - Advanced Code Usage.ipynb", - "provenance": [], - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "collapsed": false, - "id": "BnYb9aztB48u", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "NhVhMrQ0ZdQr", - "colab_type": "code", - "colab": {} - }, - "source": [ - "!pip install ExtractTable" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "2aIaghfeZnQr", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from ExtractTable import ExtractTable" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "LJL_ZyYzZsFY", - "colab_type": "code", - "colab": {} - }, - "source": [ - "api_key = YOUR_APIKEY_HERE" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bwtpzTJxZHRi", - "colab_type": "text" - }, - "source": [ - "**Create Session** with your API Key" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Bfw5GTNvZGv8", - "colab_type": "code", - "colab": {} - }, - "source": [ - "et_sess = ExtractTable(api_key)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "On4_X8v3Zk3v", - "colab_type": "text" - }, - "source": [ - "**Validate** the Key and check the plan usage" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "a7EPvvvMZ0Ub", - "colab_type": "code", - "colab": {} - }, - "source": [ - "usage = et_sess.check_usage()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sovuclERjRqy", - "colab_type": "text" - }, - "source": [ - "*If there is no error encountered in the above cell, it means we have a valid API key. Now, get started by checking the usage and trigger the file for processing*" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "HT97IP8MZ9WF", - "colab_type": "code", - "outputId": "b5dfbc96-5ce8-4461-c988-6b17e58a1448", + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 + "name": "ExtractTable Usage -2.1.0", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "source": [], + "metadata": { + "collapsed": false + } + } } - }, - "source": [ - "print(usage)" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "{'credits': 500, 'queued': 0, 'used': 132}\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-XqbBoB-i3pi", - "colab_type": "text" - }, - "source": [ - "**credits**: Total number credits attached to the API Key\n", - "\n", - "**queued** : Number of triggered jobs that are still processing in the queue\n", - "\n", - "**used** : Number of credits already used " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "P_xzVgHmZ9sw", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# filepath = \"image_path_or_image_url_with_tables\"\n", - "# filepath = r'samples/BlurryImage.jpg'\n", - "filepath = \"https://raw.githubusercontent.com/ExtractTable/ExtractTable-py/master/samples/QualityImage.jpg\"" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oUnBFxYiZ1Ka", - "colab_type": "text" - }, - "source": [ - "**Trigger** the process to extract tabular data from the file" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "3H9jzk6wJ5-V", - "colab_type": "code", - "colab": {} - }, - "source": [ - "table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables)" - ], - "execution_count": 0, - "outputs": [] }, - { - "cell_type": "markdown", - "metadata": { - "id": "k98KTihPJwyO", - "colab_type": "text" - }, - "source": [ - "Note: To process a PDF, use **pages** params in the read_pdf function, as shown below\n", - "```python \n", - "table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, pages=\"all\")\n", - "```\n", - "Below are the sample values ```pages``` accepts\n", - "\n", - "* pages = \"2\" - considers only 2nd page of the PDF\n", - "* pages = \"1,3,5\" - considers only 1st, 3rd and 5th page of the PDF\n", - "* pages = \"1, 3-5\" - considers 1st, 3rd, 4th and 5th page of the PDF\n", - "* pages = \"all\" - considers complete PDF" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Th12lbKfJhu9", - "colab_type": "text" - }, - "source": [ - "> By default, the `process_file()` returns **only** the table data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HNZ8ieKNH5db", - "colab_type": "text" - }, - "source": [ - "> **Explore** all objects of the latest file processing with `et_sess.__dict__.keys()` - Depends on the plan type of your API Key" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "aDUaDyX8IGmK", - "colab_type": "code", - "outputId": "e22422ca-b27d-405f-c263-a92898c010ea", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - } - }, - "source": [ - "et_sess.__dict__.keys()" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "dict_keys(['api_key', '_session', 'ServerResponse', 'JobStatus', 'Lines', 'Pages', 'Tables'])" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 14 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "QYwnBXwQJ9D9", - "colab_type": "code", - "outputId": "5fee31e6-a38b-43a4-9297-8f1a920ad87e", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 68 - } - }, - "source": [ - "# Access the class objects as you want\n", - "print(\"Number of pages processed in this job:\", et_sess.Pages)\n", - "print(\"Number of tables found in this job:\", len(et_sess.Tables))\n", - "# print(\"Number of lines in the first page of this job:\", len(et_sess.Lines[0]['LineArray']))\n", - "\n", - "# et_sess.Tables\n", - "# et_sess.Lines\n" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Number of pages processed in this job: 1\n", - "Number of tables found in this job: 1\n", - "Number of lines in the first page of this job: 42\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KbIJ9kpqFxRu", - "colab_type": "text" - }, - "source": [ - "> **Understand the output**: The response of a triggered job is a JSON object in the below format. Note that the response depends on the plan type of the API Key.\n", - "\n", - "```javascript\n", - "{\n", - " \"JobStatus\": , # Status of the triggered Process @ JOB-LEVEL\n", - " \"Pages\": , # Number of pages processed in this request @ PAGE-LEVEL\n", - " \"Tables\": [ # List of all tables found @ TABLE-LEVEL\n", - " {\n", - " \"Page\": , ## Page number in which this table is found\n", - " \"CharacterConfidence\": , ## Accuracy of Characters recognized from the input-page\n", - " \"LayoutConfidence\": , ## Accuracy of table layout's design decision\n", - " \"TableJson\": , ## Table Cell Text in key-value format with index orientation - {row#: {col#: }}\n", - " \"TableCoordinates\": , ## Top-left & Bottom-right Cell Coordinates - {row#: {col#: }}\n", - " \"TableConfidence\": ## Cell level accuracy of detected characters - {row#: {col#: }}\n", - " },\n", - " {...} ## ... more \"Tables\" objects\n", - " ],\n", - " \"Lines\": [ # Pagewise Line details @ PAGE-LEVEL\n", - " {\n", - " \"Page\": , # Page number in which the lines are found\n", - " \"CharacterConfidence\": , # Average Accuracy of all Characters recognized from the input-page\n", - " \"LinesArray\": [\n", - " # Ordered list of lines in this page @ LINE-LEVEL\n", - " {\n", - " \"Line\": , ## Detected text of the complete line\n", - " \"WordsArray\": [\n", - " ## Word level datails in this line @ WORD-LEVEL\n", - " {\n", - " \"Conf\": , ### Accuracy of recognized characters of the word\n", - " \"Word\": , ### Detected text of the word\n", - " \"Loc\": [x1, y1, x2, y2] ### Top-left & Bottom-right coordinates, w.r.t the input-page width-height dimensions\n", - " },\n", - " {...} ### More \"WordsArray\" objects\n", - " ]\n", - " },\n", - " {...} ## More \"LinesArray\" objects\n", - " ]\n", - " },\n", - " {...} # More Pagewise \"Lines\" details\n", - " ]\n", - "}\n", - "```" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "XrXBfENfZ2AI", - "colab_type": "code", - "outputId": "6c12b493-b774-4687-f44e-1f1731f7ce43", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 170 - } - }, - "source": [ - "table_data # Notice the default output is a pandas dataframe" - ], - "execution_count": 0, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[ 0 1 ... 5 6\n", - " 0 FLC Code Room Name ... W (m) Ceiling Height (m)\n", - " 1 RGOOTO1 Indigenous Support Officer ... 7.3 2.7\n", - " 2 RGOOTO2 Instrum. Music Room ... 7.3 2.7\n", - " 3 RGOTO1A Verandah ... 1.7 3.0\n", - " 4 RGOTO1B Eastern Stairs ... 1.7 N/A\n", - " 5 RGOTO2B Western Stairs ... 1.0 N/A\n", - " \n", - " [6 rows x 7 columns]]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 22 + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "id": "BnYb9aztB48u", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "935SwBV4Z-CH", + "colab_type": "text" + }, + "source": [ + "# 1. Installation" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "NhVhMrQ0ZdQr", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!pip install -U ExtractTable" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jfsHIegraT2l", + "colab_type": "text" + }, + "source": [ + "# 2. Import and check version" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2aIaghfeZnQr", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from ExtractTable import ExtractTable" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "YLLCa6qQaaZu", + "colab_type": "code", + "colab": {} + }, + "source": [ + "print(ExtractTable.VERSION)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VYERq5s9aiNy", + "colab_type": "text" + }, + "source": [ + "# 3. Create Session & Validate API Key\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bwtpzTJxZHRi", + "colab_type": "text" + }, + "source": [ + "## 3.1 **Create Session** with your API Key" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "LJL_ZyYzZsFY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "api_key = YOUR_APIKEY_HERE" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Bfw5GTNvZGv8", + "colab_type": "code", + "colab": {} + }, + "source": [ + "et_sess = ExtractTable(api_key)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "b5fQB7dGxLKf", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# et_sess.__dict__" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "On4_X8v3Zk3v", + "colab_type": "text" + }, + "source": [ + "## 3.2 **Validate** the Key and check the plan usage" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "a7EPvvvMZ0Ub", + "colab_type": "code", + "colab": {} + }, + "source": [ + "usage = et_sess.check_usage()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sovuclERjRqy", + "colab_type": "text" + }, + "source": [ + "*If there is no error encountered in the above cell, it means we have a valid API key. Now, lets get started by checking the usage and trigger the file for processing*" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "GJdjlTPKxcXF", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# et_sess.server_response" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "HT97IP8MZ9WF", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "bc872e57-c5cb-4db0-a034-56f03f275d4b" + }, + "source": [ + "print(usage)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "{'credits': 100, 'queued': 0, 'used': 49}\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-XqbBoB-i3pi", + "colab_type": "text" + }, + "source": [ + "## 3.3 Check Usage Details\n", + "\n", + "**credits**: Total number credits attached to the API Key\n", + "\n", + "**queued** : Number of triggered jobs that are still processing in the queue\n", + "\n", + "**used** : Number of credits already used " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ARRJaIgFcYoe", + "colab_type": "text" + }, + "source": [ + "# 4. Trigger the extraction process\n", + "\n", + "> Note: We will use the session, `et_sess`, created earlier in step 3.1, to save the session data and retrieve when needed" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6GpN9ho1chi6", + "colab_type": "text" + }, + "source": [ + "## 4.1 Accepted Input Types\n", + "\n", + "**Allowed input formats** are:\n", + "- Image\n", + " - JPG/JPEG\n", + " - PNG\n", + "- PDF\n", + " - Text PDF\n", + " - Scan PDF\n", + " - Image PDF\n", + "\n", + "\n", + "**Input Location Options**\n", + "- Location can be a file from the local drive\n", + "- Accessible remote URL - *the file object will be locally downloaded and deleted once sent to the process*" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "P_xzVgHmZ9sw", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# image_location = \"local_image_path_OR_remote_image_url_with_tables\"\n", + "# image_location = r'samples/BlurryImage.jpg'\n", + "image_location = \"https://raw.githubusercontent.com/ExtractTable/ExtractTable-py/master/samples/QualityImage.jpg\"" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oUnBFxYiZ1Ka", + "colab_type": "text" + }, + "source": [ + "## 4.2 Process an IMAGE Input\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "3H9jzk6wJ5-V", + "colab_type": "code", + "colab": {} + }, + "source": [ + "table_data = et_sess.process_file(filepath=image_location, output_format=\"df\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "AoKuQBVQy3LN", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 170 + }, + "outputId": "032898c5-0a6b-41b7-a9fc-4107defe056f" + }, + "source": [ + "table_data" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[ 0 1 ... 5 6\n", + " 0 FLC Code Room Name ... W (m) Ceiling Height (m)\n", + " 1 RGOOTO1 Indigenous Support Officer ... 7.3 2.7\n", + " 2 RGOOTO2 Instrum. Music Room ... 7.3 2.7\n", + " 3 RGOTO1A Verandah ... 1.7 3.0\n", + " 4 RGOTO1B Eastern Stairs ... 1.7 N/A\n", + " 5 RGOTO2B Western Stairs ... 1.0 N/A\n", + " \n", + " [6 rows x 7 columns]]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 19 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9uiDCtGpfTwF", + "colab_type": "text" + }, + "source": [ + "## 4.3 Process a PDF Input" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ehxmaPgthoCC", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# pdf_location = \"local_image_path_OR_remote_image_url_with_tables\"\n", + "# pdf_location = r'samples/BlurryImage.jpg'\n", + "pdf_location = \"https://raw.githubusercontent.com/ExtractTable/ExtractTable-py/master/samples/QualityImage.jpg\"" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "qdU1Au3LhiuD", + "colab_type": "code", + "colab": {} + }, + "source": [ + "table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, pages=\"all\", output_format=\"df\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k98KTihPJwyO", + "colab_type": "text" + }, + "source": [ + "Below are the sample values ```pages``` accepts **string** type\n", + "\n", + "\n", + "\n", + "| pages \t| Explanation \t|\n", + "|----------\t|-------------------------------------------------\t|\n", + "| \"1\" \t| [Default] considers only 1st page of the PDF \t|\n", + "| \"1,3,5\" \t| considers only 1st, 3rd and 5th page of the PDF \t|\n", + "| \"1, 3-5\" \t| considers 1st, 3rd, 4th and 5th page of the PDF \t|\n", + "| \"all\" \t| considers complete PDF \t|" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e-KPdSNQBeR-", + "colab_type": "text" + }, + "source": [ + "## 4.4 Table Output options\n", + "\n", + "> By default, the `process_file()` returns **only** the table data. Output depends on the `output_format` , explained below\n", + "\n", + "Explore the available options with `ExtractTable._OUTPUT_FORMATS`\n", + "\n", + "| output_format \t| Explanation \t|\n", + "|---------------\t|--------------------------------------------\t|\n", + "| \"df\" \t| [Default] Array of Pandas dataframes \t|\n", + "| \"dataframe\" \t| same as \"df\"; Array of Pandas dataframes \t|\n", + "| \"json\" \t| JSON data with index orientation \t|\n", + "| \"dict\" \t| Similar to JSON data but python dictionary \t|\n", + "| \"csv\" \t| Array of locally saved CSV file locations \t|\n", + "| \"xlsx\" \t| To save multiple tables as sheets into a single excel\t|\n", + "| \"excel\" | same as \"xlsx\"; output is an array of excel location\t|\n", + "\n", + "\n", + "Default output is an array of pandas dataframes, with which you can change to any other format like excel, html etc. Follow https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nQ1as9mkCOyu", + "colab_type": "text" + }, + "source": [ + "## 4.5 Explore session objects\n", + "\n", + "> **Explore** all objects of the latest file processing with `et_sess.__dict__.\n", + "keys()` - Depends on the plan type of your API Key" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "aDUaDyX8IGmK", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "outputId": "b89cd4d2-72a1-4196-ad6e-2a05cc6f0448" + }, + "source": [ + "et_sess.__dict__.keys()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "dict_keys(['api_key', '_session', 'ServerResponse', 'JobStatus', 'Lines', 'Pages', 'Tables'])" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 20 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "E0Z6RJa9DFlT", + "colab_type": "text" + }, + "source": [ + "Based on the API Key PLAN type, the et_sess contains below objects\n", + "\n", + "| Object \t| Explanation \t|\n", + "|-----------------\t|----------------------------------------------------------------------------------------\t|\n", + "| api_key \t| Your API Key \t|\n", + "| _session \t| Session data of the **latest** performed request/action \t|\n", + "| input_filename \t| Name of the processed input file |\n", + "| ServerResponse \t| Complete ServerResponse, along with response code and headers \t|\n", + "| server_response \t| complete server response content; equivalent to `ServerResponse.json()` \t|\n", + "| JobStatus \t| Job Status of the triggered process \t|\n", + "| Pages \t| Number of pages in the input; also number of credits consumed on the triggered process \t|\n", + "| Tables \t| Tabular Data in JSON format with index orientation; ordered table wise \t|\n", + "| Lines \t| Text Data in JSON format, ordered page wise \t|" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MytCD36ja6KM", + "colab_type": "text" + }, + "source": [ + "## 4.6 Save Table & Text to LOCAL\n", + "\n", + "```python\n", + "et_sess.save_output(output_folder, output_format=\"csv\")\n", + "```\n", + "`output_format` param is relavant only for the table data, with options \"csv\" or \"xlsx\"\n", + "\n", + "\n", + "> Note: As the `et_sess` contains the latest action performed, make sure this call is right after the `process_file()`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-CgU2PCJFQNr", + "colab_type": "text" + }, + "source": [ + "# 5. Explore the Output" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JI-CLS9UF0iS", + "colab_type": "text" + }, + "source": [ + "## 5.1 Output Structure\n", + "\n", + "> **Understand the output**: The response of a triggered job is a JSON object in the below format. \n", + "\n", + "Note that the response depends on the plan type of the API Key.\n", + "\n", + "\n", + "```javascript\n", + "{\n", + " \"JobStatus\": , # Status of the triggered Process @ JOB-LEVEL\n", + " \"Pages\": , # Number of pages processed in this request @ PAGE-LEVEL\n", + " \"Tables\": [ # List of all tables found @ TABLE-LEVEL\n", + " {\n", + " \"Page\": , ## Page number in which this table is found\n", + " \"CharacterConfidence\": , ## Accuracy of Characters recognized from the input-page\n", + " \"LayoutConfidence\": , ## Accuracy of table layout's design decision\n", + " \"TableJson\": , ## Table Cell Text in key-value format with index orientation - {row#: {col#: }}\n", + " \"TableCoordinates\": , ## Top-left & Bottom-right Cell Coordinates - {row#: {col#: }}\n", + " \"TableConfidence\": ## Cell level accuracy of detected characters - {row#: {col#: }}\n", + " },\n", + " {...} ## ... more \"Tables\" objects\n", + " ],\n", + " \"Lines\": [ # Pagewise Line details @ PAGE-LEVEL\n", + " {\n", + " \"Page\": , # Page number in which the lines are found\n", + " \"CharacterConfidence\": , # Average Accuracy of all Characters recognized from the input-page\n", + " \"LinesArray\": [\n", + " # Ordered list of lines in this page @ LINE-LEVEL\n", + " {\n", + " \"Line\": , ## Detected text of the complete line\n", + " \"WordsArray\": [\n", + " ## Word level datails in this line @ WORD-LEVEL\n", + " {\n", + " \"Conf\": , ### Accuracy of recognized characters of the word\n", + " \"Word\": , ### Detected text of the word\n", + " \"Loc\": [x1, y1, x2, y2] ### Top-left & Bottom-right coordinates, w.r.t the input-page width-height dimensions\n", + " },\n", + " {...} ### More \"WordsArray\" objects\n", + " ]\n", + " },\n", + " {...} ## More \"LinesArray\" objects\n", + " ]\n", + " },\n", + " {...} # More Pagewise \"Lines\" details\n", + " ]\n", + "}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BPY9KziZF6jR", + "colab_type": "text" + }, + "source": [ + "## 5.2 Output Details\n", + "\n", + "Output objects are based on the API Key Plan type. Available plan types are \n", + "\n", + "**Purchased Plans**\n", + "* \"LITE\" - **only table data** in the output\n", + "* \"FULL\" - **table and text data** in the output\n", + "* \"EXTRA\" - **table, text data along with cell & word coordintates and character detection accuracy**\n", + "\n", + "**Promotional Plans**: Any plan other than Purchased plans are promotional\n", + "* \"free_trial\", \"camelotpro\" - these are promotional API Keys, gives only table data equivalent to \"LITE\" plan type\n", + "\n", + "\n", + "
\n", + "Output objects detail below\n", + "\n", + "\n", + "\n", + "| Key Name \t| Parent \t| Type \t| Description \t| Availability \t|\n", + "|-\t|-\t|-\t|-\t|-\t|\n", + "| JobStatus \t| Job \t| String \t| Status of the triggered process \t| ALL Plans \t|\n", + "| Pages \t| Job \t| Integer \t| Number of pages processed in the request \t| ALL Plans \t|\n", + "| Tables \t| Job \t| Array \t| List of all tables found \t| ALL Plans \t|\n", + "| Tables[0].Page \t| Table \t| Integer \t| Page number in which the table is found \t| ALL Plans \t|\n", + "| Tables[0].CharacterConfidence \t| Table \t| Decimal \t| Accuracy of Characters recognized from the image \t| ALL Plans \t|\n", + "| Tables[0].LayoutConfidence \t| Table \t| Decimal \t| Accuracy of table layout's design decision \t| ALL Plans \t|\n", + "| Tables[0].TableJson \t| Table \t| Json/dict \t| Table Cell Text in key-value format with index orientation - {row#: {col#: }} \t| ALL Plans \t|\n", + "| Tables[0].TableCoordinates \t| Table \t| Json/dict \t| Top-left & Bottom-right Cell Coordinates - {row#: {col#: }} \t| EXTRA Plan \t|\n", + "| Tables[0].TableConfidence \t| Table \t| Json/dict \t| Cell level accuracy of detected characters - {row#: {col#: }} \t| EXTRA Plan \t|\n", + "| Lines \t| Job \t| Array \t| List of page-wise lines text \t| FULL, EXTRA\t|\n", + "| Lines[0].Page \t| Page \t| Integer \t| Page number in which the lines are found \t| Full Plan \t|\n", + "| Lines[0].CharacterConfidence \t| Page \t| Decimal \t| Average Accuracy of all Characters recognized from the input-page \t| Full Plan \t|\n", + "| Lines[0].LineArray \t| Page \t| Array \t| Ordered list of lines of the page \t| \t|\n", + "| Lines[0].LineArray[0].Line \t| Line \t| String \t| Detected text of the complete line \t| Full Plan \t|\n", + "| Lines[0].LineArray[0].WordsArray \t| Line \t| Array \t| Word level datails in this line \t| EXTRA Plan \t|\n", + "| Lines[0].LineArray[0].WordsArray[0].Conf \t| Word \t| Decimal \t| Accuracy of recognized characters of the word \t| EXTRA Plan \t|\n", + "| Lines[0].LineArray[0].WordsArray[0].Word \t| Word \t| String \t| Detected text of the word \t| EXTRA Plan \t|\n", + "| Lines[0].LineArray[0].WordsArray[0].Loc \t| Word \t| Array \t| Top-left & Bottom-right coordinates, w.r.t the input-page width-height dimensions \t| EXTRA Plan \t|" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4ysCj8_GSrd8", + "colab_type": "text" + }, + "source": [ + "# 6. Make Corrections\n", + "\n", + "> **Objective**: To ease corrections on the most common issues with the `MakeCorrections` module.\n", + "\n", + "**Details:** The service relies on OCR (Optical Character Recognition) for character detection and deep learning models to detect tabular structures on the input. There may be a chance for merged rows or columns or incorrect type detections on low-quality inputs with a complex table layout or tightly packed columns. With those in mind, we want to offer the built-in service at the client-side to give control and ease in making corrections on the output. \n", + "\n", + "\n", + "The module, `MakeCorrections`, currently supports below functionalities\n", + "\n", + "| Functionality \t| Explanation \t|\n", + "|----------------------\t|------------------------------------------------\t|\n", + "| Split Merged Rows \t| Works well on cell values with no spaces \t|\n", + "| Split Merged Columns \t| Works well on cell values with no spaces \t|\n", + "| Fix Decimal Format \t| To fix thousand and decimal separators \t|\n", + "| Fix Date Format \t| To handle and modify incorrect date separators \t|" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "MpH284nxX2KJ", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# First things first lets import the module and prepare for corrections\n", + "\n", + "from ExtractTable.common import MakeCorrections\n", + "\n", + "corrections = MakeCorrections(et_resp=et_sess.server_response)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PKRntUqYXIEQ", + "colab_type": "text" + }, + "source": [ + "## 6.1 Split Merged Rows" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_MmSR0mLXS6x", + "colab_type": "code", + "colab": {} + }, + "source": [ + "corrected_table_dataframes = corrections.split_merged_rows()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "34YLZL3nXIU6", + "colab_type": "text" + }, + "source": [ + "## 6.2 Split Merged Columns\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "hzinneCWXVZ8", + "colab_type": "code", + "colab": {} + }, + "source": [ + "corrected_table_dataframes = corrections.split_merged_columns()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3XSBMo5rXIkC", + "colab_type": "text" + }, + "source": [ + "## 6.3 Fix Decimal Format\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "hK9nWOSfXXUc", + "colab_type": "code", + "colab": {} + }, + "source": [ + "corrected_table_dataframes = corrections.fix_decimal_format(decimal_separator=\".\", thousands_separator=\",\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mok2QbmFXIz9", + "colab_type": "text" + }, + "source": [ + "## 6.4 Fix Date Format\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "UFQNzPxUSqga", + "colab_type": "code", + "colab": {} + }, + "source": [ + "corrected_table_dataframes = corrections.fix_date_format(delimiter=\"/\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "I5-CGT3oy7KG", + "colab_type": "text" + }, + "source": [ + "# 7. Helpful Code Snippets\n", + "\n", + "Extra code snippets that are useful to perform some actions on the output. Based on the frequently asked questions." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ACcH7oUpMfFp", + "colab_type": "text" + }, + "source": [ + "## 7.1 Get text data" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_CVAYfnK_sTk", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# If your API Key supports \"Lines\"\n", + "\n", + "all_page_lines = []\n", + "for each_page in et_sess.Lines:\n", + " for each_line in each_page['LinesArray']:\n", + " all_page_lines.append(each_line['Line'])\n", + " \n", + "print(\"\\n\".join(all_page_lines))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6c0vnCiXM_FM", + "colab_type": "text" + }, + "source": [ + "## 7.2 All tables output to a single excel" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "VhOTglS-NIXN", + "colab_type": "code", + "colab": {} + }, + "source": [ + "table_data = et_sess.process_file(filepath=Location_of_PDF_with_Tables, output_format=\"df\", pages=\"all\")\n", + " \n", + "import pandas as pd\n", + "accumulate_all_dfs = pd.DataFrame()\n", + "\n", + "for each_df in table_data:\n", + " accumulate_all_dfs = accumulate_all_dfs.append(each_df, ignore_index=True)\n", + " # print(each_df.shape, accumulate_all_dfs.shape)\n", + "\n", + "print(\"Shape of all tables accumulated together is\", accumulate_all_dfs.shape)\n", + "\n", + "\n", + "output_excel_location = \n", + "# Save the accumulated output to a single excel file\n", + "accumulate_all_dfs.to_excel(output_excel_location, index=False, header=False)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ba8vIKuXdAPT", + "colab_type": "text" + }, + "source": [ + "# 8. Support & Contact\n", + "\n", + "Please do not hesitate to approach our developer team at pydevs@extracttable.com for any assitance needed or to report a bug" + ] } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ie9D1umMEv6D", - "colab_type": "text" - }, - "source": [ - "Default output is an array of pandas dataframes, with which you can change to any other format, follow https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "_CVAYfnK_sTk", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# If your API Key supports \"Lines\" - Sample to get Lines\n", - "\n", - "all_page_lines = []\n", - "for each_page in et_sess.Lines:\n", - " for each_line in each_page['LinesArray']:\n", - " all_page_lines.append(each_line['Line'])" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TYLfNBQ6bL64", - "colab_type": "text" - }, - "source": [ - "Play with the result:\n", - "- check the complete server response of the latest job with `et_sess.ServerResponse.json()`\n", - "- check out list of available output formats of table `ExtractTable._OUTPUT_FORMATS`\n", - "- Retrieve the result as long as the `JobId` is unexpired, usually stays for 24 hours\n", - " - ```job_output = et_sess.get_result(job_id=JobID_HERE)```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": false, - "id": "yRsqFlIvB4-D", - "colab_type": "text" - }, - "source": [ - "## Social Media\n", - "Follow us on Social media for library updates and free credits.\n", - "\n", - "[![Image](https://cdn3.iconfinder.com/data/icons/socialnetworking/32/linkedin.png)](https://www.linkedin.com/company/extracttable)\n", - "    \n", - "[![Image](https://abs.twimg.com/favicons/twitter.ico)](https://twitter.com/extracttable)" - ] - } - ] + ] } \ No newline at end of file