From f71441b0d5e0034bf4bf19baff862ead4f1ee6ff Mon Sep 17 00:00:00 2001 From: John Lyu Date: Fri, 1 Sep 2023 15:16:01 +0800 Subject: [PATCH] Use Modern backend (#96) * add html2image and playwright backend * bump lowest version to 3.8 * ensure test output path * update readme * remove ChromeController dependency * increase waiting time in notebook convert --- .github/workflows/python-package.yml | 4 +- README.md | 75 +++++-- dataframe_image/_browser_pdf.py | 10 +- dataframe_image/_convert.py | 37 ++-- dataframe_image/_html2image.py | 59 ----- dataframe_image/_pandas_accessor.py | 81 ++++--- dataframe_image/converter/__init__.py | 2 + dataframe_image/converter/browser/__init__.py | 4 + dataframe_image/converter/browser/base.py | 202 ++++++++++++++++++ .../browser/chrome_converter.py} | 162 ++++---------- .../converter/browser/html2image_converter.py | 47 ++++ .../converter/browser/playwright_converter.py | 47 ++++ .../browser/selenium_converter.py} | 33 +-- .../browser}/static/download.html | 0 .../{ => converter/browser}/static/fail.html | 0 .../{ => converter/browser}/static/form.html | 0 .../{ => converter/browser}/static/style.css | 2 +- .../matplotlib_table.py} | 6 +- dataframe_image/logger.py | 3 + setup.py | 8 +- tests/__init__.py | 1 + tests/conftest.py | 13 ++ tests/delete_test_results.py | 19 -- tests/test_convert.py | 10 +- tests/test_df_image.py | 132 ++++++------ tests/test_output/README.md | 1 - 26 files changed, 582 insertions(+), 376 deletions(-) delete mode 100644 dataframe_image/_html2image.py create mode 100644 dataframe_image/converter/__init__.py create mode 100644 dataframe_image/converter/browser/__init__.py create mode 100644 dataframe_image/converter/browser/base.py rename dataframe_image/{_screenshot.py => converter/browser/chrome_converter.py} (53%) create mode 100644 dataframe_image/converter/browser/html2image_converter.py create mode 100644 dataframe_image/converter/browser/playwright_converter.py rename dataframe_image/{selenium_screenshot.py => converter/browser/selenium_converter.py} (72%) rename dataframe_image/{ => converter/browser}/static/download.html (100%) rename dataframe_image/{ => converter/browser}/static/fail.html (100%) rename dataframe_image/{ => converter/browser}/static/form.html (100%) rename dataframe_image/{ => converter/browser}/static/style.css (96%) rename dataframe_image/{_matplotlib_table.py => converter/matplotlib_table.py} (98%) create mode 100644 dataframe_image/logger.py create mode 100644 tests/conftest.py delete mode 100644 tests/delete_test_results.py delete mode 100644 tests/test_output/README.md diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index dc368d3..c7c6dc3 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -17,7 +17,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, windows-latest, macos-latest] - python-version: ["3.7", "3.8", "3.9", "3.10"] + python-version: ["3.8", "3.9", "3.10", "3.11"] include: - os: ubuntu-latest pippath: ~/.cache/pip @@ -104,7 +104,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip wheel - pip install pytest matplotlib selenium jupyter pandoc + pip install pytest matplotlib selenium jupyter pandoc playwright pip install . --upgrade - name: mac nbconvert patch fix # this is a tmp fix, related to https://github.com/jupyter/nbconvert/issues/1773 if: ${{ startsWith(matrix.os, 'macos') }} diff --git a/README.md b/README.md index ca38d0e..c788e2f 100644 --- a/README.md +++ b/README.md @@ -2,22 +2,10 @@ [![](https://img.shields.io/pypi/v/dataframe_image)](https://pypi.org/project/dataframe_image) [![PyPI - License](https://img.shields.io/pypi/l/dataframe_image)](LICENSE) +[![Python Version](https://img.shields.io/pypi/pyversions/dataframe_image)](https://pypi.org/project/dataframe_image) +A package to convert pandas DataFrames as images. -A package to convert Jupyter Notebooks to PDF and/or Markdown embedding pandas DataFrames as images. - -## Overview - -When converting Jupyter Notebooks to pdf using nbconvert, pandas DataFrames appear as either raw text or as simple LaTeX tables. The left side of the image below shows this representation. - -![png](https://github.com/dexplo/dataframe_image/raw/gh-pages/images/dataframe_image_compare.png) - -This package was first created to embed DataFrames into pdf and markdown documents as images so that they appear exactly as they do in Jupyter Notebooks, as seen from the right side of the image above. It has since added much more functionality. - -## Usage - -Upon installation, the option `DataFrame as Image (PDF or Markdown)` will appear in the menu `File -> Download as`. Clicking this option will open up a new browser tab with a short form to be completed. - -![png](https://github.com/dexplo/dataframe_image/raw/gh-pages/images/form.png) +Also convert Jupyter Notebooks to PDF and/or Markdown embedding dataframe as image into it. ### Exporting individual DataFrames @@ -39,6 +27,21 @@ Here, an example of how exporting a DataFrame would look like in a notebook. ![png](https://github.com/dexplo/dataframe_image/raw/gh-pages/images/dfi_export.png) +### Export Jupyter Notebook + +When converting Jupyter Notebooks to pdf using nbconvert, pandas DataFrames appear as either raw text or as simple LaTeX tables. The left side of the image below shows this representation. + +![png](https://github.com/dexplo/dataframe_image/raw/gh-pages/images/dataframe_image_compare.png) + +This package was first created to embed DataFrames into pdf and markdown documents as images so that they appear exactly as they do in Jupyter Notebooks, as seen from the right side of the image above. It has since added much more functionality. + +#### Usage + +Upon installation, the option `DataFrame as Image (PDF or Markdown)` will appear in the menu `File -> Download as`. Clicking this option will open up a new browser tab with a short form to be completed. + +![png](https://github.com/dexplo/dataframe_image/raw/gh-pages/images/form.png) + + ## Installation Install with either: @@ -46,6 +49,48 @@ Install with either: * `pip install dataframe_image` * `conda install -c conda-forge dataframe_image` +## Configuration + +### table_conversion + +When convert dataframe to image, we provide two kind of backend, browser or matplotlib. The default is browser, but you can change it by setting `table_conversion` parameter to `'matplotlib'`. + +The major difference between these two backends is that browser backend will render the dataframe as it is in the notebook, while matplotlib backend can work without browser, can export all image format, eg. `svg`, and will be extremely fast. But currently matplotlib can only simulate header and cells, `set_caption` will not work. + +```python +dfi.export(df.style.background_gradient(), "df_style.png", table_conversion="matplotlib") +``` + +#### Browser backend + +Current we provide 4 difference browser backend liberary: `playwright`, `html2image`, `selenium` and `chrome`. The default is `chrome`. + +`chrome`, which means convert image with your local chromium based browser by command line. + +`html2image` is a backup method for `chrome`, which use `html2image`. + +`playwright` is a much more stable method, but you have to install playwright first. + +`selenium` is a method that use `Firefox` driver. Sometimes chrome will make some breaking changes which break methods above, `Firefox` will be a good backup. Not stable and hard to install. But can be installed in Google Colab. + +### Other parameters + +```python +dfi.export( + obj: pd.DataFrame, + filename, + fontsize=14, + max_rows=None, + max_cols=None, + table_conversion: Literal[ + "chrome", "matplotlib", "html2image", "playwright", "selenium" + ] = "chrome", + chrome_path=None, + dpi=None, # enlarge your image,default is 100,set it larger will get a larger image + use_mathjax=False, # enable mathjax support, which means you can use latex in your dataframe +) +``` + ## PDF Conversion - LaTeX vs Chrome Browser By default, conversion to pdf happens via LaTeX, which you must have pre-installed on your machine. If you do not have the correct LaTeX installation, you'll need to select the Chrome Browser option to make the conversion. diff --git a/dataframe_image/_browser_pdf.py b/dataframe_image/_browser_pdf.py index bd0e38e..9b9ca2e 100644 --- a/dataframe_image/_browser_pdf.py +++ b/dataframe_image/_browser_pdf.py @@ -10,11 +10,10 @@ from tempfile import TemporaryDirectory, mkstemp import aiohttp -import ChromeController from nbconvert import TemplateExporter from nbconvert.exporters import Exporter, HTMLExporter -from ._screenshot import get_chrome_path +from .converter.browser.chrome_converter import get_chrome_path async def handler(ws, data, key=None): @@ -62,7 +61,7 @@ async def main(file_name, p): frameId = await handler(ws, data, "frameId") # second - enable page - # await asyncio.sleep(1) + await asyncio.sleep(1) data = {"id": 2, "method": "Page.enable"} await handler(ws, data) @@ -72,14 +71,16 @@ async def main(file_name, p): await handler(ws, data, "content") # fourth - get pdf + prev_len = 0 for _ in range(10): await asyncio.sleep(1) params = {"displayHeaderFooter": False, "printBackground": True} data = {"id": 4, "method": "Page.printToPDF", "params": params} pdf_data = await handler(ws, data, "data") pdf_data = base64.b64decode(pdf_data) - if len(pdf_data) > 1000: + if len(pdf_data) > 1000 and len(pdf_data) == prev_len: break + prev_len = len(pdf_data) else: raise TimeoutError("Could not get pdf data") return pdf_data @@ -131,6 +132,7 @@ def get_pdf_data(file_name): def get_pdf_data_chromecontroller(file_name): + import ChromeController additional_options = get_launch_args() # ChromeContext will shlex.split binary, so add quote to it with ChromeController.ChromeContext( diff --git a/dataframe_image/_convert.py b/dataframe_image/_convert.py index 2537048..74d2a4a 100644 --- a/dataframe_image/_convert.py +++ b/dataframe_image/_convert.py @@ -7,9 +7,9 @@ import tempfile import time import urllib.parse +import warnings from pathlib import Path from tempfile import TemporaryDirectory -import warnings import nbformat from nbconvert import MarkdownExporter, PDFExporter @@ -25,6 +25,7 @@ _logger = logging.getLogger(__name__) + class Converter: KINDS = ["pdf", "md"] DISPLAY_DATA_PRIORITY = [ @@ -190,26 +191,26 @@ def get_resources(self): if self.table_conversion == "html2image": pass elif self.table_conversion == "chrome": - from ._screenshot import Screenshot + from .converter.browser.chrome_converter import ChromeConverter - converter = Screenshot( + converter = ChromeConverter( center_df=self.center_df, max_rows=self.max_rows, max_cols=self.max_cols, chrome_path=self.chrome_path, ).run elif self.table_conversion == "selenium": - from .selenium_screenshot import SeleniumScreenshot + from .converter.browser.selenium_converter import SeleniumConverter - converter = SeleniumScreenshot( + converter = SeleniumConverter( center_df=self.center_df, max_rows=self.max_rows, max_cols=self.max_cols, ).run else: - from ._matplotlib_table import TableMaker + from .converter.matplotlib_table import MatplotlibTableConverter - converter = TableMaker(fontsize=22).run + converter = MatplotlibTableConverter(fontsize=22).run resources = { "metadata": {"path": str(self.nb_home), "name": self.document_name}, @@ -295,15 +296,15 @@ def to_pdf_latex(self): # get long path name of self.td temp_dir = Path(self.td.name).resolve() self.resources["temp_dir"] = temp_dir - print("TEMP_DIR", temp_dir) # TODO just for debug + print("TEMP_DIR", temp_dir) # TODO just for debug MarkdownHTTPPreprocessor().preprocess(self.nb, self.resources) for filename, image_data in self.resources["image_data_dict"].items(): fn_pieces = filename.split("_") cell_idx = int(fn_pieces[1]) ext = fn_pieces[-1].split(".")[-1] - new_filename = str(temp_dir / filename) - print(new_filename) # TODO just for debug + new_filename = str(temp_dir / filename) + print(new_filename) # TODO just for debug # extract first image from gif and use as png for latex pdf if ext == "gif": @@ -328,7 +329,9 @@ def to_pdf_latex(self): try: pdf_data, self.resources = pdf.from_notebook_node(self.nb, self.resources) except Exception as ex: - latex, _ = super(PDFExporter, pdf).from_notebook_node(self.nb, self.resources) + latex, _ = super(PDFExporter, pdf).from_notebook_node( + self.nb, self.resources + ) _logger.error("nbconvert failed to create PDF via latex \n\n{latex}") with open("notebook.tex", "w", encoding="utf-8") as f: f.write(latex) @@ -374,11 +377,13 @@ def convert(self): # Step 2: if exporting as pdf with browser, do this first # as it requires no other preprocessing if "pdf_browser" in self.to: - warnings.warn("to pdf_browser method is deprecated" - "We suggest using nbconvert, install it using `pip install nbconvert[webpdf]`" - "and then run" - "`jupyter nbconvert --to WebPDF --allow-chromium-download notebook.ipynb`" - , DeprecationWarning) + warnings.warn( + "to pdf_browser method is deprecated" + "We suggest using nbconvert, install it using `pip install nbconvert[webpdf]`" + "and then run" + "`jupyter nbconvert --to WebPDF --allow-chromium-download notebook.ipynb`", + DeprecationWarning, + ) self.to_pdf_browser() if "md" in self.to or "pdf_latex" in self.to: diff --git a/dataframe_image/_html2image.py b/dataframe_image/_html2image.py deleted file mode 100644 index 611e1f7..0000000 --- a/dataframe_image/_html2image.py +++ /dev/null @@ -1,59 +0,0 @@ -import base64 -from pathlib import Path - - -class Html2ImageConverter: - def __init__( - self, - center_df=True, - max_rows=None, - max_cols=None, - chrome_path=None, - fontsize=18, - encode_base64=True, - limit_crop=True, - device_scale_factor=1, - ): - self.center_df = center_df - self.max_rows = max_rows - self.max_cols = max_cols - self.chrome_path = chrome_path - self.fontsize = fontsize - self.encode_base64 = encode_base64 - self.limit_crop = limit_crop - self.device_scale_factor = device_scale_factor - - def get_css(self): - mod_dir = Path(__file__).resolve().parent - css_file = mod_dir / "static" / "style.css" - with open(css_file) as f: - css = "" - justify = "center" if self.center_df else "left" - css = css.format(fontsize=self.fontsize, justify=justify) - return css - - def run(self, html): - from html2image import Html2Image - - css = self.get_css() - # use folder under home directory to avoid permission issues - wd = Path.home() / ".cache" / "html2image" - wd.mkdir(parents=True, exist_ok=True) - hti = Html2Image( - browser_executable=self.chrome_path, output_path=wd, temp_path=str(wd) - ) - hti.browser.flags = [ - f"--force-device-scale-factor={self.device_scale_factor}", - "--disable-gpu", - "--hide-scrollbars", - ] - outpaths = hti.screenshot(html_str=html, css_str=css, size=(9000, 900)) - image_bytes = self.finalize_image(outpaths[0]) - return image_bytes - - def finalize_image(self, image_path) -> bytes: - with open(image_path, "rb") as f: - img_bytes = f.read() - if self.encode_base64: - img_bytes = base64.b64encode(img_bytes) - return img_bytes diff --git a/dataframe_image/_pandas_accessor.py b/dataframe_image/_pandas_accessor.py index 6badd08..cdccdb1 100644 --- a/dataframe_image/_pandas_accessor.py +++ b/dataframe_image/_pandas_accessor.py @@ -1,9 +1,17 @@ from pathlib import Path +from typing import Literal + import pandas as pd from pandas.io.formats.style import Styler +from PIL import Image -from ._screenshot import Screenshot -from .pd_html import styler2html +from dataframe_image.converter import ( + ChromeConverter, + Html2ImageConverter, + PlayWrightConverter, + SeleniumConverter, +) +from dataframe_image.pd_html import styler2html MAX_COLS = 30 MAX_ROWS = 100 @@ -24,7 +32,7 @@ def export( chrome_path=None, dpi=None, ): - return _export( + return export( self._df, filename, fontsize, @@ -36,30 +44,31 @@ def export( ) +BROWSER_CONVERTER_DICT = { + "chrome": ChromeConverter, + "selenium": SeleniumConverter, + "html2image": Html2ImageConverter, + "playwright": PlayWrightConverter, +} + + def export( - obj, + obj: pd.DataFrame, filename, fontsize=14, max_rows=None, max_cols=None, - table_conversion="chrome", + table_conversion: Literal[ + "chrome", "matplotlib", "html2image", "playwright", "selenium" + ] = "chrome", chrome_path=None, dpi=None, -): - return _export( - obj, filename, fontsize, max_rows, max_cols, table_conversion, chrome_path, dpi - ) - - -def _export( - obj: pd.DataFrame, filename, fontsize, max_rows, max_cols, table_conversion, chrome_path, dpi + use_mathjax=False, ): is_styler = isinstance(obj, Styler) df = obj.data if is_styler else obj - if table_conversion == "html2image": - from ._html2image import Html2ImageConverter - - converter = Html2ImageConverter( + if table_conversion in BROWSER_CONVERTER_DICT: + converter = BROWSER_CONVERTER_DICT[table_conversion]( max_rows=max_rows, max_cols=max_cols, chrome_path=chrome_path, @@ -67,37 +76,21 @@ def _export( encode_base64=False, limit_crop=False, device_scale_factor=(1 if dpi == None else dpi / 100.0), + use_mathjax=use_mathjax, ).run - elif table_conversion == "chrome": - converter = Screenshot( - max_rows=max_rows, - max_cols=max_cols, - chrome_path=chrome_path, - fontsize=fontsize, - encode_base64=False, - limit_crop=False, - device_scale_factor=(1 if dpi == None else dpi / 100.0), - ).run - elif table_conversion == "selenium": - from .selenium_screenshot import SeleniumScreenshot - - converter = SeleniumScreenshot( - max_rows=max_rows, - max_cols=max_cols, - fontsize=fontsize, - encode_base64=False, - limit_crop=False, - device_scale_factor=(1 if dpi == None else dpi / 100.0), - ).run - else: - from ._matplotlib_table import TableMaker + from .converter.matplotlib_table import MatplotlibTableConverter + # get extension from filename without dot extension = Path(filename).suffix if extension.startswith("."): extension = extension[1:] - converter = TableMaker( - fontsize=fontsize, encode_base64=False, for_document=False, savefig_dpi=dpi, format=extension + converter = MatplotlibTableConverter( + fontsize=fontsize, + encode_base64=False, + for_document=False, + savefig_dpi=dpi, + format=extension, ).run if df.shape[0] > MAX_ROWS and max_rows is None: @@ -144,7 +137,11 @@ def _export( else: html = obj.to_html(max_rows=max_rows, max_cols=max_cols, notebook=True) + pre_limit = Image.MAX_IMAGE_PIXELS + Image.MAX_IMAGE_PIXELS = None img_str = converter(html) + # swap back to original value + Image.MAX_IMAGE_PIXELS = pre_limit try: with open(filename, "wb") as f: diff --git a/dataframe_image/converter/__init__.py b/dataframe_image/converter/__init__.py new file mode 100644 index 0000000..d5a8ac6 --- /dev/null +++ b/dataframe_image/converter/__init__.py @@ -0,0 +1,2 @@ +from .browser import * +from .matplotlib_table import MatplotlibTableConverter diff --git a/dataframe_image/converter/browser/__init__.py b/dataframe_image/converter/browser/__init__.py new file mode 100644 index 0000000..81eaa15 --- /dev/null +++ b/dataframe_image/converter/browser/__init__.py @@ -0,0 +1,4 @@ +from .chrome_converter import ChromeConverter +from .html2image_converter import Html2ImageConverter +from .playwright_converter import PlayWrightConverter +from .selenium_converter import SeleniumConverter diff --git a/dataframe_image/converter/browser/base.py b/dataframe_image/converter/browser/base.py new file mode 100644 index 0000000..4080bca --- /dev/null +++ b/dataframe_image/converter/browser/base.py @@ -0,0 +1,202 @@ +import base64 +import io +import logging +import subprocess +from abc import ABC +from pathlib import Path +from tempfile import TemporaryDirectory + +import numpy as np +from PIL import Image, ImageOps + +from dataframe_image.pd_html import styler2html + +_logger = logging.getLogger(__name__) + + +class BrowserConverter(ABC): + MAX_IMAGE_SIZE = 65535 + + def __init__( + self, + center_df: bool = True, + max_rows: int = None, + max_cols: int = None, + chrome_path: str = None, + fontsize: int = 18, + encode_base64: bool = True, + limit_crop: bool = True, + device_scale_factor: int = 1, + use_mathjax: bool = False, + ): + """ + Initialize the Html2ImageConverter class. + + Args: + center_df (bool): Whether to center the dataframe. Default is True. + max_rows (int): Maximum number of rows. Default is None. + max_cols (int): Maximum number of columns. Default is None. + chrome_path (str): Path to the Chrome executable. Default is None. + fontsize (int): Font size. Default is 18. + encode_base64 (bool): Whether to encode the image in base64. Default is True. + limit_crop (bool): Whether to limit the crop. Default is True. + device_scale_factor (int): Device scale factor. Default is 1. + use_mathjax (bool): Whether to use MathJax for rendering. Default is False. + """ + self.center_df = center_df + self.max_rows = max_rows + self.max_cols = max_cols + self.chrome_path = chrome_path + self.fontsize = fontsize + self.encode_base64 = encode_base64 + self.limit_crop = limit_crop + self.device_scale_factor = device_scale_factor + self.use_mathjax = use_mathjax + + def get_css(self) -> str: + """ + Get the CSS for the HTML. + + Returns: + str: The CSS string. + """ + mod_dir = Path(__file__).resolve().parent + css_file = mod_dir / "static" / "style.css" + with open(css_file) as f: + css = "" + justify = "center" if self.center_df else "left" + css = css.format(fontsize=self.fontsize, justify=justify) + if self.use_mathjax: + script = """ + + + """ + css += script + return css + + def should_enlarge(self, img: Image, ss_width: int, ss_height: int) -> tuple: + """ + Check if the image should be enlarged. + + Args: + img (Image): The image to check. + ss_width (int): The screenshot width. + ss_height (int): The screenshot height. + + Returns: + tuple: A tuple containing a boolean indicating whether to enlarge the image, and the new width and height. + """ + enlarge = False + im_ndarray = np.array(img) + img2d = im_ndarray.mean(axis=2) == 255 + + all_white_vert = img2d.all(axis=0) + # must be all white for 30 pixels in a row to trigger stop + if all_white_vert[-30:].sum() != 30: + ss_width = int(ss_width * 1.5) + enlarge = True + + all_white_horiz = img2d.all(axis=1) + if all_white_horiz[-30:].sum() != 30: + ss_height = int(ss_height * 1.5) + enlarge = True + + return enlarge, ss_width, ss_height + + def screenshot( + self, html: str, ss_width: int = 1920, ss_height: int = 1080 + ) -> Image: + """ + Take a screenshot of the HTML. + + Args: + html (str): The HTML to screenshot. + ss_width (int): The screenshot width. Default is 1920. + ss_height (int): The screenshot height. Default is 1080. + + Returns: + Image: The screenshot image. + """ + raise NotImplementedError + + def crop(self, im: Image) -> Image: + """ + Crop the image. + + Args: + im (Image): The image to crop. + + Returns: + Image: The cropped image. + """ + # remove black + imrgb = im.convert("RGB") + imageBox = imrgb.getbbox() + im = im.crop(imageBox) + + # remove alpha channel + imrgb = im.convert("RGB") + # invert image (so that white is 0) + invert_im = ImageOps.invert(imrgb) + imageBox = invert_im.getbbox() + cropped = im.crop(imageBox) + return cropped + + def run(self, html: str) -> bytes: + """ + Run the converter on the HTML. + + Args: + html (str): The HTML to convert. + + Returns: + bytes: The converted image bytes. + """ + im = self.screenshot(html) + temp_img = self.crop(im) + image_bytes = self.finalize_image(temp_img) + return image_bytes + + def finalize_image(self, img: Image) -> bytes: + """ + Finalize the image. + + Args: + img (Image): The image to finalize. + + Returns: + bytes: The finalized image bytes. + """ + buffer = io.BytesIO() + img.save(buffer, format="png") + img_str = buffer.getvalue() + if self.encode_base64: + img_str = base64.b64encode(img_str).decode() + return img_str + + def repr_png_wrapper(self): + from pandas.io.formats.style import Styler + + ss = self + + def _repr_png_(self): + if isinstance(self, Styler): + html = styler2html(self) + else: + html = self.to_html( + max_rows=ss.max_rows, max_cols=ss.max_cols, notebook=True + ) + return ss.run(html) + + return _repr_png_ diff --git a/dataframe_image/_screenshot.py b/dataframe_image/converter/browser/chrome_converter.py similarity index 53% rename from dataframe_image/_screenshot.py rename to dataframe_image/converter/browser/chrome_converter.py index 3a8aae6..159f0dc 100644 --- a/dataframe_image/_screenshot.py +++ b/dataframe_image/converter/browser/chrome_converter.py @@ -1,6 +1,4 @@ -import base64 import io -import logging import os import platform import shutil @@ -8,14 +6,10 @@ from pathlib import Path from tempfile import TemporaryDirectory -import numpy as np -from PIL import Image, ImageOps +from PIL import Image -from .pd_html import styler2html - -_logger = logging.getLogger(__name__) - -MAX_IMAGE_SIZE = 65535 +from dataframe_image.converter.browser.base import BrowserConverter +from dataframe_image.logger import logger def get_system(): @@ -80,39 +74,37 @@ def get_chrome_path(chrome_path=None): raise OSError("Cannot find chrome.exe on your windows machine") -class Screenshot: +class ChromeConverter(BrowserConverter): def __init__( self, - center_df=True, - max_rows=None, - max_cols=None, - chrome_path=None, - fontsize=18, - encode_base64=True, - limit_crop=True, - device_scale_factor=1, + center_df: bool = True, + max_rows: int = None, + max_cols: int = None, + chrome_path: str = None, + fontsize: int = 18, + encode_base64: bool = True, + limit_crop: bool = True, + device_scale_factor: int = 1, + use_mathjax: bool = False, ): - self.center_df = center_df - self.max_rows = max_rows - self.max_cols = max_cols + super().__init__( + center_df, + max_rows, + max_cols, + chrome_path, + fontsize, + encode_base64, + limit_crop, + device_scale_factor, + use_mathjax, + ) self.chrome_path = get_chrome_path(chrome_path) - self.fontsize = fontsize - self.encode_base64 = encode_base64 - self.limit_crop = limit_crop - self.device_scale_factor = device_scale_factor - def get_css(self): - mod_dir = Path(__file__).resolve().parent - css_file = mod_dir / "static" / "style.css" - with open(css_file) as f: - css = "" - justify = "center" if self.center_df else "left" - css = css.format(fontsize=self.fontsize, justify=justify) - return css - - def take_screenshot(self, ss_width=1400, ss_height=900): - html_css = self.get_css() + self.html - with TemporaryDirectory() as temp_dir: + def screenshot(self, html, ss_width=1400, ss_height=900) -> Image: + html_css = self.get_css() + html + # create temp dir under current user home dir + # snap version Chrome only allow to access files under home dir + with TemporaryDirectory(dir=Path.home()) as temp_dir: temp_html = Path(temp_dir) / "temp.html" temp_img = Path(temp_dir) / "temp.png" with open(temp_html, "w", encoding="utf-8") as f: @@ -143,89 +135,23 @@ def take_screenshot(self, ss_width=1400, ss_height=900): str(temp_html), ] - self.generate_image_from_html(args) + subprocess.run( + executable=self.chrome_path, args=args, capture_output=True, check=True + ) with open(temp_img, "rb") as f: bio = io.BytesIO(f.read()) im = Image.open(bio) - return self.possibly_enlarge(im, ss_width, ss_height) - - def generate_image_from_html(self, args): - # print(self.chrome_path) - subprocess.run( - executable=self.chrome_path, args=args, capture_output=True, check=True - ) - - def possibly_enlarge(self, img, ss_width, ss_height): - enlarge = False - im_ndarray = np.array(img) - img2d = im_ndarray.mean(axis=2) == 255 - - all_white_vert = img2d.all(axis=0) - # must be all white for 30 pixels in a row to trigger stop - if all_white_vert[-30:].sum() != 30: - ss_width = int(ss_width * 1.5) - enlarge = True - - all_white_horiz = img2d.all(axis=1) - if all_white_horiz[-30:].sum() != 30: - ss_height = int(ss_height * 1.5) - enlarge = True - - if enlarge: - if ss_height < MAX_IMAGE_SIZE and ss_width < MAX_IMAGE_SIZE: - return self.take_screenshot(ss_width, ss_height) - else: - _logger.warning( - f"""Unable to enlarge image with Chrome, it is a known bug with version 111 and 112 - You could try to install an individual Chrome dev version and set chrome_path to it - or try 'df.dfi.export('df.png', table_conversion="selenium")'""" - ) - - return self.crop(img) - - def crop(self, im): - # remove black - imrgb = im.convert("RGB") - imageBox = imrgb.getbbox() - im = im.crop(imageBox) - - # remove alpha channel - imrgb = im.convert("RGB") - # invert image (so that white is 0) - invert_im = ImageOps.invert(imrgb) - imageBox = invert_im.getbbox() - cropped = im.crop(imageBox) - return cropped - - def finalize_image(self, img): - buffer = io.BytesIO() - img.save(buffer, format="png") - img_str = buffer.getvalue() - if self.encode_base64: - img_str = base64.b64encode(img_str).decode() - return img_str - - def run(self, html): - self.html = html - img = self.take_screenshot() - img_str = self.finalize_image(img) - return img_str - - def repr_png_wrapper(self): - from pandas.io.formats.style import Styler - - ss = self - - def _repr_png_(self): - if isinstance(self, Styler): - html = styler2html(self) - else: - html = self.to_html( - max_rows=ss.max_rows, max_cols=ss.max_cols, notebook=True - ) - return ss.run(html) - - return _repr_png_ + enlarge, ss_width, ss_height = self.should_enlarge(im, ss_width, ss_height) + if enlarge: + if ss_height < self.MAX_IMAGE_SIZE and ss_width < self.MAX_IMAGE_SIZE: + return self.screenshot(html, ss_width, ss_height) + else: + logger.warning( + f"""Unable to enlarge image with Chrome, it is a known bug with version 111 and 112 + You could try to install an individual Chrome dev version and set chrome_path to it + or try 'df.dfi.export('df.png', table_conversion="selenium")'""" + ) + return im def make_repr_png(center_df=True, max_rows=30, max_cols=10, chrome_path=None): @@ -253,5 +179,5 @@ def make_repr_png(center_df=True, max_rows=30, max_cols=10, chrome_path=None): Path to your machine's chrome executable. When `None`, it is automatically found. Use this when chrome is not automatically found. """ - ss = Screenshot(center_df, max_rows, max_cols, chrome_path) + ss = ChromeConverter(center_df, max_rows, max_cols, chrome_path) return ss.repr_png_wrapper() diff --git a/dataframe_image/converter/browser/html2image_converter.py b/dataframe_image/converter/browser/html2image_converter.py new file mode 100644 index 0000000..063ac8d --- /dev/null +++ b/dataframe_image/converter/browser/html2image_converter.py @@ -0,0 +1,47 @@ +import io +from pathlib import Path + +from PIL import Image + +from dataframe_image.logger import logger + +from .base import BrowserConverter + + +class Html2ImageConverter(BrowserConverter): + def screenshot( + self, html: str, ss_width: int = 1920, ss_height: int = 1080 + ) -> Image: + from html2image import Html2Image + + css = self.get_css() + # use folder under home directory to avoid permission issues + # snap version Chrome can only access files under home dir + wd = Path.home() / ".cache" / "html2image" + wd.mkdir(parents=True, exist_ok=True) + hti = Html2Image( + browser_executable=self.chrome_path, output_path=wd, temp_path=str(wd) + ) + hti.browser.flags = [ + f"--force-device-scale-factor={self.device_scale_factor}", + "--disable-gpu", + "--hide-scrollbars", + ] + outpaths = hti.screenshot( + html_str=html, css_str=css, size=(ss_width, ss_height) + ) + temp_img = outpaths[0] + with open(temp_img, "rb") as f: + bio = io.BytesIO(f.read()) + im = Image.open(bio) + enlarge, ss_width, ss_height = self.should_enlarge(im, ss_width, ss_height) + if enlarge: + if ss_height < self.MAX_IMAGE_SIZE and ss_width < self.MAX_IMAGE_SIZE: + return self.screenshot(html, ss_width, ss_height) + else: + logger.warning( + f"""Unable to enlarge image with Chrome, it is a known bug with version 111 and 112 + You could try to install an individual Chrome dev version and set chrome_path to it + or try 'df.dfi.export('df.png', table_conversion="selenium")'""" + ) + return im diff --git a/dataframe_image/converter/browser/playwright_converter.py b/dataframe_image/converter/browser/playwright_converter.py new file mode 100644 index 0000000..718de04 --- /dev/null +++ b/dataframe_image/converter/browser/playwright_converter.py @@ -0,0 +1,47 @@ +from io import BytesIO + +from PIL import Image + +from dataframe_image.logger import logger + +from .base import BrowserConverter + + +class PlayWrightConverter(BrowserConverter): + def screenshot(self, html): + try: + from playwright.sync_api import Error, sync_playwright + except ImportError as ex: + raise ImportError( + "Playwright is not installed. Install it with 'pip install playwright' and make sure you have a chromium browser installed." + ) from ex + with sync_playwright() as p: + channels = ["chrome", "msedge", None] + for c in channels: + try: + browser = p.chromium.launch(channel=c) + break + except Error: + pass + else: + raise Error( + "Could not find any chromium based browser. Make sure you have a chromium browser installed." + "Or install it by `playwright install chromium`" + ) + + context = browser.new_context(device_scale_factor=self.device_scale_factor) + page = context.new_page() + page.set_content(self.get_css() + html) + if self.use_mathjax: + mj = page.locator("mjx-container math") + try: + mj.wait_for(timeout=10000) + except Error: + logger.warning( + "MathJax did not render in time. Formula in dataframe may not be rendered correctly." + ) + pass + page.wait_for_timeout(200) + screenshot_bytes = page.screenshot(full_page=True) + im = Image.open(BytesIO(screenshot_bytes)) + return im diff --git a/dataframe_image/selenium_screenshot.py b/dataframe_image/converter/browser/selenium_converter.py similarity index 72% rename from dataframe_image/selenium_screenshot.py rename to dataframe_image/converter/browser/selenium_converter.py index 17a5f06..d691128 100644 --- a/dataframe_image/selenium_screenshot.py +++ b/dataframe_image/converter/browser/selenium_converter.py @@ -3,40 +3,21 @@ from tempfile import TemporaryDirectory from PIL import Image -from selenium.webdriver.firefox.service import Service -from ._screenshot import Screenshot +from .base import BrowserConverter -_logger = logging.getLogger(__name__) - -class SeleniumScreenshot(Screenshot): - def __init__( - self, - center_df=True, - max_rows=None, - max_cols=None, - fontsize=18, - encode_base64=True, - limit_crop=True, - device_scale_factor=1, - ): - self.center_df = center_df - self.max_rows = max_rows - self.max_cols = max_cols - self.fontsize = fontsize - self.encode_base64 = encode_base64 - self.limit_crop = limit_crop - self.device_scale_factor = device_scale_factor - - def take_screenshot(self): +class SeleniumConverter(BrowserConverter): + def screenshot(self, html: str) -> Image: # by default Firefox will cleanup it's profile directory after closing # so we need to set ignore_cleanup_errors=True + temp_dir_obj = TemporaryDirectory(prefix="dataframe_image_") temp_dir = temp_dir_obj.name try: import selenium.common import selenium.webdriver + from selenium.webdriver.firefox.service import Service options = selenium.webdriver.FirefoxOptions() options.add_argument("--headless") @@ -57,7 +38,7 @@ def take_screenshot(self): temp_html = Path(temp_dir) / "temp.html" temp_img = Path(temp_dir) / "temp.png" with open(temp_html, "w", encoding="utf-8") as f: - f.write(self.get_css() + self.html) + f.write(self.get_css() + html) with selenium.webdriver.Firefox(options=options, service=service) as driver: driver.get(f"file://{str(temp_html)}") # selenium will do the rest @@ -77,4 +58,4 @@ def take_screenshot(self): temp_dir_obj.cleanup() except OSError: pass - return self.crop(img) + return img diff --git a/dataframe_image/static/download.html b/dataframe_image/converter/browser/static/download.html similarity index 100% rename from dataframe_image/static/download.html rename to dataframe_image/converter/browser/static/download.html diff --git a/dataframe_image/static/fail.html b/dataframe_image/converter/browser/static/fail.html similarity index 100% rename from dataframe_image/static/fail.html rename to dataframe_image/converter/browser/static/fail.html diff --git a/dataframe_image/static/form.html b/dataframe_image/converter/browser/static/form.html similarity index 100% rename from dataframe_image/static/form.html rename to dataframe_image/converter/browser/static/form.html diff --git a/dataframe_image/static/style.css b/dataframe_image/converter/browser/static/style.css similarity index 96% rename from dataframe_image/static/style.css rename to dataframe_image/converter/browser/static/style.css index c1fc92b..e956f90 100644 --- a/dataframe_image/static/style.css +++ b/dataframe_image/converter/browser/static/style.css @@ -1,7 +1,7 @@ table {{ background-color: transparent; font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; - border:none; + border: 1px solid #f5f5f5; border-collapse: collapse; border-spacing:0; color:black; diff --git a/dataframe_image/_matplotlib_table.py b/dataframe_image/converter/matplotlib_table.py similarity index 98% rename from dataframe_image/_matplotlib_table.py rename to dataframe_image/converter/matplotlib_table.py index acb9392..25e9ce9 100644 --- a/dataframe_image/_matplotlib_table.py +++ b/dataframe_image/converter/matplotlib_table.py @@ -13,7 +13,7 @@ from matplotlib.transforms import Bbox -class TableMaker: +class MatplotlibTableConverter: def __init__( self, fontsize=14, @@ -307,7 +307,9 @@ def print_table(self): end = self.figwidth - start bbox = Bbox([[start - 0.1, y * h], [end + 0.1, h]]) buffer = io.BytesIO() - self.fig.savefig(buffer, bbox_inches=bbox, dpi=self.savefig_dpi, format=self.format) + self.fig.savefig( + buffer, bbox_inches=bbox, dpi=self.savefig_dpi, format=self.format + ) img_str = buffer.getvalue() if self.encode_base64: img_str = base64.b64encode(img_str).decode() diff --git a/dataframe_image/logger.py b/dataframe_image/logger.py new file mode 100644 index 0000000..eea436a --- /dev/null +++ b/dataframe_image/logger.py @@ -0,0 +1,3 @@ +import logging + +logger = logging.getLogger(__name__) diff --git a/setup.py b/setup.py index 0056cf6..fcd1e0f 100644 --- a/setup.py +++ b/setup.py @@ -18,11 +18,14 @@ packages=setuptools.find_packages(), license="MIT", classifiers=[ - "Programming Language :: Python :: 3", + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], - python_requires=">=3.6", + python_requires=">=3.8", install_requires=[ "pandas>=0.24", "nbconvert>=5", @@ -35,7 +38,6 @@ "beautifulsoup4", "cssutils", "html2image", - "ChromeController", ], include_package_data=True, entry_points={ diff --git a/tests/__init__.py b/tests/__init__.py index e69de29..1198752 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1 @@ +from . import conftest diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..8f70514 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,13 @@ +import pytest + + +@pytest.fixture(scope='session', autouse=True) +def ensure_output_dir(): + from pathlib import Path + + output_dir = Path("tests/test_output") + output_dir.mkdir(exist_ok=True, parents=True) + +@pytest.fixture() +def document_name(request): + return request.node.name.replace(" ", "_").replace("/", "_") diff --git a/tests/delete_test_results.py b/tests/delete_test_results.py deleted file mode 100644 index 427d487..0000000 --- a/tests/delete_test_results.py +++ /dev/null @@ -1,19 +0,0 @@ -import shutil -from pathlib import Path - - -def delete(p): - for file in p.iterdir(): - if file.suffix in (".pdf", ".md", ".png") or file.name.endswith( - "_dataframe_image.ipynb" - ): - if file.name != "README.md": - file.unlink() - - if file.name.endswith("_files"): - shutil.rmtree(file) - - -home = Path(__file__).parent -delete(home / "notebooks") -delete(home / "test_output") diff --git a/tests/test_convert.py b/tests/test_convert.py index 2a21806..1f0fb78 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -18,17 +18,12 @@ no_input = [True, False] -def tname_to_filename(test_name: str): - return test_name.replace(" ", "_").replace("/", "_") - - @pytest.mark.parametrize("filename", filenames) @pytest.mark.parametrize("use", uses) @pytest.mark.parametrize("execute", executes, ids=["executed", ""]) @pytest.mark.parametrize("no_input", no_input, ids=["no_input", ""]) class TestConvertPDF: - def test_to_pdf(self, request, filename, use, execute, no_input): - document_name = tname_to_filename(request.node.name) + def test_to_pdf(self, document_name, filename, use, execute, no_input): convert( filename, to="pdf", @@ -44,8 +39,7 @@ def test_to_pdf(self, request, filename, use, execute, no_input): @pytest.mark.parametrize("execute", executes, ids=["executed", ""]) @pytest.mark.parametrize("no_input", no_input, ids=["no_input", ""]) class TestConvertMD: - def test_to_md(self, request, filename, execute, no_input): - document_name = tname_to_filename(request.node.name) + def test_to_md(self, document_name, filename, execute, no_input): convert( filename, to="md", diff --git a/tests/test_df_image.py b/tests/test_df_image.py index d70e921..d021087 100644 --- a/tests/test_df_image.py +++ b/tests/test_df_image.py @@ -7,65 +7,77 @@ import dataframe_image as dfi -df = pd.read_csv("tests/notebooks/data/covid19.csv", parse_dates=["date"], index_col="date") +df = pd.read_csv( + "tests/notebooks/data/covid19.csv", parse_dates=["date"], index_col="date" +) test_dpi_values = [50, 100, 200, 400] -converters = ["chrome", "selenium", "matplotlib"] - - -class TestImage: - @pytest.mark.parametrize("dpi", test_dpi_values) - @pytest.mark.parametrize("converter", converters) - def test_df(self, converter, dpi): - df.tail(10).dfi.export( - f"tests/test_output/covid19_{converter}_dpi{dpi}.png", - table_conversion=converter, - dpi=dpi, - ) - - @pytest.mark.parametrize("dpi", test_dpi_values) - @pytest.mark.parametrize("converter", converters) - def test_styled(self, converter, dpi): - df.style.background_gradient().export_png( - f"tests/test_output/covid19_styled_{converter}_dpi{dpi}.png", - table_conversion=converter, - dpi=dpi, - ) - - @pytest.mark.parametrize("dpi", test_dpi_values) - @pytest.mark.parametrize("converter", converters) - def test_huge_df(self, converter, dpi): - df = pd.DataFrame(np.random.randint(0, 100, size=(300, 20))) - df.dfi.export( - f"tests/test_output/huge_{converter}_dpi{dpi}.png", - table_conversion=converter, - dpi=dpi, - max_rows=-1, - ) - - def test_svg(self): - dstyle = df.style.background_gradient() - dfi.export( - dstyle, f"tests/test_output/covid19_styled.svg", table_conversion="matplotlib", dpi=100 - ) - - def test_latex(self): - df_latex = pd.DataFrame(['$\int^0_1 3x^2 dx$']) - dfi.export(df_latex, f"tests/test_output/latex.png", table_conversion="chrome") - - @pytest.mark.parametrize("dpi", test_dpi_values) - @pytest.mark.parametrize("converter", converters) - def test_long_column_headers(self, converter, dpi): - column_headers = [ - "".join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=80)) - for _ in range(5) - ] - - df = pd.DataFrame(np.random.randint(0, 100, size=(5, 5)), columns=column_headers) - - df.dfi.export( - f"tests/test_output/long_column_{converter}_dpi{dpi}.png", - table_conversion=converter, - dpi=dpi, - max_rows=-1, - ) +converters = ["chrome", "selenium", "matplotlib", "html2image", "playwright"] + + +@pytest.mark.parametrize("dpi", test_dpi_values) +@pytest.mark.parametrize("converter", converters) +def test_df(document_name, converter, dpi): + df.tail(10).dfi.export( + f"tests/test_output/{document_name}.png", + table_conversion=converter, + dpi=dpi, + ) + + +@pytest.mark.parametrize("dpi", test_dpi_values) +@pytest.mark.parametrize("converter", converters) +def test_styled(document_name, converter, dpi): + df.style.background_gradient().export_png( + f"tests/test_output/{document_name}.png", + table_conversion=converter, + dpi=dpi, + ) + + +@pytest.mark.parametrize("dpi", test_dpi_values) +@pytest.mark.parametrize("converter", converters) +def test_huge_df(document_name, converter, dpi): + df = pd.DataFrame(np.random.randint(0, 100, size=(300, 20))) + df.dfi.export( + f"tests/test_output/{document_name}.png", + table_conversion=converter, + dpi=dpi, + max_rows=-1, + ) + + +def test_svg(document_name): + dstyle = df.style.background_gradient() + dfi.export( + dstyle, f"tests/test_output/{document_name}.svg", table_conversion="matplotlib" + ) + + +@pytest.mark.parametrize("converter", converters) +def test_latex(document_name, converter): + df_latex = pd.DataFrame(["$\int^0_1 3x^2 dx$"]) + dfi.export( + df_latex, + f"tests/test_output/{document_name}.png", + table_conversion=converter, + use_mathjax=True, + ) + + +@pytest.mark.parametrize("dpi", test_dpi_values) +@pytest.mark.parametrize("converter", converters) +def test_long_column_headers(document_name, converter, dpi): + column_headers = [ + "".join(random.choices(string.ascii_uppercase + string.ascii_lowercase, k=80)) + for _ in range(5) + ] + + df = pd.DataFrame(np.random.randint(0, 100, size=(5, 5)), columns=column_headers) + + df.dfi.export( + f"tests/test_output/{document_name}.png", + table_conversion=converter, + dpi=dpi, + max_rows=-1, + ) diff --git a/tests/test_output/README.md b/tests/test_output/README.md deleted file mode 100644 index 0e5a17c..0000000 --- a/tests/test_output/README.md +++ /dev/null @@ -1 +0,0 @@ -# Test results \ No newline at end of file