Merge pull request #777 from pandas-profiling/develop

v2.13.0
ydataai · May 8, 2021 · 02ed31a · 02ed31a
2 parents 662fdad + b770e0f
commit 02ed31a
Show file tree

Hide file tree

Showing 68 changed files with 470 additions and 787 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+version: 2
+updates:
+- package-ecosystem: pip
+  directory: "/"
+  schedule:
+    interval: daily
+  open-pull-requests-limit: 10
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,23 +1,20 @@
 repos:
 -   repo: https://github.com/psf/black
-    rev: 20.8b1
+    rev: 21.4b2
     hooks:
     - id: black
       language_version: python3.8
 -   repo: https://github.com/nbQA-dev/nbQA
-    rev: 0.7.0
+    rev: 0.8.0
     hooks:
     - id: nbqa-black
-      additional_dependencies: [ black==20.8b1 ]
       args: [--nbqa-mutate ]
     - id: nbqa-isort
-      additional_dependencies: [ isort==5.6.4 ]
       args: [ --nbqa-mutate, --profile=black, --project=pandas_profiling ]
     - id: nbqa-pyupgrade
-      additional_dependencies: [ pyupgrade==2.7.3 ]
       args: [ --nbqa-mutate, --py36-plus ]
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v2.12.0
+    rev: v2.14.0
     hooks:
     -   id: pyupgrade
         args: ['--py36-plus','--exit-zero-even-if-changed']
@@ -35,6 +32,13 @@ repos:
     rev: "3.9.1"
     hooks:
     -   id: flake8
-        args: [ "--select=E9,F63,F7,F82"] #,T001
-#        additional_dependencies:
-#          - flake8-print
+        args: [ "--ignore=E203,E501,W291,W503,SFS301,SIM106" ]
+        additional_dependencies:
+          - flake8-comprehensions
+          - flake8-sfs
+          - flake8-simplify
+          - flake8-eradicate
+          - flake8-print
+
+ci:
+  autoupdate_commit_msg: 'ci: pre-commit-config update'
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -18,6 +18,7 @@ include src/pandas_profiling/*.yaml
 
 # Exclude development, docs, testing and example code
 exclude .pre-commit-config.yaml
+exclude commitlint.config.js
 include Makefile make.bat
 exclude docs examples tests docsrc
 recursive-exclude docs *

diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ For each column the following statistics - if relevant for the column type - are
 
 ## Announcements
 
-**Version v2.11.0 released** featuring an exciting integration with Great Expectations that many of you requested (see details below).
+**Version v2.13.0 released** featuring an exciting integration with Great Expectations that many of you requested (see details below).
 
 **Spark backend in progress**: We can happily announce that we're nearing v1 for the Spark backend for generating profile reports.
 Stay tuned.
@@ -51,10 +51,10 @@ It's extra exciting that GitHub **matches your contribution** for the first year
 
 Find more information here:
 
- - [Changelog v2.11.0](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/changelog.html#changelog-v2-11-0)
+ - [Changelog v2.13.0](https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/pages/changelog.html#changelog)
  - [Sponsor the project on GitHub](https://github.com/sponsors/sbrugman)
 
-_February 20, 2021 💘_
+_May 8, 2021 💘_
 
 ---
 

diff --git a/commitlint.config.js b/commitlint.config.js
@@ -0,0 +1,7 @@
+module.exports = {
+    extends: ['@commitlint/config-conventional'],
+    rules: {
+        'body-max-line-length': [2, 'always', 120],
+        'footer-max-line-length': [2, 'always', 120],
+    },
+};
diff --git a/docsrc/source/conf.py b/docsrc/source/conf.py
@@ -70,7 +70,6 @@ def _GetApiWrapperVersion():
 
 html_theme = "sphinx_rtd_theme"
 html_theme_options = {"style_nav_header_background": "#174C4F"}
-# html_logo = "_static/icon.png"
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,

diff --git a/docsrc/source/pages/changelog.rst b/docsrc/source/pages/changelog.rst
@@ -2,6 +2,8 @@
 Changelog
 =========
 
+.. include:: changelog/v2_13_0.rst
+
 .. include:: changelog/v2_12_0.rst
 
 .. include:: changelog/v2_11_0.rst

diff --git a/docsrc/source/pages/changelog/v2_13_0.rst b/docsrc/source/pages/changelog/v2_13_0.rst
@@ -3,28 +3,14 @@ Changelog v2.13.0
 
 🎉 Features
 ^^^^^^^^^^^
--
-
-🐛 Bug fixes
-^^^^^^^^^^^^
--
+- configurable numeric precision
 
 👷‍♂️ Internal Improvements
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
--
-
-📖 Documentation
-^^^^^^^^^^^^^^^^
--
-
-⚠️  Deprecated
-^^^^^^^^^^^^^^^^^
--
-
-🚨 Breaking changes
-^^^^^^^^^^^^^^^^^^^
--
+- string type detection performance optimization
+- various improvements to software quality (flake8, commitlint)
 
 ⬆️ Dependencies
 ^^^^^^^^^^^^^^^^^^
--
+- upgrade from ``visions`` 0.6.0 to 0.7.1
+- upgrade from ``coverage`` <5 to ~=5.5
diff --git a/examples/census/census.py b/examples/census/census.py
@@ -43,7 +43,9 @@
     profile = ProfileReport(df, title="Census Dataset", explorative=True)
 
     # show column definition
-    definitions = json.load(open(f"census_column_definition.json"))
+    with open("census_column_definition.json") as f:
+        definitions = json.load(f)
+
     profile.set_variable(
         "dataset",
         {

diff --git a/examples/features/images_cats_and_dogs.py b/examples/features/images_cats_and_dogs.py
@@ -1,7 +1,7 @@
 import kaggle
 import pandas as pd
 
-import pandas_profiling
+from pandas_profiling import ProfileReport
 from pandas_profiling.utils.paths import get_data_path
 
 # The dataset in this example is obtained using the `kaggle` api.
@@ -31,7 +31,8 @@
 df = pd.DataFrame(series)
 
 # Generate the profile report
-profile = df.profile_report(
+profile = ProfileReport(
+    df,
     title="Example of summarization of an image dataset (Kaggle Cat and Dog dataset)",
     # We will not need those
     samples=None,

diff --git a/examples/features/images_exif.py b/examples/features/images_exif.py
@@ -1,7 +1,7 @@
 import kaggle
 import pandas as pd
 
-import pandas_profiling
+from pandas_profiling import ProfileReport
 from pandas_profiling.utils.paths import get_data_path
 
 # The dataset in this example is obtained using the `kaggle` api.
@@ -31,7 +31,8 @@
 df = pd.DataFrame(series)
 
 # Generate the profile report
-profile = df.profile_report(
+profile = ProfileReport(
+    df,
     title="Example showcasing EXIF data (Kaggle 5 Celebrity Faces Dataset)",
     # Disable what's not in our focus
     duplicates=None,

diff --git a/examples/features/mask_sensitive.py b/examples/features/mask_sensitive.py
@@ -31,19 +31,19 @@
     report = ProfileReport(
         df.sample(frac=0.25),
         title="Masked data",
-        dataset=dict(
-            description="This profiling report was generated using a sample of 5% of the original dataset.",
-            copyright_holder="StataCorp LLC",
-            copyright_year="2020",
-            url="http://www.stata-press.com/data/r15/auto2.dta",
-        ),
+        dataset={
+            "description": "This profiling report was generated using a sample of 5% of the original dataset.",
+            "copyright_holder": "StataCorp LLC",
+            "copyright_year": "2020",
+            "url": "http://www.stata-press.com/data/r15/auto2.dta",
+        },
         sensitive=True,
-        sample=dict(
-            name="Mock data sample",
-            data=mock_data,
-            caption="Disclaimer: this is synthetic data generated based on the format of the data in this table.",
-        ),
-        vars=dict(cat=dict(words=True, characters=True)),
+        sample={
+            "name": "Mock data sample",
+            "data": mock_data,
+            "caption": "Disclaimer: this is synthetic data generated based on the format of the data in this table.",
+        },
+        vars={"cat": {"words": True, "characters": True}},
         interactions=None,
     )
     report.to_file(Path("masked_report.html"))
diff --git a/examples/features/urls.py b/examples/features/urls.py
@@ -15,6 +15,6 @@
     profile = ProfileReport(
         df,
         title="Website Inaccessibility Test Lists",
-        vars=dict(url=dict(active=True)),
+        vars={"url": {"active": True}},
     )
     profile.to_file(Path("./website_inaccessibility_report.html"))
diff --git a/examples/meteorites/meteorites.py b/examples/meteorites/meteorites.py
@@ -28,7 +28,7 @@
     # Example: Mixed with base types
     df["mixed"] = np.random.choice([1, "A"], df.shape[0])
 
-    # Example: Unhashable
+    # Example: unhashable column
     df["unhashable"] = [[1]] * df.shape[0]
 
     # Example: Highly correlated variables

diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,5 +1,5 @@
 pytest
-coverage<5
+coverage~=5.5
 codecov
 pytest-mypy
 pytest-cov

diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@ pandas>=0.25.3,!=1.0.1,!=1.0.0,!=1.0.2,!=1.1.0
 matplotlib>=3.2.0
 confuse>=1.0.0
 jinja2>=2.11.1
-visions[type_image_path]==0.6.0
+visions[type_image_path]==0.7.1
 numpy>=1.16.0
 attrs>=19.3.0
 # Could be optional

diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
 with (source_root / "requirements.txt").open(encoding="utf8") as f:
     requirements = f.readlines()
 
-version = "2.12.0"
+version = "2.13.0"
 
 with (source_root / "src" / "pandas_profiling" / "version.py").open(
     "w", encoding="utf-8"

diff --git a/src/pandas_profiling/__init__.py b/src/pandas_profiling/__init__.py
@@ -9,3 +9,12 @@
 from pandas_profiling.version import __version__
 
 clear_config = ProfileReport.clear_config
+
+__all__ = [
+    "Config",
+    "config",
+    "pandas_decorator",
+    "ProfileReport",
+    "__version__",
+    "clear_config",
+]
diff --git a/src/pandas_profiling/config.py b/src/pandas_profiling/config.py
@@ -97,14 +97,14 @@ def _set_kwargs(self, reference, values: dict):
                 raise ValueError(f'Config parameter "{key}" does not exist.')
 
     _shorthands = {
-        "dataset": dict(
-            creator="",
-            author="",
-            description="",
-            copyright_holder="",
-            copyright_year="",
-            url="",
-        ),
+        "dataset": {
+            "creator": "",
+            "author": "",
+            "description": "",
+            "copyright_holder": "",
+            "copyright_year": "",
+            "url": "",
+        },
         "samples": {"head": 0, "tail": 0, "random": 0},
         "duplicates": {"head": 0},
         "interactions": {"targets": [], "continuous": False},

diff --git a/src/pandas_profiling/config_default.yaml b/src/pandas_profiling/config_default.yaml
@@ -109,6 +109,9 @@ interactions:
 # For categorical
 categorical_maximum_correlation_distinct: 100
 
+report:
+  precision: 10
+
 # Plot-specific settings
 plot:
 # Image format (svg or png)

diff --git a/src/pandas_profiling/config_minimal.yaml b/src/pandas_profiling/config_minimal.yaml
@@ -110,6 +110,9 @@ interactions:
 # For categorical
 categorical_maximum_correlation_distinct: 100
 
+report:
+  precision: 10
+
 # Plot-specific settings
 plot:
 # Image format (svg or png)

diff --git a/src/pandas_profiling/model/correlations.py b/src/pandas_profiling/model/correlations.py
@@ -122,9 +122,9 @@ def compute(df, summary) -> Optional[pd.DataFrame]:
 
         with warnings.catch_warnings():
             warnings.simplefilter("ignore")
-            import phik
+            from phik import phik_matrix
 
-            correlation = df[selcols].phik_matrix(interval_cols=intcols)
+            correlation = phik_matrix(df[selcols], interval_cols=list(intcols))
 
         return correlation
 

diff --git a/src/pandas_profiling/model/imghdr_patch.py b/src/pandas_profiling/model/imghdr_patch.py
@@ -16,7 +16,7 @@ def test_jpeg1(h, f):
 
 def test_jpeg2(h, f):
     """JPEG with small header"""
-    if len(h) >= 32 and 67 == h[5] and h[:32] == JPEG_MARK:
+    if len(h) >= 32 and h[5] == 67 and h[:32] == JPEG_MARK:
         return "jpeg"
 
 

diff --git a/src/pandas_profiling/model/summary.py b/src/pandas_profiling/model/summary.py
@@ -56,7 +56,7 @@ def sort_column_names(dct: Mapping, sort: str):
     if sort.startswith("asc"):
         dct = dict(sorted(dct.items(), key=lambda x: x[0].casefold()))
     elif sort.startswith("desc"):
-        dct = dict(reversed(sorted(dct.items(), key=lambda x: x[0].casefold())))
+        dct = dict(sorted(dct.items(), key=lambda x: x[0].casefold(), reverse=True))
     elif sort != "none":
         raise ValueError('"sort" should be "ascending", "descending" or "None".')
     return dct