Skip to content

Commit

Permalink
Merge branch 'develop' into docs/add_data_catalog_section
Browse files Browse the repository at this point in the history
  • Loading branch information
fabclmnt authored Jul 31, 2023
2 parents 13a3202 + 14ad4ac commit dcfba2e
Show file tree
Hide file tree
Showing 15 changed files with 257 additions and 8 deletions.
2 changes: 2 additions & 0 deletions src/ydata_profiling/compare_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,8 @@ def compare(

res["analysis"]["title"] = _compare_title(res["analysis"]["title"])
res["alerts"] = _create_placehoder_alerts(res["alerts"])
if not any(res["time_index_analysis"]):
res["time_index_analysis"] = None
profile = ProfileReport(None, config=_config)
profile._description_set = from_dict(data_class=BaseDescription, data=res)
return profile
9 changes: 9 additions & 0 deletions src/ydata_profiling/model/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@
get_active_correlations,
)
from ydata_profiling.model.dataframe import check_dataframe, preprocess
from ydata_profiling.model.description import TimeIndexAnalysis
from ydata_profiling.model.duplicates import get_duplicates
from ydata_profiling.model.missing import get_missing_active, get_missing_diagram
from ydata_profiling.model.pairwise import get_scatter_plot, get_scatter_tasks
from ydata_profiling.model.sample import get_custom_sample, get_sample
from ydata_profiling.model.summarizer import BaseSummarizer
from ydata_profiling.model.summary import get_series_descriptions
from ydata_profiling.model.table import get_table_stats
from ydata_profiling.model.timeseries_index import get_time_index_description
from ydata_profiling.utils.progress_bar import progress
from ydata_profiling.version import __version__

Expand Down Expand Up @@ -158,6 +160,9 @@ def describe(
config, table_stats, series_description, correlations
)

if config.vars.timeseries.active:
tsindex_description = get_time_index_description(config, df, table_stats)

pbar.set_postfix_str("Get reproduction details")
package = {
"ydata_profiling_version": __version__,
Expand All @@ -170,9 +175,13 @@ def describe(
date_end = datetime.utcnow()

analysis = BaseAnalysis(config.title, date_start, date_end)
time_index_analysis = None
if config.vars.timeseries.active and tsindex_description:
time_index_analysis = TimeIndexAnalysis(**tsindex_description)

description = BaseDescription(
analysis=analysis,
time_index_analysis=time_index_analysis,
table=table_stats,
variables=series_description,
scatter=scatter_matrix,
Expand Down
41 changes: 40 additions & 1 deletion src/ydata_profiling/model/description.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import Any, Dict, List, Union
from typing import Any, Dict, List, Optional, Union


@dataclass
Expand Down Expand Up @@ -38,12 +38,50 @@ def duration(self) -> Union[timedelta, List[timedelta]]:
raise ValueError()


@dataclass
class TimeIndexAnalysis:
"""Description of timeseries index analysis module of report.
Attributes:
n_series (Union[int, List[int]): Number of time series identified in the dataset.
length (Union[int, List[int]): Number of data points in the time series.
start (Any): Starting point of the time series.
end (Any): Ending point of the time series.
period (Union[float, List[float]): Average interval between data points in the time series.
frequency (Union[Optional[str], List[Optional[str]]): A string alias given to useful common time series frequencies, e.g. H - hours.
"""

n_series: Union[int, List[int]]
length: Union[int, List[int]]
start: Any
end: Any
period: Union[float, List[float]]
frequency: Union[Optional[str], List[Optional[str]]]

def __init__(
self,
n_series: int,
length: int,
start: Any,
end: Any,
period: float,
frequency: Optional[str] = None,
) -> None:
self.n_series = n_series
self.length = length
self.start = start
self.end = end
self.period = period
self.frequency = frequency


@dataclass
class BaseDescription:
"""Description of DataFrame.
Attributes:
analysis (BaseAnalysis): Base info about report. Title, start time and end time of description generating.
time_index_analysis (Optional[TimeIndexAnalysis]): Description of timeseries index analysis module of report.
table (Any): DataFrame statistic. Base information about DataFrame.
variables (Dict[str, Any]): Description of variables (columns) of DataFrame. Key is column name, value is description dictionary.
scatter (Any): Pairwise scatter for all variables. Plot interactions between variables.
Expand All @@ -56,6 +94,7 @@ class BaseDescription:
"""

analysis: BaseAnalysis
time_index_analysis: Optional[TimeIndexAnalysis]
table: Any
variables: Dict[str, Any]
scatter: Any
Expand Down
2 changes: 2 additions & 0 deletions src/ydata_profiling/model/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
sample_pandas,
summary_pandas,
table_pandas,
timeseries_index_pandas,
)

__all__ = [
Expand All @@ -43,4 +44,5 @@
"sample_pandas",
"summary_pandas",
"table_pandas",
"timeseries_index_pandas",
]
39 changes: 39 additions & 0 deletions src/ydata_profiling/model/pandas/timeseries_index_pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""Compute statistical description of datasets."""
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype

from ydata_profiling.config import Settings
from ydata_profiling.model.timeseries_index import get_time_index_description


@get_time_index_description.register
def pandas_get_time_index_description(
config: Settings,
df: pd.DataFrame,
table_stats: dict,
) -> dict:
if not (is_numeric_dtype(df.index) or isinstance(df.index, pd.DatetimeIndex)):
return {}

n_series = table_stats["types"].get("TimeSeries", 0)
length = table_stats["n"]
start = df.index.min()
end = df.index.max()
if isinstance(df.index, pd.DatetimeIndex):
freq = df.index.inferred_freq
delta = abs(np.diff(df.index)).mean()
delta = delta.astype(f"timedelta64[{df.index.inferred_freq}]")
period = delta.astype(float)
else:
freq = None
period = abs(np.diff(df.index)).mean()

return {
"n_series": n_series,
"length": length,
"start": start,
"end": end,
"frequency": freq,
"period": period,
}
2 changes: 2 additions & 0 deletions src/ydata_profiling/model/spark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
sample_spark,
summary_spark,
table_spark,
timeseries_index_spark,
)

__all__ = [
Expand All @@ -31,4 +32,5 @@
"sample_spark",
"summary_spark",
"table_spark",
"timeseries_index_spark",
]
14 changes: 14 additions & 0 deletions src/ydata_profiling/model/spark/timeseries_index_spark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""Compute statistical description of datasets."""
from pyspark.sql import DataFrame

from ydata_profiling.config import Settings
from ydata_profiling.model.timeseries_index import get_time_index_description


@get_time_index_description.register
def spark_get_time_index_description(
config: Settings,
df: DataFrame,
table_stats: dict,
) -> dict:
return {}
16 changes: 16 additions & 0 deletions src/ydata_profiling/model/timeseries_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""Compute statistical description of datasets."""

from typing import Any

from multimethod import multimethod

from ydata_profiling.config import Settings


@multimethod
def get_time_index_description(
config: Settings,
df: Any,
table_stats: dict,
) -> dict:
raise NotImplementedError()
10 changes: 6 additions & 4 deletions src/ydata_profiling/profile_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,11 +197,13 @@ def __initialize_dataframe(
df is not None
and isinstance(df, pd.DataFrame)
and report_config.vars.timeseries.active
and report_config.vars.timeseries.sortby
):
df = df.sort_values(by=report_config.vars.timeseries.sortby)
df = df.set_index(report_config.vars.timeseries.sortby, drop=False)
df.index.name = None
if report_config.vars.timeseries.sortby:
df = df.sort_values(by=report_config.vars.timeseries.sortby)
df = df.set_index(report_config.vars.timeseries.sortby, drop=False)
df.index.name = None
else:
df = df.sort_index()

return df

Expand Down
82 changes: 80 additions & 2 deletions src/ydata_profiling/report/structure/overview.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from typing import List
from datetime import datetime
from typing import Any, List
from urllib.parse import quote

from ydata_profiling.config import Settings
from ydata_profiling.model import BaseDescription
from ydata_profiling.model.alerts import AlertType
from ydata_profiling.model.description import TimeIndexAnalysis
from ydata_profiling.report.formatters import (
fmt,
fmt_bytesize,
Expand All @@ -13,8 +15,11 @@
fmt_timespan,
list_args,
)
from ydata_profiling.report.presentation.core import Alerts, Container, Table
from ydata_profiling.report.presentation.core import Alerts, Container
from ydata_profiling.report.presentation.core import Image as ImageWidget
from ydata_profiling.report.presentation.core import Table
from ydata_profiling.report.presentation.core.renderable import Renderable
from ydata_profiling.visualisation.plot import plot_overview_timeseries


def get_dataset_overview(config: Settings, summary: BaseDescription) -> Renderable:
Expand Down Expand Up @@ -266,6 +271,76 @@ def get_dataset_alerts(config: Settings, alerts: list) -> Alerts:
)


def get_timeseries_items(config: Settings, summary: BaseDescription) -> Container:
def format_tsindex_limit(limit: Any) -> str:
if isinstance(limit, datetime):
return limit.strftime("%Y-%m-%d %H:%M:%S")
else:
return fmt_number(limit)

assert isinstance(summary.time_index_analysis, TimeIndexAnalysis)
table_stats = [
{
"name": "Number of series",
"value": fmt_number(summary.time_index_analysis.n_series),
},
{
"name": "Time series length",
"value": fmt_number(summary.time_index_analysis.length),
},
{
"name": "Starting point",
"value": format_tsindex_limit(summary.time_index_analysis.start),
},
{
"name": "Ending point",
"value": format_tsindex_limit(summary.time_index_analysis.end),
},
{
"name": "Period",
"value": fmt_number(summary.time_index_analysis.period),
},
]

if summary.time_index_analysis.frequency:
table_stats.append(
{
"name": "Frequency",
"value": summary.time_index_analysis.frequency,
}
)

ts_info = Table(table_stats, name="Timeseries statistics", style=config.html.style)

timeseries = ImageWidget(
plot_overview_timeseries(config, summary.variables),
image_format=config.plot.image_format,
alt="ts_plot",
name="preview",
anchor_id="ts_plot_overview",
)
timeseries_scaled = ImageWidget(
plot_overview_timeseries(config, summary.variables, scale=True),
image_format=config.plot.image_format,
alt="ts_plot_scaled",
name="scaled",
anchor_id="ts_plot_scaled_overview",
)
ts_tab = Container(
[timeseries, timeseries_scaled],
anchor_id="ts_plot_overview",
name="",
sequence_type="tabs",
)

return Container(
[ts_info, ts_tab],
anchor_id="timeseries_overview",
name="Time Series",
sequence_type="grid",
)


def get_dataset_items(config: Settings, summary: BaseDescription, alerts: list) -> list:
"""Returns the dataset overview (at the top of the report)
Expand Down Expand Up @@ -293,6 +368,9 @@ def get_dataset_items(config: Settings, summary: BaseDescription, alerts: list)
if len(column_details) > 0:
items.append(get_dataset_column_definitions(config, column_details))

if summary.time_index_analysis:
items.append(get_timeseries_items(config, summary))

if alerts:
items.append(get_dataset_alerts(config, alerts))

Expand Down
43 changes: 43 additions & 0 deletions src/ydata_profiling/visualisation/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,49 @@ def _format_ts_date_axis(
return axis


@manage_matplotlib_context()
def plot_overview_timeseries(
config: Settings,
variables: Any,
figsize: tuple = (6, 4),
scale: bool = False,
) -> matplotlib.figure.Figure:
"""Plot an line plot from the data and return the AxesSubplot object.
Args:
variables: The data to plot.
figsize: The size of the figure (width, height) in inches, default (6,4).
scale: Scale series values between [0,1]. Defaults to False.
Returns:
The TimeSeries lineplot.
"""
fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(111)

col = next(iter(variables))
if isinstance(variables[col]["type"], list):
colors = create_comparison_color_list(config)
line_styles = ["-", "--"]
for col, data in variables.items():
if all(iter([t == "TimeSeries" for t in data["type"]])):
for i, series in enumerate(data["series"]):
if scale:
series = (series - series.min()) / (series.max() - series.min())
series.plot(
ax=ax,
label=col,
linestyle=line_styles[i],
color=colors[i],
alpha=0.65,
)
else:
for col, data in variables.items():
if data["type"] == "TimeSeries":
data["series"].plot(ax=ax, label=col)

plt.legend(loc="upper right")
return plot_360_n0sc0pe(config)


def _plot_timeseries(
config: Settings,
series: Union[list, pd.Series],
Expand Down
Loading

0 comments on commit dcfba2e

Please sign in to comment.