diff --git a/src/ydata_profiling/compare_reports.py b/src/ydata_profiling/compare_reports.py index fb7472238..a208f3588 100644 --- a/src/ydata_profiling/compare_reports.py +++ b/src/ydata_profiling/compare_reports.py @@ -356,6 +356,8 @@ def compare( res["analysis"]["title"] = _compare_title(res["analysis"]["title"]) res["alerts"] = _create_placehoder_alerts(res["alerts"]) + if not any(res["time_index_analysis"]): + res["time_index_analysis"] = None profile = ProfileReport(None, config=_config) profile._description_set = from_dict(data_class=BaseDescription, data=res) return profile diff --git a/src/ydata_profiling/model/describe.py b/src/ydata_profiling/model/describe.py index 444c760bf..6a7afffe1 100644 --- a/src/ydata_profiling/model/describe.py +++ b/src/ydata_profiling/model/describe.py @@ -14,6 +14,7 @@ get_active_correlations, ) from ydata_profiling.model.dataframe import check_dataframe, preprocess +from ydata_profiling.model.description import TimeIndexAnalysis from ydata_profiling.model.duplicates import get_duplicates from ydata_profiling.model.missing import get_missing_active, get_missing_diagram from ydata_profiling.model.pairwise import get_scatter_plot, get_scatter_tasks @@ -21,6 +22,7 @@ from ydata_profiling.model.summarizer import BaseSummarizer from ydata_profiling.model.summary import get_series_descriptions from ydata_profiling.model.table import get_table_stats +from ydata_profiling.model.timeseries_index import get_time_index_description from ydata_profiling.utils.progress_bar import progress from ydata_profiling.version import __version__ @@ -158,6 +160,9 @@ def describe( config, table_stats, series_description, correlations ) + if config.vars.timeseries.active: + tsindex_description = get_time_index_description(config, df, table_stats) + pbar.set_postfix_str("Get reproduction details") package = { "ydata_profiling_version": __version__, @@ -170,9 +175,13 @@ def describe( date_end = datetime.utcnow() analysis = BaseAnalysis(config.title, date_start, date_end) + time_index_analysis = None + if config.vars.timeseries.active and tsindex_description: + time_index_analysis = TimeIndexAnalysis(**tsindex_description) description = BaseDescription( analysis=analysis, + time_index_analysis=time_index_analysis, table=table_stats, variables=series_description, scatter=scatter_matrix, diff --git a/src/ydata_profiling/model/description.py b/src/ydata_profiling/model/description.py index 5ed181e5d..bdf741103 100644 --- a/src/ydata_profiling/model/description.py +++ b/src/ydata_profiling/model/description.py @@ -1,6 +1,6 @@ from dataclasses import dataclass from datetime import datetime, timedelta -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional, Union @dataclass @@ -38,12 +38,50 @@ def duration(self) -> Union[timedelta, List[timedelta]]: raise ValueError() +@dataclass +class TimeIndexAnalysis: + """Description of timeseries index analysis module of report. + + Attributes: + n_series (Union[int, List[int]): Number of time series identified in the dataset. + length (Union[int, List[int]): Number of data points in the time series. + start (Any): Starting point of the time series. + end (Any): Ending point of the time series. + period (Union[float, List[float]): Average interval between data points in the time series. + frequency (Union[Optional[str], List[Optional[str]]): A string alias given to useful common time series frequencies, e.g. H - hours. + """ + + n_series: Union[int, List[int]] + length: Union[int, List[int]] + start: Any + end: Any + period: Union[float, List[float]] + frequency: Union[Optional[str], List[Optional[str]]] + + def __init__( + self, + n_series: int, + length: int, + start: Any, + end: Any, + period: float, + frequency: Optional[str] = None, + ) -> None: + self.n_series = n_series + self.length = length + self.start = start + self.end = end + self.period = period + self.frequency = frequency + + @dataclass class BaseDescription: """Description of DataFrame. Attributes: analysis (BaseAnalysis): Base info about report. Title, start time and end time of description generating. + time_index_analysis (Optional[TimeIndexAnalysis]): Description of timeseries index analysis module of report. table (Any): DataFrame statistic. Base information about DataFrame. variables (Dict[str, Any]): Description of variables (columns) of DataFrame. Key is column name, value is description dictionary. scatter (Any): Pairwise scatter for all variables. Plot interactions between variables. @@ -56,6 +94,7 @@ class BaseDescription: """ analysis: BaseAnalysis + time_index_analysis: Optional[TimeIndexAnalysis] table: Any variables: Dict[str, Any] scatter: Any diff --git a/src/ydata_profiling/model/pandas/__init__.py b/src/ydata_profiling/model/pandas/__init__.py index b895f46e2..59ccf853c 100644 --- a/src/ydata_profiling/model/pandas/__init__.py +++ b/src/ydata_profiling/model/pandas/__init__.py @@ -19,6 +19,7 @@ sample_pandas, summary_pandas, table_pandas, + timeseries_index_pandas, ) __all__ = [ @@ -43,4 +44,5 @@ "sample_pandas", "summary_pandas", "table_pandas", + "timeseries_index_pandas", ] diff --git a/src/ydata_profiling/model/pandas/timeseries_index_pandas.py b/src/ydata_profiling/model/pandas/timeseries_index_pandas.py new file mode 100644 index 000000000..db35e1264 --- /dev/null +++ b/src/ydata_profiling/model/pandas/timeseries_index_pandas.py @@ -0,0 +1,39 @@ +"""Compute statistical description of datasets.""" +import numpy as np +import pandas as pd +from pandas.api.types import is_numeric_dtype + +from ydata_profiling.config import Settings +from ydata_profiling.model.timeseries_index import get_time_index_description + + +@get_time_index_description.register +def pandas_get_time_index_description( + config: Settings, + df: pd.DataFrame, + table_stats: dict, +) -> dict: + if not (is_numeric_dtype(df.index) or isinstance(df.index, pd.DatetimeIndex)): + return {} + + n_series = table_stats["types"].get("TimeSeries", 0) + length = table_stats["n"] + start = df.index.min() + end = df.index.max() + if isinstance(df.index, pd.DatetimeIndex): + freq = df.index.inferred_freq + delta = abs(np.diff(df.index)).mean() + delta = delta.astype(f"timedelta64[{df.index.inferred_freq}]") + period = delta.astype(float) + else: + freq = None + period = abs(np.diff(df.index)).mean() + + return { + "n_series": n_series, + "length": length, + "start": start, + "end": end, + "frequency": freq, + "period": period, + } diff --git a/src/ydata_profiling/model/spark/__init__.py b/src/ydata_profiling/model/spark/__init__.py index ee06d0271..854222a9a 100644 --- a/src/ydata_profiling/model/spark/__init__.py +++ b/src/ydata_profiling/model/spark/__init__.py @@ -13,6 +13,7 @@ sample_spark, summary_spark, table_spark, + timeseries_index_spark, ) __all__ = [ @@ -31,4 +32,5 @@ "sample_spark", "summary_spark", "table_spark", + "timeseries_index_spark", ] diff --git a/src/ydata_profiling/model/spark/timeseries_index_spark.py b/src/ydata_profiling/model/spark/timeseries_index_spark.py new file mode 100644 index 000000000..cdf3d88dd --- /dev/null +++ b/src/ydata_profiling/model/spark/timeseries_index_spark.py @@ -0,0 +1,14 @@ +"""Compute statistical description of datasets.""" +from pyspark.sql import DataFrame + +from ydata_profiling.config import Settings +from ydata_profiling.model.timeseries_index import get_time_index_description + + +@get_time_index_description.register +def spark_get_time_index_description( + config: Settings, + df: DataFrame, + table_stats: dict, +) -> dict: + return {} diff --git a/src/ydata_profiling/model/timeseries_index.py b/src/ydata_profiling/model/timeseries_index.py new file mode 100644 index 000000000..261569496 --- /dev/null +++ b/src/ydata_profiling/model/timeseries_index.py @@ -0,0 +1,16 @@ +"""Compute statistical description of datasets.""" + +from typing import Any + +from multimethod import multimethod + +from ydata_profiling.config import Settings + + +@multimethod +def get_time_index_description( + config: Settings, + df: Any, + table_stats: dict, +) -> dict: + raise NotImplementedError() diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py index 161010879..0be5c78a3 100644 --- a/src/ydata_profiling/profile_report.py +++ b/src/ydata_profiling/profile_report.py @@ -197,11 +197,13 @@ def __initialize_dataframe( df is not None and isinstance(df, pd.DataFrame) and report_config.vars.timeseries.active - and report_config.vars.timeseries.sortby ): - df = df.sort_values(by=report_config.vars.timeseries.sortby) - df = df.set_index(report_config.vars.timeseries.sortby, drop=False) - df.index.name = None + if report_config.vars.timeseries.sortby: + df = df.sort_values(by=report_config.vars.timeseries.sortby) + df = df.set_index(report_config.vars.timeseries.sortby, drop=False) + df.index.name = None + else: + df = df.sort_index() return df diff --git a/src/ydata_profiling/report/structure/overview.py b/src/ydata_profiling/report/structure/overview.py index 28defc8b5..d493e17b8 100644 --- a/src/ydata_profiling/report/structure/overview.py +++ b/src/ydata_profiling/report/structure/overview.py @@ -1,9 +1,11 @@ -from typing import List +from datetime import datetime +from typing import Any, List from urllib.parse import quote from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType +from ydata_profiling.model.description import TimeIndexAnalysis from ydata_profiling.report.formatters import ( fmt, fmt_bytesize, @@ -13,8 +15,11 @@ fmt_timespan, list_args, ) -from ydata_profiling.report.presentation.core import Alerts, Container, Table +from ydata_profiling.report.presentation.core import Alerts, Container +from ydata_profiling.report.presentation.core import Image as ImageWidget +from ydata_profiling.report.presentation.core import Table from ydata_profiling.report.presentation.core.renderable import Renderable +from ydata_profiling.visualisation.plot import plot_overview_timeseries def get_dataset_overview(config: Settings, summary: BaseDescription) -> Renderable: @@ -266,6 +271,76 @@ def get_dataset_alerts(config: Settings, alerts: list) -> Alerts: ) +def get_timeseries_items(config: Settings, summary: BaseDescription) -> Container: + def format_tsindex_limit(limit: Any) -> str: + if isinstance(limit, datetime): + return limit.strftime("%Y-%m-%d %H:%M:%S") + else: + return fmt_number(limit) + + assert isinstance(summary.time_index_analysis, TimeIndexAnalysis) + table_stats = [ + { + "name": "Number of series", + "value": fmt_number(summary.time_index_analysis.n_series), + }, + { + "name": "Time series length", + "value": fmt_number(summary.time_index_analysis.length), + }, + { + "name": "Starting point", + "value": format_tsindex_limit(summary.time_index_analysis.start), + }, + { + "name": "Ending point", + "value": format_tsindex_limit(summary.time_index_analysis.end), + }, + { + "name": "Period", + "value": fmt_number(summary.time_index_analysis.period), + }, + ] + + if summary.time_index_analysis.frequency: + table_stats.append( + { + "name": "Frequency", + "value": summary.time_index_analysis.frequency, + } + ) + + ts_info = Table(table_stats, name="Timeseries statistics", style=config.html.style) + + timeseries = ImageWidget( + plot_overview_timeseries(config, summary.variables), + image_format=config.plot.image_format, + alt="ts_plot", + name="preview", + anchor_id="ts_plot_overview", + ) + timeseries_scaled = ImageWidget( + plot_overview_timeseries(config, summary.variables, scale=True), + image_format=config.plot.image_format, + alt="ts_plot_scaled", + name="scaled", + anchor_id="ts_plot_scaled_overview", + ) + ts_tab = Container( + [timeseries, timeseries_scaled], + anchor_id="ts_plot_overview", + name="", + sequence_type="tabs", + ) + + return Container( + [ts_info, ts_tab], + anchor_id="timeseries_overview", + name="Time Series", + sequence_type="grid", + ) + + def get_dataset_items(config: Settings, summary: BaseDescription, alerts: list) -> list: """Returns the dataset overview (at the top of the report) @@ -293,6 +368,9 @@ def get_dataset_items(config: Settings, summary: BaseDescription, alerts: list) if len(column_details) > 0: items.append(get_dataset_column_definitions(config, column_details)) + if summary.time_index_analysis: + items.append(get_timeseries_items(config, summary)) + if alerts: items.append(get_dataset_alerts(config, alerts)) diff --git a/src/ydata_profiling/visualisation/plot.py b/src/ydata_profiling/visualisation/plot.py index f54ee9d9c..079690237 100644 --- a/src/ydata_profiling/visualisation/plot.py +++ b/src/ydata_profiling/visualisation/plot.py @@ -557,6 +557,49 @@ def _format_ts_date_axis( return axis +@manage_matplotlib_context() +def plot_overview_timeseries( + config: Settings, + variables: Any, + figsize: tuple = (6, 4), + scale: bool = False, +) -> matplotlib.figure.Figure: + """Plot an line plot from the data and return the AxesSubplot object. + Args: + variables: The data to plot. + figsize: The size of the figure (width, height) in inches, default (6,4). + scale: Scale series values between [0,1]. Defaults to False. + Returns: + The TimeSeries lineplot. + """ + fig = plt.figure(figsize=figsize) + ax = fig.add_subplot(111) + + col = next(iter(variables)) + if isinstance(variables[col]["type"], list): + colors = create_comparison_color_list(config) + line_styles = ["-", "--"] + for col, data in variables.items(): + if all(iter([t == "TimeSeries" for t in data["type"]])): + for i, series in enumerate(data["series"]): + if scale: + series = (series - series.min()) / (series.max() - series.min()) + series.plot( + ax=ax, + label=col, + linestyle=line_styles[i], + color=colors[i], + alpha=0.65, + ) + else: + for col, data in variables.items(): + if data["type"] == "TimeSeries": + data["series"].plot(ax=ax, label=col) + + plt.legend(loc="upper right") + return plot_360_n0sc0pe(config) + + def _plot_timeseries( config: Settings, series: Union[list, pd.Series], diff --git a/tests/backends/spark_backend/test_descriptions_spark.py b/tests/backends/spark_backend/test_descriptions_spark.py index 2b37923c4..6b13a5910 100644 --- a/tests/backends/spark_backend/test_descriptions_spark.py +++ b/tests/backends/spark_backend/test_descriptions_spark.py @@ -379,6 +379,7 @@ def test_describe_spark_df( assert { "analysis", + "time_index_analysis", "table", "variables", "scatter", diff --git a/tests/issues/test_issue545.py b/tests/issues/test_issue545.py index 3bad03e73..f63db309b 100644 --- a/tests/issues/test_issue545.py +++ b/tests/issues/test_issue545.py @@ -15,7 +15,7 @@ @pytest.mark.skipif( pandas_version_info() <= (1, 1, 0), reason="requires pandas 1.1.1 or higher" ) -def test_issue545(get_data_file): +def test_issue545(): file_name = Path(__file__).parents[0] / "data/sample_eda_df.pkl" sample_eda_df = pd.read_pickle(str(file_name)) diff --git a/tests/unit/test_describe.py b/tests/unit/test_describe.py index 10597e889..0eb10b7b7 100644 --- a/tests/unit/test_describe.py +++ b/tests/unit/test_describe.py @@ -546,6 +546,7 @@ def test_describe_df(column, describe_data, expected_results, summarizer): assert { "analysis", + "time_index_analysis", "table", "variables", "scatter", diff --git a/tests/unit/test_output.py b/tests/unit/test_output.py index 62ec2560a..5af538e60 100644 --- a/tests/unit/test_output.py +++ b/tests/unit/test_output.py @@ -17,6 +17,7 @@ def test_json(data): data = json.loads(report_json) assert set(data.keys()) == { "analysis", + "time_index_analysis", "correlations", "duplicates", "alerts",