signaux-faibles · vviers · May 18, 2021 · May 19, 2021 · Jun 15, 2021
diff --git a/notebooks/02-rapport_post_integration.ipynb b/notebooks/02-rapport_post_integration.ipynb
@@ -0,0 +1,321 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "directed-vessel",
+   "metadata": {},
+   "source": [
+    "# Automated Post-integration Report - Signaux Faibles\n",
+    "This notebook can be run after each new data integration by the [opensignauxfaibles](https://github.com/signaux-faibles/opensignauxfaibles) codebase."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "electoral-joining",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "VARIABLES = [\n",
+    "    \"financier_court_terme\",\n",
+    "    \"interets\",\n",
+    "    \"ca\",\n",
+    "    \"equilibre_financier\",\n",
+    "    \"endettement\",\n",
+    "    \"degre_immo_corporelle\",\n",
+    "    \"liquidite_reduite\",\n",
+    "    \"poids_bfr_exploitation\",\n",
+    "    \"productivite_capital_investi\",\n",
+    "    \"rentabilite_economique\",\n",
+    "    \"rentabilite_nette\",\n",
+    "    \"cotisation\",\n",
+    "    \"cotisation_moy12m\",\n",
+    "    \"montant_part_ouvriere\",\n",
+    "    \"montant_part_ouvriere_past_1\",\n",
+    "    \"montant_part_ouvriere_past_12\",\n",
+    "    \"montant_part_ouvriere_past_2\",\n",
+    "    \"montant_part_ouvriere_past_3\",\n",
+    "    \"montant_part_ouvriere_past_6\",\n",
+    "    \"montant_part_patronale\",\n",
+    "    \"montant_part_patronale_past_1\",\n",
+    "    \"montant_part_patronale_past_12\",\n",
+    "    \"montant_part_patronale_past_2\",\n",
+    "    \"montant_part_patronale_past_3\",\n",
+    "    \"montant_part_patronale_past_6\",\n",
+    "    \"ratio_dette\",\n",
+    "    \"ratio_dette_moy12m\",\n",
+    "    \"effectif\",\n",
+    "    \"apart_heures_consommees_cumulees\",\n",
+    "    \"apart_heures_consommees\",\n",
+    "    \"paydex_nb_jours\",\n",
+    "    \"paydex_nb_jours_past_12\",\n",
+    "]\n",
+    "# ces variables sont toujours requêtées\n",
+    "VARIABLES += [\"outcome\", \"periode\", \"siret\", \"siren\", \"time_til_outcome\", \"code_naf\"]\n",
+    "\n",
+    "# période actuelle\n",
+    "LATEST_PERIODE = \"2021-02-01\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "athletic-adams",
+   "metadata": {},
+   "source": [
+    "## Fetch a random sample of the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "tutorial-congress",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%config Completer.use_jedi = False\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "mighty-feelings",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from predictsignauxfaibles.data import SFDataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "extra-panama",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = SFDataset(\n",
+    "    fields = VARIABLES,\n",
+    "    sample_size=100_000\n",
+    ")\n",
+    "dataset.fetch_data();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "headed-aurora",
+   "metadata": {},
+   "source": [
+    "## Temporal Coverage and NA values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "comic-shift",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset.data.periode = pd.to_datetime(dataset.data.periode)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "optional-corner",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "date_range = dataset.data.periode.min().date(), dataset.data.periode.max().date()\n",
+    "print(f\"Data goes from {date_range[0]} to {date_range[1]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "proof-horse",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "na_rates_df = (dataset.data.isna().sum() / len(dataset) * 100).sort_values(ascending = False).to_frame()\n",
+    "na_rates_df.columns = [\"NA rate\"]\n",
+    "na_rates_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "numerous-senate",
+   "metadata": {},
+   "source": [
+    "## Coverage over time for selected variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "pretty-memorabilia",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "constitutional-audience",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axs = plt.subplots(len(VARIABLES), figsize=(10, 100))\n",
+    "fig.tight_layout()\n",
+    "for i, variable in enumerate(VARIABLES):\n",
+    "    grouped = dataset.data.groupby(pd.Grouper(key=\"periode\", freq=\"M\")).agg({f\"{variable}\": count_na_prop})\n",
+    "    axs[i].set_title(f\"{variable}\")\n",
+    "    axs[i].set_ylim([0, 100])\n",
+    "    axs[i].plot_date(grouped.index, grouped[f\"{variable}\"], \"-\");\n",
+    "    axs[i].set(adjustable='box')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aboriginal-dominican",
+   "metadata": {},
+   "source": [
+    "## Average over time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "local-beijing",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pandas.api.types import is_numeric_dtype"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "purple-helicopter",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "VARIABLES_TO_AVERAGE = [var for var in VARIABLES if is_numeric_dtype(dataset.data[var])]\n",
+    "fig, axs = plt.subplots(len(VARIABLES_TO_AVERAGE), figsize=(10, 100))\n",
+    "fig.tight_layout()\n",
+    "for i, variable in enumerate(VARIABLES_TO_AVERAGE):\n",
+    "    grouped = dataset.data.groupby(pd.Grouper(key=\"periode\", freq=\"M\")).agg({f\"{variable}\": \"mean\"})\n",
+    "    axs[i].set_title(f\"{variable}\")\n",
+    "    #axs[i].set_ylim([0, 100])\n",
+    "    axs[i].plot_date(grouped.index, grouped[f\"{variable}\"], \"-\");\n",
+    "    axs[i].set(adjustable='box')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "guided-launch",
+   "metadata": {},
+   "source": [
+    "## Codes NAF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "crude-wesley",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import seaborn as sns\n",
+    "grouped = dataset.data.groupby(\"code_naf\", as_index=False).agg({\"outcome\": \"count\"})\n",
+    "sns.barplot(x = grouped.code_naf, y = grouped.outcome);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "distinguished-router",
+   "metadata": {},
+   "source": [
+    "## Codes NAF over time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "satisfactory-selling",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grouped = dataset.data.groupby([pd.Grouper(key = \"periode\", freq = \"2Q\"), \"code_naf\"]).agg({\"outcome\": \"count\"}).reset_index()\n",
+    "plt.figure(figsize = (15, 10))\n",
+    "sns.lineplot(x = grouped.periode, y = grouped.outcome, hue = grouped.code_naf);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "wanted-retrieval",
+   "metadata": {},
+   "source": [
+    "## Analysis for latest period only"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "structural-bridal",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = SFDataset(\n",
+    "    fields = VARIABLES,\n",
+    "    date_min = LATEST_PERIODE,\n",
+    "    sample_size=100_000\n",
+    ")\n",
+    "dataset.fetch_data();"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "continuing-minnesota",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "na_rates_df = (dataset.data.isna().sum() / len(dataset) * 100).sort_values(ascending = False).to_frame()\n",
+    "na_rates_df.columns = [\"NA rate\"]\n",
+    "na_rates_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "friendly-appearance",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}