Skip to content
This repository has been archived by the owner on Oct 4, 2021. It is now read-only.

feat: v0 rapport post integration #65

Draft
wants to merge 3 commits into
base: develop
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
321 changes: 321 additions & 0 deletions notebooks/02-rapport_post_integration.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "directed-vessel",
"metadata": {},
"source": [
"# Automated Post-integration Report - Signaux Faibles\n",
"This notebook can be run after each new data integration by the [opensignauxfaibles](https://github.com/signaux-faibles/opensignauxfaibles) codebase."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "electoral-joining",
"metadata": {},
"outputs": [],
"source": [
"VARIABLES = [\n",
" \"financier_court_terme\",\n",
" \"interets\",\n",
" \"ca\",\n",
" \"equilibre_financier\",\n",
" \"endettement\",\n",
" \"degre_immo_corporelle\",\n",
" \"liquidite_reduite\",\n",
" \"poids_bfr_exploitation\",\n",
" \"productivite_capital_investi\",\n",
" \"rentabilite_economique\",\n",
" \"rentabilite_nette\",\n",
" \"cotisation\",\n",
" \"cotisation_moy12m\",\n",
" \"montant_part_ouvriere\",\n",
" \"montant_part_ouvriere_past_1\",\n",
" \"montant_part_ouvriere_past_12\",\n",
" \"montant_part_ouvriere_past_2\",\n",
" \"montant_part_ouvriere_past_3\",\n",
" \"montant_part_ouvriere_past_6\",\n",
" \"montant_part_patronale\",\n",
" \"montant_part_patronale_past_1\",\n",
" \"montant_part_patronale_past_12\",\n",
" \"montant_part_patronale_past_2\",\n",
" \"montant_part_patronale_past_3\",\n",
" \"montant_part_patronale_past_6\",\n",
" \"ratio_dette\",\n",
" \"ratio_dette_moy12m\",\n",
" \"effectif\",\n",
" \"apart_heures_consommees_cumulees\",\n",
" \"apart_heures_consommees\",\n",
" \"paydex_nb_jours\",\n",
" \"paydex_nb_jours_past_12\",\n",
"]\n",
"# ces variables sont toujours requêtées\n",
"VARIABLES += [\"outcome\", \"periode\", \"siret\", \"siren\", \"time_til_outcome\", \"code_naf\"]\n",
"\n",
"# période actuelle\n",
"LATEST_PERIODE = \"2021-02-01\""
]
},
{
"cell_type": "markdown",
"id": "athletic-adams",
"metadata": {},
"source": [
"## Fetch a random sample of the data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "tutorial-congress",
"metadata": {},
"outputs": [],
"source": [
"%config Completer.use_jedi = False\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "mighty-feelings",
"metadata": {},
"outputs": [],
"source": [
"from predictsignauxfaibles.data import SFDataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "extra-panama",
"metadata": {},
"outputs": [],
"source": [
"dataset = SFDataset(\n",
" fields = VARIABLES,\n",
" sample_size=100_000\n",
")\n",
"dataset.fetch_data();"
]
},
{
"cell_type": "markdown",
"id": "headed-aurora",
"metadata": {},
"source": [
"## Temporal Coverage and NA values"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "comic-shift",
"metadata": {},
"outputs": [],
"source": [
"dataset.data.periode = pd.to_datetime(dataset.data.periode)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "optional-corner",
"metadata": {},
"outputs": [],
"source": [
"date_range = dataset.data.periode.min().date(), dataset.data.periode.max().date()\n",
"print(f\"Data goes from {date_range[0]} to {date_range[1]}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "proof-horse",
"metadata": {},
"outputs": [],
"source": [
"na_rates_df = (dataset.data.isna().sum() / len(dataset) * 100).sort_values(ascending = False).to_frame()\n",
"na_rates_df.columns = [\"NA rate\"]\n",
"na_rates_df"
]
},
{
"cell_type": "markdown",
"id": "numerous-senate",
"metadata": {},
"source": [
"## Coverage over time for selected variables"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "pretty-memorabilia",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "constitutional-audience",
"metadata": {},
"outputs": [],
"source": [
"fig, axs = plt.subplots(len(VARIABLES), figsize=(10, 100))\n",
"fig.tight_layout()\n",
"for i, variable in enumerate(VARIABLES):\n",
" grouped = dataset.data.groupby(pd.Grouper(key=\"periode\", freq=\"M\")).agg({f\"{variable}\": count_na_prop})\n",
" axs[i].set_title(f\"{variable}\")\n",
" axs[i].set_ylim([0, 100])\n",
" axs[i].plot_date(grouped.index, grouped[f\"{variable}\"], \"-\");\n",
" axs[i].set(adjustable='box')"
]
},
{
"cell_type": "markdown",
"id": "aboriginal-dominican",
"metadata": {},
"source": [
"## Average over time"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "local-beijing",
"metadata": {},
"outputs": [],
"source": [
"from pandas.api.types import is_numeric_dtype"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "purple-helicopter",
"metadata": {},
"outputs": [],
"source": [
"VARIABLES_TO_AVERAGE = [var for var in VARIABLES if is_numeric_dtype(dataset.data[var])]\n",
"fig, axs = plt.subplots(len(VARIABLES_TO_AVERAGE), figsize=(10, 100))\n",
"fig.tight_layout()\n",
"for i, variable in enumerate(VARIABLES_TO_AVERAGE):\n",
" grouped = dataset.data.groupby(pd.Grouper(key=\"periode\", freq=\"M\")).agg({f\"{variable}\": \"mean\"})\n",
" axs[i].set_title(f\"{variable}\")\n",
" #axs[i].set_ylim([0, 100])\n",
" axs[i].plot_date(grouped.index, grouped[f\"{variable}\"], \"-\");\n",
" axs[i].set(adjustable='box')"
]
},
{
"cell_type": "markdown",
"id": "guided-launch",
"metadata": {},
"source": [
"## Codes NAF"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "crude-wesley",
"metadata": {},
"outputs": [],
"source": [
"import seaborn as sns\n",
"grouped = dataset.data.groupby(\"code_naf\", as_index=False).agg({\"outcome\": \"count\"})\n",
"sns.barplot(x = grouped.code_naf, y = grouped.outcome);"
]
},
{
"cell_type": "markdown",
"id": "distinguished-router",
"metadata": {},
"source": [
"## Codes NAF over time"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "satisfactory-selling",
"metadata": {},
"outputs": [],
"source": [
"grouped = dataset.data.groupby([pd.Grouper(key = \"periode\", freq = \"2Q\"), \"code_naf\"]).agg({\"outcome\": \"count\"}).reset_index()\n",
"plt.figure(figsize = (15, 10))\n",
"sns.lineplot(x = grouped.periode, y = grouped.outcome, hue = grouped.code_naf);"
]
},
{
"cell_type": "markdown",
"id": "wanted-retrieval",
"metadata": {},
"source": [
"## Analysis for latest period only"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "structural-bridal",
"metadata": {},
"outputs": [],
"source": [
"dataset = SFDataset(\n",
" fields = VARIABLES,\n",
" date_min = LATEST_PERIODE,\n",
" sample_size=100_000\n",
")\n",
"dataset.fetch_data();"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "continuing-minnesota",
"metadata": {},
"outputs": [],
"source": [
"na_rates_df = (dataset.data.isna().sum() / len(dataset) * 100).sort_values(ascending = False).to_frame()\n",
"na_rates_df.columns = [\"NA rate\"]\n",
"na_rates_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "friendly-appearance",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}