From b0e3fdcfc791b2576b898da1575fa4c4683e7d83 Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Thu, 26 Sep 2024 09:50:11 +0200
Subject: [PATCH 1/6] feat : first analysis output tables

---
 analysis/synthesis/population.py | 79 ++++++++++++++++++++++++++++++++
 synthesis/output.py              | 11 +++++
 2 files changed, 90 insertions(+)
 create mode 100644 analysis/synthesis/population.py

diff --git a/analysis/synthesis/population.py b/analysis/synthesis/population.py
new file mode 100644
index 00000000..e2db75e0
--- /dev/null
+++ b/analysis/synthesis/population.py
@@ -0,0 +1,79 @@
+import itertools
+import numba
+
+import numpy as np
+import pandas as pd
+from  analysis.marginals import NUMBER_OF_VEHICLES_LABELS
+
+AGE_CLASS = [0, 10, 14, 17, 25, 50, 65, np.inf]
+NUMBER_OF_VEHICLES= [0,1,2,3,np.inf]
+def configure(context):
+
+    context.config("output_path")
+    context.config("output_prefix", "ile_de_france_")
+    context.stage("synthesis.population.trips")
+    context.stage("synthesis.population.enriched")
+
+    context.stage("data.census.filtered", alias = "census")
+    context.stage("data.hts.selected", alias = "hts")
+
+def execute(context):
+    path = context.config("output_path")
+    prefix = context.config("output_prefix")
+
+    df_person_eq = context.stage("synthesis.population.enriched")
+    df_trip_eq = context.stage("synthesis.population.trips")
+
+    df_census = context.stage("census")
+    _, df_hts_person, df_hts_trip = context.stage("hts")
+    # get age class
+    df_person_eq["age_class"] = pd.cut(df_person_eq["age"],AGE_CLASS,include_lowest=True)
+    df_census["age_class"] = pd.cut(df_census["age"],AGE_CLASS,include_lowest=True)
+    df_hts_person["age_class"] = pd.cut(df_hts_person["age"],AGE_CLASS,include_lowest=True)
+
+    # get vehicule class 
+    df_person_eq["vehicles_class"] = pd.cut(df_person_eq["number_of_vehicles"],NUMBER_OF_VEHICLES,right=True,labels=NUMBER_OF_VEHICLES_LABELS)
+    df_census["vehicles_class"] = pd.cut(df_census["number_of_vehicles"],NUMBER_OF_VEHICLES,right=True,labels=NUMBER_OF_VEHICLES_LABELS)
+
+
+    df_eq_depl = pd.merge(df_trip_eq,df_person_eq[["person_id","age_class"]],on=["person_id"])
+    df_hts_depl = pd.merge(df_hts_trip,df_hts_person[["person_id","age_class"]],on=["person_id"])
+    # Age purpose analysis
+    analysis_age_purpose = pd.pivot_table(df_eq_depl,"person_id",index="age_class",columns="following_purpose",aggfunc="count")
+    analysis_age_purpose.to_csv(f"{path}/{prefix}age_purpose.csv")
+
+    # Compare age volume
+    analysis_age_class = pd.concat([df_census.groupby("age_class")["person_id"].count(),df_person_eq.groupby("age_class")["person_id"].count()],axis=1).reset_index()
+    analysis_age_class.columns = ["Age class","INSEE","EQASIM"]
+    analysis_age_class.to_csv(f"{path}/{prefix}age.csv")
+
+    # Compare vehicule volume
+    analysis_vehicles_class = pd.concat([df_census.groupby("vehicles_class")["household_id"].nunique(),df_person_eq.groupby("vehicles_class")["household_id"].nunique()],axis=1).reset_index()
+    analysis_vehicles_class.columns = ["Number of vehicles class","INSEE","EQASIM"]
+    analysis_vehicles_class.to_csv(f"{path}/{prefix}vehicle.csv")
+    
+    # Compare license volume 
+    analysis_license_class = pd.concat([df_hts_person.groupby("has_license")["person_id"].count(),df_person_eq.groupby("has_license")["person_id"].count()],axis=1).reset_index()
+    analysis_vehicles_class.columns = ["Possession of license","HTS","EQASIM"]
+    analysis_vehicles_class["Possession of license"] = analysis_vehicles_class["Possession of license"] == 1
+    analysis_license_class.to_csv(f"{path}/{prefix}license.csv")
+
+    # Compare depl volume
+    analysis_depl = pd.concat([df_hts_depl.groupby("age_class")["person_id"].count(),df_eq_depl.groupby("age_class")["person_id"].count()],axis=1).reset_index()
+    analysis_depl.columns = ["Age class","HTS","EQASIM"]
+    analysis_depl.to_csv(f"{path}/{prefix}deplacement.csv")
+
+    # Compare dist
+    df_hts_trip["routed_distance"] = df_hts_trip["routed_distance"]/1000
+    df_hts_trip["distance_class"] = pd.cut(df_hts_trip["routed_distance"],list(np.arange(50))+[np.inf])
+    analysis_dist = df_hts_trip.groupby("distance_class")["person_id"].count()
+    return analysis_dist
+
+
+
+
+
+
+
+
+
diff --git a/synthesis/output.py b/synthesis/output.py
index 1c47962f..a761d1e6 100644
--- a/synthesis/output.py
+++ b/synthesis/output.py
@@ -8,6 +8,7 @@
 import numpy as np
 
 def configure(context):
+    context.stage("analysis.synthesis.population")
     context.stage("synthesis.population.enriched")
 
     context.stage("synthesis.population.activities")
@@ -260,3 +261,13 @@ def execute(context):
     if "geoparquet" in output_formats:
         path = "%s/%strips.geoparquet" % (output_path, output_prefix)
         df_spatial.to_parquet(path)
+    
+    # Execution analysis
+    df_spatial = df_spatial.to_crs("EPSG:2154")
+
+    df_spatial["distance"] = df_spatial.length
+    df_spatial["distance_class"] = pd.cut(df_spatial["distance"],list(np.arange(50))+[np.inf])
+    analysis_distance = context.stage("analysis.synthesis.population")
+    analysis_distance = pd.concat([analysis_distance,df_spatial.groupby("distance_class")["person_id"].count()],axis=1).reset_index()
+    analysis_distance.columns = ["Distance class","HTS","EQASIM"]
+    analysis_distance.to_csv(f"{output_path}/{output_prefix}distance.csv")
\ No newline at end of file

From f0177e08c9fcc8658dd9f109f22164923be03415 Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Mon, 30 Sep 2024 16:21:16 +0200
Subject: [PATCH 2/6] feat: add proportion & improvement outputs

---
 analysis/synthesis/population.py | 43 +++++++++++++++++++++-----------
 synthesis/output.py              | 15 +++++++----
 2 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/analysis/synthesis/population.py b/analysis/synthesis/population.py
index e2db75e0..48be0d2f 100644
--- a/analysis/synthesis/population.py
+++ b/analysis/synthesis/population.py
@@ -7,10 +7,12 @@
 
 AGE_CLASS = [0, 10, 14, 17, 25, 50, 65, np.inf]
 NUMBER_OF_VEHICLES= [0,1,2,3,np.inf]
+NAME_AGE_CLASS = ["0-10","11-14","15-17","18-25","26-50","51-65","65+"]
 def configure(context):
 
     context.config("output_path")
     context.config("output_prefix", "ile_de_france_")
+    context.config("sampling_rate")
     context.stage("synthesis.population.trips")
     context.stage("synthesis.population.enriched")
 
@@ -20,16 +22,17 @@ def configure(context):
 def execute(context):
     path = context.config("output_path")
     prefix = context.config("output_prefix")
-
+    sampling_rate = context.config("sampling_rate")
     df_person_eq = context.stage("synthesis.population.enriched")
     df_trip_eq = context.stage("synthesis.population.trips")
 
     df_census = context.stage("census")
     _, df_hts_person, df_hts_trip = context.stage("hts")
+    df_hts_person["person_weight"] *=df_census["weight"].sum()/df_hts_person["person_weight"].sum()
     # get age class
-    df_person_eq["age_class"] = pd.cut(df_person_eq["age"],AGE_CLASS,include_lowest=True)
-    df_census["age_class"] = pd.cut(df_census["age"],AGE_CLASS,include_lowest=True)
-    df_hts_person["age_class"] = pd.cut(df_hts_person["age"],AGE_CLASS,include_lowest=True)
+    df_person_eq["age_class"] = pd.cut(df_person_eq["age"],AGE_CLASS,include_lowest=True,labels=NAME_AGE_CLASS)
+    df_census["age_class"] = pd.cut(df_census["age"],AGE_CLASS,include_lowest=True,labels=NAME_AGE_CLASS)
+    df_hts_person["age_class"] = pd.cut(df_hts_person["age"],AGE_CLASS,include_lowest=True,labels=NAME_AGE_CLASS)
 
     # get vehicule class 
     df_person_eq["vehicles_class"] = pd.cut(df_person_eq["number_of_vehicles"],NUMBER_OF_VEHICLES,right=True,labels=NUMBER_OF_VEHICLES_LABELS)
@@ -37,36 +40,48 @@ def execute(context):
 
 
     df_eq_depl = pd.merge(df_trip_eq,df_person_eq[["person_id","age_class"]],on=["person_id"])
-    df_hts_depl = pd.merge(df_hts_trip,df_hts_person[["person_id","age_class"]],on=["person_id"])
+    df_hts_depl = pd.merge(df_hts_trip,df_hts_person[["person_id","age_class","person_weight"]],on=["person_id"])
     # Age purpose analysis
     analysis_age_purpose = pd.pivot_table(df_eq_depl,"person_id",index="age_class",columns="following_purpose",aggfunc="count")
+    analysis_age_purpose = analysis_age_purpose/sampling_rate
     analysis_age_purpose.to_csv(f"{path}/{prefix}age_purpose.csv")
 
     # Compare age volume
-    analysis_age_class = pd.concat([df_census.groupby("age_class")["person_id"].count(),df_person_eq.groupby("age_class")["person_id"].count()],axis=1).reset_index()
+    analysis_age_class = pd.concat([df_census.groupby("age_class")["weight"].sum(),df_person_eq.groupby("age_class")["person_id"].count()],axis=1).reset_index()
     analysis_age_class.columns = ["Age class","INSEE","EQASIM"]
+    analysis_age_class["Proportion_INSEE"] = analysis_age_class["INSEE"] /df_census["weight"].sum()
+    analysis_age_class["Proportion_EQASIM"] = analysis_age_class["EQASIM"] /len(df_person_eq)
+    analysis_age_class["EQASIM"] = analysis_age_class["EQASIM"]/sampling_rate
     analysis_age_class.to_csv(f"{path}/{prefix}age.csv")
 
     # Compare vehicule volume
     analysis_vehicles_class = pd.concat([df_census.groupby("vehicles_class")["household_id"].nunique(),df_person_eq.groupby("vehicles_class")["household_id"].nunique()],axis=1).reset_index()
     analysis_vehicles_class.columns = ["Number of vehicles class","INSEE","EQASIM"]
-    analysis_vehicles_class.to_csv(f"{path}/{prefix}vehicle.csv")
+    analysis_vehicles_class["INSEE"] = analysis_vehicles_class["INSEE"] / df_census["household_id"].nunique() 
+    analysis_vehicles_class["EQASIM"] = analysis_vehicles_class["EQASIM"] / df_person_eq["household_id"].nunique() 
+    analysis_vehicles_class.to_csv(f"{path}/{prefix}nbr_vehicle.csv")
     
     # Compare license volume 
-    analysis_license_class = pd.concat([df_hts_person.groupby("has_license")["person_id"].count(),df_person_eq.groupby("has_license")["person_id"].count()],axis=1).reset_index()
-    analysis_vehicles_class.columns = ["Possession of license","HTS","EQASIM"]
-    analysis_vehicles_class["Possession of license"] = analysis_vehicles_class["Possession of license"] == 1
+    analysis_license_class = pd.concat([df_hts_person.groupby("has_license")["person_weight"].sum(),df_person_eq.groupby("has_license")["person_id"].count()],axis=1).reset_index()
+    analysis_license_class.columns = ["Possession of license","HTS","EQASIM"]
+    analysis_license_class["Proportion_HTS"] = analysis_license_class["HTS"] /df_hts_person["person_weight"].sum()
+    analysis_license_class["Proportion_EQASIM"] = analysis_license_class["EQASIM"] /len(df_person_eq)
+    analysis_license_class["EQASIM"] = analysis_license_class["EQASIM"]/sampling_rate
     analysis_license_class.to_csv(f"{path}/{prefix}license.csv")
 
     # Compare depl volume
-    analysis_depl = pd.concat([df_hts_depl.groupby("age_class")["person_id"].count(),df_eq_depl.groupby("age_class")["person_id"].count()],axis=1).reset_index()
+    analysis_depl = pd.concat([df_hts_depl.groupby("age_class")["person_weight"].sum(),df_eq_depl.groupby("age_class")["person_id"].count()],axis=1).reset_index()
     analysis_depl.columns = ["Age class","HTS","EQASIM"]
+    analysis_depl["Proportion_HTS"] = analysis_depl["HTS"] /df_hts_depl["person_weight"].sum()
+    analysis_depl["Proportion_EQASIM"] = analysis_depl["EQASIM"] /len(df_eq_depl)
+    analysis_depl["EQASIM"] = analysis_depl["EQASIM"]/sampling_rate
     analysis_depl.to_csv(f"{path}/{prefix}deplacement.csv")
 
     # Compare dist
-    df_hts_trip["routed_distance"] = df_hts_trip["routed_distance"]/1000
-    df_hts_trip["distance_class"] = pd.cut(df_hts_trip["routed_distance"],list(np.arange(50))+[np.inf])
-    analysis_dist = df_hts_trip.groupby("distance_class")["person_id"].count()
+    df_hts_depl["routed_distance"] = df_hts_depl["routed_distance"]/1000
+    df_hts_depl["distance_class"] = pd.cut(df_hts_depl["routed_distance"],list(np.arange(100))+[np.inf])
+    analysis_dist = df_hts_depl.groupby("distance_class")["person_weight"].sum()
+
     return analysis_dist
 
 
diff --git a/synthesis/output.py b/synthesis/output.py
index a761d1e6..7f9d64ef 100644
--- a/synthesis/output.py
+++ b/synthesis/output.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 def configure(context):
-    context.stage("analysis.synthesis.population")
+
     context.stage("synthesis.population.enriched")
 
     context.stage("synthesis.population.activities")
@@ -17,13 +17,14 @@ def configure(context):
     context.stage("synthesis.vehicles.vehicles")
 
     context.stage("synthesis.population.spatial.locations")
-
+    context.stage("analysis.synthesis.population")
     context.stage("documentation.meta_output")
 
     context.config("output_path")
     context.config("output_prefix", "ile_de_france_")
     context.config("output_formats", ["csv", "gpkg"])
-    
+    context.config("sampling_rate")
+
     if context.config("mode_choice", False):
         context.stage("matsim.simulation.prepare")
 
@@ -263,11 +264,15 @@ def execute(context):
         df_spatial.to_parquet(path)
     
     # Execution analysis
+    SAMPLING_RATE =context.config("sampling_rate")
     df_spatial = df_spatial.to_crs("EPSG:2154")
 
-    df_spatial["distance"] = df_spatial.length
-    df_spatial["distance_class"] = pd.cut(df_spatial["distance"],list(np.arange(50))+[np.inf])
+    df_spatial["distance"] = df_spatial.length/1000
+    df_spatial["distance_class"] = pd.cut(df_spatial["distance"],list(np.arange(100))+[np.inf])
     analysis_distance = context.stage("analysis.synthesis.population")
     analysis_distance = pd.concat([analysis_distance,df_spatial.groupby("distance_class")["person_id"].count()],axis=1).reset_index()
     analysis_distance.columns = ["Distance class","HTS","EQASIM"]
+    analysis_distance["Proportion_HTS"] = analysis_distance["HTS"] / analysis_distance["HTS"].sum()
+    analysis_distance["Proportion_EQASIM"] = analysis_distance["EQASIM"] / len(df_spatial)
+    analysis_distance["EQASIM"] = analysis_distance["EQASIM"]/ SAMPLING_RATE
     analysis_distance.to_csv(f"{output_path}/{output_prefix}distance.csv")
\ No newline at end of file

From ac0e0a9e1075217be28c51b2c6267b2f95d40dc3 Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Thu, 10 Oct 2024 09:37:20 +0200
Subject: [PATCH 3/6] fix : vehicle analysis & output folder

---
 analysis/grid/comparison_flow_volume.py | 13 +++++++--
 analysis/synthesis/population.py        | 38 ++++++++++++++-----------
 synthesis/output.py                     |  5 ++--
 3 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/analysis/grid/comparison_flow_volume.py b/analysis/grid/comparison_flow_volume.py
index b2506ea1..963491dd 100644
--- a/analysis/grid/comparison_flow_volume.py
+++ b/analysis/grid/comparison_flow_volume.py
@@ -1,8 +1,9 @@
 import pandas as pd
 import geopandas as gpd
+import os
 
 import plotly.express as px 
-
+ANALYSIS_FOLDER = "compare_flow_volume"
 
 SAMPLING_RATE = 0.05
 
@@ -84,6 +85,12 @@ def execute(context):
     df_grids = stat_grid(df_trips_comp,df_locations_comp,df_persons_comp,df_grid)
     point = df_grid.unary_union.centroid # a changé avec ploy_dep
     print("Printing grids...")
+
+    # check output folder existence
+    analysis_output_path = os.path.join(context.config("output_path"), ANALYSIS_FOLDER)
+    if not os.path.exists(analysis_output_path):
+        os.mkdir(analysis_output_path)    
+
     for prefix, figure in figures.items():
         df_select_age = df_stats[df_stats["age"].between(figure["min_age"],figure["max_age"])]
         df_select_age = df_select_age.dissolve(by=["id_carr_1km","following_purpose"],aggfunc="count").reset_index()
@@ -103,7 +110,7 @@ def execute(context):
                 df_select  = df_select[df_select["count"] != 0]
                 fig = px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="count", opacity= 0.7,color_continuous_scale='reds',
                                         mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Localisation flow distribution for {prefix} group with {purpose} purpose")
-                fig.write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html')
+                fig.write_html(f'{analysis_output_path}/{context.config("output_prefix")}{prefix}_{purpose}.html')
             else :
                 df_grids_select = gpd.sjoin(df_grids_select,df_grid,how='right',predicate="contains").fillna(0)
                 df_select = gpd.sjoin(df_select,df_grids_select.drop(columns=[ 'index_left']),how='right',predicate="contains").rename(columns={"count_left":"volume_studied_simu","count_right":"volume_compared_simu"}).fillna(0)
@@ -111,6 +118,6 @@ def execute(context):
                 df_select  = df_select[(df_select["volume_studied_simu"] != 0 )| (df_select["volume_compared_simu"] != 0)]
                 df_select["pourcentage_vol"] = df_select["volume_difference"] / df_select["volume_compared_simu"]
                 px.choropleth_mapbox(df_select,geojson=df_select.geometry,locations=df_select.index,color="volume_difference", opacity= 0.7,color_continuous_scale="picnic", color_continuous_midpoint= 0,hover_name="id_carr_1km_right", hover_data=["volume_studied_simu", "volume_compared_simu","pourcentage_vol"],
-                                        mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Comparison flow distribution with previous simulation for {prefix} group with {purpose} purpose").write_html(f'{context.config("output_path")}/{context.config("output_prefix")}{prefix}_{purpose}.html')
+                                        mapbox_style = 'open-street-map',center=dict(lat= point.y,lon=point.x),title=f"Comparison flow distribution with previous simulation for {prefix} group with {purpose} purpose").write_html(f'{analysis_output_path}/{context.config("output_prefix")}{prefix}_{purpose}.html')
 
             
\ No newline at end of file
diff --git a/analysis/synthesis/population.py b/analysis/synthesis/population.py
index 48be0d2f..a13b2945 100644
--- a/analysis/synthesis/population.py
+++ b/analysis/synthesis/population.py
@@ -1,6 +1,5 @@
-import itertools
-import numba
 
+import os
 import numpy as np
 import pandas as pd
 from  analysis.marginals import NUMBER_OF_VEHICLES_LABELS
@@ -8,6 +7,7 @@
 AGE_CLASS = [0, 10, 14, 17, 25, 50, 65, np.inf]
 NUMBER_OF_VEHICLES= [0,1,2,3,np.inf]
 NAME_AGE_CLASS = ["0-10","11-14","15-17","18-25","26-50","51-65","65+"]
+ANALYSIS_FOLDER = "analysis_population"
 def configure(context):
 
     context.config("output_path")
@@ -20,15 +20,21 @@ def configure(context):
     context.stage("data.hts.selected", alias = "hts")
 
 def execute(context):
-    path = context.config("output_path")
+
+    # check output folder existence
+    analysis_output_path = os.path.join(context.config("output_path"), ANALYSIS_FOLDER)
+    if not os.path.exists(analysis_output_path):
+        os.mkdir(analysis_output_path)
+    
     prefix = context.config("output_prefix")
     sampling_rate = context.config("sampling_rate")
     df_person_eq = context.stage("synthesis.population.enriched")
     df_trip_eq = context.stage("synthesis.population.trips")
 
     df_census = context.stage("census")
-    _, df_hts_person, df_hts_trip = context.stage("hts")
+    df_hts_households, df_hts_person, df_hts_trip = context.stage("hts")
     df_hts_person["person_weight"] *=df_census["weight"].sum()/df_hts_person["person_weight"].sum()
+    df_hts_households["household_weight"] *=df_census["weight"].sum()/df_hts_households["household_weight"].sum()
     # get age class
     df_person_eq["age_class"] = pd.cut(df_person_eq["age"],AGE_CLASS,include_lowest=True,labels=NAME_AGE_CLASS)
     df_census["age_class"] = pd.cut(df_census["age"],AGE_CLASS,include_lowest=True,labels=NAME_AGE_CLASS)
@@ -36,7 +42,7 @@ def execute(context):
 
     # get vehicule class 
     df_person_eq["vehicles_class"] = pd.cut(df_person_eq["number_of_vehicles"],NUMBER_OF_VEHICLES,right=True,labels=NUMBER_OF_VEHICLES_LABELS)
-    df_census["vehicles_class"] = pd.cut(df_census["number_of_vehicles"],NUMBER_OF_VEHICLES,right=True,labels=NUMBER_OF_VEHICLES_LABELS)
+    df_hts_households["vehicles_class"] = pd.cut(df_hts_households["number_of_vehicles"],NUMBER_OF_VEHICLES,right=True,labels=NUMBER_OF_VEHICLES_LABELS)
 
 
     df_eq_depl = pd.merge(df_trip_eq,df_person_eq[["person_id","age_class"]],on=["person_id"])
@@ -44,7 +50,7 @@ def execute(context):
     # Age purpose analysis
     analysis_age_purpose = pd.pivot_table(df_eq_depl,"person_id",index="age_class",columns="following_purpose",aggfunc="count")
     analysis_age_purpose = analysis_age_purpose/sampling_rate
-    analysis_age_purpose.to_csv(f"{path}/{prefix}age_purpose.csv")
+    analysis_age_purpose.to_csv(f"{analysis_output_path}/{prefix}age_purpose.csv")
 
     # Compare age volume
     analysis_age_class = pd.concat([df_census.groupby("age_class")["weight"].sum(),df_person_eq.groupby("age_class")["person_id"].count()],axis=1).reset_index()
@@ -52,14 +58,14 @@ def execute(context):
     analysis_age_class["Proportion_INSEE"] = analysis_age_class["INSEE"] /df_census["weight"].sum()
     analysis_age_class["Proportion_EQASIM"] = analysis_age_class["EQASIM"] /len(df_person_eq)
     analysis_age_class["EQASIM"] = analysis_age_class["EQASIM"]/sampling_rate
-    analysis_age_class.to_csv(f"{path}/{prefix}age.csv")
-
-    # Compare vehicule volume
-    analysis_vehicles_class = pd.concat([df_census.groupby("vehicles_class")["household_id"].nunique(),df_person_eq.groupby("vehicles_class")["household_id"].nunique()],axis=1).reset_index()
-    analysis_vehicles_class.columns = ["Number of vehicles class","INSEE","EQASIM"]
-    analysis_vehicles_class["INSEE"] = analysis_vehicles_class["INSEE"] / df_census["household_id"].nunique() 
-    analysis_vehicles_class["EQASIM"] = analysis_vehicles_class["EQASIM"] / df_person_eq["household_id"].nunique() 
-    analysis_vehicles_class.to_csv(f"{path}/{prefix}nbr_vehicle.csv")
+    analysis_age_class.to_csv(f"{analysis_output_path}/{prefix}age.csv")
+
+    # Compare vehicle volume
+    analysis_vehicles_class = pd.concat([df_hts_households.groupby("vehicles_class")["household_weight"].sum(),df_person_eq.groupby("vehicles_class")["household_id"].nunique()],axis=1).reset_index()
+    analysis_vehicles_class.columns = ["Number of vehicles class","HTS","EQASIM"]
+    analysis_vehicles_class["Proportion_HTS"] = analysis_vehicles_class["HTS"] / df_hts_households["household_weight"].sum() 
+    analysis_vehicles_class["Proportion_EQASIM"] = analysis_vehicles_class["EQASIM"] / df_person_eq["household_id"].nunique() 
+    analysis_vehicles_class.to_csv(f"{analysis_output_path}/{prefix}nbr_vehicle.csv")
     
     # Compare license volume 
     analysis_license_class = pd.concat([df_hts_person.groupby("has_license")["person_weight"].sum(),df_person_eq.groupby("has_license")["person_id"].count()],axis=1).reset_index()
@@ -67,7 +73,7 @@ def execute(context):
     analysis_license_class["Proportion_HTS"] = analysis_license_class["HTS"] /df_hts_person["person_weight"].sum()
     analysis_license_class["Proportion_EQASIM"] = analysis_license_class["EQASIM"] /len(df_person_eq)
     analysis_license_class["EQASIM"] = analysis_license_class["EQASIM"]/sampling_rate
-    analysis_license_class.to_csv(f"{path}/{prefix}license.csv")
+    analysis_license_class.to_csv(f"{analysis_output_path}/{prefix}license.csv")
 
     # Compare depl volume
     analysis_depl = pd.concat([df_hts_depl.groupby("age_class")["person_weight"].sum(),df_eq_depl.groupby("age_class")["person_id"].count()],axis=1).reset_index()
@@ -75,7 +81,7 @@ def execute(context):
     analysis_depl["Proportion_HTS"] = analysis_depl["HTS"] /df_hts_depl["person_weight"].sum()
     analysis_depl["Proportion_EQASIM"] = analysis_depl["EQASIM"] /len(df_eq_depl)
     analysis_depl["EQASIM"] = analysis_depl["EQASIM"]/sampling_rate
-    analysis_depl.to_csv(f"{path}/{prefix}deplacement.csv")
+    analysis_depl.to_csv(f"{analysis_output_path}/{prefix}deplacement.csv")
 
     # Compare dist
     df_hts_depl["routed_distance"] = df_hts_depl["routed_distance"]/1000
diff --git a/synthesis/output.py b/synthesis/output.py
index 2559f199..92affdd1 100644
--- a/synthesis/output.py
+++ b/synthesis/output.py
@@ -6,6 +6,7 @@
 import sqlite3
 import math
 import numpy as np
+from analysis.synthesis.population import ANALYSIS_FOLDER
 
 def configure(context):
 
@@ -274,7 +275,7 @@ def execute(context):
         path = "%s/%strips.geoparquet" % (output_path, output_prefix)
         df_spatial.to_parquet(path)
     
-    # Execution analysis
+    # Output population analysis
     SAMPLING_RATE =context.config("sampling_rate")
     df_spatial = df_spatial.to_crs("EPSG:2154")
 
@@ -286,4 +287,4 @@ def execute(context):
     analysis_distance["Proportion_HTS"] = analysis_distance["HTS"] / analysis_distance["HTS"].sum()
     analysis_distance["Proportion_EQASIM"] = analysis_distance["EQASIM"] / len(df_spatial)
     analysis_distance["EQASIM"] = analysis_distance["EQASIM"]/ SAMPLING_RATE
-    analysis_distance.to_csv(f"{output_path}/{output_prefix}distance.csv")
\ No newline at end of file
+    analysis_distance.to_csv(f"{output_path}/{ANALYSIS_FOLDER}/{output_prefix}distance.csv")
\ No newline at end of file

From 7a12e6aba343d765a8ec60be12a9e83a7d75acdf Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Mon, 21 Oct 2024 15:44:12 +0200
Subject: [PATCH 4/6] fix : coments & changelog

---
 CHANGELOG.md                     |  1 +
 analysis/synthesis/population.py | 30 +++++++++++++++---------------
 synthesis/output.py              |  2 ++
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 468795dc..b08962ba 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 **Under development**
 
+- feat: add population analysis output
 - feat: add municipality information to households and activities
 - chore: update to `eqasim-java` commit `ece4932`
 - feat: vehicles and vehicle types are now always generated
diff --git a/analysis/synthesis/population.py b/analysis/synthesis/population.py
index a13b2945..ecbbe1e9 100644
--- a/analysis/synthesis/population.py
+++ b/analysis/synthesis/population.py
@@ -45,10 +45,10 @@ def execute(context):
     df_hts_households["vehicles_class"] = pd.cut(df_hts_households["number_of_vehicles"],NUMBER_OF_VEHICLES,right=True,labels=NUMBER_OF_VEHICLES_LABELS)
 
 
-    df_eq_depl = pd.merge(df_trip_eq,df_person_eq[["person_id","age_class"]],on=["person_id"])
-    df_hts_depl = pd.merge(df_hts_trip,df_hts_person[["person_id","age_class","person_weight"]],on=["person_id"])
+    df_eq_travel = pd.merge(df_trip_eq,df_person_eq[["person_id","age_class"]],on=["person_id"])
+    df_hts_travel = pd.merge(df_hts_trip,df_hts_person[["person_id","age_class","person_weight"]],on=["person_id"])
     # Age purpose analysis
-    analysis_age_purpose = pd.pivot_table(df_eq_depl,"person_id",index="age_class",columns="following_purpose",aggfunc="count")
+    analysis_age_purpose = pd.pivot_table(df_eq_travel,"person_id",index="age_class",columns="following_purpose",aggfunc="count")
     analysis_age_purpose = analysis_age_purpose/sampling_rate
     analysis_age_purpose.to_csv(f"{analysis_output_path}/{prefix}age_purpose.csv")
 
@@ -75,18 +75,18 @@ def execute(context):
     analysis_license_class["EQASIM"] = analysis_license_class["EQASIM"]/sampling_rate
     analysis_license_class.to_csv(f"{analysis_output_path}/{prefix}license.csv")
 
-    # Compare depl volume
-    analysis_depl = pd.concat([df_hts_depl.groupby("age_class")["person_weight"].sum(),df_eq_depl.groupby("age_class")["person_id"].count()],axis=1).reset_index()
-    analysis_depl.columns = ["Age class","HTS","EQASIM"]
-    analysis_depl["Proportion_HTS"] = analysis_depl["HTS"] /df_hts_depl["person_weight"].sum()
-    analysis_depl["Proportion_EQASIM"] = analysis_depl["EQASIM"] /len(df_eq_depl)
-    analysis_depl["EQASIM"] = analysis_depl["EQASIM"]/sampling_rate
-    analysis_depl.to_csv(f"{analysis_output_path}/{prefix}deplacement.csv")
-
-    # Compare dist
-    df_hts_depl["routed_distance"] = df_hts_depl["routed_distance"]/1000
-    df_hts_depl["distance_class"] = pd.cut(df_hts_depl["routed_distance"],list(np.arange(100))+[np.inf])
-    analysis_dist = df_hts_depl.groupby("distance_class")["person_weight"].sum()
+    # Compare travel volume
+    analysis_travel = pd.concat([df_hts_travel.groupby("age_class")["person_weight"].sum(),df_eq_travel.groupby("age_class")["person_id"].count()],axis=1).reset_index()
+    analysis_travel.columns = ["Age class","HTS","EQASIM"]
+    analysis_travel["Proportion_HTS"] = analysis_travel["HTS"] /df_hts_travel["person_weight"].sum()
+    analysis_travel["Proportion_EQASIM"] = analysis_travel["EQASIM"] /len(df_eq_travel)
+    analysis_travel["EQASIM"] = analysis_travel["EQASIM"]/sampling_rate
+    analysis_travel.to_csv(f"{analysis_output_path}/{prefix}travel.csv")
+
+    # Compare distance
+    df_hts_travel["routed_distance"] = df_hts_travel["routed_distance"]/1000
+    df_hts_travel["distance_class"] = pd.cut(df_hts_travel["routed_distance"],list(np.arange(100))+[np.inf])
+    analysis_dist = df_hts_travel.groupby("distance_class")["person_weight"].sum()
 
     return analysis_dist
 
diff --git a/synthesis/output.py b/synthesis/output.py
index 92affdd1..810dff3b 100644
--- a/synthesis/output.py
+++ b/synthesis/output.py
@@ -281,6 +281,8 @@ def execute(context):
 
     df_spatial["distance"] = df_spatial.length/1000
     df_spatial["distance_class"] = pd.cut(df_spatial["distance"],list(np.arange(100))+[np.inf])
+
+    # Compare distance 
     analysis_distance = context.stage("analysis.synthesis.population")
     analysis_distance = pd.concat([analysis_distance,df_spatial.groupby("distance_class")["person_id"].count()],axis=1).reset_index()
     analysis_distance.columns = ["Distance class","HTS","EQASIM"]

From 9dd5df21c24f6de9c8b9b67a81f5bcecbe852a4c Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Mon, 21 Oct 2024 16:44:13 +0200
Subject: [PATCH 5/6] fix: correction with egt

---
 analysis/synthesis/population.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/analysis/synthesis/population.py b/analysis/synthesis/population.py
index ecbbe1e9..a0e7728d 100644
--- a/analysis/synthesis/population.py
+++ b/analysis/synthesis/population.py
@@ -84,7 +84,7 @@ def execute(context):
     analysis_travel.to_csv(f"{analysis_output_path}/{prefix}travel.csv")
 
     # Compare distance
-    df_hts_travel["routed_distance"] = df_hts_travel["routed_distance"]/1000
+    df_hts_travel["routed_distance"] = df_hts_travel["routed_distance"]/1000 if "routed_distance" in  df_hts_travel.columns else df_hts_travel["euclidean_distance"]/1000
     df_hts_travel["distance_class"] = pd.cut(df_hts_travel["routed_distance"],list(np.arange(100))+[np.inf])
     analysis_dist = df_hts_travel.groupby("distance_class")["person_weight"].sum()
 

From a01f600e15583ca05ae29280cf3298d6f0a35214 Mon Sep 17 00:00:00 2001
From: Marie Laurent <mlaurent@tellae.fr>
Date: Wed, 23 Oct 2024 17:10:38 +0200
Subject: [PATCH 6/6] fix: separate analysis from data output & update docs

---
 analysis/synthesis/population.py | 35 ++++++++++++++++++++++++++++----
 docs/population.md               |  8 ++++++++
 synthesis/output.py              | 20 ++----------------
 3 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/analysis/synthesis/population.py b/analysis/synthesis/population.py
index a0e7728d..7a09c782 100644
--- a/analysis/synthesis/population.py
+++ b/analysis/synthesis/population.py
@@ -2,8 +2,9 @@
 import os
 import numpy as np
 import pandas as pd
+import geopandas as gpd
 from  analysis.marginals import NUMBER_OF_VEHICLES_LABELS
-
+from shapely import distance
 AGE_CLASS = [0, 10, 14, 17, 25, 50, 65, np.inf]
 NUMBER_OF_VEHICLES= [0,1,2,3,np.inf]
 NAME_AGE_CLASS = ["0-10","11-14","15-17","18-25","26-50","51-65","65+"]
@@ -13,8 +14,10 @@ def configure(context):
     context.config("output_path")
     context.config("output_prefix", "ile_de_france_")
     context.config("sampling_rate")
+
     context.stage("synthesis.population.trips")
     context.stage("synthesis.population.enriched")
+    context.stage("synthesis.population.spatial.locations")
 
     context.stage("data.census.filtered", alias = "census")
     context.stage("data.hts.selected", alias = "hts")
@@ -30,7 +33,12 @@ def execute(context):
     sampling_rate = context.config("sampling_rate")
     df_person_eq = context.stage("synthesis.population.enriched")
     df_trip_eq = context.stage("synthesis.population.trips")
-
+    df_location_eq = context.stage("synthesis.population.spatial.locations")[["person_id", "activity_index", "geometry"]]
+    
+    df_location_eq = df_location_eq.to_crs("EPSG:2154")
+    df_trip_eq["preceding_activity_index"] = df_trip_eq["trip_index"]
+    df_trip_eq["following_activity_index"] = df_trip_eq["trip_index"] + 1
+    
     df_census = context.stage("census")
     df_hts_households, df_hts_person, df_hts_trip = context.stage("hts")
     df_hts_person["person_weight"] *=df_census["weight"].sum()/df_hts_person["person_weight"].sum()
@@ -47,6 +55,8 @@ def execute(context):
 
     df_eq_travel = pd.merge(df_trip_eq,df_person_eq[["person_id","age_class"]],on=["person_id"])
     df_hts_travel = pd.merge(df_hts_trip,df_hts_person[["person_id","age_class","person_weight"]],on=["person_id"])
+
+    print("Generate tables ...")
     # Age purpose analysis
     analysis_age_purpose = pd.pivot_table(df_eq_travel,"person_id",index="age_class",columns="following_purpose",aggfunc="count")
     analysis_age_purpose = analysis_age_purpose/sampling_rate
@@ -86,9 +96,26 @@ def execute(context):
     # Compare distance
     df_hts_travel["routed_distance"] = df_hts_travel["routed_distance"]/1000 if "routed_distance" in  df_hts_travel.columns else df_hts_travel["euclidean_distance"]/1000
     df_hts_travel["distance_class"] = pd.cut(df_hts_travel["routed_distance"],list(np.arange(100))+[np.inf])
-    analysis_dist = df_hts_travel.groupby("distance_class")["person_weight"].sum()
 
-    return analysis_dist
+    df_spatial = pd.merge(df_trip_eq, df_location_eq.rename(columns = {
+        "activity_index": "preceding_activity_index",
+        "geometry": "preceding_geometry"
+    }), how = "left", on = ["person_id", "preceding_activity_index"])
+
+    df_spatial = pd.merge(df_spatial, df_location_eq.rename(columns = {
+        "activity_index": "following_activity_index",
+        "geometry": "following_geometry"
+    }), how = "left", on = ["person_id", "following_activity_index"])
+    df_spatial["distance"] = df_spatial.apply(lambda x:distance( x["preceding_geometry"],x["following_geometry"])/1000,axis=1)
+    df_spatial["distance_class"] = pd.cut(df_spatial["distance"],list(np.arange(100))+[np.inf])
+
+    analysis_distance = pd.concat([df_hts_travel.groupby("distance_class")["person_weight"].sum(),df_spatial.groupby("distance_class")["person_id"].count()],axis=1).reset_index()
+    analysis_distance.columns = ["Distance class","HTS","EQASIM"]
+    analysis_distance["Proportion_HTS"] = analysis_distance["HTS"] / analysis_distance["HTS"].sum()
+    analysis_distance["Proportion_EQASIM"] = analysis_distance["EQASIM"] / len(df_spatial)
+    analysis_distance["EQASIM"] = analysis_distance["EQASIM"]/ sampling_rate
+    analysis_distance.to_csv(f"{analysis_output_path}/{prefix}distance.csv")
+
 
 
 
diff --git a/docs/population.md b/docs/population.md
index bb479bf3..4bf64326 100644
--- a/docs/population.md
+++ b/docs/population.md
@@ -450,3 +450,11 @@ folder as: `{output_prefix}_{age group}_{trip pupose}.html`
 
 Note:
 With `analysis_from_file` at False, the last synthetic population is studied by default. Also if `output_prefix` and `comparison_file_prefix` refer to the same outputs, or `comparison_file_prefix` is not specified, then only a volume visualisation of this particular population is produced.
+
+
+### Comparaison population to source data
+
+Using the population pipeline in the Analysis directory, you can generate multiple tables comparing composition of synthetic population to source data. Right now the tables generated compare : population volume by age range, households volume by number of vehicles, population volume with a license and without, trip volume by age range and trip volume by length.
+Complementary from the synthetic population only, a table of population volume by age range and trip purpose is also created.
+
+To be able to use this pipeline, you must already have create a synthetic population. Then you need to open the `config.yml` and add the `analysis.synthesis.population` stage in the `run` section. 
\ No newline at end of file
diff --git a/synthesis/output.py b/synthesis/output.py
index 810dff3b..a1561b48 100644
--- a/synthesis/output.py
+++ b/synthesis/output.py
@@ -18,7 +18,7 @@ def configure(context):
     context.stage("synthesis.vehicles.vehicles")
 
     context.stage("synthesis.population.spatial.locations")
-    context.stage("analysis.synthesis.population")
+
     context.stage("documentation.meta_output")
 
     context.config("output_path")
@@ -273,20 +273,4 @@ def execute(context):
         clean_gpkg(path)
     if "geoparquet" in output_formats:
         path = "%s/%strips.geoparquet" % (output_path, output_prefix)
-        df_spatial.to_parquet(path)
-    
-    # Output population analysis
-    SAMPLING_RATE =context.config("sampling_rate")
-    df_spatial = df_spatial.to_crs("EPSG:2154")
-
-    df_spatial["distance"] = df_spatial.length/1000
-    df_spatial["distance_class"] = pd.cut(df_spatial["distance"],list(np.arange(100))+[np.inf])
-
-    # Compare distance 
-    analysis_distance = context.stage("analysis.synthesis.population")
-    analysis_distance = pd.concat([analysis_distance,df_spatial.groupby("distance_class")["person_id"].count()],axis=1).reset_index()
-    analysis_distance.columns = ["Distance class","HTS","EQASIM"]
-    analysis_distance["Proportion_HTS"] = analysis_distance["HTS"] / analysis_distance["HTS"].sum()
-    analysis_distance["Proportion_EQASIM"] = analysis_distance["EQASIM"] / len(df_spatial)
-    analysis_distance["EQASIM"] = analysis_distance["EQASIM"]/ SAMPLING_RATE
-    analysis_distance.to_csv(f"{output_path}/{ANALYSIS_FOLDER}/{output_prefix}distance.csv")
\ No newline at end of file
+        df_spatial.to_parquet(path)
\ No newline at end of file