-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path03_aggregation_buffers.py
502 lines (374 loc) · 21.9 KB
/
03_aggregation_buffers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
# ==================================================================================================================
# DISSERTATION
# SECTION 02: Python processing
# This script picks up at the end of 01B, and uses the output from the first set of QGIS process.
# This script is intended to be run in full as a single python file.
# Author: J Post
# ==================================================================================================================
# 0. IMPORT PACKAGES
import os
import glob
import subprocess
import sys
import json
import math
import time
import pandas as pd
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# from osgeo import gdal
# from pygeos import Geometry
from globals import * # Imports the filepaths defined in globals.py
print('Packages imported.\n')
# Start timer
start_time = time.time()
# ==================================================================================================================
# 1. LOAD DATA
# Read in the output files from script 01B:
# 1. WorldPop points
# 2. Landcover (cropland)
# 3. GHSL rural polygons
time_11s = time.time()
if sfmt == '.shp':
gdf_pop = gpd.read_file(pop_points)
gdf_crops = gpd.read_file(cropland_poly_dissolved)
gdf_ghsl = gpd.read_file(ghsl_poly_dissolved)
elif sfmt == '.feather':
gdf_pop = gpd.read_feather(pop_points)
gdf_crops = gpd.read_feather(cropland_poly_dissolved)
gdf_ghsl = gpd.read_feather(ghsl_poly_dissolved)
districts_shp = gpd.read_file(districts_filepath)
ag_workers = pd.read_csv(agworkers_filepath
, dtype = {'State code':str, 'District code':str}
)
print('Input data files loaded.\n')
timestamp(time_11s)
# ==================================================================================================================
# 2. JOIN WORLDPOP POINTS TO LAND COVER POLYGONS
# ===========
# 2.1 Join WorldPop to GHSL rural areas
time_21s = time.time()
# Join points to GHSL
pop_points_rural = gdf_pop.sjoin(gdf_ghsl, how = 'inner', predicate='within')
pop_points_rural = pop_points_rural.drop(columns='index_right')
# Export the filtered rural population
# pop_points_rural.to_feather(pop_points_rural_path)
print(f'Rural population points gdf created.\n')
timestamp(time_21s)
# pop_points_rural.plot(column='raster_value')
# plt.show()
# ===========
# 2.2 Join WorldPop to DynamicWorld cropland areas
time_22s = time.time()
# Join points to cropland
pop_joined_dw = gdf_pop.sjoin(gdf_crops, how='inner', predicate='within')
# pop_points_cropland = pop_joined_dw.loc[pop_joined_dw['cropland']==1] # remove here because now filtered in script 01C
pop_points_cropland = pop_joined_dw.drop(columns='index_right')
pop_points_cropland.to_feather(pop_points_cropland_path)
print(f'Cropland population points gdf created and exported to {sfmt}.\n')
timestamp(time_22s)
# pop_points_cropland.plot(column='raster_value')
# plt.show()
# ==================================================================================================================
# 3. CALCULATE DISTRICT LEVEL POPULATION
# Next steps:
# 1. Break down state counts (Karnataka) into districts -> Need to decide if this is necessary? What is the most accurate?
# 2. Sum point population estimates by district, to get a total district population estimate from WorldPop
# 3. Sum rural point population estimates by district, to get a rural district population estimate from WorldPop
# 4. Sum cropland point population estimates by district, to get a cropland population district estimate from WorldPop
# 5. Sum rural-crop point population estimates by district, to get final ADP estimate -> Will this be any different than value in Step (4)?
# ===========
# 3.1 Join census agworkers to district boundaries
time_31s = time.time()
ag_workers_jn = districts_shp.merge(ag_workers, how='left', left_on='pc11_d_id', right_on='District code', sort=True
# , indicator=True, validate='one_to_many' # Creates a '_merge' column, for quality checking
)
ag_workers_jn.head()
print('Census ADP counts joined to district boundaries.\n')
timestamp(time_31s)
# ===========
# 3.2 Join WorldPop points to district boundaries
time_32s = time.time()
# This joins the attributes of the points to the polygons they fall within
pop_jn_districts = gdf_pop.sjoin(districts_shp, how='inner', predicate='within')
# Dissolve points to calculate aggregated population for district
dissolve_df = pop_jn_districts[['raster_value', 'geometry', 'pc11_d_id', 'd_name']]
sum_pop_districts = dissolve_df.dissolve(by = 'pc11_d_id', as_index=False, aggfunc={'raster_value':'sum',
'd_name':'first'})
sum_pop_districts['raster_value'] = sum_pop_districts['raster_value'].round() # remove unnecessary decimals
# Export the worldpop points by district
sum_pop_districts.to_feather(sum_pop_districts_path)
print(f'Pop points joined to district boundaries and exported to {sfmt}.\n')
timestamp(time_32s)
# ===========
# 3.3 Join WorldPop RURAL points to district boundaries
time_33s = time.time()
# This joins the attributes of the points to the polygons they fall within
rupop_jn_districts = pop_points_rural.sjoin(districts_shp, how='inner', predicate='within')
# Dissolve points to calculate aggregated population for district
dissolve_df = rupop_jn_districts[['raster_value', 'geometry', 'pc11_d_id', 'd_name']]
sum_rupop_districts = dissolve_df.dissolve(by = 'pc11_d_id', as_index=False, aggfunc={'raster_value':'sum',
'd_name':'first'})
sum_rupop_districts['raster_value'] = sum_rupop_districts['raster_value'].round()
# Export the rural points by district
sum_rupop_districts.to_feather(sum_rupop_districts_path)
print(f'Rural pop points joined to district boundaries and export to {sfmt}.\n')
timestamp(time_33s)
# ===========
# 3.4 Join WorldPop CROPLAND points to district boundaries
time_34s = time.time()
# This joins the attributes of the points to the polygons they fall within
crpop_jn_districts = pop_points_cropland.sjoin(districts_shp, how='inner', predicate='within')
# Dissolve points to calculate aggregated population for district
dissolve_df = crpop_jn_districts[['raster_value', 'geometry', 'pc11_d_id', 'd_name']]
sum_crpop_districts = dissolve_df.dissolve(by = 'pc11_d_id', as_index=False, aggfunc={'raster_value':'sum',
'd_name':'first'})
sum_crpop_districts['raster_value'] = sum_crpop_districts['raster_value'].round()
# Export the cropland points by district
sum_crpop_districts.to_feather(sum_crpop_districts_path)
print(f'Cropland pop points joined to district boundaries and exported to {sfmt}.\n')
timestamp(time_34s)
# ==================================================================================================================
# 4. COLLATE OUTPUT INTO SINGLE GEODATAFRAME
# Input files:
# 1. Census population counts by district
# 2. Census agricultural workers by district >>> use combo of first 2 to calculate Ag Workers Density (per km2)
# 3. WorldPop (all points) aggregated count by district
# 4. WorldPop (rural points) aggregated count by district
# 5. WorldPop (cropland points) aggregated count by district
# 6. Cropland area by district
# 7. Rural area by district
masterdf = ag_workers_jn[['pc11_s_id', 'pc11_d_id', 'd_name', 'geometry', 'District code',
'Population', 'Area sq km', 'Population per sq km',
'ADP1', 'ADP2', 'ADP3', 'ADP4', 'Total workers', 'ADP5']]
join_worldpop_all = sum_pop_districts[['pc11_d_id', 'raster_value']]
join_worldpop_all = join_worldpop_all.rename(columns={'raster_value':'worldpop'})
masterdf = masterdf.merge(join_worldpop_all, how='left', on=['pc11_d_id'])
join_worldpop_rural = sum_rupop_districts[['pc11_d_id', 'raster_value']]
join_worldpop_rural = join_worldpop_rural.rename(columns={'raster_value':'worldpop_rural'})
masterdf = masterdf.merge(join_worldpop_rural, how='left', on=['pc11_d_id'])
join_worldpop_crop = sum_crpop_districts[['pc11_d_id', 'raster_value']]
join_worldpop_crop = join_worldpop_crop.rename(columns={'raster_value':'worldpop_crop'})
masterdf = masterdf.merge(join_worldpop_crop, how='left', on=['pc11_d_id'])
df_cropland_area = pd.read_csv(cropland_area_path, dtype = {'pc11_s_id':str, 'pc11_d_id':str})
masterdf = masterdf.merge(df_cropland_area, how='left', on=['pc11_d_id'])
df_rural_area = pd.read_csv(rural_area_path, dtype = {'pc11_s_id':str, 'pc11_d_id':str})
masterdf = masterdf.merge(df_rural_area, how='left', on=['pc11_d_id'], suffixes=(None,'_y'))
masterdf.drop(columns='district_area_y', inplace=True)
# ==================================================================================================================
# 5. CALCULATE DIFFERENCE IN POPULATION ESTIMATES
# Columns to compute:
# 0. Difference between Worldpop total population and Census total population
# 1. Difference between Worldpop cropland population and ADP1
# 2. Difference between Worldpop cropland population and ADP2
# 3. Difference between Worldpop cropland population and ADP3
# 4. Difference between Worldpop cropland population and ADP4
# 5. Difference between Worldpop cropland population and ADP5
# TODO: recalculate percentage changes to follow Sri Lanka method
# 1. Calculate the ADPc as a proportion of total population
# 2. Calculate the ADPa as a proportion of total population
# 3. Calculate the difference between [1] and [2]
# Calculate difference between Worldpop total population and Census total population
masterdf['d_poptotals'] = masterdf['worldpop'] - masterdf['Population']
masterdf['d_pc0'] = round(100 - masterdf['Population']/masterdf['worldpop']*100,2)
# Calculate rural population as a % of total population
masterdf['rural_pctotal'] = masterdf['worldpop_rural']/masterdf['worldpop']*100
# Calculate ADPa as a % of total population
masterdf['ADPa_pctotal'] = masterdf['worldpop_crop']/masterdf['Population']*100
# Calculate difference between Worldpop cropland population and ADP1
masterdf['ADPc1_pctotal'] = masterdf['ADP1']/masterdf['Population']*100
masterdf['d_pc1'] = masterdf['ADPc1_pctotal'] - masterdf['ADPa_pctotal']
# Calculate difference between Worldpop cropland population and ADP2
masterdf['ADPc2_pctotal'] = masterdf['ADP2']/masterdf['Population']*100
masterdf['d_pc2'] = masterdf['ADPc2_pctotal'] - masterdf['ADPa_pctotal']
# Calculate difference between Worldpop cropland population and ADP3
masterdf['ADPc3_pctotal'] = masterdf['ADP3']/masterdf['Population']*100
masterdf['d_pc3'] = masterdf['ADPc3_pctotal'] - masterdf['ADPa_pctotal']
# Calculate difference between Worldpop cropland population and ADP4
masterdf['ADPc4_pctotal'] = masterdf['ADP4']/masterdf['Population']*100
masterdf['d_pc4'] = masterdf['ADPc4_pctotal'] - masterdf['ADPa_pctotal']
# Calculate difference between Worldpop cropland population and ADP5
masterdf['ADPc5_pctotal'] = masterdf['ADP5']/masterdf['Population']*100
masterdf['d_pc5'] = masterdf['ADPc5_pctotal'] - masterdf['ADPa_pctotal']
# Calculate difference between ADPc5 and Rural population
# If negative, then the estimated ADP surpasses entire rural population of district
masterdf['d_rural_adp'] = masterdf['worldpop_rural'] - masterdf['ADP5']
dlist = ['d_poptotals', 'd_pc1', 'd_pc2', 'd_pc3','d_pc4', 'd_pc5']
masterdf.drop(columns='geometry', inplace=True)
# masterdf.columns
masterdf.head()
# Add column to flag districts that need a buffer
# Use defined function with conditional rules: categorise_buffer
# Apply conditional rows to each row of masterdf
if ADPcn == 'ADPc3':
masterdf['need_buffer'] = masterdf.apply(lambda row: categorise_buffer(row, 'd_pc3'), axis=1)
elif ADPcn == 'ADPc5':
masterdf['need_buffer'] = masterdf.apply(lambda row: categorise_buffer(row, 'd_pc5'), axis=1)
elif ADPcn == 'ADPc1':
masterdf['need_buffer'] = masterdf.apply(lambda row: categorise_buffer(row, 'd_pc1'), axis=1)
elif ADPcn == 'ADPc2':
masterdf['need_buffer'] = masterdf.apply(lambda row: categorise_buffer(row, 'd_pc2'), axis=1)
elif ADPcn == 'ADPc4':
masterdf['need_buffer'] = masterdf.apply(lambda row: categorise_buffer(row, 'd_pc4'), axis=1)
# Identify districts where the ADPc5 > worldpop_rural (more agricultural population than total rural population).
# or districts with no rural population (worldpop_rural = NaN)
masterdf["d_rural_adp"] = masterdf["d_rural_adp"].fillna(-999)
masterdf['need_buffer'][masterdf['d_rural_adp'] < 0] = 'ineligible'
masterdf['need_buffer'][masterdf['d_pc5'].isna()] = 'ineligible'
# Remove ineligible districts and save in separate csv
ineligibledf = masterdf[masterdf['need_buffer'] == 'ineligible']
ineg_list = ineligibledf.index.values.tolist()
masterdf = masterdf.drop(index=ineg_list)
# Export masterdf to csv
masterdf.to_csv(masterdf_path, index=False)
ineligibledf.to_csv(ineligibledf_path, index=False)
print('Master results file exported to csv.\n')
# ==================================================================================================================
# 6. BUFFER ITERATION
# =====================
# 6.1 Define a function to create a buffer and recalculate the ADP estimate
# Where:
# 1. district_shp = districts_shp
# 2. crops_shp = gdf_crops
# 3. rural_points = pop_points_rural
# 4. district_code = string format of district code
# 5. buffer_radius = set accordingly; default 50m
# 6. buffer_type = 'enlarge' or 'reduce'
def generate_buffer(districts_shp, crops_shp, rural_points, district_code, buffer_radius, buffer_type):
"""
Generate a buffer around cropland area in district
...
Arguments
---------
districts_shp : polygon of district boundaries
crops_shp : polygon of cropland in state (or district)
rural_points : vector grid of population points in rural areas
district_code : census designated identifier of district
buffer_radius : radius (in absolute m) of buffer to be generated
buffer_type : takes one of 'enlarge', 'subtract', 'unchanged'
Returns
-------
check_buffer : GeoDataFrame with 1 row, containing buffer details tied to district geometry
d_buffer_gdf : GeoDataFrame with 1 row, containing polygon of buffered zone
"""
time_buff = time.time()
# Define district boundaries
district_boundary = districts_shp.loc[districts_shp['pc11_d_id'] == district_code]
crop_by_district_boundary = gpd.overlay(crops_shp, district_boundary, how='intersection')
# Convert to a Geoseries
district_series = crop_by_district_boundary['geometry']
# Convert buffer metre input into degrees (WGS84)
if buffer_type == 'enlarge':
degrees = buffer_radius * (0.00001/1.11) # Transformation required if CRS is geographic (WGS84, EPSG:4326)
elif buffer_type == 'subtract':
buffer_radius = buffer_radius * -1 # negative buffer radius = reduction in size
degrees = buffer_radius * (0.00001/1.11)
elif buffer_type == 'unchanged':
degrees = 0 # No buffer required (type == 'unchanged')
print('Creating ' + str(buffer_radius) + 'm buffers on crop lands for district: ' + district_code)
# Calculate buffer
d_buffer = district_series.buffer(degrees)
d_buffer.name = 'geometry'
d_buffer_gdf = gpd.GeoDataFrame(d_buffer, crs="EPSG:4326", geometry='geometry')
d_buffer_gdf['pc11_d_id'] = district_code
# Join rural points to buffer zone
ru_points_buffer = rural_points.sjoin(d_buffer_gdf, how='inner', predicate='within')
ru_points_buffer = ru_points_buffer.drop(columns='index_right')
# Dissolve points to calculate aggregated population for district
sum_buffer_points = ru_points_buffer.dissolve(as_index=False, aggfunc={'raster_value':'sum'})
sum_buffer_points['raster_value'] = sum_buffer_points['raster_value'].round()
# Add district code and buffer radius to geodataframe
sum_buffer_points['pc11_d_id'] = district_code
sum_buffer_points['buffer_r'] = buffer_radius
print(f'Rural pop points joined to buffer area and new ADP calculated.')
# Merge census data from masterdf
if ADPcn == 'ADPc3':
ADPcn_pctotal = 'ADPc3_pctotal'
elif ADPcn == 'ADPc5':
ADPcn_pctotal = 'ADPc5_pctotal'
elif ADPcn == 'ADPc1':
ADPcn_pctotal = 'ADPc1_pctotal'
elif ADPcn == 'ADPc2':
ADPcn_pctotal = 'ADPc2_pctotal'
elif ADPcn == 'ADPc4':
ADPcn_pctotal = 'ADPc4_pctotal'
check_buffer = sum_buffer_points.merge(masterdf[['pc11_d_id', 'Population', 'crop_area_pc', 'rural_area_pc', ADPcn_pctotal, 'need_buffer']]
, how='left', on='pc11_d_id')
check_buffer['buffered_pctotal'] = check_buffer['raster_value']/check_buffer['Population']*100
check_buffer['d_bufferedpc'] = check_buffer[ADPcn_pctotal] - check_buffer['buffered_pctotal']
# Next apply conditional rows to each row of masterdf
check_buffer['revised_buffer'] = check_buffer.apply(lambda row: buffer_logic(row, 'need_buffer', 'd_bufferedpc'), axis=1)
timestamp(time_buff)
return check_buffer, d_buffer_gdf
# # TEST RUN
# sum_buffer_gdf = generate_buffer(districts_shp, gdf_crops, pop_points_rural, '583', 50, 'subtract')
# # TEST ZERO BUFFER
# sum_buffer_gdf, buffer_poly = generate_buffer(districts_shp, gdf_crops, pop_points_rural, '518', 50, 'unchanged')
# # TEST INPUT 'UNCHANGED'
# sum_buffer_gdf, buffer_poly = generate_buffer(districts_shp, gdf_crops, pop_points_rural, '582', -50, 'enlarge')
# =====================
# 6.2 Run function over set of districts and combine outputs into a single GeoDataframe
time_buffer = time.time()
# Create a dictionary of the districts
buffer_dict = dict(zip(masterdf['pc11_d_id'], masterdf['need_buffer']))
for i in ['518', '244']:
if i in buffer_dict:
del buffer_dict[i] # Manually remove Mumbai; cropland unaligned with rural grid, causing buffer error.
# TEST LOOP THROUGH DICTIONARY
# buffer_dict = {'518': 'unchanged', '520': 'enlarge', '521': 'enlarge'}
# Initialize a list to store individual GeoDataFrames
buffer_gdf_list = []
buffer_poly_list = []
# 2. Run for loop (loop through each district)
for key, value in buffer_dict.items():
buffer_radius = 50
iteration_count = 0
# Run initial buffer function
sum_buffer_gdf, buffer_poly = generate_buffer(districts_shp, gdf_crops, pop_points_rural, key, buffer_radius, value)
print('1st Run: District ' + key + ' value is ' + value + ' and result: ' + sum_buffer_gdf['revised_buffer'].item())
print('d_bufferedpc: ' + str(round(sum_buffer_gdf['d_bufferedpc'].item(),2)))
# 3. Run while loop (iterate over a single district until threshold is met)
# Iteration count is designed to present unseen errors from looping forever; buffer process will cut off after 5 runs
while (abs(sum_buffer_gdf['d_bufferedpc'].item()) > 5) and (iteration_count <= iteration_max):
if value in ['unchanged', 'ineligible']: # Ensures districts that are initially within threshold or are ineligible
break # do not run through the buffer iteration process
elif sum_buffer_gdf['revised_buffer'].item() in ['enlarge', 'subtract']:
buffer_radius = round(buffer_radius + (buffer_radius/2))
sum_buffer_gdf, buffer_poly = generate_buffer(districts_shp, gdf_crops, pop_points_rural, key, buffer_radius, value)
elif sum_buffer_gdf['revised_buffer'].item() == 'overenlarged':
buffer_radius = round(buffer_radius - (buffer_radius/2))
sum_buffer_gdf, buffer_poly = generate_buffer(districts_shp, gdf_crops, pop_points_rural, key, buffer_radius, 'enlarge')
elif sum_buffer_gdf['revised_buffer'].item() == 'oversubtracted':
buffer_radius = round(buffer_radius - (buffer_radius/2))
sum_buffer_gdf, buffer_poly = generate_buffer(districts_shp, gdf_crops, pop_points_rural, key, buffer_radius, 'subtract')
print('District ' + key + ' value is ' + value + ' and result: ' + sum_buffer_gdf['revised_buffer'].item())
print('d_bufferedpc: ' + str(round(sum_buffer_gdf['d_bufferedpc'].item(),2)))
iteration_count = iteration_count + 1
# Append the single-row GeoDataFrame to the list
buffer_gdf_list.append(sum_buffer_gdf)
buffer_poly_list.append(buffer_poly)
print('*** District ' + key + ' complete. ***\n Iterations: ' + str(iteration_count) + '\n')
timestamp(time_buffer)
# Concatenate all GeoDataFrames in the list
buffer_gdf = pd.concat(buffer_gdf_list)
buffer_poly_state = pd.concat(buffer_poly_list)
# Reset index
buffer_gdf.reset_index(drop=True, inplace=True)
buffer_poly_state.reset_index(drop=True, inplace=True)
# Display the final GeoDataFrame
print(buffer_gdf)
# Export buffer_gdf to .feather
buffer_gdf.to_feather(buffergdf_path)
# Export buffer_df to csv
buffer_df = pd.DataFrame(buffer_gdf.drop(columns = 'geometry'))
buffer_df.to_csv(bufferdf_path, index=False)
# Join buffer radius values to district polygons
buffer_map = districts_shp.merge(buffer_df, on='pc11_d_id')
# Export buffer map and buffer polygon to .shp (for QGIS mapping)
buffer_map.to_file(buffermap_path)
buffer_poly_state.to_file(buffer_poly_path)
print('\nScript complete.\n')
timestamp(start_time)