-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.qmd
309 lines (232 loc) · 11.8 KB
/
data.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
---
title: "Data"
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
```
```{r pacakges}
library(tidyverse)
library(usmap)
library(sf)
library(infer)
library(moderndive)
```
```{r Tidying and Wrangling Data}
##Tidying Data
#creating dfs from .csv files
police_locals <- read_csv("data/police-locals.csv")
agencies <- read_csv("data/fatal-police-shootings-agencies.csv")
shootings <- read_csv("data/fatal-police-shootings-data.csv")
#removing old `city` tag from data set that we created when decatenated the city names
police_locals <- police_locals |>
select(-city_old)
# creating `agencies` df with just police departments
agencies <- agencies |>
filter(grepl("department", tolower(name))) |>
filter(!grepl("county", tolower(name)))
#creating binned categorical account of if shooting victim was `armed`
shootings <- shootings |>
mutate(armed = case_when(is.na(armed_with) ~ "NO",
armed_with == "unarmed" ~ "NO",
armed_with == "unknown" ~ "NO",
armed_with == "undetermined" ~ "NO",
armed_with == "gun" ~ "YES",
armed_with == "knife" ~ "YES",
armed_with == "blunt_object" ~ "YES",
armed_with == "other" ~ "YES",
armed_with == "replica" ~ "YES",
armed_with == "vehicle" ~ "YES"))
#creating df with only agency `names`, `id`, and `state`
agencies_ids <- agencies |>
select(name, id, state)
#creating df with `city`, `agency`, and `state` info for each shooting
shooting_agencies <- shootings |>
select(city, agency_ids, state)
#changing `shooting` var in `shooting_agencies` df to numeric
shooting_agencies$agency_ids <- as.numeric(shootings$agency_ids)
#creating df with `city` and `state` info for each agency by joining `agencies_ids` and `shooting_agencies`
agencies_w_cities <- agencies_ids |>
left_join(shooting_agencies, by = c("id" = "agency_ids", "state" = "state")) |>
drop_na(city) |>
distinct(id, .keep_all = TRUE)
#creating df with census data for each agency by joining `agencies_w_cities` and `police_locals`
agencies_census <- agencies_w_cities |>
full_join(police_locals, by = c("city" = "city", "state" = "state")) |>
drop_na(police_force_size) |>
distinct(id, .keep_all = TRUE) |>
mutate(majority = if_else(all >= 0.5, "TRUE", "FALSE"))
#creating df of only shootings involving agencies within `agencies` df
shootings_case <- shootings |>
right_join(agencies_census, by = c("city" = "city", "state" = "state")) |>
select(-agency_ids) |>
rename(agency_ids = id.y, id = id.x, agency = name.y, victim = name.x) |>
select(-location_precision, -race_source)
```
```{r Counting Shootings}
#count shootings by agency
shootings_by_agency <- shootings_case |>
count(agency)
#find top 25 agencies with the most shootings
top_25_agencies <- shootings_by_agency |>
slice_max(n, n = 25)
# visulize top 25 agencies with the most shootings
ggplot(data = top_25_agencies,
mapping = aes(x = agency, y = n)) +
geom_col() +
theme(axis.text.x = element_text(angle = 75,
vjust = 1,
hjust = 1,
margin = margin(t = 5, b = 5)))
```
```{r shot_map}
#mapping Locations of Police-Involved Shootings between 2015 and 2023
#load geo-viz libraries
library(ggmap)
library(maps)
library(mapdata)
#create blank map
usa <- map_data("usa")
states <- map_data("state")
#add locations of shootings to maps
shot_map <- ggplot(data = states) +
geom_polygon(aes(x = long, y = lat, fill = group, group = group), color = "white") +
coord_fixed(1.3) +
guides(fill=FALSE) + # do this to leave off the color legend
geom_point(data = shootings_case, aes(x = longitude, y = latitude), color = "black", size = .2) +
geom_point(data = shootings_case, aes(x = longitude, y = latitude), color = "red", size = .1) +
labs(title = "Locations of Police-Involved Shootings between 2015 and 2023",
captions = "This is only includes cities where we have agency census data.",
x = "Longitude",
y = "Latitude")
```
```{r }
#creating df with total shootings per agency and census data
agencies_census <- agencies_census |>
left_join(shootings_by_agency, by = c("name" = "agency"))
#prelim visualization of relationship between percentage of officer residency and number of fatal shootings per agency
agencies_census |>
ggplot(mapping = aes(x = all, y = n)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE)
#creating visualization of comparison Shootings in Cities where a Majority/Minority of Officers Reside
p0 <- shootings_case |>
ggplot(aes(x = majority, fill = armed)) +
geom_bar() +
labs(title = "Shootings in Cities where a Majority of Officers Reside",
caption = "This is only includes shootings where we have agency census data.",
x = "Does a majority a of the total police force live in the city?",
y = "Number of fatal shootings",
fill = "Victim Armed?")
p0
#calculate mean number of shootings per agency in cities where a majority of officers reside in the city
majority_mean <- shootings_case |>
filter(majority == TRUE) |>
count(agency) |>
summarize(maj_mean = mean(n))
#calculate mean number of shootings per agency in cities where a minority of officers reside in the city
minority_mean <- shootings_case |>
filter(majority == FALSE) |>
count(agency) |>
summarize(min_mean = mean(n))
#calculate a difference in means between the `majority` and `minority`
diff_in_means <- majority_mean - minority_mean
diff_in_means
#tidy table
knitr::kable(head(diff_in_means))
```
```{r}
#fit single linear regression model for correlation between percentage of officer residency and number of fatal shootings per agency
fit <- lm(n ~ all, data = agencies_census)
fit
#tidy `fit`
p1 <- get_regression_table(fit)
knitr::kable(head(p1))
#add `armed` and `majority` to `shootings_by_agency` df
shootings_by_agency_census <- shootings_case |>
group_by(agency) |>
count(armed) |>
drop_na(n, armed) |>
right_join(agencies_census, by = c("agency" = "name")) |>
distinct(armed, .keep_all = TRUE)
shootings_by_agency_census <- shootings_by_agency_census |>
select(n.x, armed, all)
#fit multiple linear regression model for correlation between percentage of officer residency and victim armament and number of fatal shootings per agency
fit_multi <- lm(n.x ~ all + armed, data = shootings_by_agency_census)
fit_multi
#tidy `fit_multi`
p2 <- get_regression_table(fit_multi)
knitr::kable(head(p2))
#visualize polynomial relationship between percentage of officer residency and number of fatal shootings per agency
ggplot(data = shootings_by_agency_census, aes(x = all, y = n.x)) +
geom_jitter(width = 0.10, height = 0, alpha = 0.45) +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = TRUE) +
labs(title = "Number of Shootings on a Scale of Police Force Residency",
x = "Percentage of the total police force that lives in the city",
y = "Number of fatal shootings in that city")
#visualize polynomial relationship between percentage of officer residency and victim armament and number of fatal shootings per agency
ggplot(data = shootings_by_agency_census, aes(x = all, y = n.x, color = armed)) +
geom_jitter(width = 0.10, height = 0, alpha = 0.45) +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = TRUE) +
labs(title = "Number of Shootings on a Scale of Police Force Residency",
x = "Percentage of the total police force that lives in the city",
y = "Number of fatal shootings in that city",
color = "Victim Armed?")
```
The model equation for `fit` is:
\[ \text{Number of Fatal Shootings (n)} = 35.7782 - 0.5874 \times \text{Percentage of Officer Residency (all)} \]
Interpretation:
- The intercept, $35.7782$, is the estimated number of fatal shootings when the percentage of officer in-city residency (`all`) is $0$. For each one-unit increase in the percentage of officer residency, the number of fatal shootings is expected to decrease by $0.5874$ ($-0.5874$) units, assuming all other factors remain constant.
This model suggests that there is a negative association between the percentage of officer residency and the number of fatal shootings. However, it's important to interpret the results in the context of your data and consider potential confounding factors, like whether or not the victim was armed.
The model equation for `fit_multi` considering victim armament (`armed`) is:
\[ \text{# of Fatal Shootings (n.x)} = 4.117 + 1.211 \times \text{Percentage of Officer Residency (all)} + 24.921 \times \text{Armed (YES)} \]
- The intercept, $4.117$, is the estimated number of fatal shootings where the percentage of officer in-city residency (`all`) is $0$ and the victim was un-armed. For each one-unit increase in the percentage of in-city officer residency compared to the total force (`all`), we expect an increase of $1.211$ fatal shootings, assuming the victim's armament status (`armedYES`) remains constant.
- The coefficient for 'armedYES', $24.921$, indicates that the victim is armed (`armed` is `YES`), we expect an increase of $24.921$ fatal shootings compared to when the victim is not armed (`armed` is `No`), assuming the percentage of officer residency (`all`) remains constant.
In summary, the model suggests that the percentage of officer residency and whether the victim is armed are associated with the number of fatal shootings per agency even as we control for victim armament. However, as correlation does not imply causation, and other factors not included in the model may influence the outcomes.
```{r Hypothesis Testing for Diff in Mean Total Fatal Shootings between Residency Prop}
#generate null distribution
null_dist <- agencies_census |>
specify(n ~ majority) |>
hypothesize(null = "independence") |>
generate(reps = 1000, type = "permute") |>
calculate(stat = "diff in means", order = c("TRUE", "FALSE"))
#compute observed test statistic
test_stat <- agencies_census |>
specify(n ~ majority) |>
calculate(stat = "diff in means", order = c("TRUE", "FALSE"))
#visualize p-value
null_dist |>
visualize() +
shade_p_value(obs_stat = test_stat, direction = "less")
#compute p-value
null_dist |>
get_p_value(obs_stat = test_stat, direction = "less")
```
Inference for a Difference in Means
- $H_0$: The mean total number of fatal shootings per agencies does not differ based on if a majority of the officers live in the city or not.
- $H_A$: The mean total number of fatal shootings per agencies is fewer in cities where a majority of the officers live in the city then cities where they do not.
-- $H_0 : \mu_{maj} − \mu_{min} = 0$, or equivalently $H_0 : \mu_{maj} = \mu_{min}$ -- $H_A : \mu_{maj} − \mu_{min} < 0$, or equivalently $H_A : \mu_{maj} < \mu_{min}$
```{r Hypothesis Testing for Correlation between Total Fatal Shootings and Residency Prop}
#generate null distribution
null_dist_cor <- agencies_census |>
specify(n ~ white) |>
hypothesize(null = "independence") |>
generate(reps = 1000, type = "permute") |>
calculate(stat = "correlation")
#compute observed test statistic
test_stat_cor <- agencies_census |>
specify(n ~ white) |>
calculate(stat = "correlation")
test_stat_cor
#visualize p-value
null_dist_cor |>
visualize() +
shade_p_value(obs_stat = test_stat, direction = "two.sided")
#compute p-value
null_dist_cor |>
get_p_value(obs_stat = test_stat, direction = "two.sided")
```
Inference for a Correlation
- $H_O$: There is no relationship between percentage of the total police force that lives in the city they serve and number of fatal shootings.
- $H_A$: There is a relationship between percentage of the total police force that lives in the city they serve and number of fatal shootings.
- $H_0 : \rho = 0$
- $H_0 : \rho \neq 0$