diff --git a/10.visualize-gender.Rmd b/10.visualize-gender.Rmd index a005ea7..bef07f7 100644 --- a/10.visualize-gender.Rmd +++ b/10.visualize-gender.Rmd @@ -65,21 +65,13 @@ iscb_pubmed <- iscb_gender_df %>% values_to = "probabilities" ) %>% filter(!is.na(probabilities)) %>% - group_by(type, year, gender) %>% - mutate( - pmc_citations_year = mean(adjusted_citations), - weight = adjusted_citations / pmc_citations_year, - weighted_probs = probabilities * weight - # weight = 1 - ) + group_by(type, year, gender) iscb_pubmed_sum <- iscb_pubmed %>% summarise( # n = n(), - mean_prob = mean(weighted_probs), - # mean_prob = mean(probabilities, na.rm = T), - # sd_prob = sd(probabilities, na.rm = T), - se_prob = sqrt(var(probabilities) * sum(weight^2) / (sum(weight)^2)), + mean_prob = mean(probabilities, na.rm = T), + se_prob = sd(probabilities, na.rm = T), # n = mean(n), me_prob = alpha_threshold * se_prob, .groups = "drop" @@ -102,7 +94,7 @@ fig_1 <- iscb_pubmed_sum %>% # group_by(year, type, gender) %>% gender_breakdown("main", fct_rev(type)) fig_1 -ggsave("figs/gender_breakdown.png", fig_1, width = 5, height = 2.5) +ggsave("figs/gender_breakdown.png", fig_1, width = 5, height = 2.5, dpi = 600) ggsave("figs/gender_breakdown.svg", fig_1, width = 5, height = 2.5) ``` @@ -131,8 +123,8 @@ fig_1d <- iscb_pubmed %>% ) %>% group_by(type2, year, gender) %>% summarise( - mean_prob = mean(weighted_probs), - se_prob = sqrt(var(probabilities) * sum(weight^2) / (sum(weight)^2)), + mean_prob = mean(probabilities), + se_prob = sd(probabilities)/sqrt(n()), me_prob = alpha_threshold * se_prob, .groups = "drop" ) %>% @@ -173,7 +165,7 @@ iscb_pubmed_sum %>% ```{r echo = F} get_p <- function(inte, colu) { broom::tidy(inte) %>% - filter(term == "weighted_probs") %>% + filter(term == "probabilities") %>% pull(colu) %>% sprintf("%0.5g", .) } @@ -181,7 +173,7 @@ get_p <- function(inte, colu) { ```{r} iscb_lm <- iscb_pubmed %>% - filter(gender == "probability_female", !is.na(weighted_probs)) %>% + filter(gender == "probability_female", !is.na(probabilities)) %>% mutate(type = as.factor(type)) %>% mutate(type = type %>% relevel(ref = "Pubmed authors")) ``` @@ -189,33 +181,33 @@ iscb_lm <- iscb_pubmed %>% ```{r} scaled_iscb <- iscb_lm %>% filter(year(year) >= 2002) -# scaled_iscb$s_prob <- scale(scaled_iscb$weighted_probs, scale = F) +# scaled_iscb$s_prob <- scale(scaled_iscb$probabilities, scale = F) # scaled_iscb$s_year <- scale(scaled_iscb$year, scale = F) -main_lm <- glm(type ~ year + weighted_probs, +main_lm <- glm(type ~ year + probabilities, data = scaled_iscb, # %>% mutate(year = as.factor(year)) family = "binomial" ) broom::tidy(main_lm) inte_lm <- glm( - # type ~ scale(year, scale = F) * scale(weighted_probs, scale = F), + # type ~ scale(year, scale = F) * scale(probabilities, scale = F), # type ~ s_year * s_prob, - type ~ year * weighted_probs, + type ~ year * probabilities, data = scaled_iscb, # %>% mutate(year = as.factor(year)) family = "binomial" ) broom::tidy(inte_lm) anova(main_lm, inte_lm, test = "Chisq") # mean(scaled_iscb$year) -# mean(scaled_iscb$weighted_probs) +# mean(scaled_iscb$probabilities) ``` The two groups of scientists did not have a significant association with the gender predicted from fore names (_P_ = `r get_p(main_lm, 'p.value')`). Interaction terms do not predict `type` over and above the main effect of gender probability and year. ```{r include=FALSE, eval=FALSE} -# inte_lm <- glm(type ~ (year * weighted_probs), +# inte_lm <- glm(type ~ (year * probabilities), # data = iscb_lm, # family = 'binomial') ``` diff --git a/11.visualize-name-origins.Rmd b/11.visualize-name-origins.Rmd index 2b9de29..b2de730 100644 --- a/11.visualize-name-origins.Rmd +++ b/11.visualize-name-origins.Rmd @@ -96,17 +96,12 @@ iscb_pubmed_oth <- iscb_nat_df %>% values_to = "probabilities" ) %>% filter(!is.na(probabilities)) %>% - group_by(type, year, region) %>% - mutate( - pmc_citations_year = mean(adjusted_citations), - weight = adjusted_citations / pmc_citations_year, - weighted_probs = probabilities * weight - ) + group_by(type, year, region) iscb_pubmed_sum_oth <- iscb_pubmed_oth %>% summarise( - mean_prob = mean(weighted_probs), - se_prob = sqrt(var(probabilities) * sum(weight^2) / (sum(weight)^2)), + mean_prob = mean(probabilities), + se_prob = sd(probabilities)/sqrt(n()), me_prob = alpha_threshold * se_prob, .groups = "drop" ) @@ -127,7 +122,7 @@ for (conf in my_confs) { iscb_nat[[i]] <- iscb_pubmed_oth %>% filter(region != "OtherCategories", type != "Pubmed authors" & journal == conf) %>% group_by(type, year, region, journal) %>% - summarise(mean_prob = mean(weighted_probs), .groups = "drop") + summarise(mean_prob = mean(probabilities), .groups = "drop") } ``` @@ -169,7 +164,7 @@ fig_4b <- iscb_pubmed_sum_oth %>% fig_4 <- cowplot::plot_grid(fig_4a, fig_4b, labels = "AUTO", ncol = 1, rel_heights = c(1.3, 1)) fig_4 -ggsave("figs/region_breakdown.png", fig_4, width = 6.7, height = 5.5) +ggsave("figs/region_breakdown.png", fig_4, width = 6.7, height = 5.5, dpi = 600) ggsave("figs/region_breakdown.svg", fig_4, width = 6.7, height = 5.5) ``` @@ -185,7 +180,7 @@ iscb_lm <- iscb_pubmed_oth %>% type = as.factor(type) %>% relevel(ref = "Pubmed authors") ) main_lm <- function(regioni) { - glm(type ~ year + weighted_probs, + glm(type ~ year + probabilities, data = iscb_lm %>% filter(region == regioni, !is.na(probabilities), year(year) >= 2002), family = "binomial" @@ -193,9 +188,9 @@ main_lm <- function(regioni) { } inte_lm <- function(regioni) { - glm(type ~ year * weighted_probs, + glm(type ~ year * probabilities, data = iscb_lm %>% - filter(region == regioni, !is.na(weighted_probs), year(year) >= 2002), + filter(region == regioni, !is.na(probabilities), year(year) >= 2002), family = "binomial" ) } @@ -215,7 +210,7 @@ Interaction terms do not predict `type` over and above the main effect of name o ```{r echo = F} get_p <- function(i, colu) { broom::tidy(main_list[[i]]) %>% - filter(term == "weighted_probs") %>% + filter(term == "probabilities") %>% pull(colu) } @@ -326,21 +321,16 @@ iscb_pubmed_oth_lag <- iscb_nat_df %>% values_to = "probabilities" ) %>% filter(!is.na(probabilities), year(year) >= 2002) %>% - group_by(type, year, region) %>% - mutate( - pmc_citations_year = mean(adjusted_citations), - weight = adjusted_citations / pmc_citations_year, - weighted_probs = probabilities * weight - ) + group_by(type, year, region) iscb_lm_lag <- iscb_pubmed_oth_lag %>% ungroup() %>% mutate(type = as.factor(type) %>% relevel(ref = "Pubmed authors")) main_lm <- function(regioni) { - glm(type ~ year + weighted_probs, + glm(type ~ year + probabilities, data = iscb_lm_lag %>% - filter(region == regioni, !is.na(weighted_probs)), + filter(region == regioni, !is.na(probabilities)), family = "binomial" ) } diff --git a/12.analyze-affiliation.Rmd b/12.analyze-affiliation.Rmd index e700614..e85fc4f 100644 --- a/12.analyze-affiliation.Rmd +++ b/12.analyze-affiliation.Rmd @@ -293,7 +293,7 @@ enrichment_plot_right <- plot_obs_exp_right %>% enrichment_plot <- cowplot::plot_grid(enrichment_plot_left, enrichment_plot_right, rel_widths = c(1, 1.3)) enrichment_plot -ggsave('figs/enrichment-plot.png', enrichment_plot, width = 5.5, height = 3.5) +ggsave('figs/enrichment-plot.png', enrichment_plot, width = 5.5, height = 3.5, dpi = 600) ``` diff --git a/14.us-name-origin.Rmd b/14.us-name-origin.Rmd index 06ca206..ef737ed 100644 --- a/14.us-name-origin.Rmd +++ b/14.us-name-origin.Rmd @@ -70,17 +70,12 @@ iscb_pubmed_oth <- iscb_nat_df %>% values_to = "probabilities" ) %>% filter(!is.na(probabilities)) %>% - group_by(type, year, region) %>% - mutate( - pmc_citations_year = mean(adjusted_citations), - weight = adjusted_citations / pmc_citations_year, - weighted_probs = probabilities * weight - ) + group_by(type, year, region) iscb_pubmed_sum_oth <- iscb_pubmed_oth %>% summarise( - mean_prob = mean(weighted_probs), - se_prob = sqrt(var(probabilities) * sum(weight^2) / (sum(weight)^2)), + mean_prob = mean(probabilities), + se_prob = sd(probabilities)/sqrt(n()), me_prob = alpha_threshold * se_prob, .groups = "drop" ) @@ -119,7 +114,7 @@ fig_us_name_originb <- iscb_pubmed_sum_oth %>% fig_us_name_origin <- cowplot::plot_grid(fig_us_name_origina, fig_us_name_originb, labels = "AUTO", ncol = 1, rel_heights = c(1.3, 1)) fig_us_name_origin -ggsave("figs/us_name_origin.png", fig_us_name_origin, width = 6.5, height = 5.5) +ggsave("figs/us_name_origin.png", fig_us_name_origin, width = 6.5, height = 5.5, dpi = 600) ggsave("figs/us_name_origin.svg", fig_us_name_origin, width = 6.5, height = 5.5) ``` @@ -134,17 +129,17 @@ iscb_lm <- iscb_pubmed_oth %>% type = relevel(as.factor(type), ref = "Pubmed authors") ) main_lm <- function(regioni) { - glm(type ~ year + weighted_probs, + glm(type ~ year + probabilities, data = iscb_lm %>% - filter(region == regioni, !is.na(weighted_probs), year(year) >= 2002), + filter(region == regioni, !is.na(probabilities), year(year) >= 2002), family = "binomial" ) } inte_lm <- function(regioni) { - glm(type ~ weighted_probs * year, + glm(type ~ probabilities * year, data = iscb_lm %>% - filter(region == regioni, !is.na(weighted_probs), year(year) >= 2002), + filter(region == regioni, !is.na(probabilities), year(year) >= 2002), family = "binomial" ) } @@ -165,7 +160,7 @@ Interaction terms do not predict `type` over and above the main effect of name o ```{r echo = F} get_exp <- function(i, colu) { broom::tidy(main_list[[i]]) %>% - filter(term == "weighted_probs") %>% + filter(term == "probabilities") %>% pull(colu) } diff --git a/_output.yaml b/_output.yaml index ea2b6fe..0681b68 100644 --- a/_output.yaml +++ b/_output.yaml @@ -3,3 +3,4 @@ html_document: toc: true toc_float: true code_download: true + dpi: 600 diff --git a/docs/091.draw-roc.html b/docs/091.draw-roc.html index 524ce17..387df08 100644 --- a/docs/091.draw-roc.html +++ b/docs/091.draw-roc.html @@ -1741,35 +1741,22 @@
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
-## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
-## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
-## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
-## ✓ readr 1.4.0 ✓ forcats 0.5.0
-## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
-## x dplyr::filter() masks stats::filter()
-## x dplyr::lag() masks stats::lag()
-# still need to install caret for the calibration function because tidymodels's
-# probably hasn't published this yet
-library(caret)
## Loading required package: lattice
+library(tidyverse)
+# still need to install caret for the calibration function because tidymodels's
+# probably hasn't published this yet
+library(caret)
+
+source('utils/r-utils.R')
+theme_set(theme_bw())
roc_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/7c22d0a5f661ce5aeb785215095deda40973ff17/models/NamePrism_roc_curves.tsv') %>%
+ rename('region' = category) %>%
+ # recode_region_letter() %>%
+ recode_region() %>%
+ group_by(region) %>%
+ mutate(Sensitivity = tpr, Specificity = 1-fpr, dSens = c(abs(diff(1-tpr)), 0)) %>%
+ ungroup()
##
-## Attaching package: 'caret'
-## The following object is masked from 'package:purrr':
-##
-## lift
-
-roc_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/7c22d0a5f661ce5aeb785215095deda40973ff17/models/NamePrism_roc_curves.tsv') %>%
- rename('region' = category) %>%
- # recode_region_letter() %>%
- recode_region() %>%
- group_by(region) %>%
- mutate(Sensitivity = tpr, Specificity = 1-fpr, dSens = c(abs(diff(1-tpr)), 0)) %>%
- ungroup()
##
-## ── Column specification ────────────────────────────────────────────────────────
+## ── Column specification ─────────────────────────────────────────
## cols(
## fpr = col_double(),
## tpr = col_double(),
@@ -1780,38 +1767,38 @@ Name origin prediction method performance
## ℹ Unknown levels in `f`: OtherCategories
## ℹ Input `region` is `fct_recode(...)`.
## Warning: Unknown levels in `f`: OtherCategories
-auc_df <- roc_df %>%
- group_by(region) %>%
- # add_count() %>%
- summarise(auc = sum((1 - fpr) * dSens),
- n = n()) %>%
- arrange(desc(auc)) %>%
- mutate(auc_pct = 100 * auc,
- reg_auc = paste0(region, ', AUC = ', round(auc_pct, 1), '%'))
auc_df <- roc_df %>%
+ group_by(region) %>%
+ # add_count() %>%
+ summarise(auc = sum((1 - fpr) * dSens),
+ n = n()) %>%
+ arrange(desc(auc)) %>%
+ mutate(auc_pct = 100 * auc,
+ reg_auc = paste0(region, ', AUC = ', round(auc_pct, 1), '%'))
## `summarise()` ungrouping output (override with `.groups` argument)
-# region_levels <- c('Celtic English', 'European', 'East Asian', 'Hispanic', 'South Asian', 'Muslim', 'Israeli', 'African')
-region_levels <- paste(c('Celtic/English', 'European', 'East Asian', 'Hispanic', 'South Asian', 'Arabic', 'Hebrew', 'African', 'Nordic', 'Greek'), 'names')
-region_levels_let <- toupper(letters[1:8])
-region_cols <- c('#b3de69', '#fdb462', '#bc80bd', '#8dd3c7', '#fccde5', '#ffffb3', '#ccebc5', '#bebada', '#80b1d3', '#fb8072')
-
-fig_3a <- roc_df %>%
- left_join(auc_df, by = 'region') %>%
- ggplot(aes(x = Sensitivity, y = Specificity, color = fct_relevel(reg_auc, as.character(auc_df$reg_auc)))) +
- scale_color_manual(values = region_cols) +
- geom_step(size = 1, alpha = 0.8) +
- coord_fixed() +
- scale_x_reverse(breaks = seq(1, 0, -0.2), labels = scales::percent) +
- scale_y_continuous(breaks = seq(0, 1, 0.2), labels = scales::percent, limits = c(NA, 1.05)) +
- theme(legend.position = c(0.62, 0.42),
- legend.title = element_blank(),
- legend.text.align = 1,
- legend.text = element_text(size = 7),
- legend.margin = margin(-0.2, 0.2, 0.2, 0, unit='cm'))
predictions_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/7c22d0a5f661ce5aeb785215095deda40973ff17/data/NamePrism_results_test.tsv') %>%
- mutate(y_true = as.factor(truth)) %>%
- select(-truth)
# region_levels <- c('Celtic English', 'European', 'East Asian', 'Hispanic', 'South Asian', 'Muslim', 'Israeli', 'African')
+region_levels <- paste(c('Celtic/English', 'European', 'East Asian', 'Hispanic', 'South Asian', 'Arabic', 'Hebrew', 'African', 'Nordic', 'Greek'), 'names')
+region_levels_let <- toupper(letters[1:8])
+region_cols <- c('#b3de69', '#fdb462', '#bc80bd', '#8dd3c7', '#fccde5', '#ffffb3', '#ccebc5', '#bebada', '#80b1d3', '#fb8072')
+
+fig_3a <- roc_df %>%
+ left_join(auc_df, by = 'region') %>%
+ ggplot(aes(x = Sensitivity, y = Specificity, color = fct_relevel(reg_auc, as.character(auc_df$reg_auc)))) +
+ scale_color_manual(values = region_cols) +
+ geom_step(size = 1, alpha = 0.8) +
+ coord_fixed() +
+ scale_x_reverse(breaks = seq(1, 0, -0.2), labels = scales::percent) +
+ scale_y_continuous(breaks = seq(0, 1, 0.2), labels = scales::percent, limits = c(NA, 1.05)) +
+ theme(legend.position = c(0.62, 0.42),
+ legend.title = element_blank(),
+ legend.text.align = 1,
+ legend.text = element_text(size = 7),
+ legend.margin = margin(-0.2, 0.2, 0.2, 0, unit='cm'))
predictions_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/7c22d0a5f661ce5aeb785215095deda40973ff17/data/NamePrism_results_test.tsv') %>%
+ mutate(y_true = as.factor(truth)) %>%
+ select(-truth)
##
-## ── Column specification ────────────────────────────────────────────────────────
+## ── Column specification ─────────────────────────────────────────
## cols(
## African = col_double(),
## CelticEnglish = col_double(),
@@ -1825,83 +1812,79 @@ Name origin prediction method performance
## SouthAsian = col_double(),
## truth = col_character()
## )
-regs <- predictions_df %>% select(African:SouthAsian) %>% colnames()
-cal_dfs <- list()
-for (reg in regs) {
- pred_reg <- predictions_df %>%
- mutate(y_true_bin = as.factor((y_true == reg))) %>%
- rename(prob = reg) %>%
- select(y_true_bin, prob)
-
- cal_dfs[[reg]] <- calibration(y_true_bin ~ prob,
- data = pred_reg,
- cuts = 11,
- class = 'TRUE')$data %>%
- mutate(region = reg)
-}
## Note: Using an external vector in selections is ambiguous.
-## ℹ Use `all_of(reg)` instead of `reg` to silence this message.
-## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
-## This message is displayed once per session.
-
-## calibModelVar bin Percent Lower Upper Count midpoint
-## 1 prob [0,0.0909] 0.973038 0.9061138 1.043559 777 4.545455
-## 2 prob (0.0909,0.182] 12.715105 10.7555376 14.887108 133 13.636364
-## 3 prob (0.182,0.273] 20.620843 16.9791523 24.652952 93 22.727273
-## 4 prob (0.273,0.364] 29.924242 24.4643714 35.841394 79 31.818182
-## 5 prob (0.364,0.455] 35.897436 29.7515540 42.405681 84 40.909091
-## 6 prob (0.455,0.545] 38.536585 31.8402892 45.569554 79 50.000000
-## 7 prob (0.545,0.636] 45.637584 37.4635833 53.988516 68 59.090909
-## 8 prob (0.636,0.727] 56.953642 48.6544756 64.974492 86 68.181818
-## 9 prob (0.727,0.818] 61.421320 54.2394760 68.253900 121 77.272727
-## 10 prob (0.818,0.909] 71.764706 66.6571343 76.488532 244 86.363636
-## 11 prob (0.909,1] 97.209555 96.8524649 97.536348 8953 95.454545
-## region
-## 1 EastAsian
-## 2 EastAsian
-## 3 EastAsian
-## 4 EastAsian
-## 5 EastAsian
-## 6 EastAsian
-## 7 EastAsian
-## 8 EastAsian
-## 9 EastAsian
-## 10 EastAsian
-## 11 EastAsian
-fig_3b <- bind_rows(cal_dfs) %>%
- recode_region() %>%
- ggplot(aes(x = midpoint/100, y = Percent/100, color = fct_relevel(region, as.character(auc_df$region)))) +
- geom_abline(slope = 1, linetype = 2, alpha = 0.5) +
- scale_y_continuous(labels = scales::percent_format(accuracy = 20L), breaks = seq(0, 1, 0.2), limits = c(-0.005, 1.045)) +
- scale_x_continuous(labels = scales::percent_format(accuracy = 20L), breaks = seq(0, 1, 0.2), limits = c(0, 1)) +
- coord_fixed() +
- geom_point() +
- geom_line() +
- scale_color_manual(values = region_cols) +
- theme(legend.position = 'None') +
- labs(x = 'Predicted probability', y = 'Fraction of names')
regs <- predictions_df %>% select(African:SouthAsian) %>% colnames()
+cal_dfs <- list()
+for (reg in regs) {
+ pred_reg <- predictions_df %>%
+ mutate(y_true_bin = as.factor((y_true == reg))) %>%
+ rename(prob = reg) %>%
+ select(y_true_bin, prob)
+
+ cal_dfs[[reg]] <- calibration(y_true_bin ~ prob,
+ data = pred_reg,
+ cuts = 11,
+ class = 'TRUE')$data %>%
+ mutate(region = reg)
+}
+cal_dfs$EastAsian
## calibModelVar bin Percent Lower Upper
+## 1 prob [0,0.0909] 0.973038 0.9061138 1.043559
+## 2 prob (0.0909,0.182] 12.715105 10.7555376 14.887108
+## 3 prob (0.182,0.273] 20.620843 16.9791523 24.652952
+## 4 prob (0.273,0.364] 29.924242 24.4643714 35.841394
+## 5 prob (0.364,0.455] 35.897436 29.7515540 42.405681
+## 6 prob (0.455,0.545] 38.536585 31.8402892 45.569554
+## 7 prob (0.545,0.636] 45.637584 37.4635833 53.988516
+## 8 prob (0.636,0.727] 56.953642 48.6544756 64.974492
+## 9 prob (0.727,0.818] 61.421320 54.2394760 68.253900
+## 10 prob (0.818,0.909] 71.764706 66.6571343 76.488532
+## 11 prob (0.909,1] 97.209555 96.8524649 97.536348
+## Count midpoint region
+## 1 777 4.545455 EastAsian
+## 2 133 13.636364 EastAsian
+## 3 93 22.727273 EastAsian
+## 4 79 31.818182 EastAsian
+## 5 84 40.909091 EastAsian
+## 6 79 50.000000 EastAsian
+## 7 68 59.090909 EastAsian
+## 8 86 68.181818 EastAsian
+## 9 121 77.272727 EastAsian
+## 10 244 86.363636 EastAsian
+## 11 8953 95.454545 EastAsian
+fig_3b <- bind_rows(cal_dfs) %>%
+ recode_region() %>%
+ ggplot(aes(x = midpoint/100, y = Percent/100, color = fct_relevel(region, as.character(auc_df$region)))) +
+ geom_abline(slope = 1, linetype = 2, alpha = 0.5) +
+ scale_y_continuous(labels = scales::percent_format(accuracy = 20L), breaks = seq(0, 1, 0.2), limits = c(-0.005, 1.045)) +
+ scale_x_continuous(labels = scales::percent_format(accuracy = 20L), breaks = seq(0, 1, 0.2), limits = c(0, 1)) +
+ coord_fixed() +
+ geom_point() +
+ geom_line() +
+ scale_color_manual(values = region_cols) +
+ theme(legend.position = 'None') +
+ labs(x = 'Predicted probability', y = 'Fraction of names')
## Warning: Problem with `mutate()` input `region`.
## ℹ Unknown levels in `f`: OtherCategories
## ℹ Input `region` is `fct_recode(...)`.
## Warning: Unknown levels in `f`: OtherCategories
-n_obs <- sum(auc_df$n)
-short_regs <- auc_df$region %>%
- as.character() %>%
- gsub(' names', '', .)
-
-heat_dat <- predictions_df %>%
- group_by(y_true) %>%
- summarise_if(is.numeric, mean, na.rm = T) %>%
- ungroup() %>%
- pivot_longer(- y_true, names_to = 'region', values_to = 'pred_prob') %>%
- recode_region() %>%
- rename('reg_hat' = region, 'region' = y_true) %>%
- recode_region() %>%
- rename('y_true' = region, 'region' = reg_hat) %>%
- left_join(auc_df, by = 'region') %>%
- mutate(scale_pred_prob = log2((pred_prob)/(n/n_obs)),
- region = region %>% gsub(' names', '', .) %>% fct_relevel(short_regs),
- y_true = y_true %>% gsub(' names', '', .) %>% fct_relevel(short_regs))
n_obs <- sum(auc_df$n)
+short_regs <- auc_df$region %>%
+ as.character() %>%
+ gsub(' names', '', .)
+
+heat_dat <- predictions_df %>%
+ group_by(y_true) %>%
+ summarise_if(is.numeric, mean, na.rm = T) %>%
+ ungroup() %>%
+ pivot_longer(- y_true, names_to = 'region', values_to = 'pred_prob') %>%
+ recode_region() %>%
+ rename('reg_hat' = region, 'region' = y_true) %>%
+ recode_region() %>%
+ rename('y_true' = region, 'region' = reg_hat) %>%
+ left_join(auc_df, by = 'region') %>%
+ mutate(scale_pred_prob = log2((pred_prob)/(n/n_obs)),
+ region = region %>% gsub(' names', '', .) %>% fct_relevel(short_regs),
+ y_true = y_true %>% gsub(' names', '', .) %>% fct_relevel(short_regs))
## Warning: Problem with `mutate()` input `region`.
## ℹ Unknown levels in `f`: OtherCategories
## ℹ Input `region` is `fct_recode(...)`.
@@ -1910,31 +1893,31 @@ ## Warning: Unknown levels in `f`: OtherCategories
-fig_3c <- ggplot(heat_dat, aes(y_true, region,
- fill = scale_pred_prob)) +
- geom_tile() +
- scale_fill_gradientn(
- colours = c("#3CBC75FF","white","#440154FF"),
- values = scales::rescale(
- c(min(heat_dat$scale_pred_prob),
- 0,
- max(heat_dat$scale_pred_prob)))
- ) +
- coord_fixed() +
- labs(x = 'True region', y = 'Predicted region', fill = bquote(log[2]~'FC')) +
- theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
- legend.position = 'top',
- legend.key.height = unit(0.2, 'cm'),
- legend.title = element_text(vjust = 1),
- legend.margin = margin(0, 0,0, -1, unit='cm'),
- axis.title.x = element_text(margin = margin(t = 27, r = 0, b = 0, l = 0)),
- axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0)))
-
-fig_3 <- cowplot::plot_grid(fig_3a, fig_3b, fig_3c, labels = 'AUTO', nrow = 1,
- rel_widths = c(2,2,1.6))
-fig_3
fig_3c <- ggplot(heat_dat, aes(y_true, region,
+ fill = scale_pred_prob)) +
+ geom_tile() +
+ scale_fill_gradientn(
+ colours = c("#3CBC75FF","white","#440154FF"),
+ values = scales::rescale(
+ c(min(heat_dat$scale_pred_prob),
+ 0,
+ max(heat_dat$scale_pred_prob)))
+ ) +
+ coord_fixed() +
+ labs(x = 'True region', y = 'Predicted region', fill = bquote(log[2]~'FC')) +
+ theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
+ legend.position = 'top',
+ legend.key.height = unit(0.2, 'cm'),
+ legend.title = element_text(vjust = 1),
+ legend.margin = margin(0, 0,0, -1, unit='cm'),
+ axis.title.x = element_text(margin = margin(t = 27, r = 0, b = 0, l = 0)),
+ axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0)))
+
+fig_3 <- cowplot::plot_grid(fig_3a, fig_3b, fig_3c, labels = 'AUTO', nrow = 1,
+ rel_widths = c(2,2,1.6))
+fig_3
##
-## ── Column specification ────────────────────────────────────────────────────────
+## ── Column specification ─────────────────────────────────────────
## cols(
## fore_name = col_character(),
## last_name = col_character(),
@@ -4334,7 +4334,7 @@ General data read-in
publication_date = ymd(publication_date, truncated = 2)) %>%
filter(year(publication_date) < 2020)
##
-## ── Column specification ────────────────────────────────────────────────────────
+## ── Column specification ─────────────────────────────────────────
## cols(
## pmid = col_double(),
## pmcid = col_character(),
@@ -4358,7 +4358,7 @@ General data read-in
left_join(select(all_full_names, - full_name), by = c('fore_name', 'last_name')) %>%
filter(year(year) < 2020, conference != 'PSB') # remove PSB, exclude ISCB Fellows and ISMB speakers in 2020 for now
##
-## ── Column specification ────────────────────────────────────────────────────────
+## ── Column specification ─────────────────────────────────────────
## cols(
## year = col_double(),
## full_name = col_character(),
@@ -4371,9 +4371,10 @@ General data read-in
## )
## # A tibble: 0 x 11
-## # … with 11 variables: year <date>, full_name <chr>, fore_name <chr>,
-## # last_name <chr>, conference <chr>, source <chr>, affiliations <chr>,
-## # afflcountries <chr>, publication_date <date>, fore_name_simple <chr>,
+## # … with 11 variables: year <date>, full_name <chr>,
+## # fore_name <chr>, last_name <chr>, conference <chr>,
+## # source <chr>, affiliations <chr>, afflcountries <chr>,
+## # publication_date <date>, fore_name_simple <chr>,
## # last_name_simple <chr>
large_jours <- articles %>%
count(journal, sort = T) %>%
@@ -4385,7 +4386,7 @@ General data read-in
left_join(all_full_names, by = 'full_name')
## Warning: Missing column names filled in: 'X1' [1]
##
-## ── Column specification ────────────────────────────────────────────────────────
+## ── Column specification ─────────────────────────────────────────
## cols(
## X1 = col_character(),
## African = col_double(),
@@ -4406,8 +4407,8 @@ General data read-in
corr_authors %>%
count(year, name = 'Number of articles with last authors') %>%
DT::datatable(rownames = F)
-
-
+
+
If we set a threshold at least 200 articles a year, we should only consider articles from 1998 on.
corr_authors <- corr_authors %>%
add_count(year, name = 'n_aut_yr') %>%
diff --git a/docs/093.summary-stats.html b/docs/093.summary-stats.html
index 54e91c6..4db9784 100644
--- a/docs/093.summary-stats.html
+++ b/docs/093.summary-stats.html
@@ -4317,8 +4317,8 @@ Honorees
count(fore_name, last_name) %>%
arrange(desc(n)) %>%
DT::datatable()
-
-
+
+
Number of keynote speakers/fellows across years:
keynotes %>%
select(year, conference) %>%
@@ -4364,7 +4364,7 @@ Authors
Gender analysis
##
-## ── Column specification ────────────────────────────────────────────────────────
+## ── Column specification ─────────────────────────────────────────
## cols(
## fore_name_simple = col_character(),
## n_authors = col_double(),
@@ -4412,8 +4412,8 @@ Gender analysis
pull(pred.asi) %>%
mean(na.rm = T)
## [1] "Proceeding with surname-only predictions..."
-## Warning in merge_surnames(voter.file, impute.missing = impute.missing): 1305
-## surnames were not matched.
+## Warning in merge_surnames(voter.file, impute.missing =
+## impute.missing): 1305 surnames were not matched.
## [1] 0.8174599
Honorees that didn’t receive a gender prediction: Chung-I Wu.
In summary, the NA predictions mostly include initials only, hyphenated names and perhaps names with accent marks.
@@ -4492,8 +4492,8 @@ Name origin analysis
## ℹ Unknown levels in `f`: OtherCategories
## ℹ Input `region` is `fct_recode(...)`.
## Warning: Unknown levels in `f`: OtherCategories
-
-
+
+
## # A tibble: 2 x 3
## `!is.na(African)` `is.na(fore_name_simple.x)` n
diff --git a/docs/10.visualize-gender.html b/docs/10.visualize-gender.html
index 1f524c3..19daa4c 100644
--- a/docs/10.visualize-gender.html
+++ b/docs/10.visualize-gender.html
@@ -1682,7 +1682,7 @@ Load data
alpha_threshold <- qnorm(0.975)
gender_df <- read_tsv("data/gender/genderize.tsv")
##
-## ── Column specification ────────────────────────────────────────────────────────
+## ── Column specification ─────────────────────────────────────────
## cols(
## fore_name_simple = col_character(),
## n_authors = col_double(),
@@ -1730,21 +1730,13 @@ Prepare data frames for later analyses
values_to = "probabilities"
) %>%
filter(!is.na(probabilities)) %>%
- group_by(type, year, gender) %>%
- mutate(
- pmc_citations_year = mean(adjusted_citations),
- weight = adjusted_citations / pmc_citations_year,
- weighted_probs = probabilities * weight
- # weight = 1
- )
+ group_by(type, year, gender)
iscb_pubmed_sum <- iscb_pubmed %>%
summarise(
# n = n(),
- mean_prob = mean(weighted_probs),
- # mean_prob = mean(probabilities, na.rm = T),
- # sd_prob = sd(probabilities, na.rm = T),
- se_prob = sqrt(var(probabilities) * sum(weight^2) / (sum(weight)^2)),
+ mean_prob = mean(probabilities, na.rm = T),
+ se_prob = sd(probabilities, na.rm = T),
# n = mean(n),
me_prob = alpha_threshold * se_prob,
.groups = "drop"
@@ -1760,15 +1752,15 @@ Figure 2: ISCB Fellows and keynote speakers appear more evenly split between
# group_by(year, type, gender) %>%
gender_breakdown("main", fct_rev(type))
fig_1
-ggsave("figs/gender_breakdown.png", fig_1, width = 5, height = 2.5)
+
+ggsave("figs/gender_breakdown.png", fig_1, width = 5, height = 2.5, dpi = 600)
ggsave("figs/gender_breakdown.svg", fig_1, width = 5, height = 2.5)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## type prob_female_avg
## <chr> <dbl>
## 1 Keynote speakers/Fellows 0.303
-## 2 Pubmed authors 0.268
+## 2 Pubmed authors 0.277
## Scale for 'x' is already present. Adding another scale for 'x', which will
-## replace the existing scale.
+## Scale for 'x' is already present. Adding another scale for
+## 'x', which will replace the existing scale.
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
-## Warning: Removed 13171 rows containing non-finite values (stat_smooth).
-
+## Warning: Removed 13171 rows containing non-finite values
+## (stat_smooth).
+
iscb_lm <- iscb_pubmed %>%
- filter(gender == "probability_female", !is.na(weighted_probs)) %>%
+ filter(gender == "probability_female", !is.na(probabilities)) %>%
mutate(type = as.factor(type)) %>%
mutate(type = type %>% relevel(ref = "Pubmed authors"))
scaled_iscb <- iscb_lm %>%
filter(year(year) >= 2002)
-# scaled_iscb$s_prob <- scale(scaled_iscb$weighted_probs, scale = F)
+# scaled_iscb$s_prob <- scale(scaled_iscb$probabilities, scale = F)
# scaled_iscb$s_year <- scale(scaled_iscb$year, scale = F)
-main_lm <- glm(type ~ year + weighted_probs,
+main_lm <- glm(type ~ year + probabilities,
data = scaled_iscb, # %>% mutate(year = as.factor(year))
family = "binomial"
)
broom::tidy(main_lm)
## # A tibble: 3 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) -2.06 0.478 -4.30 1.67e- 5
-## 2 year -0.000271 0.0000320 -8.47 2.42e-17
-## 3 weighted_probs 0.155 0.0921 1.69 9.19e- 2
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) -2.06 0.478 -4.30 1.67e- 5
+## 2 year -0.000271 0.0000320 -8.47 2.46e-17
+## 3 probabilities 0.193 0.146 1.33 1.85e- 1
inte_lm <- glm(
- # type ~ scale(year, scale = F) * scale(weighted_probs, scale = F),
+ # type ~ scale(year, scale = F) * scale(probabilities, scale = F),
# type ~ s_year * s_prob,
- type ~ year * weighted_probs,
+ type ~ year * probabilities,
data = scaled_iscb, # %>% mutate(year = as.factor(year))
family = "binomial"
)
broom::tidy(inte_lm)
## # A tibble: 4 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) -1.91 0.523 -3.65 2.59e- 4
-## 2 year -0.000281 0.0000351 -7.99 1.30e-15
-## 3 weighted_probs -0.445 0.901 -0.494 6.21e- 1
-## 4 year:weighted_probs 0.0000402 0.0000593 0.679 4.97e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) -1.77 0.568 -3.11 1.85e- 3
+## 2 year -0.000291 0.0000383 -7.59 3.22e-14
+## 3 probabilities -0.992 1.29 -0.771 4.41e- 1
+## 4 year:probabilities 0.0000787 0.0000846 0.930 3.52e- 1
anova(main_lm, inte_lm, test = "Chisq")
## Analysis of Deviance Table
##
-## Model 1: type ~ year + weighted_probs
-## Model 2: type ~ year * weighted_probs
+## Model 1: type ~ year + probabilities
+## Model 2: type ~ year * probabilities
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
-## 1 153942 4582.3
-## 2 153941 4581.8 1 0.47034 0.4928
+## 1 153942 4582.8
+## 2 153941 4582.0 1 0.86975 0.351
# mean(scaled_iscb$year)
-# mean(scaled_iscb$weighted_probs)
-The two groups of scientists did not have a significant association with the gender predicted from fore names (P = 0.091898). Interaction terms do not predict type
over and above the main effect of gender probability and year.
The two groups of scientists did not have a significant association with the gender predicted from fore names (P = 0.18469). Interaction terms do not predict type
over and above the main effect of gender probability and year.
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
@@ -1878,55 +1871,73 @@ Hypothesis testing
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
-## [1] stats graphics grDevices utils datasets methods base
+## [1] stats graphics grDevices utils datasets methods
+## [7] base
##
## other attached packages:
-## [1] gdtools_0.2.2 wru_0.1-10 rnaturalearth_0.1.0
-## [4] lubridate_1.7.9.2 caret_6.0-86 lattice_0.20-41
-## [7] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2
-## [10] purrr_0.3.4 readr_1.4.0 tidyr_1.1.2
-## [13] tibble_3.0.4 ggplot2_3.3.2 tidyverse_1.3.0
+## [1] broom_0.7.2 DT_0.16 epitools_0.5-10.1
+## [4] gdtools_0.2.2 wru_0.1-10 rnaturalearth_0.1.0
+## [7] lubridate_1.7.9.2 caret_6.0-86 lattice_0.20-41
+## [10] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2
+## [13] purrr_0.3.4 readr_1.4.0 tidyr_1.1.2
+## [16] tibble_3.0.4 ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
-## [1] colorspace_2.0-0 ellipsis_0.3.1 class_7.3-17
-## [4] rprojroot_1.3-2 fs_1.5.0 rstudioapi_0.12
-## [7] farver_2.0.3 remotes_2.2.0 DT_0.16
-## [10] prodlim_2019.11.13 fansi_0.4.1 xml2_1.3.2
-## [13] codetools_0.2-16 splines_4.0.3 knitr_1.30
-## [16] pkgload_1.1.0 jsonlite_1.7.1 pROC_1.16.2
-## [19] broom_0.7.2 dbplyr_2.0.0 rgeos_0.5-5
-## [22] compiler_4.0.3 httr_1.4.2 backports_1.2.0
-## [25] assertthat_0.2.1 Matrix_1.2-18 cli_2.1.0
-## [28] htmltools_0.5.0 prettyunits_1.1.1 tools_4.0.3
-## [31] gtable_0.3.0 glue_1.4.2 rnaturalearthdata_0.1.0
-## [34] reshape2_1.4.4 Rcpp_1.0.5 cellranger_1.1.0
-## [37] vctrs_0.3.4 svglite_1.2.3.2 nlme_3.1-149
-## [40] iterators_1.0.13 crosstalk_1.1.0.1 timeDate_3043.102
-## [43] gower_0.2.2 xfun_0.19 ps_1.4.0
-## [46] testthat_3.0.0 rvest_0.3.6 lifecycle_0.2.0
-## [49] devtools_2.3.2 MASS_7.3-53 scales_1.1.1
-## [52] ipred_0.9-9 hms_0.5.3 RColorBrewer_1.1-2
-## [55] yaml_2.2.1 curl_4.3 memoise_1.1.0
-## [58] rpart_4.1-15 stringi_1.5.3 desc_1.2.0
-## [61] foreach_1.5.1 e1071_1.7-4 pkgbuild_1.1.0
-## [64] lava_1.6.8.1 systemfonts_0.3.2 rlang_0.4.8
-## [67] pkgconfig_2.0.3 evaluate_0.14 sf_0.9-6
-## [70] recipes_0.1.15 htmlwidgets_1.5.2 labeling_0.4.2
-## [73] cowplot_1.1.0 tidyselect_1.1.0 processx_3.4.4
-## [76] plyr_1.8.6 magrittr_1.5 R6_2.5.0
-## [79] generics_0.1.0 DBI_1.1.0 mgcv_1.8-33
-## [82] pillar_1.4.6 haven_2.3.1 withr_2.3.0
-## [85] units_0.6-7 survival_3.2-7 sp_1.4-4
-## [88] nnet_7.3-14 modelr_0.1.8 crayon_1.3.4
-## [91] KernSmooth_2.23-17 utf8_1.1.4 rmarkdown_2.5
-## [94] usethis_1.6.3 grid_4.0.3 readxl_1.3.1
-## [97] data.table_1.13.2 callr_3.5.1 ModelMetrics_1.2.2.2
-## [100] reprex_0.3.0 digest_0.6.27 classInt_0.4-3
-## [103] stats4_4.0.3 munsell_0.5.0 viridisLite_0.3.0
-## [106] sessioninfo_1.1.1
+## [1] colorspace_2.0-0 ellipsis_0.3.1
+## [3] class_7.3-17 rprojroot_1.3-2
+## [5] fs_1.5.0 rstudioapi_0.12
+## [7] farver_2.0.3 remotes_2.2.0
+## [9] prodlim_2019.11.13 fansi_0.4.1
+## [11] xml2_1.3.2 codetools_0.2-16
+## [13] splines_4.0.3 knitr_1.30
+## [15] pkgload_1.1.0 jsonlite_1.7.1
+## [17] pROC_1.16.2 dbplyr_2.0.0
+## [19] rgeos_0.5-5 compiler_4.0.3
+## [21] httr_1.4.2 backports_1.2.0
+## [23] assertthat_0.2.1 Matrix_1.2-18
+## [25] cli_2.1.0 htmltools_0.5.0
+## [27] prettyunits_1.1.1 tools_4.0.3
+## [29] gtable_0.3.0 glue_1.4.2
+## [31] rnaturalearthdata_0.1.0 reshape2_1.4.4
+## [33] Rcpp_1.0.5 cellranger_1.1.0
+## [35] vctrs_0.3.4 svglite_1.2.3.2
+## [37] nlme_3.1-149 iterators_1.0.13
+## [39] crosstalk_1.1.0.1 timeDate_3043.102
+## [41] gower_0.2.2 xfun_0.19
+## [43] ps_1.4.0 testthat_3.0.0
+## [45] rvest_0.3.6 lifecycle_0.2.0
+## [47] devtools_2.3.2 MASS_7.3-53
+## [49] scales_1.1.1 ipred_0.9-9
+## [51] hms_0.5.3 RColorBrewer_1.1-2
+## [53] yaml_2.2.1 curl_4.3
+## [55] memoise_1.1.0 rpart_4.1-15
+## [57] stringi_1.5.3 desc_1.2.0
+## [59] foreach_1.5.1 e1071_1.7-4
+## [61] pkgbuild_1.1.0 lava_1.6.8.1
+## [63] systemfonts_0.3.2 rlang_0.4.8
+## [65] pkgconfig_2.0.3 evaluate_0.14
+## [67] sf_0.9-6 recipes_0.1.15
+## [69] htmlwidgets_1.5.2 labeling_0.4.2
+## [71] cowplot_1.1.0 tidyselect_1.1.0
+## [73] processx_3.4.4 plyr_1.8.6
+## [75] magrittr_1.5 R6_2.5.0
+## [77] generics_0.1.0 DBI_1.1.0
+## [79] mgcv_1.8-33 pillar_1.4.6
+## [81] haven_2.3.1 withr_2.3.0
+## [83] units_0.6-7 survival_3.2-7
+## [85] sp_1.4-4 nnet_7.3-14
+## [87] modelr_0.1.8 crayon_1.3.4
+## [89] KernSmooth_2.23-17 utf8_1.1.4
+## [91] rmarkdown_2.5 usethis_1.6.3
+## [93] grid_4.0.3 readxl_1.3.1
+## [95] data.table_1.13.2 callr_3.5.1
+## [97] ModelMetrics_1.2.2.2 reprex_0.3.0
+## [99] digest_0.6.27 classInt_0.4-3
+## [101] stats4_4.0.3 munsell_0.5.0
+## [103] viridisLite_0.3.0 sessioninfo_1.1.1
##
-## ── Column specification ────────────────────────────────────────────────────────
+## ── Column specification ─────────────────────────────────────────
## cols(
## Country = col_character(),
## Region = col_character()
@@ -1747,17 +1747,12 @@ Descriptive statistics
values_to = "probabilities"
) %>%
filter(!is.na(probabilities)) %>%
- group_by(type, year, region) %>%
- mutate(
- pmc_citations_year = mean(adjusted_citations),
- weight = adjusted_citations / pmc_citations_year,
- weighted_probs = probabilities * weight
- )
+ group_by(type, year, region)
iscb_pubmed_sum_oth <- iscb_pubmed_oth %>%
summarise(
- mean_prob = mean(weighted_probs),
- se_prob = sqrt(var(probabilities) * sum(weight^2) / (sum(weight)^2)),
+ mean_prob = mean(probabilities),
+ se_prob = sd(probabilities)/sqrt(n()),
me_prob = alpha_threshold * se_prob,
.groups = "drop"
)
@@ -1777,7 +1772,7 @@ By conference keynotes/fellows
iscb_nat[[i]] <- iscb_pubmed_oth %>%
filter(region != "OtherCategories", type != "Pubmed authors" & journal == conf) %>%
group_by(type, year, region, journal) %>%
- summarise(mean_prob = mean(weighted_probs), .groups = "drop")
+ summarise(mean_prob = mean(probabilities), .groups = "drop")
}
save(my_world, iscb_pubmed_oth, iscb_nat, file = "Rdata/iscb-pubmed_nat.Rdata")
@@ -1815,8 +1810,8 @@ ## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
fig_4
-
-ggsave("figs/region_breakdown.png", fig_4, width = 6.7, height = 5.5)
+
+ggsave("figs/region_breakdown.png", fig_4, width = 6.7, height = 5.5, dpi = 600)
ggsave("figs/region_breakdown.svg", fig_4, width = 6.7, height = 5.5)
@@ -1830,7 +1825,7 @@ Hypothesis testing
type = as.factor(type) %>% relevel(ref = "Pubmed authors")
)
main_lm <- function(regioni) {
- glm(type ~ year + weighted_probs,
+ glm(type ~ year + probabilities,
data = iscb_lm %>%
filter(region == regioni, !is.na(probabilities), year(year) >= 2002),
family = "binomial"
@@ -1838,122 +1833,120 @@ Hypothesis testing
}
inte_lm <- function(regioni) {
- glm(type ~ year * weighted_probs,
+ glm(type ~ year * probabilities,
data = iscb_lm %>%
- filter(region == regioni, !is.na(weighted_probs), year(year) >= 2002),
+ filter(region == regioni, !is.na(probabilities), year(year) >= 2002),
family = "binomial"
)
}
-main_list <- lapply(large_regions, main_lm)
-## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
-names(main_list) <- large_regions
+main_list <- lapply(large_regions, main_lm)
+names(main_list) <- large_regions
lapply(main_list, broom::tidy)
## $CelticEnglish
## # A tibble: 3 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) -2.17 0.481 -4.52 6.06e- 6
-## 2 year -0.000268 0.0000320 -8.37 5.64e-17
-## 3 weighted_probs 0.194 0.0561 3.46 5.46e- 4
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) -2.62 0.489 -5.36 8.47e- 8
+## 2 year -0.000250 0.0000321 -7.78 7.40e-15
+## 3 probabilities 0.869 0.139 6.26 3.97e-10
##
## $EastAsian
## # A tibble: 3 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) -2.30 0.480 -4.80 1.58e- 6
-## 2 year -0.000242 0.0000321 -7.54 4.82e-14
-## 3 weighted_probs -1.67 0.286 -5.82 6.02e- 9
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) -2.29 0.479 -4.77 1.81e- 6
+## 2 year -0.000241 0.0000320 -7.51 6.02e-14
+## 3 probabilities -1.75 0.250 -7.00 2.51e-12
##
## $European
## # A tibble: 3 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) -2.08 0.480 -4.32 1.53e- 5
-## 2 year -0.000272 0.0000319 -8.53 1.46e-17
-## 3 weighted_probs 0.0713 0.0822 0.867 3.86e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) -2.15 0.484 -4.45 8.72e- 6
+## 2 year -0.000271 0.0000320 -8.46 2.62e-17
+## 3 probabilities 0.222 0.137 1.62 1.05e- 1
##
## $OtherCategories
## # A tibble: 3 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) -2.06 0.479 -4.30 1.73e- 5
-## 2 year -0.000273 0.0000319 -8.57 1.03e-17
-## 3 weighted_probs 0.0724 0.0948 0.763 4.45e- 1
-inte_list <- lapply(large_regions, inte_lm)
-## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
-lapply(inte_list, broom::tidy)
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) -2.07 0.479 -4.33 1.52e- 5
+## 2 year -0.000274 0.0000319 -8.58 9.22e-18
+## 3 probabilities 0.159 0.151 1.05 2.95e- 1
+inte_list <- lapply(large_regions, inte_lm)
+lapply(inte_list, broom::tidy)
## [[1]]
## # A tibble: 4 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) -2.11 0.507 -4.17 3.06e- 5
-## 2 year -0.000272 0.0000338 -8.05 8.48e-16
-## 3 weighted_probs 0.00264 0.525 0.00502 9.96e- 1
-## 4 year:weighted_probs 0.0000131 0.0000353 0.370 7.11e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) -2.48 0.627 -3.96 7.57e- 5
+## 2 year -0.000259 0.0000414 -6.26 3.96e-10
+## 3 probabilities 0.451 1.19 0.380 7.04e- 1
+## 4 year:probabilities 0.0000283 0.0000796 0.355 7.23e- 1
##
## [[2]]
## # A tibble: 4 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) -2.53 0.506 -5.01 5.54e- 7
-## 2 year -0.000227 0.0000338 -6.71 1.94e-11
-## 3 weighted_probs 1.96 2.45 0.800 4.24e- 1
-## 4 year:weighted_probs -0.000239 0.000164 -1.46 1.44e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) -2.45 0.500 -4.90 9.36e- 7
+## 2 year -0.000229 0.0000334 -6.87 6.37e-12
+## 3 probabilities 0.853 2.19 0.389 6.97e- 1
+## 4 year:probabilities -0.000172 0.000146 -1.18 2.38e- 1
##
## [[3]]
## # A tibble: 4 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) -1.87 0.538 -3.47 5.20e- 4
-## 2 year -0.000286 0.0000358 -8.01 1.19e-15
-## 3 weighted_probs -0.587 0.779 -0.753 4.51e- 1
-## 4 year:weighted_probs 0.0000441 0.0000511 0.862 3.89e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) -1.66 0.614 -2.71 6.75e- 3
+## 2 year -0.000303 0.0000410 -7.39 1.44e-13
+## 3 probabilities -1.28 1.20 -1.07 2.85e- 1
+## 4 year:probabilities 0.000101 0.0000796 1.27 2.05e- 1
##
## [[4]]
## # A tibble: 4 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) -1.80 0.527 -3.42 6.25e- 4
-## 2 year -0.000290 0.0000351 -8.29 1.17e-16
-## 3 weighted_probs -0.880 0.849 -1.04 3.00e- 1
-## 4 year:weighted_probs 0.0000628 0.0000544 1.15 2.48e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) -1.55 0.598 -2.60 9.42e- 3
+## 2 year -0.000309 0.0000401 -7.70 1.31e-14
+## 3 probabilities -1.76 1.35 -1.30 1.92e- 1
+## 4 year:probabilities 0.000127 0.0000882 1.44 1.50e- 1
for (i in 1:4) {
print(anova(main_list[[i]], inte_list[[i]], test = "Chisq"))
}
## Analysis of Deviance Table
##
-## Model 1: type ~ year + weighted_probs
-## Model 2: type ~ year * weighted_probs
+## Model 1: type ~ year + probabilities
+## Model 2: type ~ year * probabilities
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
-## 1 163886 4627.2
-## 2 163885 4627.1 1 0.14187 0.7064
+## 1 163886 4599.0
+## 2 163885 4598.9 1 0.12594 0.7227
## Analysis of Deviance Table
##
-## Model 1: type ~ year + weighted_probs
-## Model 2: type ~ year * weighted_probs
+## Model 1: type ~ year + probabilities
+## Model 2: type ~ year * probabilities
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
-## 1 163886 4576.6
-## 2 163885 4574.6 1 2.0829 0.149
+## 1 163886 4554.1
+## 2 163885 4552.7 1 1.3867 0.239
## Analysis of Deviance Table
##
-## Model 1: type ~ year + weighted_probs
-## Model 2: type ~ year * weighted_probs
+## Model 1: type ~ year + probabilities
+## Model 2: type ~ year * probabilities
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
-## 1 163886 4634.1
-## 2 163885 4633.4 1 0.74565 0.3879
+## 1 163886 4632.2
+## 2 163885 4630.6 1 1.607 0.2049
## Analysis of Deviance Table
##
-## Model 1: type ~ year + weighted_probs
-## Model 2: type ~ year * weighted_probs
+## Model 1: type ~ year + probabilities
+## Model 2: type ~ year * probabilities
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
-## 1 163886 4634.3
-## 2 163885 4633.1 1 1.2103 0.2713
+## 1 163886 4633.7
+## 2 163885 4631.6 1 2.0781 0.1494
Interaction terms do not predict type
over and above the main effect of name origin probability and year (p > 0.01).
A Celtic/English name has 1.2141832 the odds of being selected as an honoree, significantly higher compared to other names (\(\beta_\textrm{Celtic/English} =\) 0.19407, P = 0.00054646). An East Asian name has 0.1889996 the odds of being selected as an honoree, significantly lower than to other names (\(\beta_\textrm{East Asian} =\) -1.666, P = 6.0164e-09). The two groups of scientists did not have a significant association with names predicted to be European (P = 0.38583) or in Other categories (P = 0.44544).
+A Celtic/English name has 2.3850497 the odds of being selected as an honoree, significantly higher compared to other names (\(\beta_\textrm{Celtic/English} =\) 0.86922, P = 3.9713e-10). An East Asian name has 0.1731846 the odds of being selected as an honoree, significantly lower than to other names (\(\beta_\textrm{East Asian} =\) -1.7534, P = 2.5132e-12). The two groups of scientists did not have a significant association with names predicted to be European (P = 0.10527) or in Other categories (P = 0.29469).
## $CelticEnglish
## # A tibble: 3 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) 29.1 1.18 24.7 2.46e-134
-## 2 year -0.00210 0.0000749 -28.0 2.09e-172
-## 3 weighted_probs 0.0199 0.102 0.196 8.45e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) 29.0 1.19 24.5 4.51e-132
+## 2 year -0.00209 0.0000750 -27.9 5.63e-171
+## 3 probabilities 0.130 0.170 0.767 4.43e- 1
##
## $EastAsian
## # A tibble: 3 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) 29.0 1.18 24.6 5.89e-134
-## 2 year -0.00209 0.0000749 -27.8 1.41e-170
-## 3 weighted_probs -0.708 0.292 -2.43 1.52e- 2
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) 28.9 1.18 24.6 3.99e-133
+## 2 year -0.00208 0.0000749 -27.7 2.99e-169
+## 3 probabilities -1.01 0.280 -3.60 3.20e- 4
##
## $European
## # A tibble: 3 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) 29.1 1.18 24.7 4.84e-135
-## 2 year -0.00210 0.0000748 -28.0 7.61e-173
-## 3 weighted_probs 0.00832 0.106 0.0788 9.37e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) 29.1 1.18 24.7 1.06e-134
+## 2 year -0.00210 0.0000748 -28.0 8.91e-173
+## 3 probabilities 0.0595 0.166 0.358 7.20e- 1
##
## $OtherCategories
## # A tibble: 3 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) 29.1 1.18 24.7 3.41e-135
-## 2 year -0.00210 0.0000748 -28.1 3.25e-173
-## 3 weighted_probs 0.170 0.106 1.60 1.09e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) 29.1 1.18 24.8 2.82e-135
+## 2 year -0.00210 0.0000748 -28.1 7.58e-174
+## 3 probabilities 0.432 0.183 2.36 1.80e- 2
Adapted from epitools::riskratio()
.
ggsave('figs/enrichment-plot.png', enrichment_plot, width = 5.5, height = 3.5)
+ggsave('figs/enrichment-plot.png', enrichment_plot, width = 5.5, height = 3.5, dpi = 600)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
@@ -4452,56 +4452,74 @@ Log enrichment figure
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
-## [1] stats graphics grDevices utils datasets methods base
+## [1] stats graphics grDevices utils datasets methods
+## [7] base
##
## other attached packages:
-## [1] DT_0.16 epitools_0.5-10.1 gdtools_0.2.2
-## [4] wru_0.1-10 rnaturalearth_0.1.0 lubridate_1.7.9.2
-## [7] caret_6.0-86 lattice_0.20-41 forcats_0.5.0
-## [10] stringr_1.4.0 dplyr_1.0.2 purrr_0.3.4
-## [13] readr_1.4.0 tidyr_1.1.2 tibble_3.0.4
-## [16] ggplot2_3.3.2 tidyverse_1.3.0
+## [1] broom_0.7.2 DT_0.16 epitools_0.5-10.1
+## [4] gdtools_0.2.2 wru_0.1-10 rnaturalearth_0.1.0
+## [7] lubridate_1.7.9.2 caret_6.0-86 lattice_0.20-41
+## [10] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2
+## [13] purrr_0.3.4 readr_1.4.0 tidyr_1.1.2
+## [16] tibble_3.0.4 ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
-## [1] colorspace_2.0-0 ellipsis_0.3.1 class_7.3-17
-## [4] rprojroot_1.3-2 fs_1.5.0 rstudioapi_0.12
-## [7] farver_2.0.3 remotes_2.2.0 prodlim_2019.11.13
-## [10] fansi_0.4.1 xml2_1.3.2 codetools_0.2-16
-## [13] splines_4.0.3 knitr_1.30 pkgload_1.1.0
-## [16] jsonlite_1.7.1 pROC_1.16.2 broom_0.7.2
-## [19] dbplyr_2.0.0 rgeos_0.5-5 compiler_4.0.3
-## [22] httr_1.4.2 backports_1.2.0 assertthat_0.2.1
-## [25] Matrix_1.2-18 cli_2.1.0 htmltools_0.5.0
-## [28] prettyunits_1.1.1 tools_4.0.3 gtable_0.3.0
-## [31] glue_1.4.2 rnaturalearthdata_0.1.0 reshape2_1.4.4
-## [34] Rcpp_1.0.5 cellranger_1.1.0 vctrs_0.3.4
-## [37] svglite_1.2.3.2 nlme_3.1-149 iterators_1.0.13
-## [40] crosstalk_1.1.0.1 timeDate_3043.102 gower_0.2.2
-## [43] xfun_0.19 ps_1.4.0 testthat_3.0.0
-## [46] rvest_0.3.6 lifecycle_0.2.0 devtools_2.3.2
-## [49] MASS_7.3-53 scales_1.1.1 ipred_0.9-9
-## [52] hms_0.5.3 RColorBrewer_1.1-2 yaml_2.2.1
-## [55] curl_4.3 memoise_1.1.0 rpart_4.1-15
-## [58] stringi_1.5.3 desc_1.2.0 foreach_1.5.1
-## [61] e1071_1.7-4 pkgbuild_1.1.0 lava_1.6.8.1
-## [64] systemfonts_0.3.2 rlang_0.4.8 pkgconfig_2.0.3
-## [67] evaluate_0.14 sf_0.9-6 recipes_0.1.15
-## [70] htmlwidgets_1.5.2 labeling_0.4.2 cowplot_1.1.0
-## [73] tidyselect_1.1.0 processx_3.4.4 plyr_1.8.6
-## [76] magrittr_1.5 R6_2.5.0 generics_0.1.0
-## [79] DBI_1.1.0 mgcv_1.8-33 pillar_1.4.6
-## [82] haven_2.3.1 withr_2.3.0 units_0.6-7
-## [85] survival_3.2-7 sp_1.4-4 nnet_7.3-14
-## [88] modelr_0.1.8 crayon_1.3.4 KernSmooth_2.23-17
-## [91] utf8_1.1.4 rmarkdown_2.5 usethis_1.6.3
-## [94] grid_4.0.3 readxl_1.3.1 data.table_1.13.2
-## [97] callr_3.5.1 ModelMetrics_1.2.2.2 reprex_0.3.0
-## [100] digest_0.6.27 classInt_0.4-3 stats4_4.0.3
-## [103] munsell_0.5.0 viridisLite_0.3.0 sessioninfo_1.1.1
+## [1] colorspace_2.0-0 ellipsis_0.3.1
+## [3] class_7.3-17 rprojroot_1.3-2
+## [5] fs_1.5.0 rstudioapi_0.12
+## [7] farver_2.0.3 remotes_2.2.0
+## [9] prodlim_2019.11.13 fansi_0.4.1
+## [11] xml2_1.3.2 codetools_0.2-16
+## [13] splines_4.0.3 knitr_1.30
+## [15] pkgload_1.1.0 jsonlite_1.7.1
+## [17] pROC_1.16.2 dbplyr_2.0.0
+## [19] rgeos_0.5-5 compiler_4.0.3
+## [21] httr_1.4.2 backports_1.2.0
+## [23] assertthat_0.2.1 Matrix_1.2-18
+## [25] cli_2.1.0 htmltools_0.5.0
+## [27] prettyunits_1.1.1 tools_4.0.3
+## [29] gtable_0.3.0 glue_1.4.2
+## [31] rnaturalearthdata_0.1.0 reshape2_1.4.4
+## [33] Rcpp_1.0.5 cellranger_1.1.0
+## [35] vctrs_0.3.4 svglite_1.2.3.2
+## [37] nlme_3.1-149 iterators_1.0.13
+## [39] crosstalk_1.1.0.1 timeDate_3043.102
+## [41] gower_0.2.2 xfun_0.19
+## [43] ps_1.4.0 testthat_3.0.0
+## [45] rvest_0.3.6 lifecycle_0.2.0
+## [47] devtools_2.3.2 MASS_7.3-53
+## [49] scales_1.1.1 ipred_0.9-9
+## [51] hms_0.5.3 RColorBrewer_1.1-2
+## [53] yaml_2.2.1 curl_4.3
+## [55] memoise_1.1.0 rpart_4.1-15
+## [57] stringi_1.5.3 desc_1.2.0
+## [59] foreach_1.5.1 e1071_1.7-4
+## [61] pkgbuild_1.1.0 lava_1.6.8.1
+## [63] systemfonts_0.3.2 rlang_0.4.8
+## [65] pkgconfig_2.0.3 evaluate_0.14
+## [67] sf_0.9-6 recipes_0.1.15
+## [69] htmlwidgets_1.5.2 labeling_0.4.2
+## [71] cowplot_1.1.0 tidyselect_1.1.0
+## [73] processx_3.4.4 plyr_1.8.6
+## [75] magrittr_1.5 R6_2.5.0
+## [77] generics_0.1.0 DBI_1.1.0
+## [79] mgcv_1.8-33 pillar_1.4.6
+## [81] haven_2.3.1 withr_2.3.0
+## [83] units_0.6-7 survival_3.2-7
+## [85] sp_1.4-4 nnet_7.3-14
+## [87] modelr_0.1.8 crayon_1.3.4
+## [89] KernSmooth_2.23-17 utf8_1.1.4
+## [91] rmarkdown_2.5 usethis_1.6.3
+## [93] grid_4.0.3 readxl_1.3.1
+## [95] data.table_1.13.2 callr_3.5.1
+## [97] ModelMetrics_1.2.2.2 reprex_0.3.0
+## [99] digest_0.6.27 classInt_0.4-3
+## [101] stats4_4.0.3 munsell_0.5.0
+## [103] viridisLite_0.3.0 sessioninfo_1.1.1
## [1] "Proceeding with surname-only predictions..."
-## Warning in merge_surnames(voter.file, impute.missing = impute.missing): 5166
-## surnames were not matched.
+## Warning in merge_surnames(voter.file, impute.missing =
+## impute.missing): 5166 surnames were not matched.
pubmed_us_race <- pubmed_race_pmids %>%
group_by(pmid, journal, publication_date, year, adjusted_citations) %>%
summarise_at(vars(contains('pred.')), mean, na.rm = T, .groups = 'drop') %>%
@@ -1685,8 +1685,8 @@ Race/ethnicity predictions
rename('surname' = last_name_simple) %>%
predict_race(surname.only = T, impute.missing = F)
## [1] "Proceeding with surname-only predictions..."
-## Warning in merge_surnames(voter.file, impute.missing = impute.missing): 100
-## surnames were not matched.
+## Warning in merge_surnames(voter.file, impute.missing =
+## impute.missing): 100 surnames were not matched.
my_jours <- unique(pubmed_us_race$journal)
my_confs <- unique(iscb_us_race$conference)
n_jours <- length(my_jours)
@@ -1795,10 +1795,14 @@ Hypothesis testing
## -0.8113 -0.5414 -0.0942 0.2651 16.6148
##
## Coefficients:
-## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) 0.612841 0.004475 136.937 <2e-16 ***
-## year -0.041522 0.004554 -9.118 <2e-16 ***
-## typeKeynote speakers/Fellows 0.083082 0.038991 2.131 0.0331 *
+## Estimate Std. Error t value
+## (Intercept) 0.612841 0.004475 136.937
+## year -0.041522 0.004554 -9.118
+## typeKeynote speakers/Fellows 0.083082 0.038991 2.131
+## Pr(>|t|)
+## (Intercept) <2e-16 ***
+## year <2e-16 ***
+## typeKeynote speakers/Fellows 0.0331 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
@@ -1820,10 +1824,14 @@ Hypothesis testing
## -0.2870 -0.2596 -0.2303 0.1112 6.2454
##
## Coefficients:
-## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) 0.247463 0.003178 77.861 < 2e-16 ***
-## year 0.043328 0.003234 13.397 < 2e-16 ***
-## typeKeynote speakers/Fellows -0.099391 0.027691 -3.589 0.000332 ***
+## Estimate Std. Error t value
+## (Intercept) 0.247463 0.003178 77.861
+## year 0.043328 0.003234 13.397
+## typeKeynote speakers/Fellows -0.099391 0.027691 -3.589
+## Pr(>|t|)
+## (Intercept) < 2e-16 ***
+## year < 2e-16 ***
+## typeKeynote speakers/Fellows 0.000332 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
@@ -1845,10 +1853,14 @@ Hypothesis testing
## -0.1573 -0.1143 -0.0837 0.0159 9.6671
##
## Coefficients:
-## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) 0.139696 0.001617 86.395 <2e-16 ***
-## year -0.001807 0.001645 -1.098 0.272
-## typeKeynote speakers/Fellows 0.016309 0.014088 1.158 0.247
+## Estimate Std. Error t value
+## (Intercept) 0.139696 0.001617 86.395
+## year -0.001807 0.001645 -1.098
+## typeKeynote speakers/Fellows 0.016309 0.014088 1.158
+## Pr(>|t|)
+## (Intercept) <2e-16 ***
+## year 0.272
+## typeKeynote speakers/Fellows 0.247
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
diff --git a/docs/14.us-name-origin.html b/docs/14.us-name-origin.html
index c71358d..c5d0a57 100644
--- a/docs/14.us-name-origin.html
+++ b/docs/14.us-name-origin.html
@@ -1720,17 +1720,12 @@ Organize data
values_to = "probabilities"
) %>%
filter(!is.na(probabilities)) %>%
- group_by(type, year, region) %>%
- mutate(
- pmc_citations_year = mean(adjusted_citations),
- weight = adjusted_citations / pmc_citations_year,
- weighted_probs = probabilities * weight
- )
+ group_by(type, year, region)
iscb_pubmed_sum_oth <- iscb_pubmed_oth %>%
summarise(
- mean_prob = mean(weighted_probs),
- se_prob = sqrt(var(probabilities) * sum(weight^2) / (sum(weight)^2)),
+ mean_prob = mean(probabilities),
+ se_prob = sd(probabilities)/sqrt(n()),
me_prob = alpha_threshold * se_prob,
.groups = "drop"
)
@@ -1768,8 +1763,8 @@ Figures for paper
fig_us_name_origin <- cowplot::plot_grid(fig_us_name_origina, fig_us_name_originb, labels = "AUTO", ncol = 1, rel_heights = c(1.3, 1))
## `geom_smooth()` using formula 'y ~ s(x, bs = "cs")'
fig_us_name_origin
-
-ggsave("figs/us_name_origin.png", fig_us_name_origin, width = 6.5, height = 5.5)
+
+ggsave("figs/us_name_origin.png", fig_us_name_origin, width = 6.5, height = 5.5, dpi = 600)
ggsave("figs/us_name_origin.svg", fig_us_name_origin, width = 6.5, height = 5.5)
@@ -1782,17 +1777,17 @@ Hypothesis testing
type = relevel(as.factor(type), ref = "Pubmed authors")
)
main_lm <- function(regioni) {
- glm(type ~ year + weighted_probs,
+ glm(type ~ year + probabilities,
data = iscb_lm %>%
- filter(region == regioni, !is.na(weighted_probs), year(year) >= 2002),
+ filter(region == regioni, !is.na(probabilities), year(year) >= 2002),
family = "binomial"
)
}
inte_lm <- function(regioni) {
- glm(type ~ weighted_probs * year,
+ glm(type ~ probabilities * year,
data = iscb_lm %>%
- filter(region == regioni, !is.na(weighted_probs), year(year) >= 2002),
+ filter(region == regioni, !is.na(probabilities), year(year) >= 2002),
family = "binomial"
)
}
@@ -1803,108 +1798,108 @@ Hypothesis testing
lapply(main_list, broom::tidy)
## $CelticEnglish
## # A tibble: 3 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) 4.91 0.523 9.39 6.14e-21
-## 2 year -0.000616 0.0000346 -17.8 6.39e-71
-## 3 weighted_probs 0.0434 0.0837 0.519 6.04e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) 4.76 0.535 8.89 5.86e-19
+## 2 year -0.000611 0.0000348 -17.6 3.87e-69
+## 3 probabilities 0.269 0.185 1.46 1.46e- 1
##
## $EastAsian
## # A tibble: 3 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) 4.75 0.520 9.13 6.68e-20
-## 2 year -0.000595 0.0000347 -17.2 5.24e-66
-## 3 weighted_probs -1.77 0.501 -3.52 4.28e- 4
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) 4.73 0.519 9.12 7.71e-20
+## 2 year -0.000592 0.0000346 -17.1 9.97e-66
+## 3 probabilities -1.89 0.455 -4.14 3.43e- 5
##
## $European
## # A tibble: 3 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) 4.88 0.521 9.36 7.82e-21
-## 2 year -0.000616 0.0000345 -17.8 3.48e-71
-## 3 weighted_probs 0.173 0.109 1.58 1.14e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) 4.78 0.524 9.12 7.75e-20
+## 2 year -0.000614 0.0000345 -17.8 6.78e-71
+## 3 probabilities 0.446 0.194 2.30 2.16e- 2
##
## $OtherCategories
## # A tibble: 3 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) 4.94 0.519 9.52 1.79e-21
-## 2 year -0.000618 0.0000345 -17.9 1.17e-71
-## 3 weighted_probs 0.0518 0.136 0.381 7.03e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) 4.93 0.520 9.48 2.58e-21
+## 2 year -0.000618 0.0000345 -17.9 1.18e-71
+## 3 probabilities 0.0957 0.212 0.451 6.52e- 1
inte_list <- lapply(large_regions, inte_lm)
lapply(inte_list, broom::tidy)
## [[1]]
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) 5.01 0.563 8.89 5.91e-19
-## 2 weighted_probs -0.228 0.591 -0.386 7.00e- 1
-## 3 year -0.000623 0.0000377 -16.5 2.45e-61
-## 4 weighted_probs:year 0.0000201 0.0000425 0.473 6.36e- 1
+## 1 (Intercept) 4.81 0.730 6.60 4.16e-11
+## 2 probabilities 0.128 1.32 0.0969 9.23e- 1
+## 3 year -0.000615 0.0000479 -12.8 9.81e-38
+## 4 probabilities:year 0.00000953 0.0000883 0.108 9.14e- 1
##
## [[2]]
## # A tibble: 4 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) 4.70 0.540 8.72 2.89e-18
-## 2 weighted_probs -0.524 4.08 -0.128 8.98e- 1
-## 3 year -0.000592 0.0000359 -16.5 6.46e-61
-## 4 weighted_probs:year -0.0000794 0.000261 -0.304 7.61e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) 4.71 0.536 8.78 1.57e-18
+## 2 probabilities -1.29 3.81 -0.339 7.34e- 1
+## 3 year -0.000591 0.0000357 -16.5 1.68e-61
+## 4 probabilities:year -0.0000380 0.000243 -0.157 8.76e- 1
##
## [[3]]
## # A tibble: 4 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) 5.07 0.573 8.85 9.11e-19
-## 2 weighted_probs -0.462 0.821 -0.563 5.73e- 1
-## 3 year -0.000629 0.0000382 -16.5 7.11e-61
-## 4 weighted_probs:year 0.0000437 0.0000549 0.796 4.26e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) 5.17 0.674 7.67 1.77e-14
+## 2 probabilities -0.809 1.40 -0.577 5.64e- 1
+## 3 year -0.000640 0.0000449 -14.3 3.42e-46
+## 4 probabilities:year 0.0000840 0.0000926 0.906 3.65e- 1
##
## [[4]]
## # A tibble: 4 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) 5.04 0.587 8.58 9.33e-18
-## 2 weighted_probs -0.320 1.04 -0.308 7.58e- 1
-## 3 year -0.000625 0.0000393 -15.9 6.63e-57
-## 4 weighted_probs:year 0.0000251 0.0000690 0.364 7.16e- 1
+## term estimate std.error statistic p.value
+## <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 (Intercept) 5.22 0.670 7.79 6.76e-15
+## 2 probabilities -0.964 1.59 -0.605 5.45e- 1
+## 3 year -0.000637 0.0000447 -14.2 4.57e-46
+## 4 probabilities:year 0.0000701 0.000104 0.673 5.01e- 1
for (i in 1:4) {
print(anova(main_list[[i]], inte_list[[i]], test = "Chisq"))
}
## Analysis of Deviance Table
##
-## Model 1: type ~ year + weighted_probs
-## Model 2: type ~ weighted_probs * year
+## Model 1: type ~ year + probabilities
+## Model 2: type ~ probabilities * year
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
-## 1 26398 2066.7
-## 2 26397 2066.5 1 0.2267 0.634
+## 1 26398 2064.9
+## 2 26397 2064.9 1 0.011635 0.9141
## Analysis of Deviance Table
##
-## Model 1: type ~ year + weighted_probs
-## Model 2: type ~ weighted_probs * year
+## Model 1: type ~ year + probabilities
+## Model 2: type ~ probabilities * year
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
-## 1 26398 2043.8
-## 2 26397 2043.7 1 0.089951 0.7642
+## 1 26398 2036.7
+## 2 26397 2036.7 1 0.024177 0.8764
## Analysis of Deviance Table
##
-## Model 1: type ~ year + weighted_probs
-## Model 2: type ~ weighted_probs * year
+## Model 1: type ~ year + probabilities
+## Model 2: type ~ probabilities * year
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
-## 1 26398 2064.9
-## 2 26397 2064.2 1 0.63868 0.4242
+## 1 26398 2061.9
+## 2 26397 2061.1 1 0.82361 0.3641
## Analysis of Deviance Table
##
-## Model 1: type ~ year + weighted_probs
-## Model 2: type ~ weighted_probs * year
+## Model 1: type ~ year + probabilities
+## Model 2: type ~ probabilities * year
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 26398 2066.8
-## 2 26397 2066.7 1 0.13195 0.7164
+## 2 26397 2066.3 1 0.45537 0.4998
Interaction terms do not predict type
over and above the main effect of name origin probability and year (p > 0.01).
Conclusion
-An East Asian name has 0.1709525 the odds of being selected as an honoree, significantly lower compared to other names (\(\beta_\textrm{East Asian} =\) -1.7664, P = 0.00042758). The two groups of scientists did not have a significant association with names predicted to be Celtic/English (P = 0.60355), European (P = 0.11373), or in Other categories (P = 0.70348).
+An East Asian name has 0.1516018 the odds of being selected as an honoree, significantly lower compared to other names (\(\beta_\textrm{East Asian} =\) -1.8865, P = 3.4282e-05). The two groups of scientists did not have a significant association with names predicted to be Celtic/English (P = 0.14566), European (P = 0.021596), or in Other categories (P = 0.65199).
Supplement
@@ -1948,7 +1943,8 @@ Supplementary Figure S7
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
-## [1] stats graphics grDevices utils datasets methods base
+## [1] stats graphics grDevices utils datasets methods
+## [7] base
##
## other attached packages:
## [1] broom_0.7.2 DT_0.16 epitools_0.5-10.1
@@ -1959,45 +1955,62 @@ Supplementary Figure S7
## [16] tibble_3.0.4 ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
-## [1] colorspace_2.0-0 ellipsis_0.3.1 class_7.3-17
-## [4] rprojroot_1.3-2 fs_1.5.0 rstudioapi_0.12
-## [7] farver_2.0.3 remotes_2.2.0 prodlim_2019.11.13
-## [10] fansi_0.4.1 xml2_1.3.2 codetools_0.2-16
-## [13] splines_4.0.3 knitr_1.30 pkgload_1.1.0
-## [16] jsonlite_1.7.1 pROC_1.16.2 dbplyr_2.0.0
-## [19] rgeos_0.5-5 compiler_4.0.3 httr_1.4.2
-## [22] backports_1.2.0 assertthat_0.2.1 Matrix_1.2-18
-## [25] cli_2.1.0 htmltools_0.5.0 prettyunits_1.1.1
-## [28] tools_4.0.3 gtable_0.3.0 glue_1.4.2
-## [31] rnaturalearthdata_0.1.0 reshape2_1.4.4 Rcpp_1.0.5
-## [34] cellranger_1.1.0 vctrs_0.3.4 svglite_1.2.3.2
-## [37] nlme_3.1-149 iterators_1.0.13 crosstalk_1.1.0.1
-## [40] timeDate_3043.102 gower_0.2.2 xfun_0.19
-## [43] ps_1.4.0 testthat_3.0.0 rvest_0.3.6
-## [46] lifecycle_0.2.0 devtools_2.3.2 MASS_7.3-53
-## [49] scales_1.1.1 ipred_0.9-9 hms_0.5.3
-## [52] RColorBrewer_1.1-2 yaml_2.2.1 curl_4.3
-## [55] memoise_1.1.0 rpart_4.1-15 stringi_1.5.3
-## [58] desc_1.2.0 foreach_1.5.1 e1071_1.7-4
-## [61] pkgbuild_1.1.0 lava_1.6.8.1 systemfonts_0.3.2
-## [64] rlang_0.4.8 pkgconfig_2.0.3 evaluate_0.14
-## [67] sf_0.9-6 recipes_0.1.15 htmlwidgets_1.5.2
-## [70] labeling_0.4.2 cowplot_1.1.0 tidyselect_1.1.0
-## [73] processx_3.4.4 plyr_1.8.6 magrittr_1.5
-## [76] R6_2.5.0 generics_0.1.0 DBI_1.1.0
-## [79] mgcv_1.8-33 pillar_1.4.6 haven_2.3.1
-## [82] withr_2.3.0 units_0.6-7 survival_3.2-7
-## [85] sp_1.4-4 nnet_7.3-14 modelr_0.1.8
-## [88] crayon_1.3.4 KernSmooth_2.23-17 utf8_1.1.4
-## [91] rmarkdown_2.5 usethis_1.6.3 grid_4.0.3
-## [94] readxl_1.3.1 data.table_1.13.2 callr_3.5.1
-## [97] ModelMetrics_1.2.2.2 reprex_0.3.0 digest_0.6.27
-## [100] classInt_0.4-3 stats4_4.0.3 munsell_0.5.0
+## [1] colorspace_2.0-0 ellipsis_0.3.1
+## [3] class_7.3-17 rprojroot_1.3-2
+## [5] fs_1.5.0 rstudioapi_0.12
+## [7] farver_2.0.3 remotes_2.2.0
+## [9] prodlim_2019.11.13 fansi_0.4.1
+## [11] xml2_1.3.2 codetools_0.2-16
+## [13] splines_4.0.3 knitr_1.30
+## [15] pkgload_1.1.0 jsonlite_1.7.1
+## [17] pROC_1.16.2 dbplyr_2.0.0
+## [19] rgeos_0.5-5 compiler_4.0.3
+## [21] httr_1.4.2 backports_1.2.0
+## [23] assertthat_0.2.1 Matrix_1.2-18
+## [25] cli_2.1.0 htmltools_0.5.0
+## [27] prettyunits_1.1.1 tools_4.0.3
+## [29] gtable_0.3.0 glue_1.4.2
+## [31] rnaturalearthdata_0.1.0 reshape2_1.4.4
+## [33] Rcpp_1.0.5 cellranger_1.1.0
+## [35] vctrs_0.3.4 svglite_1.2.3.2
+## [37] nlme_3.1-149 iterators_1.0.13
+## [39] crosstalk_1.1.0.1 timeDate_3043.102
+## [41] gower_0.2.2 xfun_0.19
+## [43] ps_1.4.0 testthat_3.0.0
+## [45] rvest_0.3.6 lifecycle_0.2.0
+## [47] devtools_2.3.2 MASS_7.3-53
+## [49] scales_1.1.1 ipred_0.9-9
+## [51] hms_0.5.3 RColorBrewer_1.1-2
+## [53] yaml_2.2.1 curl_4.3
+## [55] memoise_1.1.0 rpart_4.1-15
+## [57] stringi_1.5.3 desc_1.2.0
+## [59] foreach_1.5.1 e1071_1.7-4
+## [61] pkgbuild_1.1.0 lava_1.6.8.1
+## [63] systemfonts_0.3.2 rlang_0.4.8
+## [65] pkgconfig_2.0.3 evaluate_0.14
+## [67] sf_0.9-6 recipes_0.1.15
+## [69] htmlwidgets_1.5.2 labeling_0.4.2
+## [71] cowplot_1.1.0 tidyselect_1.1.0
+## [73] processx_3.4.4 plyr_1.8.6
+## [75] magrittr_1.5 R6_2.5.0
+## [77] generics_0.1.0 DBI_1.1.0
+## [79] mgcv_1.8-33 pillar_1.4.6
+## [81] haven_2.3.1 withr_2.3.0
+## [83] units_0.6-7 survival_3.2-7
+## [85] sp_1.4-4 nnet_7.3-14
+## [87] modelr_0.1.8 crayon_1.3.4
+## [89] KernSmooth_2.23-17 utf8_1.1.4
+## [91] rmarkdown_2.5 usethis_1.6.3
+## [93] grid_4.0.3 readxl_1.3.1
+## [95] data.table_1.13.2 callr_3.5.1
+## [97] ModelMetrics_1.2.2.2 reprex_0.3.0
+## [99] digest_0.6.27 classInt_0.4-3
+## [101] stats4_4.0.3 munsell_0.5.0
## [103] viridisLite_0.3.0 sessioninfo_1.1.1
-LS0tCnRpdGxlOiAiUmVwcmVzZW50YXRpb24gYW5hbHlzaXMgb2YgbmFtZSBvcmlnaW4gaW4gdGhlIFVTIgotLS0KCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShsdWJyaWRhdGUpCnNvdXJjZSgidXRpbHMvci11dGlscy5SIikKdGhlbWVfc2V0KHRoZW1lX2J3KCkgKyB0aGVtZShsZWdlbmQudGl0bGUgPSBlbGVtZW50X2JsYW5rKCkpKQpgYGAKCk9ubHkga2VlcCBhcnRpY2xlcyBmcm9tIDIwMDIgYmVjYXVzZSBmZXcgYXV0aG9ycyBoYWQgbmF0aW9uYWxpdHkgcHJlZGljdGlvbnMgYmVmb3JlIDIwMDIgKG1vc3RseSBkdWUgdG8gbWlzc2luZyBtZXRhZGF0YSkuClNlZSBbMDkzLnN1bW1hcnktc3RhdHNdKGRvY3MvMDkzLnN1bW1hcnktc3RhdHMuaHRtbCkgZm9yIG1vcmUgZGV0YWlscy4KCmBgYHtyfQpsb2FkKCJSZGF0YS9yYXdzLlJkYXRhIikKCmFscGhhX3RocmVzaG9sZCA8LSBxbm9ybSgwLjk3NSkKCnB1Ym1lZF9uYXRfZGYgPC0gY29ycl9hdXRob3JzICU+JQogIGZpbHRlcih5ZWFyKHllYXIpID49IDIwMDIpICU+JQogIHNlcGFyYXRlX3Jvd3MoY291bnRyaWVzLCBzZXAgPSAiLCIpICU+JQogIGZpbHRlcihjb3VudHJpZXMgPT0gIlVTIikgJT4lCiAgbGVmdF9qb2luKG5hdGlvbmFsaXplX2RmLCBieSA9IGMoImZvcmVfbmFtZSIsICJsYXN0X25hbWUiKSkgJT4lCiAgZ3JvdXBfYnkocG1pZCwgam91cm5hbCwgcHVibGljYXRpb25fZGF0ZSwgeWVhciwgYWRqdXN0ZWRfY2l0YXRpb25zKSAlPiUKICBzdW1tYXJpc2VfYXQodmFycyhBZnJpY2FuOlNvdXRoQXNpYW4pLCBtZWFuLCBuYS5ybSA9IFQpICU+JQogIHVuZ3JvdXAoKQoKaXNjYl9uYXRfZGYgPC0ga2V5bm90ZXMgJT4lCiAgc2VwYXJhdGVfcm93cyhhZmZsY291bnRyaWVzLCBzZXAgPSAiXFx8IikgJT4lCiAgZmlsdGVyKGFmZmxjb3VudHJpZXMgPT0gIlVuaXRlZCBTdGF0ZXMiKSAlPiUKICBsZWZ0X2pvaW4obmF0aW9uYWxpemVfZGYsIGJ5ID0gYygiZm9yZV9uYW1lIiwgImxhc3RfbmFtZSIpKQoKc3RhcnRfeWVhciA8LSAxOTkyCmVuZF95ZWFyIDwtIDIwMTkKbl95ZWFycyA8LSBlbmRfeWVhciAtIHN0YXJ0X3llYXIKbXlfam91cnMgPC0gdW5pcXVlKHB1Ym1lZF9uYXRfZGYkam91cm5hbCkKbXlfY29uZnMgPC0gdW5pcXVlKGlzY2JfbmF0X2RmJGNvbmZlcmVuY2UpCm5fam91cnMgPC0gbGVuZ3RoKG15X2pvdXJzKQpuX2NvbmZzIDwtIGxlbmd0aChteV9jb25mcykKcmVnaW9uX2xldmVscyA8LSBwYXN0ZShjKCJDZWx0aWMvRW5nbGlzaCIsICJFdXJvcGVhbiIsICJFYXN0IEFzaWFuIiwgIkhpc3BhbmljIiwgIlNvdXRoIEFzaWFuIiwgIkFyYWJpYyIsICJIZWJyZXciLCAiQWZyaWNhbiIsICJOb3JkaWMiLCAiR3JlZWsiKSwgIm5hbWVzIikKCnJlZ2lvbl9jb2xzIDwtIGMoIiNmZmZmYjMiLCAiI2ZjY2RlNSIsICIjYjNkZTY5IiwgIiNmZGI0NjIiLCAiIzgwYjFkMyIsICIjOGRkM2M3IiwgIiNiZWJhZGEiLCAiI2ZiODA3MiIsICIjYmM4MGJkIiwgIiNjY2ViYzUiKQpgYGAKCiMjIE9yZ2FuaXplIGRhdGEKClByZXBhcmUgZGF0YSBmcmFtZXMgZm9yIGxhdGVyIGFuYWx5c2VzOgoKLSByYmluZCByZXN1bHRzIG9mIHJhY2UgcHJlZGljdGlvbnMgaW4gaXNjYiBhbmQgUHVibWVkCi0gcGl2b3QgbG9uZwotIGNvbXB1dGUgbWVhbiwgc2QsIG1hcmdpbmFsIGVycm9yCgpgYGB7cn0KaXNjYl9wdWJtZWRfb3RoIDwtIGlzY2JfbmF0X2RmICU+JQogIHJlbmFtZSgiam91cm5hbCIgPSBjb25mZXJlbmNlKSAlPiUKICBzZWxlY3QoeWVhciwgam91cm5hbCwgQWZyaWNhbjpTb3V0aEFzaWFuLCBwdWJsaWNhdGlvbl9kYXRlKSAlPiUKICBtdXRhdGUoCiAgICB0eXBlID0gIktleW5vdGUgc3BlYWtlcnMvRmVsbG93cyIsCiAgICBhZGp1c3RlZF9jaXRhdGlvbnMgPSAxCiAgKSAlPiUKICBiaW5kX3Jvd3MoCiAgICBwdWJtZWRfbmF0X2RmICU+JQogICAgICBzZWxlY3QoeWVhciwgam91cm5hbCwgQWZyaWNhbjpTb3V0aEFzaWFuLCBwdWJsaWNhdGlvbl9kYXRlLCBhZGp1c3RlZF9jaXRhdGlvbnMpICU+JQogICAgICBtdXRhdGUodHlwZSA9ICJQdWJtZWQgYXV0aG9ycyIpCiAgKSAlPiUKICBtdXRhdGUoT3RoZXJDYXRlZ29yaWVzID0gU291dGhBc2lhbiArIEhpc3BhbmljICsgSmV3aXNoICsgTXVzbGltICsgTm9yZGljICsgR3JlZWsgKyBBZnJpY2FuKSAlPiUKICBwaXZvdF9sb25nZXIoYyhBZnJpY2FuOlNvdXRoQXNpYW4sIE90aGVyQ2F0ZWdvcmllcyksCiAgICBuYW1lc190byA9ICJyZWdpb24iLAogICAgdmFsdWVzX3RvID0gInByb2JhYmlsaXRpZXMiCiAgKSAlPiUKICBmaWx0ZXIoIWlzLm5hKHByb2JhYmlsaXRpZXMpKSAlPiUKICBncm91cF9ieSh0eXBlLCB5ZWFyLCByZWdpb24pICU+JQogIG11dGF0ZSgKICAgIHBtY19jaXRhdGlvbnNfeWVhciA9IG1lYW4oYWRqdXN0ZWRfY2l0YXRpb25zKSwKICAgIHdlaWdodCA9IGFkanVzdGVkX2NpdGF0aW9ucyAvIHBtY19jaXRhdGlvbnNfeWVhciwKICAgIHdlaWdodGVkX3Byb2JzID0gcHJvYmFiaWxpdGllcyAqIHdlaWdodAogICkKCmlzY2JfcHVibWVkX3N1bV9vdGggPC0gaXNjYl9wdWJtZWRfb3RoICU+JQogIHN1bW1hcmlzZSgKICAgIG1lYW5fcHJvYiA9IG1lYW4od2VpZ2h0ZWRfcHJvYnMpLAogICAgc2VfcHJvYiA9IHNxcnQodmFyKHByb2JhYmlsaXRpZXMpICogc3VtKHdlaWdodF4yKSAvIChzdW0od2VpZ2h0KV4yKSksCiAgICBtZV9wcm9iID0gYWxwaGFfdGhyZXNob2xkICogc2VfcHJvYiwKICAgIC5ncm91cHMgPSAiZHJvcCIKICApCgppc2NiX3B1Ym1lZF9zdW0gPC0gaXNjYl9wdWJtZWRfc3VtX290aCAlPiUKICBmaWx0ZXIocmVnaW9uICE9ICJPdGhlckNhdGVnb3JpZXMiKQpgYGAKCiMjIEZpZ3VyZXMgZm9yIHBhcGVyCgpgYGB7ciBmaWcuaGVpZ2h0PTcsIGZpZy53aWR0aD05LCB3YXJuaW5nPUZBTFNFfQpmaWdfdXNfbmFtZV9vcmlnaW5hIDwtIGlzY2JfcHVibWVkX3N1bSAlPiUKICBmaWx0ZXIoeWVhciA8ICIyMDIwLTAxLTAxIikgJT4lCiAgcmVnaW9uX2JyZWFrZG93bigibWFpbiIsIHJlZ2lvbl9sZXZlbHMsIGZjdF9yZXYodHlwZSkpICsKICBndWlkZXMoZmlsbCA9IGd1aWRlX2xlZ2VuZChucm93ID0gMikpCgpsYXJnZV9yZWdpb25zIDwtIGMoIkNlbHRpY0VuZ2xpc2giLCAiRWFzdEFzaWFuIiwgIkV1cm9wZWFuIiwgIk90aGVyQ2F0ZWdvcmllcyIpCgojIyBNZWFuIGFuZCBzdGFuZGFyZCBkZXZpYXRpb24gb2YgcHJlZGljdGVkIHByb2JhYmlsaXRpZXM6CmZpZ191c19uYW1lX29yaWdpbmIgPC0gaXNjYl9wdWJtZWRfc3VtX290aCAlPiUKICBmaWx0ZXIocmVnaW9uICVpbiUgbGFyZ2VfcmVnaW9ucykgJT4lCiAgcmVjb2RlX3JlZ2lvbigpICU+JQogIGdhbV9hbmRfY2koCiAgICBkZjIgPSBpc2NiX3B1Ym1lZF9vdGggJT4lCiAgICAgIGZpbHRlcihyZWdpb24gJWluJSBsYXJnZV9yZWdpb25zKSAlPiUKICAgICAgcmVjb2RlX3JlZ2lvbigpLAogICAgc3RhcnRfeSA9IHN0YXJ0X3llYXIsIGVuZF95ID0gZW5kX3llYXIKICApICsKICB0aGVtZSgKICAgIGxlZ2VuZC5wb3NpdGlvbiA9IGMoMC44OCwgMC44MyksCiAgICBwYW5lbC5ncmlkLm1pbm9yID0gZWxlbWVudF9ibGFuaygpLAogICAgbGVnZW5kLm1hcmdpbiA9IG1hcmdpbigtMC41LCAwLCAwLCAwLCB1bml0ID0gImNtIiksCiAgICBsZWdlbmQudGV4dCA9IGVsZW1lbnRfdGV4dChzaXplID0gNikKICApICsKICBmYWNldF93cmFwKHZhcnMoZmN0X3JlbGV2ZWwocmVnaW9uLCBsYXJnZV9yZWdpb25zKSksIG5yb3cgPSAxKQoKZmlnX3VzX25hbWVfb3JpZ2luIDwtIGNvd3Bsb3Q6OnBsb3RfZ3JpZChmaWdfdXNfbmFtZV9vcmlnaW5hLCBmaWdfdXNfbmFtZV9vcmlnaW5iLCBsYWJlbHMgPSAiQVVUTyIsIG5jb2wgPSAxLCByZWxfaGVpZ2h0cyA9IGMoMS4zLCAxKSkKZmlnX3VzX25hbWVfb3JpZ2luCmdnc2F2ZSgiZmlncy91c19uYW1lX29yaWdpbi5wbmciLCBmaWdfdXNfbmFtZV9vcmlnaW4sIHdpZHRoID0gNi41LCBoZWlnaHQgPSA1LjUpCmdnc2F2ZSgiZmlncy91c19uYW1lX29yaWdpbi5zdmciLCBmaWdfdXNfbmFtZV9vcmlnaW4sIHdpZHRoID0gNi41LCBoZWlnaHQgPSA1LjUpCmBgYAoKIyMgSHlwb3RoZXNpcyB0ZXN0aW5nCgpgYGB7cn0KaXNjYl9sbSA8LSBpc2NiX3B1Ym1lZF9vdGggJT4lCiAgdW5ncm91cCgpICU+JQogIG11dGF0ZSgKICAgICMgeWVhciA9IGMoc2NhbGUoeWVhcikpLAogICAgIyB5ZWFyID0gYXMuZmFjdG9yKHllYXIpLAogICAgdHlwZSA9IHJlbGV2ZWwoYXMuZmFjdG9yKHR5cGUpLCByZWYgPSAiUHVibWVkIGF1dGhvcnMiKQogICkKbWFpbl9sbSA8LSBmdW5jdGlvbihyZWdpb25pKSB7CiAgZ2xtKHR5cGUgfiB5ZWFyICsgd2VpZ2h0ZWRfcHJvYnMsCiAgICBkYXRhID0gaXNjYl9sbSAlPiUKICAgICAgZmlsdGVyKHJlZ2lvbiA9PSByZWdpb25pLCAhaXMubmEod2VpZ2h0ZWRfcHJvYnMpLCB5ZWFyKHllYXIpID49IDIwMDIpLAogICAgZmFtaWx5ID0gImJpbm9taWFsIgogICkKfQoKaW50ZV9sbSA8LSBmdW5jdGlvbihyZWdpb25pKSB7CiAgZ2xtKHR5cGUgfiB3ZWlnaHRlZF9wcm9icyAqIHllYXIsCiAgICBkYXRhID0gaXNjYl9sbSAlPiUKICAgICAgZmlsdGVyKHJlZ2lvbiA9PSByZWdpb25pLCAhaXMubmEod2VpZ2h0ZWRfcHJvYnMpLCB5ZWFyKHllYXIpID49IDIwMDIpLAogICAgZmFtaWx5ID0gImJpbm9taWFsIgogICkKfQoKbWFpbl9saXN0IDwtIGxhcHBseShsYXJnZV9yZWdpb25zLCBtYWluX2xtKQoKbmFtZXMobWFpbl9saXN0KSA8LSBsYXJnZV9yZWdpb25zCmxhcHBseShtYWluX2xpc3QsIGJyb29tOjp0aWR5KQoKaW50ZV9saXN0IDwtIGxhcHBseShsYXJnZV9yZWdpb25zLCBpbnRlX2xtKQpsYXBwbHkoaW50ZV9saXN0LCBicm9vbTo6dGlkeSkKZm9yIChpIGluIDE6NCkgewogIHByaW50KGFub3ZhKG1haW5fbGlzdFtbaV1dLCBpbnRlX2xpc3RbW2ldXSwgdGVzdCA9ICJDaGlzcSIpKQp9CmBgYApJbnRlcmFjdGlvbiB0ZXJtcyBkbyBub3QgcHJlZGljdCBgdHlwZWAgb3ZlciBhbmQgYWJvdmUgdGhlIG1haW4gZWZmZWN0IG9mIG5hbWUgb3JpZ2luIHByb2JhYmlsaXR5IGFuZCB5ZWFyIChfcF8gPiAwLjAxKS4KCmBgYHtyIGVjaG8gPSBGfQpnZXRfZXhwIDwtIGZ1bmN0aW9uKGksIGNvbHUpIHsKICBicm9vbTo6dGlkeShtYWluX2xpc3RbW2ldXSkgJT4lCiAgICBmaWx0ZXIodGVybSA9PSAid2VpZ2h0ZWRfcHJvYnMiKSAlPiUKICAgIHB1bGwoY29sdSkKfQoKcHJpbnRfcCA8LSBmdW5jdGlvbih4KSBzcHJpbnRmKCIlMC41ZyIsIHgpCmBgYAoKIyMgQ29uY2x1c2lvbgoKQW4gRWFzdCBBc2lhbiBuYW1lIGhhcyBgciBleHAoZ2V0X2V4cCgyLCAnZXN0aW1hdGUnKSlgIHRoZSBvZGRzIG9mIGJlaW5nIHNlbGVjdGVkIGFzIGFuIGhvbm9yZWUsIHNpZ25pZmljYW50bHkgbG93ZXIgY29tcGFyZWQgdG8gb3RoZXIgbmFtZXMgKCRcYmV0YV9cdGV4dHJte0Vhc3QgQXNpYW59ID0kIGByIHByaW50X3AoZ2V0X2V4cCgyLCAnZXN0aW1hdGUnKSlgLCBfUF8gPSBgciBwcmludF9wKGdldF9leHAoMiwgJ3AudmFsdWUnKSlgKS4KVGhlIHR3byBncm91cHMgb2Ygc2NpZW50aXN0cyBkaWQgbm90IGhhdmUgYSBzaWduaWZpY2FudCBhc3NvY2lhdGlvbiB3aXRoIG5hbWVzIHByZWRpY3RlZCB0byBiZSBDZWx0aWMvRW5nbGlzaCAoX1BfID0gYHIgcHJpbnRfcChnZXRfZXhwKDEsICdwLnZhbHVlJykpYCksIEV1cm9wZWFuIChfUF8gPSBgciBwcmludF9wKGdldF9leHAoMywgJ3AudmFsdWUnKSlgKSwgb3IgaW4gT3RoZXIgY2F0ZWdvcmllcyAoX1BfID0gYHIgcHJpbnRfcChnZXRfZXhwKDQsICdwLnZhbHVlJykpYCkuCgojIyBTdXBwbGVtZW50CgojIyMgU3VwcGxlbWVudGFyeSBGaWd1cmUgUzcgeyNzdXBfZmlnX3M3fQpJdCdzIGRpZmZpY3VsdCB0byBjb21lIHRvIGEgY29uY2x1c2lvbiBmb3Igb3RoZXIgcmVnaW9ucyB3aXRoIHNvIGZldyBkYXRhIHBvaW50cyBhbmQgdGhlIGltcGVyZmVjdCBhY2N1cmFjeSBvZiBvdXIgcHJlZGljdGlvbi4KVGhlcmUgc2VlbXMgdG8gYmUgbGl0dGxlIGRpZmZlcmVuY2UgYmV0d2VlbiB0aGUgcHJvcG9ydGlvbiBvZiBrZXlub3RlIHNwZWFrZXJzIG9mIEFmcmljYW4sIEFyYWJpYywgU291dGggQXNpYW4gYW5kIEhpc3BhbmljIG9yaWdpbiB0aGFuIHRob3NlIGluIHRoZSBmaWVsZC4KSG93ZXZlciwganVzdCBiZWNhdXNlIGEgbmF0aW9uYWxpdHkgaXNuJ3QgdW5kZXJyZXByZXNlbnRlZCBhZ2FpbnN0IHRoZSBmaWVsZCBkb2Vzbid0IG1lYW4gc2NpZW50aXN0cyBmcm9tIHRoYXQgbmF0aW9uYWxpdHkgYXJlIGFwcHJvcHJpYXRlbHkgcmVwcmVzZW50ZWQuCgpgYGB7ciBmaWcuaGVpZ2h0PTYsIHdhcm5pbmc9RkFMU0V9CmRmMiA8LSBpc2NiX3B1Ym1lZF9vdGggJT4lCiAgZmlsdGVyKHJlZ2lvbiAhPSAiT3RoZXJDYXRlZ29yaWVzIikgJT4lCiAgcmVjb2RlX3JlZ2lvbigpCgpmaWdfczcgPC0gaXNjYl9wdWJtZWRfc3VtICU+JQogIHJlY29kZV9yZWdpb24oKSAlPiUKICBnYW1fYW5kX2NpKAogICAgZGYyID0gZGYyLAogICAgc3RhcnRfeSA9IHN0YXJ0X3llYXIsIGVuZF95ID0gZW5kX3llYXIKICApICsKICB0aGVtZShsZWdlbmQucG9zaXRpb24gPSBjKDAuOCwgMC4xKSkgKwogIGZhY2V0X3dyYXAodmFycyhmY3RfcmVsZXZlbChyZWdpb24sIHJlZ2lvbl9sZXZlbHMpKSwgbmNvbCA9IDMpCgpmaWdfczcKZ2dzYXZlKCJmaWdzL2ZpZ19zNy5wbmciLCBmaWdfczcsIHdpZHRoID0gNiwgaGVpZ2h0ID0gNikKZ2dzYXZlKCJmaWdzL2ZpZ19zNy5zdmciLCBmaWdfczcsIHdpZHRoID0gNiwgaGVpZ2h0ID0gNikKYGBgCgoKYGBge3J9CnNlc3Npb25JbmZvKCkKYGBgCg==
+LS0tCnRpdGxlOiAiUmVwcmVzZW50YXRpb24gYW5hbHlzaXMgb2YgbmFtZSBvcmlnaW4gaW4gdGhlIFVTIgotLS0KCmBgYHtyIHNldHVwLCBpbmNsdWRlPUZBTFNFfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShsdWJyaWRhdGUpCnNvdXJjZSgidXRpbHMvci11dGlscy5SIikKdGhlbWVfc2V0KHRoZW1lX2J3KCkgKyB0aGVtZShsZWdlbmQudGl0bGUgPSBlbGVtZW50X2JsYW5rKCkpKQpgYGAKCk9ubHkga2VlcCBhcnRpY2xlcyBmcm9tIDIwMDIgYmVjYXVzZSBmZXcgYXV0aG9ycyBoYWQgbmF0aW9uYWxpdHkgcHJlZGljdGlvbnMgYmVmb3JlIDIwMDIgKG1vc3RseSBkdWUgdG8gbWlzc2luZyBtZXRhZGF0YSkuClNlZSBbMDkzLnN1bW1hcnktc3RhdHNdKGRvY3MvMDkzLnN1bW1hcnktc3RhdHMuaHRtbCkgZm9yIG1vcmUgZGV0YWlscy4KCmBgYHtyfQpsb2FkKCJSZGF0YS9yYXdzLlJkYXRhIikKCmFscGhhX3RocmVzaG9sZCA8LSBxbm9ybSgwLjk3NSkKCnB1Ym1lZF9uYXRfZGYgPC0gY29ycl9hdXRob3JzICU+JQogIGZpbHRlcih5ZWFyKHllYXIpID49IDIwMDIpICU+JQogIHNlcGFyYXRlX3Jvd3MoY291bnRyaWVzLCBzZXAgPSAiLCIpICU+JQogIGZpbHRlcihjb3VudHJpZXMgPT0gIlVTIikgJT4lCiAgbGVmdF9qb2luKG5hdGlvbmFsaXplX2RmLCBieSA9IGMoImZvcmVfbmFtZSIsICJsYXN0X25hbWUiKSkgJT4lCiAgZ3JvdXBfYnkocG1pZCwgam91cm5hbCwgcHVibGljYXRpb25fZGF0ZSwgeWVhciwgYWRqdXN0ZWRfY2l0YXRpb25zKSAlPiUKICBzdW1tYXJpc2VfYXQodmFycyhBZnJpY2FuOlNvdXRoQXNpYW4pLCBtZWFuLCBuYS5ybSA9IFQpICU+JQogIHVuZ3JvdXAoKQoKaXNjYl9uYXRfZGYgPC0ga2V5bm90ZXMgJT4lCiAgc2VwYXJhdGVfcm93cyhhZmZsY291bnRyaWVzLCBzZXAgPSAiXFx8IikgJT4lCiAgZmlsdGVyKGFmZmxjb3VudHJpZXMgPT0gIlVuaXRlZCBTdGF0ZXMiKSAlPiUKICBsZWZ0X2pvaW4obmF0aW9uYWxpemVfZGYsIGJ5ID0gYygiZm9yZV9uYW1lIiwgImxhc3RfbmFtZSIpKQoKc3RhcnRfeWVhciA8LSAxOTkyCmVuZF95ZWFyIDwtIDIwMTkKbl95ZWFycyA8LSBlbmRfeWVhciAtIHN0YXJ0X3llYXIKbXlfam91cnMgPC0gdW5pcXVlKHB1Ym1lZF9uYXRfZGYkam91cm5hbCkKbXlfY29uZnMgPC0gdW5pcXVlKGlzY2JfbmF0X2RmJGNvbmZlcmVuY2UpCm5fam91cnMgPC0gbGVuZ3RoKG15X2pvdXJzKQpuX2NvbmZzIDwtIGxlbmd0aChteV9jb25mcykKcmVnaW9uX2xldmVscyA8LSBwYXN0ZShjKCJDZWx0aWMvRW5nbGlzaCIsICJFdXJvcGVhbiIsICJFYXN0IEFzaWFuIiwgIkhpc3BhbmljIiwgIlNvdXRoIEFzaWFuIiwgIkFyYWJpYyIsICJIZWJyZXciLCAiQWZyaWNhbiIsICJOb3JkaWMiLCAiR3JlZWsiKSwgIm5hbWVzIikKCnJlZ2lvbl9jb2xzIDwtIGMoIiNmZmZmYjMiLCAiI2ZjY2RlNSIsICIjYjNkZTY5IiwgIiNmZGI0NjIiLCAiIzgwYjFkMyIsICIjOGRkM2M3IiwgIiNiZWJhZGEiLCAiI2ZiODA3MiIsICIjYmM4MGJkIiwgIiNjY2ViYzUiKQpgYGAKCiMjIE9yZ2FuaXplIGRhdGEKClByZXBhcmUgZGF0YSBmcmFtZXMgZm9yIGxhdGVyIGFuYWx5c2VzOgoKLSByYmluZCByZXN1bHRzIG9mIHJhY2UgcHJlZGljdGlvbnMgaW4gaXNjYiBhbmQgUHVibWVkCi0gcGl2b3QgbG9uZwotIGNvbXB1dGUgbWVhbiwgc2QsIG1hcmdpbmFsIGVycm9yCgpgYGB7cn0KaXNjYl9wdWJtZWRfb3RoIDwtIGlzY2JfbmF0X2RmICU+JQogIHJlbmFtZSgiam91cm5hbCIgPSBjb25mZXJlbmNlKSAlPiUKICBzZWxlY3QoeWVhciwgam91cm5hbCwgQWZyaWNhbjpTb3V0aEFzaWFuLCBwdWJsaWNhdGlvbl9kYXRlKSAlPiUKICBtdXRhdGUoCiAgICB0eXBlID0gIktleW5vdGUgc3BlYWtlcnMvRmVsbG93cyIsCiAgICBhZGp1c3RlZF9jaXRhdGlvbnMgPSAxCiAgKSAlPiUKICBiaW5kX3Jvd3MoCiAgICBwdWJtZWRfbmF0X2RmICU+JQogICAgICBzZWxlY3QoeWVhciwgam91cm5hbCwgQWZyaWNhbjpTb3V0aEFzaWFuLCBwdWJsaWNhdGlvbl9kYXRlLCBhZGp1c3RlZF9jaXRhdGlvbnMpICU+JQogICAgICBtdXRhdGUodHlwZSA9ICJQdWJtZWQgYXV0aG9ycyIpCiAgKSAlPiUKICBtdXRhdGUoT3RoZXJDYXRlZ29yaWVzID0gU291dGhBc2lhbiArIEhpc3BhbmljICsgSmV3aXNoICsgTXVzbGltICsgTm9yZGljICsgR3JlZWsgKyBBZnJpY2FuKSAlPiUKICBwaXZvdF9sb25nZXIoYyhBZnJpY2FuOlNvdXRoQXNpYW4sIE90aGVyQ2F0ZWdvcmllcyksCiAgICBuYW1lc190byA9ICJyZWdpb24iLAogICAgdmFsdWVzX3RvID0gInByb2JhYmlsaXRpZXMiCiAgKSAlPiUKICBmaWx0ZXIoIWlzLm5hKHByb2JhYmlsaXRpZXMpKSAlPiUKICBncm91cF9ieSh0eXBlLCB5ZWFyLCByZWdpb24pCgppc2NiX3B1Ym1lZF9zdW1fb3RoIDwtIGlzY2JfcHVibWVkX290aCAlPiUKICBzdW1tYXJpc2UoCiAgICBtZWFuX3Byb2IgPSBtZWFuKHByb2JhYmlsaXRpZXMpLAogICAgc2VfcHJvYiA9IHNkKHByb2JhYmlsaXRpZXMpL3NxcnQobigpKSwKICAgIG1lX3Byb2IgPSBhbHBoYV90aHJlc2hvbGQgKiBzZV9wcm9iLAogICAgLmdyb3VwcyA9ICJkcm9wIgogICkKCmlzY2JfcHVibWVkX3N1bSA8LSBpc2NiX3B1Ym1lZF9zdW1fb3RoICU+JQogIGZpbHRlcihyZWdpb24gIT0gIk90aGVyQ2F0ZWdvcmllcyIpCmBgYAoKIyMgRmlndXJlcyBmb3IgcGFwZXIKCmBgYHtyIGZpZy5oZWlnaHQ9NywgZmlnLndpZHRoPTksIHdhcm5pbmc9RkFMU0V9CmZpZ191c19uYW1lX29yaWdpbmEgPC0gaXNjYl9wdWJtZWRfc3VtICU+JQogIGZpbHRlcih5ZWFyIDwgIjIwMjAtMDEtMDEiKSAlPiUKICByZWdpb25fYnJlYWtkb3duKCJtYWluIiwgcmVnaW9uX2xldmVscywgZmN0X3Jldih0eXBlKSkgKwogIGd1aWRlcyhmaWxsID0gZ3VpZGVfbGVnZW5kKG5yb3cgPSAyKSkKCmxhcmdlX3JlZ2lvbnMgPC0gYygiQ2VsdGljRW5nbGlzaCIsICJFYXN0QXNpYW4iLCAiRXVyb3BlYW4iLCAiT3RoZXJDYXRlZ29yaWVzIikKCiMjIE1lYW4gYW5kIHN0YW5kYXJkIGRldmlhdGlvbiBvZiBwcmVkaWN0ZWQgcHJvYmFiaWxpdGllczoKZmlnX3VzX25hbWVfb3JpZ2luYiA8LSBpc2NiX3B1Ym1lZF9zdW1fb3RoICU+JQogIGZpbHRlcihyZWdpb24gJWluJSBsYXJnZV9yZWdpb25zKSAlPiUKICByZWNvZGVfcmVnaW9uKCkgJT4lCiAgZ2FtX2FuZF9jaSgKICAgIGRmMiA9IGlzY2JfcHVibWVkX290aCAlPiUKICAgICAgZmlsdGVyKHJlZ2lvbiAlaW4lIGxhcmdlX3JlZ2lvbnMpICU+JQogICAgICByZWNvZGVfcmVnaW9uKCksCiAgICBzdGFydF95ID0gc3RhcnRfeWVhciwgZW5kX3kgPSBlbmRfeWVhcgogICkgKwogIHRoZW1lKAogICAgbGVnZW5kLnBvc2l0aW9uID0gYygwLjg4LCAwLjgzKSwKICAgIHBhbmVsLmdyaWQubWlub3IgPSBlbGVtZW50X2JsYW5rKCksCiAgICBsZWdlbmQubWFyZ2luID0gbWFyZ2luKC0wLjUsIDAsIDAsIDAsIHVuaXQgPSAiY20iKSwKICAgIGxlZ2VuZC50ZXh0ID0gZWxlbWVudF90ZXh0KHNpemUgPSA2KQogICkgKwogIGZhY2V0X3dyYXAodmFycyhmY3RfcmVsZXZlbChyZWdpb24sIGxhcmdlX3JlZ2lvbnMpKSwgbnJvdyA9IDEpCgpmaWdfdXNfbmFtZV9vcmlnaW4gPC0gY293cGxvdDo6cGxvdF9ncmlkKGZpZ191c19uYW1lX29yaWdpbmEsIGZpZ191c19uYW1lX29yaWdpbmIsIGxhYmVscyA9ICJBVVRPIiwgbmNvbCA9IDEsIHJlbF9oZWlnaHRzID0gYygxLjMsIDEpKQpmaWdfdXNfbmFtZV9vcmlnaW4KZ2dzYXZlKCJmaWdzL3VzX25hbWVfb3JpZ2luLnBuZyIsIGZpZ191c19uYW1lX29yaWdpbiwgd2lkdGggPSA2LjUsIGhlaWdodCA9IDUuNSwgZHBpID0gNjAwKQpnZ3NhdmUoImZpZ3MvdXNfbmFtZV9vcmlnaW4uc3ZnIiwgZmlnX3VzX25hbWVfb3JpZ2luLCB3aWR0aCA9IDYuNSwgaGVpZ2h0ID0gNS41KQpgYGAKCiMjIEh5cG90aGVzaXMgdGVzdGluZwoKYGBge3J9CmlzY2JfbG0gPC0gaXNjYl9wdWJtZWRfb3RoICU+JQogIHVuZ3JvdXAoKSAlPiUKICBtdXRhdGUoCiAgICAjIHllYXIgPSBjKHNjYWxlKHllYXIpKSwKICAgICMgeWVhciA9IGFzLmZhY3Rvcih5ZWFyKSwKICAgIHR5cGUgPSByZWxldmVsKGFzLmZhY3Rvcih0eXBlKSwgcmVmID0gIlB1Ym1lZCBhdXRob3JzIikKICApCm1haW5fbG0gPC0gZnVuY3Rpb24ocmVnaW9uaSkgewogIGdsbSh0eXBlIH4geWVhciArIHByb2JhYmlsaXRpZXMsCiAgICBkYXRhID0gaXNjYl9sbSAlPiUKICAgICAgZmlsdGVyKHJlZ2lvbiA9PSByZWdpb25pLCAhaXMubmEocHJvYmFiaWxpdGllcyksIHllYXIoeWVhcikgPj0gMjAwMiksCiAgICBmYW1pbHkgPSAiYmlub21pYWwiCiAgKQp9CgppbnRlX2xtIDwtIGZ1bmN0aW9uKHJlZ2lvbmkpIHsKICBnbG0odHlwZSB+IHByb2JhYmlsaXRpZXMgKiB5ZWFyLAogICAgZGF0YSA9IGlzY2JfbG0gJT4lCiAgICAgIGZpbHRlcihyZWdpb24gPT0gcmVnaW9uaSwgIWlzLm5hKHByb2JhYmlsaXRpZXMpLCB5ZWFyKHllYXIpID49IDIwMDIpLAogICAgZmFtaWx5ID0gImJpbm9taWFsIgogICkKfQoKbWFpbl9saXN0IDwtIGxhcHBseShsYXJnZV9yZWdpb25zLCBtYWluX2xtKQoKbmFtZXMobWFpbl9saXN0KSA8LSBsYXJnZV9yZWdpb25zCmxhcHBseShtYWluX2xpc3QsIGJyb29tOjp0aWR5KQoKaW50ZV9saXN0IDwtIGxhcHBseShsYXJnZV9yZWdpb25zLCBpbnRlX2xtKQpsYXBwbHkoaW50ZV9saXN0LCBicm9vbTo6dGlkeSkKZm9yIChpIGluIDE6NCkgewogIHByaW50KGFub3ZhKG1haW5fbGlzdFtbaV1dLCBpbnRlX2xpc3RbW2ldXSwgdGVzdCA9ICJDaGlzcSIpKQp9CmBgYApJbnRlcmFjdGlvbiB0ZXJtcyBkbyBub3QgcHJlZGljdCBgdHlwZWAgb3ZlciBhbmQgYWJvdmUgdGhlIG1haW4gZWZmZWN0IG9mIG5hbWUgb3JpZ2luIHByb2JhYmlsaXR5IGFuZCB5ZWFyIChfcF8gPiAwLjAxKS4KCmBgYHtyIGVjaG8gPSBGfQpnZXRfZXhwIDwtIGZ1bmN0aW9uKGksIGNvbHUpIHsKICBicm9vbTo6dGlkeShtYWluX2xpc3RbW2ldXSkgJT4lCiAgICBmaWx0ZXIodGVybSA9PSAicHJvYmFiaWxpdGllcyIpICU+JQogICAgcHVsbChjb2x1KQp9CgpwcmludF9wIDwtIGZ1bmN0aW9uKHgpIHNwcmludGYoIiUwLjVnIiwgeCkKYGBgCgojIyBDb25jbHVzaW9uCgpBbiBFYXN0IEFzaWFuIG5hbWUgaGFzIGByIGV4cChnZXRfZXhwKDIsICdlc3RpbWF0ZScpKWAgdGhlIG9kZHMgb2YgYmVpbmcgc2VsZWN0ZWQgYXMgYW4gaG9ub3JlZSwgc2lnbmlmaWNhbnRseSBsb3dlciBjb21wYXJlZCB0byBvdGhlciBuYW1lcyAoJFxiZXRhX1x0ZXh0cm17RWFzdCBBc2lhbn0gPSQgYHIgcHJpbnRfcChnZXRfZXhwKDIsICdlc3RpbWF0ZScpKWAsIF9QXyA9IGByIHByaW50X3AoZ2V0X2V4cCgyLCAncC52YWx1ZScpKWApLgpUaGUgdHdvIGdyb3VwcyBvZiBzY2llbnRpc3RzIGRpZCBub3QgaGF2ZSBhIHNpZ25pZmljYW50IGFzc29jaWF0aW9uIHdpdGggbmFtZXMgcHJlZGljdGVkIHRvIGJlIENlbHRpYy9FbmdsaXNoIChfUF8gPSBgciBwcmludF9wKGdldF9leHAoMSwgJ3AudmFsdWUnKSlgKSwgRXVyb3BlYW4gKF9QXyA9IGByIHByaW50X3AoZ2V0X2V4cCgzLCAncC52YWx1ZScpKWApLCBvciBpbiBPdGhlciBjYXRlZ29yaWVzIChfUF8gPSBgciBwcmludF9wKGdldF9leHAoNCwgJ3AudmFsdWUnKSlgKS4KCiMjIFN1cHBsZW1lbnQKCiMjIyBTdXBwbGVtZW50YXJ5IEZpZ3VyZSBTNyB7I3N1cF9maWdfczd9Ckl0J3MgZGlmZmljdWx0IHRvIGNvbWUgdG8gYSBjb25jbHVzaW9uIGZvciBvdGhlciByZWdpb25zIHdpdGggc28gZmV3IGRhdGEgcG9pbnRzIGFuZCB0aGUgaW1wZXJmZWN0IGFjY3VyYWN5IG9mIG91ciBwcmVkaWN0aW9uLgpUaGVyZSBzZWVtcyB0byBiZSBsaXR0bGUgZGlmZmVyZW5jZSBiZXR3ZWVuIHRoZSBwcm9wb3J0aW9uIG9mIGtleW5vdGUgc3BlYWtlcnMgb2YgQWZyaWNhbiwgQXJhYmljLCBTb3V0aCBBc2lhbiBhbmQgSGlzcGFuaWMgb3JpZ2luIHRoYW4gdGhvc2UgaW4gdGhlIGZpZWxkLgpIb3dldmVyLCBqdXN0IGJlY2F1c2UgYSBuYXRpb25hbGl0eSBpc24ndCB1bmRlcnJlcHJlc2VudGVkIGFnYWluc3QgdGhlIGZpZWxkIGRvZXNuJ3QgbWVhbiBzY2llbnRpc3RzIGZyb20gdGhhdCBuYXRpb25hbGl0eSBhcmUgYXBwcm9wcmlhdGVseSByZXByZXNlbnRlZC4KCmBgYHtyIGZpZy5oZWlnaHQ9Niwgd2FybmluZz1GQUxTRX0KZGYyIDwtIGlzY2JfcHVibWVkX290aCAlPiUKICBmaWx0ZXIocmVnaW9uICE9ICJPdGhlckNhdGVnb3JpZXMiKSAlPiUKICByZWNvZGVfcmVnaW9uKCkKCmZpZ19zNyA8LSBpc2NiX3B1Ym1lZF9zdW0gJT4lCiAgcmVjb2RlX3JlZ2lvbigpICU+JQogIGdhbV9hbmRfY2koCiAgICBkZjIgPSBkZjIsCiAgICBzdGFydF95ID0gc3RhcnRfeWVhciwgZW5kX3kgPSBlbmRfeWVhcgogICkgKwogIHRoZW1lKGxlZ2VuZC5wb3NpdGlvbiA9IGMoMC44LCAwLjEpKSArCiAgZmFjZXRfd3JhcCh2YXJzKGZjdF9yZWxldmVsKHJlZ2lvbiwgcmVnaW9uX2xldmVscykpLCBuY29sID0gMykKCmZpZ19zNwpnZ3NhdmUoImZpZ3MvZmlnX3M3LnBuZyIsIGZpZ19zNywgd2lkdGggPSA2LCBoZWlnaHQgPSA2KQpnZ3NhdmUoImZpZ3MvZmlnX3M3LnN2ZyIsIGZpZ19zNywgd2lkdGggPSA2LCBoZWlnaHQgPSA2KQpgYGAKCgpgYGB7cn0Kc2Vzc2lvbkluZm8oKQpgYGAK
diff --git a/docs/15.analyze-2020.html b/docs/15.analyze-2020.html
index 237c854..2211437 100644
--- a/docs/15.analyze-2020.html
+++ b/docs/15.analyze-2020.html
@@ -1693,8 +1693,10 @@ Gender
n_confs <- length(my_confs)
table(iscb_gender_df$afflcountries)
##
-## China Italy Japan United Kingdom United States
-## 1 1 1 1 13
+## China Italy Japan United Kingdom
+## 1 1 1 1
+## United States
+## 13
mean(iscb_gender_df$probability_male, na.rm = T)
## [1] 0.584375
Proportion of US affiliation: 76.47%. Mean probability of being male: 58.44%.
@@ -1803,7 +1805,8 @@