_targets.Rmd

---
title: "Target Markdown"
output: html_document
---

# Set up targets environment

Here we run necessary code for `targets` setup.

```{r setup, include = FALSE}
here::set_here()
knitr::opts_chunk$set(collapse = TRUE, comment = "#>")

# load libraries for targets notebook
library(targets)
library(dplyr, quietly = TRUE)

# clean knitting history
tar_unscript()
```

# Dependencies

Here, we load packages and custom functions required in the pipeline.

```{r, engine="targets", example-globals, tar_globals = TRUE}
options(tidyverse.quiet = TRUE)


# load libraries for make
tar_option_set(packages = c("bookdown",
                            "dplyr",
                            "tidyr",
                            "purrr",
                            "readr",
                            "quanteda",
                            "igraph",
                            "DiagrammeR",
                            "tibble",
                            "stringr",
                            "topicmodels",
                            "ggplot2",
                            "scales",
                            "ldatuning",
                            "data.table",
                            "tidytext",
                            "tm",
                            "RSQLite",
                            "DBI",
                            "MatchIt",
                            "qs",
                            "tidyverse",
                            "psych",
                            "arrow",
                            "tarchetypes",
                            "cobalt",
                            "furrr"
                            ))

# read custom functions
lapply(list.files(here::here("R"), full.names = TRUE), source)
```

# Data import

Here we import all the data we work with. Data are stored on a cloud from where we download it. 

```{r, engine="targets", data_import}
# Set global timeout for download operations
options(timeout = 3000) # 50 minutes

list(
    
   
  tarchetypes::tar_download(ids_path,
      urls = Sys.getenv("link_to_supervisors"),
      paths = here::here("data", "raw", "supervisors.csv")),
  tarchetypes::tar_download(gender_path,
      urls = Sys.getenv("link_to_gender"),
      paths = here::here("data", "raw", "names4api_tagged.tsv")),
  tarchetypes::tar_download(authors_arrow_path,
      urls = Sys.getenv("link_to_updated_authors"),
      paths = here::here("data", "raw", "authors_by_pubs.arrow"),
       mode = "wb"),
  tarchetypes::tar_download(text_arrow_path,
      urls = Sys.getenv("link_to_updated_texts"),
      paths = here::here("data", "raw", "riv_text.arrow"),
      mode = "wb"),
  tarchetypes::tar_download(cep_investigators_path,
      urls = Sys.getenv("link_to_cep_investigators"),
      paths = here::here("data", "raw", "cep_investigators.arrow"),
       mode = "wb"),
  tarchetypes::tar_download(cep_details_path,
      urls = Sys.getenv("link_to_cep_details"),
      paths = here::here("data", "raw", "cep_details.arrow"),
       mode = "wb"),
  tarchetypes::tar_download(riv_details_path,
      urls = Sys.getenv("link_to_riv_details"),
      paths = here::here("data", "raw", "riv_details.arrow"),
       mode = "wb"),
  tarchetypes::tar_download(riv_authors_path,
      urls = Sys.getenv("link_to_riv_authors"),
      paths = here::here("data", "raw", "riv_authors.arrow"),
       mode = "wb"),
   tarchetypes::tar_download(sup_path,
      urls = Sys.getenv("link_to_manual_sup_search"),
      paths = here::here("data", "raw", "manual_sup_search_6.csv")),
  tarchetypes::tar_download(sup_path_second,
      urls = Sys.getenv("link_to_manual_sup_search_second"),
      paths = here::here("data", "raw", "manual_sup_search_second.csv")),
  tarchetypes::tar_download(blacklist_path,
      urls = Sys.getenv("link_to_blacklist_control"),
      paths = here::here("data", "raw", "blacklist_control_final_before_disamb.csv")),
  tarchetypes::tar_download(all_sup_path,
      urls = Sys.getenv("link_to_all_sup"),
      paths = here::here("data", "raw", "all_sup.csv")),

  
     # tar_target(ids_path, here::here("data", "raw", "supervisors_updated.csv"), format = "file"),
     tar_target(ids, read.csv(ids_path)),
     # tar_target(gender_path, here::here("data", "raw", "names4api_tagged.tsv"), format = "file"),
     tar_target(gender, read.delim(gender_path)),
     tar_target(authors_arrow, arrow::read_feather(authors_arrow_path)),
     tar_target(text_arrow, arrow::read_feather(text_arrow_path)),
     tar_target(cep_investigators_arrow, arrow::read_feather(cep_investigators_path)),
     tar_target(cep_details_arrow, arrow::read_feather(cep_details_path)),
     tar_target(riv_details_arrow, arrow::read_feather(riv_details_path)),
     tar_target(riv_authors_arrow, arrow::read_feather(riv_authors_path)),
     tar_target(name = db_path, 
                command = transcribe_database(path = here::here("data",
                                                                "raw",
                                                                "czech_sci_db.sqlite"),
                                              authors_arrow, 
                                              cep_investigators_arrow,   
                                              cep_details_arrow, 
                                              riv_details_arrow, 
                                              riv_authors_arrow,
                                              text_arrow),
                format = "file"
                ),
     tar_target(sup_control, read.csv(sup_path)),
     tar_target(sup_control_second, read.csv(sup_path_second)),
     tar_target(blacklist_control, read.csv(blacklist_path)),
     tar_target(all_sup, read.csv(all_sup_path))

)
```

# Matching

Here we perform construction of control groups via mahalanobis distance matching. There are 2 control groups: funded, and unfunded, which have separate matching procedures, but later are combined into one dataset (target "matching") so it is easier to work with.  


```{r, engine="targets", matching}
list(
     
      tar_target(matching_data, match_data_prep(db_path, ids, gender, blacklist_control)),
      tar_target(matched_obj_unfunded, match_obj_unfunded(matching_data)), 
      tar_target(matched_obj_funded, match_obj_funded(db_path, matching_data)),
      tar_target(matching, match_data(matched_obj_unfunded, matched_obj_funded))
     
)
```

# Data preparation

Here we perform author disambiguation of the manually collected names of supervisors ("sup_complete", "ids_complete") and combine these into final dataset including list of publications for each observation (in the target "all_authors").

Further, we prepare the data for the construction of topic models in the targets "all_pubs" and "random_pubs".


```{r, engine="targets", data_prep}
list(

 tar_target(sup_complete, create_sup_complete(all_sup, sup_control, db_path, matching_data)),
     tar_target(ids_complete, create_ids_complete(ids, db_path, matching_data)),
      tar_target(final_data, connect_data(matching, db_path, ids_complete, sup_complete)),
               
      tar_target(all_authors,
           read_all_authors(db_path, ids_complete, final_data)),
     
 
     tar_target(all_pubs,
           get_all_pubs(db_path, final_data)),
     tar_target(random_pubs,
           get_random_pubs(db_path))

)
```

# Topic model

Here we construct a smaller topic model based on the randomly selected 1200 publications ("topic_model_original") from the full database of all Czech publications. Then we take all publications of the researchers in our sample (both treatment group and control groups, target "topic_model_input_addition") and match it onto the existing topic model ("topic_model"). This way we limit the endogeneity of the topic model, compared to if we calculated the topic model based only on the publications of our sample researchers.


```{r, engine="targets", topic_model}
list(
    
   tar_target(topic_model_input,
              make_topic_model_input(random_pubs)),
   tar_target(topic_number,
              find_topic_number(topic_model_input),
                format = "qs"),
   tar_target(topic_model_original,
               make_topic_model(topic_model_input, topic_number),
                format = "qs"),
   tar_target(topic_model_input_addition,
              make_topic_model_input_addition(all_pubs)),
   tar_target(topic_model,
               make_topic_model_addition(topic_model_input_addition, topic_model_original),
                format = "qs")

    
)
```

# Coauthorship network

Here we construct coauthorship network for each observation in our sample, which is a precursor from which 2 parts of the independence indicator are calculated.   

```{r, engine="targets", network}

list(

   
    tar_target(coauthorship,
               make_network(all_authors)),

     tar_target(coauthorship_graph,
               make_graph(coauthorship))

)

```

# Researcher independence indicator (RII) part 1 - eigenvector centrality

Here we calculate first part of the independence indicator: eigenvector centrality of the PhD supervisors in the author's network.

```{r, engine="targets", RII1}

list(
    
     tar_target(eigen_centrality,
               comp_eigen_centr(coauthorship_graph))
    
)

```

# Researcher independence indicator (RII) part 2 - clustering coefficient

Here we calculate second part of the independence indicator: clustering coefficient of the PhD supervisors in the author's network.

```{r, engine="targets", RII2}

list(
    
    tar_target(cluster,
                comp_clustering(eigen_centrality))
    
    
)

```

# Researcher independence indicator (RII) part 3 - independent publications

Here we calculate third part of the independence indicator: ratio of publications without coauthorship with the author's PhD supervisor.

```{r, engine="targets", RII3}

list(
    
   tar_target(independent_pubs,
                comp_ind_pubs(cluster))
)

```

# Researcher independence indicator (RII) part 4 - independent topics

Here we calculate fourth part of the independence indicator: ratio of topics that the author publishes on that their PhD supervisor does not publish on.


```{r, engine="targets", RII4}

list(
    
    tar_target(independent_topics,
                comp_ind_topics(independent_pubs, topic_model, db_path))
)

```


# Final indicator

Here we construct the final independence indicator using the parts calculated above.  

```{r, engine="targets", final}

list(
    
    #  # tar_target(testing,
    #  #           just_test(db_path),
    #  #           format = "qs")
    
    tar_target(full_indicator,
               calc_full_indicator(independent_topics, final_data))
    

)


```


# Attachments and additional analyses

Here we test to what extent can PhD advisors be identified algorithmically.

```{r, engine="targets", attachments}

list(
    
    #  # tar_target(testing,
    #  #           just_test(db_path),
    #  #           format = "qs")
    
    tar_target(sup_alg,
               refine_data(matching, db_path, sup_complete))
    

)


```


# Article

Here is the paper presenting results. It also contains code for figures, hypothesis testing, etc.. 

```{targets article}
list(
    
    # tar_target(results,
    #            analyse(full_indicator),
    #            format = "qs") ,
    # tar_target(sup_testing,
    #            sup_test(matching, db_path, ids, sup_control, ids_complete),
    #            format = "qs") ,
    # 
    tarchetypes::tar_render(article, here::here("article_formatted_doc.Rmd"))
)


```