knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(tidyverse) library(rcordis) library(readr) library(glue) library(janitor) library(conflicted) library(DBI) library(duckdb) library(dbplyr) library(here) conflict_prefer("filter", "dplyr") conflict_prefer("here", "here")
rcntmp <- "251538" idtmp <- "682999" org_raw <- "data-raw" %>% here(glue("fp{ 1:8 }-organizations.csv")) %>% read_csv2(col_types = cols(.default = "c")) %>% clean_names() # try generating proj raw as view from org raw proj_raw <- "data-raw" %>% here(glue("fp{ 1:8 }-projects.csv")) %>% read_csv2(col_types = cols(.default = "c")) %>% clean_names() ## Notes: # 1. ca. 300 recent h2020 projs are not (yet) in org_raw # proj_raw %>% # select(id, framework_programme, call) %>% # anti_join(org_raw, by = c("id" = "project_id")) %>% # count(framework_programme, call, sort = TRUE) # 2. project_id not all numeric # 3. all rcn are integers and unique # proj_raw %>% # mutate(rcn_integer = strtoi(rcn), # rcnistr = as.character(rcn_integer), # rcn_is_int = identical(rcn, rcnistr)) %>% # select(rcn, rcnistr, rcn_is_int) %>% # filter(duplicated(rcn)) # filter(!rcn_is_int) # 4. 20 % coordinators, 78 % participants, 1 % partners org_raw %>% count(role, sort = TRUE) %>% mutate(percent = round (100 * n / sum(n))) org_raw %>% mutate(ec_contribution = str_replace(ec_contribution, ",", "."), ec_contribution = as.numeric(ec_contribution)) %>% group_by(role) %>% summarise( avg = mean(ec_contribution, na.rm = TRUE), med = median(ec_contribution, na.rm = TRUE) ) proj_org <- org_raw %>% select(rcn = project_rcn, role, eur = ec_contribution, name, country, city, post = post_code, url = organization_url, tel = contact_telephone_number) %>% mutate( tel = str_replace(tel, "^[+]", "00"), telL = extract(tel, "\\d") ) # rep_raw <- "data-raw" %>% # here("fp8-reports.csv") %>% # read_csv2(col_types = cols(.default = "c")) %>% # clean_names() # check ec contributions from_org <- org_raw %>% mutate( contrib = str_replace(ec_contribution, ",", "."), contrib = parse_double(contrib)) %>% group_by(project_id) %>% summarise(total_contrib = sum(contrib, na.rm = TRUE)) %>% ungroup() %>% select(id = project_id, total_contrib) ## 22 cases don't match! proj_raw %>% select(id, ec_max_contribution, framework_programme, funding_scheme) %>% inner_join(from_org, by = "id") %>% mutate( ec_max_contribution = str_replace(ec_max_contribution, ",", "."), ec_max_contribution = as.numeric(ec_max_contribution), delta = abs(ec_max_contribution - total_contrib)) %>% filter(delta > .5) %>% arrange(desc(delta)) org_raw %>% count(name, sort = TRUE) ## 31,388 org_raw %>% count(name, country, sort = TRUE) ## 31,420 org_raw %>% count(name, city, country, sort = TRUE) %>% print(n = 20) ## 31,447 org_raw %>% select(name, city, country, url = organization_url) %>% count(name, city, country, url, sort = TRUE) %>% print(n = 20) # check unique rows: same acronym and coordinator, different ids and calls # check rcn uid [ok] con <- dbConnect(duckdb(), here("data-raw", "cordis.duckdb")) # con <- dbConnect(duckdb(), ":memory:") dbWriteTable(con, "cordis", cordis) dbListFields(con, "cordis") dbDisconnect(con) cordis <- bind_rows( fp7 %>% mutate(fp = "fp7"), h2020 %>% mutate(fp = "h2020")) %>% janitor::clean_names() %>% mutate( role = str_to_title(role), role = fct_relevel(role, "Coordinator", "Participant", "Partner"), name = str_to_title(name), name = str_remove_all(name, "/[‘’‚‛‹›‚]/"), name = str_remove_all(name, "/[“”«»„]/"), name = str_squish(name), city = str_to_title(city), city = str_remove_all(city, "/[‘’‚‛‹›‚]/"), city = str_remove_all(city, "/[“”«»„]/"), city = str_squish(city)) %>% rename( org_id = id, org_legal = name, org_city = city, org_post_code = post_code) roles <- cordis %>% distinct(role) %>% arrange(role) %>% mutate(role_id = row_number()) %>% select(role_id, everything()) cordis <- cordis %>% left_join(roles, by = "role") ## fingerprint org_names: refine <- cordis %>% select(org_id, org_legal, short_name, country, street, org_city, org_post_code, vat_number) %>% mutate( fingerprint = str_to_lower(org_legal)) refine <- refine %>% mutate( fingerprint = ifelse(identical(fingerprint, character(0)), NA, fingerprint), fingerprint_org = map_chr(fingerprint, str_c, collapse = "") ) ## try fixing missing PICs cordis %>% distinct(org_id, org_legal, org_city, org_post_code) %>% count(org_legal, org_id, sort = TRUE) tmp <- cordis %>% select(pic, project_id, ec_contribution, role_id) %>% unite("key", everything(), sep = "-", remove = FALSE) %>% filter(duplicated(key)) cordis %>% filter(is.na(id)) %>% count(name, sort = TRUE) participation <- cordis %>% distinct(pic = id, project_id, ec_contribution, role_id) glimpse(cordis) cordis %>% filter(project_id == "602289") %>% roles %>% enum_pk_candidates() # get data from the cordis ------------------------------------------------ # download.file("http://cordis.europa.eu/data/cordis-h2020organizations.xlsx", here("data", "cordis-h2020organizations.xlsx")) # download.file("http://cordis.europa.eu/data/cordis-fp7organizations.xlsx", here("data", "cordis-fp7organizations.xlsx"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.