knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(tidyverse)
library(rcordis)
library(readr)
library(glue)
library(janitor)
library(conflicted)
library(DBI)
library(duckdb)
library(dbplyr)
library(here)
conflict_prefer("filter", "dplyr")
conflict_prefer("here", "here")
rcntmp <- "251538"
idtmp <- "682999"


org_raw <- "data-raw" %>%
  here(glue("fp{ 1:8 }-organizations.csv")) %>%
  read_csv2(col_types = cols(.default = "c")) %>%
  clean_names()

# try generating proj raw as view from org raw
proj_raw <- "data-raw" %>%
  here(glue("fp{ 1:8 }-projects.csv")) %>%
  read_csv2(col_types = cols(.default = "c")) %>%
  clean_names()

## Notes:

# 1. ca. 300 recent h2020 projs are not (yet) in org_raw
# proj_raw %>%
#   select(id, framework_programme, call) %>%
#   anti_join(org_raw, by = c("id" = "project_id")) %>%
#   count(framework_programme, call, sort = TRUE)

# 2. project_id not all numeric

# 3. all rcn are integers and unique
# proj_raw %>%
#   mutate(rcn_integer = strtoi(rcn),
#          rcnistr = as.character(rcn_integer),
#          rcn_is_int = identical(rcn, rcnistr)) %>%
#   select(rcn, rcnistr, rcn_is_int) %>%
#   filter(duplicated(rcn))
#   filter(!rcn_is_int)

# 4. 20 % coordinators, 78 % participants, 1 % partners
org_raw %>% 
  count(role, sort = TRUE) %>%
  mutate(percent = round (100 * n / sum(n)))

org_raw %>%
  mutate(ec_contribution = str_replace(ec_contribution, ",", "."),
         ec_contribution = as.numeric(ec_contribution)) %>%
  group_by(role) %>%
  summarise(
    avg = mean(ec_contribution, na.rm = TRUE),
    med = median(ec_contribution, na.rm = TRUE)
  )

proj_org <- 
  org_raw %>%
  select(rcn = project_rcn, 
         role, 
         eur = ec_contribution,
         name, 
         country,
         city,
         post = post_code,
         url = organization_url,
         tel = contact_telephone_number) %>%
  mutate(
    tel = str_replace(tel, "^[+]", "00"),
    telL = extract(tel, "\\d")
  )

# rep_raw <- "data-raw" %>%
#   here("fp8-reports.csv") %>%
#   read_csv2(col_types = cols(.default = "c")) %>%
#   clean_names()



# check ec contributions
from_org <- org_raw %>% 
  mutate(
    contrib = str_replace(ec_contribution, ",", "."),
    contrib = parse_double(contrib)) %>%
  group_by(project_id) %>%
  summarise(total_contrib = sum(contrib, na.rm = TRUE)) %>%
  ungroup() %>%
  select(id = project_id, total_contrib)

## 22 cases don't match!
proj_raw %>%
  select(id, ec_max_contribution, framework_programme, funding_scheme) %>%
  inner_join(from_org, by = "id") %>%
  mutate(
    ec_max_contribution = str_replace(ec_max_contribution, ",", "."),
    ec_max_contribution = as.numeric(ec_max_contribution),
    delta = abs(ec_max_contribution - total_contrib)) %>%
  filter(delta > .5) %>%
  arrange(desc(delta))


org_raw %>% 
  count(name, sort = TRUE) ## 31,388
org_raw %>% 
  count(name, country, sort = TRUE) ## 31,420
org_raw %>% 
  count(name, city, country, sort = TRUE) %>%
  print(n = 20) ## 31,447
org_raw %>% 
  select(name, city, country, url = organization_url) %>%
  count(name, city, country, url, sort = TRUE) %>%
  print(n = 20)




# check unique rows: same acronym and coordinator, different ids and calls
# check rcn uid [ok]



con <- dbConnect(duckdb(), here("data-raw", "cordis.duckdb"))
# con <- dbConnect(duckdb(), ":memory:")
dbWriteTable(con, "cordis", cordis)
dbListFields(con, "cordis")
dbDisconnect(con)


cordis <- bind_rows(
  fp7 %>% mutate(fp = "fp7"), 
  h2020 %>% mutate(fp = "h2020")) %>%
  janitor::clean_names() %>%
  mutate(
    role = str_to_title(role),
    role = fct_relevel(role, "Coordinator", "Participant", "Partner"),
    name = str_to_title(name),
    name = str_remove_all(name, "/[‘’‚‛‹›‚]/"),
    name = str_remove_all(name, "/[“”«»„]/"),
    name = str_squish(name),
    city = str_to_title(city),
    city = str_remove_all(city, "/[‘’‚‛‹›‚]/"),
    city = str_remove_all(city, "/[“”«»„]/"),
    city = str_squish(city)) %>%
  rename(
    org_id = id,
    org_legal = name,
    org_city = city,
    org_post_code = post_code)

roles <- cordis %>%
  distinct(role) %>%
  arrange(role) %>%
  mutate(role_id = row_number()) %>%
  select(role_id, everything())

cordis <- cordis %>% left_join(roles, by = "role")

## fingerprint org_names:
refine <- cordis %>%
  select(org_id, org_legal, short_name, country, street, org_city, org_post_code, vat_number) %>%
  mutate(
    fingerprint = str_to_lower(org_legal))

refine <- refine %>%
  mutate(
    fingerprint = ifelse(identical(fingerprint, character(0)), NA, fingerprint),
    fingerprint_org = map_chr(fingerprint, str_c, collapse = "")
  )

## try fixing missing PICs

cordis %>%
  distinct(org_id, org_legal, org_city, org_post_code) %>%
  count(org_legal, org_id, sort = TRUE)

tmp <- cordis %>%
  select(pic, project_id, ec_contribution, role_id) %>%
  unite("key", everything(), sep = "-", remove = FALSE) %>%
  filter(duplicated(key))

cordis  %>% 
  filter(is.na(id)) %>%
  count(name, sort = TRUE)

participation <- cordis %>%
  distinct(pic = id, project_id, ec_contribution, role_id)

glimpse(cordis)

cordis %>% filter(project_id == "602289") %>%

roles %>%
  enum_pk_candidates()

# get data from the cordis ------------------------------------------------
# download.file("http://cordis.europa.eu/data/cordis-h2020organizations.xlsx", here("data", "cordis-h2020organizations.xlsx"))
# download.file("http://cordis.europa.eu/data/cordis-fp7organizations.xlsx", here("data", "cordis-fp7organizations.xlsx"))


zambujo/rcordis documentation built on June 14, 2020, 9:41 p.m.