In danimedi/residentado_peru: What the Package Does (One Line, Title Case)

knitr::opts_chunk$set(echo = TRUE)

# packages
library(readr)
library(dplyr)
library(tibble)
library(magrittr)
library(stringr)
library(purrr)

# source multiple functions in the code directory
source2 <- function(...) {
  funs <- unlist(list(...))
  scripts <- file.path("R", funs)
  purrr::walk(scripts, source)
}

write_complete_data <- function(df) {
  readr::write_csv(df, "peru_residentado.csv")
}

read_complete_data <- function() {
  readr::read_csv("peru_residentado.csv")
}

Extract and clean the data from PDF tables

source2("read_pdf_tables.R")
read_pdf_tables(path_input_pdf = "conareme/pdf", 
                path_output_csv = "conareme/identified")

Unify the data

source2("unify_data_sets.R")
files <- list.files("conareme/identified", full.names = TRUE, 
                     pattern = "postulantes[.]csv$")
years <- 2013:2020
applicants <- unify_data_sets(files, years)
# write_csv(applicants, "data/clean/identified/peru-postulantes-2013-2020.csv")

files <- list.files("conareme/identified", full.names = TRUE, 
                     pattern = "ingresantes[.]csv$")
years <- 2016:2020
incomings <- unify_data_sets(files, years)
# write_csv(incomings, "data/clean/identified/peru-ingresantes-2016-2020.csv")

# subset rows with codigo (year 2016)
incomings1 <- incomings %>% filter(year == "2016")
applicants1 <- applicants %>% filter(year %in% as.character(2013:2016))
x <- full_join(applicants1, incomings1, by = c("year", "Codigo"), 
               suffix = c("_postulantes", "_ingresantes"))

# subset rows without codigo
incomings2 <- incomings %>% filter(year != "2016")
applicants2 <- applicants %>% filter(!year %in% as.character(2013:2016))
y <- full_join(applicants2, incomings2, by = c("year", "No Doc."),
               suffix = c("_postulantes", "_ingresantes"))

# join both pieces
dat_joined <- bind_rows(x, y)

# check that important columns have the same values
check_cols <- function(df, cols) {
  df_filt <- df %>% select(starts_with(cols))
  lapply(seq(1, length(df_filt), 2), function(i) {
    col_name <- names(df_filt[i])
    j <- df_filt[[i]] != df_filt[[i+1]]
    problems <- tibble(
      indx = which(j),
      x = as.character(df_filt[[i]][indx]),
      y = as.character(df_filt[[i+1]][indx])
    )
    list(col_name, problems = problems)
  })
}
columns <- c("Apellido Paterno", "Apellido Materno", "Nombres", "Universidad",
             "Tipo", "Especialidad", "V11", "Modalidad", "Serum", "Bonif")
# check_cols(dat_joined, columns)

dat_final <- dat_joined %>% 
  select(
    year, Codigo, `No Doc.`, CMP, `Apellido Paterno_postulantes`, 
    `Apellido Materno_postulantes`, Nombres_postulantes, Universidad_postulantes,
    Universidad_ingresantes, Tipo_postulantes, 
    `Especialidad/SubEspecialidad_postulantes`,
    `Especialidad/SubEspecialidad_ingresantes`, Serum_postulantes, 
    Bonif._postulantes, V9, V11_postulantes, V11_ingresantes, V13,
    Modalidad_postulantes, Modalidad_ingresantes, Norma, O.M., untaje, SNCDS,
    `1er Niv.`, `5to Sup.`, `Prom Pre`, ENAM, Estado, `Esp. Previa`, Examen,
    `Factor A.`, `ota Fina`, Obs., rden, .xame, Ajuste, P.Final,
    ngres, Sede
  ) %>% rename(
    apellido_paterno = `Apellido Paterno_postulantes`, 
    apellido_materno = `Apellido Materno_postulantes`,
    num_doc = `No Doc.`, nombres = Nombres_postulantes, tipo = Tipo_postulantes,
    especialidad_subespecialidad_postulantes = `Especialidad/SubEspecialidad_postulantes`, especialidad_subespecialidad_ingresantes = `Especialidad/SubEspecialidad_ingresantes`,
    serum = Serum_postulantes, bonificacion = Bonif._postulantes,
    puntaje = untaje, primer_nivel = `1er Niv.`, quinto_superior = `5to Sup.`,
    promedio_pregrado = `Prom Pre`, especialidad_previa = `Esp. Previa`,
    nota_final = `ota Fina`, observaciones = Obs., orden = rden, examen2 = .xame,
    promedio_final = P.Final, ingreso = ngres
  )
# deal with the columns `ingreso` and `examen2` of the year 2016
dat_2016 <- dat_final %>% filter(year == "2016")
dat_2016$Examen <- dat_2016$examen2
dat_final[dat_final$year == "2016", ] <- dat_2016
dat_final <- dat_final %>% select(!c(ingreso, examen2))

After processing there were 67 rows with a lot of missing values, those were people with different IDs between the different data sets (postulantes vs. ingresantes).

# find the phantom data in the final data
# `i` evaluates the data that is present in the final data but it's not present in the applicants data
i <- !(paste0(
  dat_final$year, dat_final$apellido_paterno, dat_final$apellido_materno
) %in% paste0(
  applicants$year, applicants$`Apellido Paterno`, applicants$`Apellido Materno`
))
phantom_dat <- dat_final[i, ]

# reveal identity and join data
x <- incomings[incomings$`No Doc.` %in% phantom_dat$num_doc, ]
fixed_phantom <- left_join(x, applicants, 
                           by = c("year", "Apellido Paterno", "Apellido Materno"),
                           suffix = c("_ingresantes", "_postulantes"))
# remove empty columns
k <- map_lgl(fixed_phantom, ~ all(is.na(.x)))
fixed_phantom <- fixed_phantom[,!k]
# rename columns to join the rows
fixed_phantom <- fixed_phantom %>% 
  rename(
    apellido_paterno = `Apellido Paterno`, 
    apellido_materno = `Apellido Materno`, nombres1 = Nombres_postulantes,
    nombres2 = Nombres_ingresantes, tipo1 = Tipo_postulantes,
    tipo2 = Tipo_ingresantes,
    especialidad_subespecialidad_postulantes = `Especialidad/SubEspecialidad_postulantes`, especialidad_subespecialidad_ingresantes = `Especialidad/SubEspecialidad_ingresantes`,
    serum = Serum_postulantes,
    primer_nivel = `1er Niv.`, quinto_superior = `5to Sup.`,
    promedio_pregrado = `Prom Pre`,
    nota_final = `ota Fina`
  )
# deal with nombres2, tipo2, No Doc., Codigo
jump_cols <- function(dat, cols) {
  h <- is.na(dat[[cols[1]]]) & !is.na(dat[[cols[2]]])
  dat[h, cols[1]] <- dat[h, cols[2]]
  dat
}
fixed_phantom <- fixed_phantom %>% 
  jump_cols(c("nombres1", "nombres2")) %>% 
  jump_cols(c("tipo1", "tipo2")) %>% 
  jump_cols(c("No Doc._ingresantes", "No Doc._postulantes")) %>% 
  select(
    !c(nombres2, tipo2, `No Doc._postulantes`, No_ingresantes, Obser.,
       No_postulantes)
  ) %>% 
  rename(nombres = nombres1, tipo = tipo1, num_doc = `No Doc._ingresantes`,
         Codigo = Codigo_postulantes)
# be sure that there are no new columns
length(setdiff(names(fixed_phantom), names(dat_final))) == 0

# remove the phantom data and its conections in the main data set and add the new clean phantom data
# `i` evaluates the data that is present in the final data but it's not present in the applicants data
i <- !(paste0(
  dat_final$year, dat_final$apellido_paterno, dat_final$apellido_materno
) %in% paste0(
  applicants$year, applicants$`Apellido Paterno`, applicants$`Apellido Materno`
))
# `j` evaluates the data that is present in the final data and also in the phantom data
j <- paste0(
  dat_final$year, dat_final$apellido_paterno, dat_final$apellido_materno
) %in% paste0(
  fixed_phantom$year, fixed_phantom$apellido_paterno,
  fixed_phantom$apellido_materno
)
# remove both of them, the phantom data and its connections and add the new "clean" phantom data
l <- i | j
dat_final <- bind_rows(dat_final[!l, ], fixed_phantom)

# write_complete_data(dat_final)

Transform the data

Obtain gender from names

Gender was obtained from the first name, the sources were a list of names according to sex from Wikipedia (Nombres masculinos, Nombres femeninos) and a list from GitHub. The gender of the remaining names was assigned by hand if it was clear. This information was condensed into one data set with two columns: name and gender.

genders <- read_csv("data/names_gender/first_name-gender_dictionary.csv")
head(genders)
dim(genders)

Then we can obtain the gender in the "main" data set using the first names.

dat1 <- read_rds("data/conareme/identified/peru-COMPLETO-2013-2020.rds")
head(dat1)

# remove accents
str_to_english <- function(x) {
  x %>% 
    stringr::str_to_lower() %>% 
    stringi::stri_trans_general(id = "Latin-ASCII")
}

# obtain first name
first_names <- dat1$nombres %>% 
  str_extract("\\w+") %>% 
  str_to_english()

# create a column with the genders according with this first names
lookup <- setNames(genders$gender, genders$name)
x <- unname(lookup[first_names])
dat2 <- mutate(dat1, gender = x)

De-identification

An option for the de-identification could be "rescuing" a column to identify people, e.g. if the same person applies to residency in different years, it will have the same number in that column. The challenge is that there are few rows with both id number and code, so maybe there was a lost of information in the code before. Options to capture this information are matching the codes or using last/first-names.

At the moment there is no column for the identifier.

dat3 <- dat2 %>% select(year, gender, Universidad_postulantes:Sede)

Final "cleaning"

library(stringr)
# functions for transformations
transform_numeric <- function(x) {
  y1 <- str_extract(x, "\\d+[.]\\d+")
  y2 <- str_extract(x, "\\d+")
  ifelse(!is.na(y1), as.double(y1), as.integer(y2))
}

dat4 <- dat3 %>% mutate(
  year = as.integer(year),
  gender = ifelse(gender == "F", 1, 0),
  serum = transform_numeric(serum),
  bonificacion = as.double(bonificacion),
  puntaje = as.double(puntaje),
  SNCDS = transform_numeric(SNCDS),
  primer_nivel = transform_numeric(primer_nivel),
  quinto_superior = transform_numeric(quinto_superior),
  ENAM = transform_numeric(ENAM),
  Estado = ifelse(Estado == "Apto", 1, 0),
  especialidad_previa = ifelse(
    especialidad_previa == "NO", 0, 
    as.integer(str_extract(especialidad_previa, "\\d+"))
  ),
  Examen = transform_numeric(Examen),
  `Factor A.` = transform_numeric(`Factor A.`),
  nota_final = transform_numeric(nota_final),
  observaciones = ifelse(is.na(observaciones), 0, 1),
  orden = transform_numeric(orden),
  Ajuste = transform_numeric(Ajuste),
  promedio_final = transform_numeric(promedio_final),
)

# add a column to indicate if the person obtained or not the residency
dat4 <- dat4 %>% 
  mutate(ingreso = ifelse(is.na(especialidad_subespecialidad_ingresantes),
                          0, 1))

Fix "bad names" of universities

Some universities had problems with the names.

dat <- read_complete_data()
head(dat)

# detect low frequency universities
detect_low_freq <- function(df, cols) {
  df <- df[cols]
  map(df, function(col) {
    freqs <- table(col) %>% sort()
    tibble(value = names(freqs), n = unname(freqs))
  })
}
# detect_low_freq(dat, c("Universidad_postulantes", "Universidad_ingresantes"))

lookup_postulantes <- c(
  "AUSMP" = "USMP", "DITHURP" = "URP", "EMMAUNMSM" = "UNMSM", "ENASUPCH" = "UPCH",
  "GOUSMP" = "USMP", "HUNT" = "UNT", "HURP" = "URP", "IKUNSA" = "UNSA",
  "IUPCH" = "UPCH", "LESUNSA" = "UNSA", "LESUSMP" = "USMP", "LINDAUPAO" = "UPAO",
  "NUNCP" = "UNCP", "OSURP" = "URP", "OUNMSM" = "UNMSM", "OUNSA" = "UNSA",
  "RIOUPAO" = "UPAO", "ROSURP" = "URP", "RRESUNMSM" = "UNMSM", "RUPAO" = "UPAO",
  "SUNMSM" = "UNMSM", "SUPAO" = "UPAO", "SUPCH" = "UPCH", "AURP" = "URP",
  "EDESUNMSM" = "UNMSM", "HUPAO" = "UPAO", "IEUNSA" = "UNSA", "SURP" = "URP",
  "SUSMP" = "USMP", "SABINUANMSM" = "UNMSM", "ROSARUIORP" = "URP",
  "N FEURNMASNMDEZ" = "UNMSM"
)

lookup_ingresantes <- c(
  "OUSMP" = "USMP", "SURP" = "URP"
)

replace_lookups <- function(x, lookup) {
  ifelse(x %in% names(lookup), lookup[x], x)
}
dat$Universidad_postulantes <- replace_lookups(dat$Universidad_postulantes,
                                               lookup_postulantes)
dat$Universidad_ingresantes <- replace_lookups(dat$Universidad_ingresantes,
                                               lookup_ingresantes)

# detect_low_freq(dat, c("Universidad_postulantes", "Universidad_ingresantes"))

The processes below this line have not been finished

Clean modality

There are columns like V9, V11, V13 and modality that contain the same information, so this code checks and fix if the information is the same to avoid duplicates.

indx_modalidad_libre <- function(
  df, 
  cols_modalidad = c("V9", "V11_postulantes", "V11_ingresantes", "V13", 
                     "Modalidad_postulantes", "Modalidad_ingresantes")
) {
  df_filt <- df[cols_modalidad]
  vapply(seq_len(nrow(df_filt)), function(i) {
    row_vect <- unlist(df_filt[i, ])
    row_vect <- row_vect[!is.na(row_vect)]
    if (length(row_vect) > 0) {
      all(row_vect == "Libre")
    } else {
      FALSE
    }

  }, logical(1))
}

# split data
i <- indx_modalidad_libre(dat)
dat_libre <- dat[i, ]
dat_no_libre <- dat[!i, ]

Add the region based on the universities

universities <- read_csv("data/university_region/region_for_universities - spreadsheet.csv")
head(universities)

Use that information to create a column with the location/region/departamento.

Separate specialties by type

library(stringr)
dat <- read_complete_data()
specialties <- unique(dat$especialidad_subespecialidad_postulantes)
types_specialties <- list(
  covid_related = c(
    "MEDICINA DE EMERGENCIAS Y DESASTRES", 
    "MEDICINA DE ENFERMEDADES INFECCIOSA", 
    "ADMINISTRACION  Y GESTION EN SALUD",
    "ADMINISTRACION EN SALUD",
    "MEDICINA DE ENFERMEDADES INFECCIOSAS Y TROPIC",
    "ADMINISTRACION Y GESTION EN SALUD"
  ),
  surgical = str_subset(specialties, "(CIRUGIA|ORTOPEDIA Y TRAUMATOLOGIA)"),
  clinical = str_subset(specialties, "(CIRUGIA|ORTOPEDIA Y TRAUMATOLOGIA)", 
                        negate = TRUE)
)