knitr::opts_chunk$set(echo = TRUE) # packages library(readr) library(dplyr) library(tibble) library(magrittr) library(stringr) library(purrr) # source multiple functions in the code directory source2 <- function(...) { funs <- unlist(list(...)) scripts <- file.path("R", funs) purrr::walk(scripts, source) } write_complete_data <- function(df) { readr::write_csv(df, "peru_residentado.csv") } read_complete_data <- function() { readr::read_csv("peru_residentado.csv") }
source2("read_pdf_tables.R") read_pdf_tables(path_input_pdf = "conareme/pdf", path_output_csv = "conareme/identified")
source2("unify_data_sets.R") files <- list.files("conareme/identified", full.names = TRUE, pattern = "postulantes[.]csv$") years <- 2013:2020 applicants <- unify_data_sets(files, years) # write_csv(applicants, "data/clean/identified/peru-postulantes-2013-2020.csv")
files <- list.files("conareme/identified", full.names = TRUE, pattern = "ingresantes[.]csv$") years <- 2016:2020 incomings <- unify_data_sets(files, years) # write_csv(incomings, "data/clean/identified/peru-ingresantes-2016-2020.csv")
# subset rows with codigo (year 2016) incomings1 <- incomings %>% filter(year == "2016") applicants1 <- applicants %>% filter(year %in% as.character(2013:2016)) x <- full_join(applicants1, incomings1, by = c("year", "Codigo"), suffix = c("_postulantes", "_ingresantes")) # subset rows without codigo incomings2 <- incomings %>% filter(year != "2016") applicants2 <- applicants %>% filter(!year %in% as.character(2013:2016)) y <- full_join(applicants2, incomings2, by = c("year", "No Doc."), suffix = c("_postulantes", "_ingresantes")) # join both pieces dat_joined <- bind_rows(x, y) # check that important columns have the same values check_cols <- function(df, cols) { df_filt <- df %>% select(starts_with(cols)) lapply(seq(1, length(df_filt), 2), function(i) { col_name <- names(df_filt[i]) j <- df_filt[[i]] != df_filt[[i+1]] problems <- tibble( indx = which(j), x = as.character(df_filt[[i]][indx]), y = as.character(df_filt[[i+1]][indx]) ) list(col_name, problems = problems) }) } columns <- c("Apellido Paterno", "Apellido Materno", "Nombres", "Universidad", "Tipo", "Especialidad", "V11", "Modalidad", "Serum", "Bonif") # check_cols(dat_joined, columns) dat_final <- dat_joined %>% select( year, Codigo, `No Doc.`, CMP, `Apellido Paterno_postulantes`, `Apellido Materno_postulantes`, Nombres_postulantes, Universidad_postulantes, Universidad_ingresantes, Tipo_postulantes, `Especialidad/SubEspecialidad_postulantes`, `Especialidad/SubEspecialidad_ingresantes`, Serum_postulantes, Bonif._postulantes, V9, V11_postulantes, V11_ingresantes, V13, Modalidad_postulantes, Modalidad_ingresantes, Norma, O.M., untaje, SNCDS, `1er Niv.`, `5to Sup.`, `Prom Pre`, ENAM, Estado, `Esp. Previa`, Examen, `Factor A.`, `ota Fina`, Obs., rden, .xame, Ajuste, P.Final, ngres, Sede ) %>% rename( apellido_paterno = `Apellido Paterno_postulantes`, apellido_materno = `Apellido Materno_postulantes`, num_doc = `No Doc.`, nombres = Nombres_postulantes, tipo = Tipo_postulantes, especialidad_subespecialidad_postulantes = `Especialidad/SubEspecialidad_postulantes`, especialidad_subespecialidad_ingresantes = `Especialidad/SubEspecialidad_ingresantes`, serum = Serum_postulantes, bonificacion = Bonif._postulantes, puntaje = untaje, primer_nivel = `1er Niv.`, quinto_superior = `5to Sup.`, promedio_pregrado = `Prom Pre`, especialidad_previa = `Esp. Previa`, nota_final = `ota Fina`, observaciones = Obs., orden = rden, examen2 = .xame, promedio_final = P.Final, ingreso = ngres ) # deal with the columns `ingreso` and `examen2` of the year 2016 dat_2016 <- dat_final %>% filter(year == "2016") dat_2016$Examen <- dat_2016$examen2 dat_final[dat_final$year == "2016", ] <- dat_2016 dat_final <- dat_final %>% select(!c(ingreso, examen2))
After processing there were 67 rows with a lot of missing values, those were people with different IDs between the different data sets (postulantes vs. ingresantes).
# find the phantom data in the final data # `i` evaluates the data that is present in the final data but it's not present in the applicants data i <- !(paste0( dat_final$year, dat_final$apellido_paterno, dat_final$apellido_materno ) %in% paste0( applicants$year, applicants$`Apellido Paterno`, applicants$`Apellido Materno` )) phantom_dat <- dat_final[i, ] # reveal identity and join data x <- incomings[incomings$`No Doc.` %in% phantom_dat$num_doc, ] fixed_phantom <- left_join(x, applicants, by = c("year", "Apellido Paterno", "Apellido Materno"), suffix = c("_ingresantes", "_postulantes")) # remove empty columns k <- map_lgl(fixed_phantom, ~ all(is.na(.x))) fixed_phantom <- fixed_phantom[,!k] # rename columns to join the rows fixed_phantom <- fixed_phantom %>% rename( apellido_paterno = `Apellido Paterno`, apellido_materno = `Apellido Materno`, nombres1 = Nombres_postulantes, nombres2 = Nombres_ingresantes, tipo1 = Tipo_postulantes, tipo2 = Tipo_ingresantes, especialidad_subespecialidad_postulantes = `Especialidad/SubEspecialidad_postulantes`, especialidad_subespecialidad_ingresantes = `Especialidad/SubEspecialidad_ingresantes`, serum = Serum_postulantes, primer_nivel = `1er Niv.`, quinto_superior = `5to Sup.`, promedio_pregrado = `Prom Pre`, nota_final = `ota Fina` ) # deal with nombres2, tipo2, No Doc., Codigo jump_cols <- function(dat, cols) { h <- is.na(dat[[cols[1]]]) & !is.na(dat[[cols[2]]]) dat[h, cols[1]] <- dat[h, cols[2]] dat } fixed_phantom <- fixed_phantom %>% jump_cols(c("nombres1", "nombres2")) %>% jump_cols(c("tipo1", "tipo2")) %>% jump_cols(c("No Doc._ingresantes", "No Doc._postulantes")) %>% select( !c(nombres2, tipo2, `No Doc._postulantes`, No_ingresantes, Obser., No_postulantes) ) %>% rename(nombres = nombres1, tipo = tipo1, num_doc = `No Doc._ingresantes`, Codigo = Codigo_postulantes) # be sure that there are no new columns length(setdiff(names(fixed_phantom), names(dat_final))) == 0 # remove the phantom data and its conections in the main data set and add the new clean phantom data # `i` evaluates the data that is present in the final data but it's not present in the applicants data i <- !(paste0( dat_final$year, dat_final$apellido_paterno, dat_final$apellido_materno ) %in% paste0( applicants$year, applicants$`Apellido Paterno`, applicants$`Apellido Materno` )) # `j` evaluates the data that is present in the final data and also in the phantom data j <- paste0( dat_final$year, dat_final$apellido_paterno, dat_final$apellido_materno ) %in% paste0( fixed_phantom$year, fixed_phantom$apellido_paterno, fixed_phantom$apellido_materno ) # remove both of them, the phantom data and its connections and add the new "clean" phantom data l <- i | j dat_final <- bind_rows(dat_final[!l, ], fixed_phantom) # write_complete_data(dat_final)
Gender was obtained from the first name, the sources were a list of names according to sex from Wikipedia (Nombres masculinos, Nombres femeninos) and a list from GitHub. The gender of the remaining names was assigned by hand if it was clear. This information was condensed into one data set with two columns: name
and gender
.
genders <- read_csv("data/names_gender/first_name-gender_dictionary.csv") head(genders) dim(genders)
Then we can obtain the gender in the "main" data set using the first names.
dat1 <- read_rds("data/conareme/identified/peru-COMPLETO-2013-2020.rds") head(dat1)
# remove accents str_to_english <- function(x) { x %>% stringr::str_to_lower() %>% stringi::stri_trans_general(id = "Latin-ASCII") } # obtain first name first_names <- dat1$nombres %>% str_extract("\\w+") %>% str_to_english() # create a column with the genders according with this first names lookup <- setNames(genders$gender, genders$name) x <- unname(lookup[first_names]) dat2 <- mutate(dat1, gender = x)
An option for the de-identification could be "rescuing" a column to identify people, e.g. if the same person applies to residency in different years, it will have the same number in that column. The challenge is that there are few rows with both id number and code, so maybe there was a lost of information in the code before. Options to capture this information are matching the codes or using last/first-names.
At the moment there is no column for the identifier.
dat3 <- dat2 %>% select(year, gender, Universidad_postulantes:Sede)
library(stringr) # functions for transformations transform_numeric <- function(x) { y1 <- str_extract(x, "\\d+[.]\\d+") y2 <- str_extract(x, "\\d+") ifelse(!is.na(y1), as.double(y1), as.integer(y2)) }
dat4 <- dat3 %>% mutate( year = as.integer(year), gender = ifelse(gender == "F", 1, 0), serum = transform_numeric(serum), bonificacion = as.double(bonificacion), puntaje = as.double(puntaje), SNCDS = transform_numeric(SNCDS), primer_nivel = transform_numeric(primer_nivel), quinto_superior = transform_numeric(quinto_superior), ENAM = transform_numeric(ENAM), Estado = ifelse(Estado == "Apto", 1, 0), especialidad_previa = ifelse( especialidad_previa == "NO", 0, as.integer(str_extract(especialidad_previa, "\\d+")) ), Examen = transform_numeric(Examen), `Factor A.` = transform_numeric(`Factor A.`), nota_final = transform_numeric(nota_final), observaciones = ifelse(is.na(observaciones), 0, 1), orden = transform_numeric(orden), Ajuste = transform_numeric(Ajuste), promedio_final = transform_numeric(promedio_final), ) # add a column to indicate if the person obtained or not the residency dat4 <- dat4 %>% mutate(ingreso = ifelse(is.na(especialidad_subespecialidad_ingresantes), 0, 1))
Some universities had problems with the names.
dat <- read_complete_data() head(dat)
# detect low frequency universities detect_low_freq <- function(df, cols) { df <- df[cols] map(df, function(col) { freqs <- table(col) %>% sort() tibble(value = names(freqs), n = unname(freqs)) }) } # detect_low_freq(dat, c("Universidad_postulantes", "Universidad_ingresantes")) lookup_postulantes <- c( "AUSMP" = "USMP", "DITHURP" = "URP", "EMMAUNMSM" = "UNMSM", "ENASUPCH" = "UPCH", "GOUSMP" = "USMP", "HUNT" = "UNT", "HURP" = "URP", "IKUNSA" = "UNSA", "IUPCH" = "UPCH", "LESUNSA" = "UNSA", "LESUSMP" = "USMP", "LINDAUPAO" = "UPAO", "NUNCP" = "UNCP", "OSURP" = "URP", "OUNMSM" = "UNMSM", "OUNSA" = "UNSA", "RIOUPAO" = "UPAO", "ROSURP" = "URP", "RRESUNMSM" = "UNMSM", "RUPAO" = "UPAO", "SUNMSM" = "UNMSM", "SUPAO" = "UPAO", "SUPCH" = "UPCH", "AURP" = "URP", "EDESUNMSM" = "UNMSM", "HUPAO" = "UPAO", "IEUNSA" = "UNSA", "SURP" = "URP", "SUSMP" = "USMP", "SABINUANMSM" = "UNMSM", "ROSARUIORP" = "URP", "N FEURNMASNMDEZ" = "UNMSM" ) lookup_ingresantes <- c( "OUSMP" = "USMP", "SURP" = "URP" ) replace_lookups <- function(x, lookup) { ifelse(x %in% names(lookup), lookup[x], x) } dat$Universidad_postulantes <- replace_lookups(dat$Universidad_postulantes, lookup_postulantes) dat$Universidad_ingresantes <- replace_lookups(dat$Universidad_ingresantes, lookup_ingresantes) # detect_low_freq(dat, c("Universidad_postulantes", "Universidad_ingresantes"))
The processes below this line have not been finished
There are columns like V9, V11, V13 and modality that contain the same information, so this code checks and fix if the information is the same to avoid duplicates.
indx_modalidad_libre <- function( df, cols_modalidad = c("V9", "V11_postulantes", "V11_ingresantes", "V13", "Modalidad_postulantes", "Modalidad_ingresantes") ) { df_filt <- df[cols_modalidad] vapply(seq_len(nrow(df_filt)), function(i) { row_vect <- unlist(df_filt[i, ]) row_vect <- row_vect[!is.na(row_vect)] if (length(row_vect) > 0) { all(row_vect == "Libre") } else { FALSE } }, logical(1)) } # split data i <- indx_modalidad_libre(dat) dat_libre <- dat[i, ] dat_no_libre <- dat[!i, ]
universities <- read_csv("data/university_region/region_for_universities - spreadsheet.csv") head(universities)
Use that information to create a column with the location/region/departamento.
library(stringr) dat <- read_complete_data() specialties <- unique(dat$especialidad_subespecialidad_postulantes) types_specialties <- list( covid_related = c( "MEDICINA DE EMERGENCIAS Y DESASTRES", "MEDICINA DE ENFERMEDADES INFECCIOSA", "ADMINISTRACION Y GESTION EN SALUD", "ADMINISTRACION EN SALUD", "MEDICINA DE ENFERMEDADES INFECCIOSAS Y TROPIC", "ADMINISTRACION Y GESTION EN SALUD" ), surgical = str_subset(specialties, "(CIRUGIA|ORTOPEDIA Y TRAUMATOLOGIA)"), clinical = str_subset(specialties, "(CIRUGIA|ORTOPEDIA Y TRAUMATOLOGIA)", negate = TRUE) )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.