Demonstrates that georeferencing options vary in how systematically they include some kinds of events while excluding other kinds. Asks whether or not missingness of geographic information is truly random, or if certain kinds of events or places are being dropped.
rm(list=ls()); gc() # !diagnostics off library(MeasuringLandscape) library(tidyverse) dir_figures <- glue::glue(getwd(), "/../paper/figures/") gc() knitr::opts_knit$set(progress = TRUE, verbose = TRUE) knitr::opts_chunk$set(fig.width=12, fig.height=8, warning=FALSE, message=FALSE, cache=TRUE) options(width = 160)
#Load Events events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean','map_coordinate_clean_latitude','map_coordinate_clean_longitude')], "location_text", transform, map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude)) ) #Reload from scratch each time in case we subset sometehing weirdly georef_all_dt <- readRDS(system.file("extdata", "georef_all_dt_recomendations.Rds", package = "MeasuringLandscape")) table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner) #All events are in here table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner[!is.na(georef_all_dt$georef_b)]) #7,742 events with at least one gazetteer suggestion #Exclude all distance = 0 obs, those are self matches georef_all_dt <- subset(georef_all_dt, !is.na(name_cleaner) & # must have a name (is.na(distance_km) | distance_km!=0) ) #Can be either missing or not zero. Only thing we drop is zero because that's a self match
Proof of concept, show it works on original missingness
glue::glue("Military coordinates only") y <- !is.na(events_sf$map_coordinate_clean_latitude); table(y) pred_cords <- MeasuringLandscape:::predict_missingness_dv(y) auc_cords_dataset <- Metrics::auc(pred_cords$label, pred_cords$xb) recall_cords_dataset <- sum(pred_cords$label) glue::glue("Text only") y <- !is.na(events_sf$name_cleaner); table(y) pred_text <- MeasuringLandscape:::predict_missingness_dv(y) auc_text_dataset <- Metrics::auc(pred_text$label, pred_text$xb) recall_text_dataset <- sum(pred_text$label) glue::glue("Military or Text") y <- !is.na(events_sf$name_cleaner) | #Name isn't missing !is.na(events_sf$map_coordinate_clean_latitude); #or it has coordinates table(y) pred_cordstext <- MeasuringLandscape:::predict_missingness_dv(y) auc_cordstext_dataset <- Metrics::auc(pred_cordstext$label, pred_cordstext$xb) recall_cordstext_dataset <- sum(pred_cordstext$label)
#Hand Rule glue::glue("Hand Rule") data.table::setkey(georef_all_dt, handrule) georef_all_dt_handrule <- georef_all_dt[,.SD[1], by=list(event_hash) ] y <- events_sf$event_hash %in% georef_all_dt_handrule$event_hash #Which events received a imputed location under this rule table(y) pred_source_handrule <- MeasuringLandscape:::predict_missingness_dv(y) auc_handrule_dataset <- Metrics::auc(pred_source_handrule$label, pred_source_handrule$xb) recall_handrule_dataset <- sum(pred_source_handrule$label) #Ensemble Rule glue::glue("Ensemble Rule") data.table::setkey(georef_all_dt, rule_ensemble) georef_all_dt_ensemble <- georef_all_dt[,.SD[1], by=list(event_hash) ] y <- events_sf$event_hash %in% georef_all_dt_ensemble$event_hash #Which events received a imputed location under this rule table(y) pred_source_ensemble <- MeasuringLandscape:::predict_missingness_dv(y) auc_ensemble_dataset <- Metrics::auc(pred_source_ensemble$label, pred_source_ensemble$xb) recall_ensemble_dataset <- sum(pred_source_ensemble$label)
#fuzzy glue::glue("fuzzy") georef_all_dt_byfuzzy <- georef_all_dt[, list(distance_km_min=min(distance_km, na.rm=T) ), by=list(event_hash, fuzzy) ] summary(georef_all_dt_byfuzzy) pred_fuzzy_list <- list() q_all <- na.omit(unique(georef_all_dt_byfuzzy$fuzzy)) ; table(q_all) for( q in q_all ){ print(q) y <- events_sf$event_hash %in% georef_all_dt_byfuzzy[fuzzy==q #The right kind of fuzzy ]$event_hash pred_fuzzy_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y) } auc_fuzzy <- sapply(pred_fuzzy_list, FUN=function(q) Metrics::auc(q$label, q$xb)) recall_fuzzy <- sapply(pred_fuzzy_list, FUN=function(q) sum(q$label) )
#Source dataset glue::glue("Source dataset") georef_all_dt_bysource <- georef_all_dt[,list(distance_km_min=min(distance_km, na.rm=T) ),by=list(event_hash, source_dataset)] ; dim(georef_all_dt_bysource) georef_all_dt_bysource[!is.finite(distance_km_min), distance_km_min:=NA] pred_source_dataset_list <- list() for( q in na.omit(unique(georef_all_dt_bysource$source_dataset)) ){ print(q) y <- events_sf$event_hash %in% georef_all_dt_bysource[source_dataset==q ]$event_hash pred_source_dataset_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y) } auc_source_dataset <- sapply(pred_source_dataset_list, FUN=function(q) Metrics::auc(q$label, q$xb)) recall_source_dataset <- sapply(pred_source_dataset_list, FUN=function(q) sum(q$label) )
#geometry_type glue::glue("geometry_type") georef_all_dt_bygeometry_type <- georef_all_dt[,list(distance_km_min=min(distance_km, na.rm=T) ),by=list(event_hash, geometry_type)] georef_all_dt_bygeometry_type[!is.finite(distance_km_min), distance_km_min:=NA] pred_geometry_type_list <- list() for( q in na.omit(unique(georef_all_dt_bygeometry_type$geometry_type)) ){ print(q) y <- events_sf$event_hash %in% georef_all_dt_bygeometry_type[geometry_type==q ]$event_hash pred_geometry_type_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y) } auc_geometry_type <- sapply(pred_geometry_type_list, FUN=function(q) Metrics::auc(q$label, q$xb)) recall_geometry_type <- sapply(pred_geometry_type_list, FUN=function(q) sum(q$label) )
#Self Reference glue::glue("Self Reference") georef_all_dt_byselfreference <- georef_all_dt[,list(distance_km_min=min(distance_km, na.rm=T) ),by=list(event_hash, SelfReference )] ; dim(georef_all_dt_bysource) georef_all_dt_byselfreference[!is.finite(distance_km_min), distance_km_min:=NA] pred_selfreference_list <- list() for( q in na.omit(unique(georef_all_dt_byselfreference$SelfReference)) ){ print(q) y <- events_sf$event_hash %in% georef_all_dt_byselfreference[SelfReference==q]$event_hash pred_selfreference_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y) } auc_selfreference <- sapply(pred_selfreference_list, FUN=function(q) Metrics::auc(q$label, q$xb)) recall_selfreference <- sapply(pred_selfreference_list, FUN=function(q) sum(q$label) )
bias_dv_df <- data.table::rbindlist(list( cbind(auc=auc_cords_dataset, recall=recall_cords_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% mutate(label="Mil Coords") %>% mutate(Type="Original Geo Info"), cbind(auc=auc_text_dataset, recall=recall_text_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% mutate(label="Text Location") %>% mutate(Type="Original Geo Info"), cbind(auc=auc_cordstext_dataset, recall=recall_cordstext_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% mutate(label="Mil Coords or Text Location") %>% mutate(Type="Original Geo Info"), cbind(auc=auc_cordstext_dataset, recall=recall_cordstext_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% mutate(label="Hand Rule") %>% mutate(Type="Rule"), cbind(auc=auc_cordstext_dataset, recall=recall_cordstext_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% mutate(label="Ensemble Rule") %>% mutate(Type="Rule"), cbind(auc=auc_selfreference, recall=recall_selfreference) %>% data.frame() %>% tibble::rownames_to_column("label") %>% mutate(label=ifelse(label, "Match to Other Events","No Match to Other Events")) %>% mutate(Type="Allow Match To Other Events"), cbind(auc=auc_fuzzy, recall=recall_fuzzy) %>% data.frame() %>% tibble::rownames_to_column("label") %>% mutate(label=ifelse(label, "Fuzzy","Exact")) %>% mutate(Type="Match Type"), cbind(auc=auc_source_dataset, recall=recall_source_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% mutate(Type="Source Dataset"), cbind(auc=auc_geometry_type, recall=recall_geometry_type) %>% data.frame() %>% tibble::rownames_to_column("label") %>% mutate(Type="Geometry Type") )) saveRDS(bias_dv_df, file=glue::glue(getwd(), "/../inst/extdata/bias_dv_df.Rds") )
Plot
(Appendix Figure 9)
#bias_dv_df <- readRDS(system.file("extdata", "bias_dv_df.Rds", package = "MeasuringLandscape")) sentence_case <- function(x) stringr::str_to_sentence(tolower(gsub("_"," ",x))) #install.packages("extrafont"); #library(extrafont) #library(extrafont) #extrafont::font_import(prompt=F ) #capabilities() #windowsFonts() #sort(as.vector(unlist(windowsFonts()))) fonts <- c('Times New Roman', 'Calibri', 'Courier New', "Georgia", "Tunga", "Lucida Fax") #'serif','Helvetica','Bookman','Palatino') library(ggplot2) bias_dv_df[Type=="Allow Match To Other Events" & label=="No Match to Other Events", label:= "Self Ref.",] bias_dv_df[Type=="Allow Match To Other Events" & label=="Match to Other Events", label:= "No Self Ref.",] bias_dv_df[Type=="Allow Match To Other Events" , Type:= "Self Reference",] fontfaces <- factor(c("plain","bold","italic","bold.italic","plain","plain")) colours = c("Self Reference" = "#F8766D", "Geometry Type" = "#A3A500", "Match Type" = "#00BF7D", "Rule" = "#00B0F6", "Source Dataset"="#E76BF3", "Original Geo Info"="#53B400") #p_load(ggrepel, tools) p_bias_dv <- bias_dv_df %>% #filter(!(label %in% c('kenya_district1962','kenya_cadastral','kenya_cadastral_district',"LINESTRING"))) %>% #filter(term != "(Intercept)") %>% #mutate(label[Type=="Match Type"]=gsub("FALSE", "Exact", label[Type=="Match Type"])) %>% #mutate(label[Type=="Match Type"]=gsub("True", "Fuzzy", label[Type=="Match Type"])) %>% mutate(Type=sentence_case(Type), label=sentence_case(label) ) %>% ggplot(aes(x=auc, y=round(recall/nrow(events_sf),2), color=Type, label=label, family = fonts[as.numeric(as.factor(Type))], fontface= fontfaces[as.numeric(as.factor(Type))] )) + ggrepel::geom_text_repel(size=3) + theme_bw() + xlab(sentence_case("Predictability of Missingness of Imputed Locations, Area Under the Curve")) + ylab(sentence_case("Recovery Rate")) + theme( legend.position = c(0.9, 0.3), # c(0,0) bottom left, c(1,1) top-right. ) p_bias_dv #+ coord_cartesian(y="log") ggsave( filename = glue::glue(dir_figures, "p_bias_dv.pdf"), plot = p_bias_dv, width = 9, height = 6, device = cairo_pdf #have to use cairo to correctly embed the fonts )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.