Demonstrates that georeferencing options vary in how systematically they include some kinds of events while excluding other kinds. Asks whether or not missingness of geographic information is truly random, or if certain kinds of events or places are being dropped.

rm(list=ls()); gc()
# !diagnostics off
library(MeasuringLandscape)
library(tidyverse)

dir_figures <- glue::glue(getwd(), "/../paper/figures/")

gc()

knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8,  warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)
#Load Events
events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) 

events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean','map_coordinate_clean_latitude','map_coordinate_clean_longitude')],
                                     "location_text", transform,
      map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude))
      )


#Reload from scratch each time in case we subset sometehing weirdly
georef_all_dt <- readRDS(system.file("extdata", "georef_all_dt_recomendations.Rds", package = "MeasuringLandscape")) 

table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner) #All events are in here
table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner[!is.na(georef_all_dt$georef_b)]) #7,742 events with at least one gazetteer suggestion

#Exclude all distance = 0 obs, those are self matches
georef_all_dt <- subset(georef_all_dt, 
                        !is.na(name_cleaner) & # must have a name
                        (is.na(distance_km) | distance_km!=0)  ) #Can be either missing or not zero. Only thing we drop is zero because that's a self match

Predict missing coordinates

Proof of concept, show it works on original missingness

glue::glue("Military coordinates only")
y <- !is.na(events_sf$map_coordinate_clean_latitude); table(y)
pred_cords <- MeasuringLandscape:::predict_missingness_dv(y)
auc_cords_dataset <- Metrics::auc(pred_cords$label, pred_cords$xb)
recall_cords_dataset <- sum(pred_cords$label)


glue::glue("Text only")
y <- !is.na(events_sf$name_cleaner); table(y)
pred_text <- MeasuringLandscape:::predict_missingness_dv(y)
auc_text_dataset <- Metrics::auc(pred_text$label, pred_text$xb)
recall_text_dataset <- sum(pred_text$label)


glue::glue("Military or Text")
y <- !is.na(events_sf$name_cleaner) | #Name isn't missing
     !is.na(events_sf$map_coordinate_clean_latitude); #or it has coordinates
table(y)
pred_cordstext <- MeasuringLandscape:::predict_missingness_dv(y)
auc_cordstext_dataset <- Metrics::auc(pred_cordstext$label, pred_cordstext$xb)
recall_cordstext_dataset <- sum(pred_cordstext$label)
#Hand Rule
glue::glue("Hand Rule")
data.table::setkey(georef_all_dt, handrule)
georef_all_dt_handrule <- georef_all_dt[,.SD[1], by=list(event_hash) ]
y <- events_sf$event_hash %in% georef_all_dt_handrule$event_hash #Which events received a imputed location under this rule
table(y)
pred_source_handrule <- MeasuringLandscape:::predict_missingness_dv(y)
auc_handrule_dataset <- Metrics::auc(pred_source_handrule$label, pred_source_handrule$xb)
recall_handrule_dataset <- sum(pred_source_handrule$label)

#Ensemble Rule
glue::glue("Ensemble Rule")
data.table::setkey(georef_all_dt, rule_ensemble)
georef_all_dt_ensemble <- georef_all_dt[,.SD[1], by=list(event_hash) ]
y <- events_sf$event_hash %in% georef_all_dt_ensemble$event_hash #Which events received a imputed location under this rule
table(y)
pred_source_ensemble <- MeasuringLandscape:::predict_missingness_dv(y)
auc_ensemble_dataset <- Metrics::auc(pred_source_ensemble$label, pred_source_ensemble$xb)
recall_ensemble_dataset <- sum(pred_source_ensemble$label)
#fuzzy
glue::glue("fuzzy")
georef_all_dt_byfuzzy <- georef_all_dt[,
                                       list(distance_km_min=min(distance_km, na.rm=T) ),
                                       by=list(event_hash, fuzzy)
                                       ]
summary(georef_all_dt_byfuzzy)
pred_fuzzy_list <- list()
q_all <- na.omit(unique(georef_all_dt_byfuzzy$fuzzy)) ; table(q_all)
for( q in q_all ){
  print(q)
  y <- events_sf$event_hash %in% 
      georef_all_dt_byfuzzy[fuzzy==q  #The right kind of fuzzy
                            ]$event_hash
  pred_fuzzy_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y)
}
auc_fuzzy <- sapply(pred_fuzzy_list, FUN=function(q) Metrics::auc(q$label, q$xb))
recall_fuzzy <- sapply(pred_fuzzy_list, FUN=function(q) sum(q$label) )
#Source dataset
glue::glue("Source dataset")
georef_all_dt_bysource <- georef_all_dt[,list(distance_km_min=min(distance_km, na.rm=T) ),by=list(event_hash, source_dataset)] ; dim(georef_all_dt_bysource)
georef_all_dt_bysource[!is.finite(distance_km_min), distance_km_min:=NA]
pred_source_dataset_list <- list()
for( q in na.omit(unique(georef_all_dt_bysource$source_dataset)) ){
  print(q)
  y <- events_sf$event_hash %in% georef_all_dt_bysource[source_dataset==q ]$event_hash
  pred_source_dataset_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y)
}
auc_source_dataset <- sapply(pred_source_dataset_list, FUN=function(q) Metrics::auc(q$label, q$xb))
recall_source_dataset <- sapply(pred_source_dataset_list, FUN=function(q) sum(q$label) )
#geometry_type
glue::glue("geometry_type")
georef_all_dt_bygeometry_type <- georef_all_dt[,list(distance_km_min=min(distance_km, na.rm=T) ),by=list(event_hash, geometry_type)] 
georef_all_dt_bygeometry_type[!is.finite(distance_km_min), distance_km_min:=NA]
pred_geometry_type_list <- list()
for( q in na.omit(unique(georef_all_dt_bygeometry_type$geometry_type)) ){
  print(q)
  y <- events_sf$event_hash %in% georef_all_dt_bygeometry_type[geometry_type==q ]$event_hash
  pred_geometry_type_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y)
}
auc_geometry_type <- sapply(pred_geometry_type_list, FUN=function(q) Metrics::auc(q$label, q$xb))
recall_geometry_type <- sapply(pred_geometry_type_list, FUN=function(q) sum(q$label) )
#Self Reference
glue::glue("Self Reference")
georef_all_dt_byselfreference <- georef_all_dt[,list(distance_km_min=min(distance_km, na.rm=T) ),by=list(event_hash, SelfReference )] ; dim(georef_all_dt_bysource)
georef_all_dt_byselfreference[!is.finite(distance_km_min), distance_km_min:=NA]
pred_selfreference_list <- list()
for( q in na.omit(unique(georef_all_dt_byselfreference$SelfReference)) ){
  print(q)
  y <- events_sf$event_hash %in% georef_all_dt_byselfreference[SelfReference==q]$event_hash
  pred_selfreference_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y)
}
auc_selfreference <- sapply(pred_selfreference_list, FUN=function(q) Metrics::auc(q$label, q$xb))
recall_selfreference <- sapply(pred_selfreference_list, FUN=function(q) sum(q$label) )
bias_dv_df <- data.table::rbindlist(list(
    cbind(auc=auc_cords_dataset, recall=recall_cords_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% 
      mutate(label="Mil Coords") %>% mutate(Type="Original Geo Info"),

    cbind(auc=auc_text_dataset, recall=recall_text_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% 
      mutate(label="Text Location") %>% mutate(Type="Original Geo Info"),
    cbind(auc=auc_cordstext_dataset, recall=recall_cordstext_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% 
      mutate(label="Mil Coords or Text Location") %>% mutate(Type="Original Geo Info"),

    cbind(auc=auc_cordstext_dataset, recall=recall_cordstext_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% 
      mutate(label="Hand Rule") %>% mutate(Type="Rule"),
    cbind(auc=auc_cordstext_dataset, recall=recall_cordstext_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% 
      mutate(label="Ensemble Rule") %>% mutate(Type="Rule"),

    cbind(auc=auc_selfreference, recall=recall_selfreference) %>% data.frame() %>% 
      tibble::rownames_to_column("label") %>% mutate(label=ifelse(label, "Match to Other Events","No Match to Other Events")) %>% 
                        mutate(Type="Allow Match To Other Events"),

    cbind(auc=auc_fuzzy, recall=recall_fuzzy) %>% data.frame() %>% tibble::rownames_to_column("label") 
    %>% mutate(label=ifelse(label, "Fuzzy","Exact")) %>% mutate(Type="Match Type"),

    cbind(auc=auc_source_dataset, recall=recall_source_dataset) %>% data.frame() %>% 
      tibble::rownames_to_column("label") %>% mutate(Type="Source Dataset"),

    cbind(auc=auc_geometry_type, recall=recall_geometry_type) %>% data.frame() %>% 
      tibble::rownames_to_column("label") %>% mutate(Type="Geometry Type")
))

saveRDS(bias_dv_df,
      file=glue::glue(getwd(), "/../inst/extdata/bias_dv_df.Rds")
)

Plot

(Appendix Figure 9)

#bias_dv_df <- readRDS(system.file("extdata", "bias_dv_df.Rds", package = "MeasuringLandscape"))

sentence_case <- function(x) stringr::str_to_sentence(tolower(gsub("_"," ",x)))

#install.packages("extrafont");
#library(extrafont)
#library(extrafont)
#extrafont::font_import(prompt=F )
#capabilities()
#windowsFonts()
#sort(as.vector(unlist(windowsFonts())))

fonts <- c('Times New Roman',
           'Calibri',
           'Courier New',
           "Georgia",
           "Tunga",
           "Lucida Fax")
           #'serif','Helvetica','Bookman','Palatino')

library(ggplot2)


bias_dv_df[Type=="Allow Match To Other Events" & label=="No Match to Other Events", label:= "Self Ref.",]
bias_dv_df[Type=="Allow Match To Other Events" & label=="Match to Other Events", label:= "No Self Ref.",]
bias_dv_df[Type=="Allow Match To Other Events" , Type:= "Self Reference",]


fontfaces <- factor(c("plain","bold","italic","bold.italic","plain","plain"))

colours = c("Self Reference" = "#F8766D",
            "Geometry Type" = "#A3A500",
            "Match Type" = "#00BF7D",
            "Rule" = "#00B0F6",
            "Source Dataset"="#E76BF3",
            "Original Geo Info"="#53B400") 

#p_load(ggrepel, tools)
p_bias_dv <- bias_dv_df %>% 
        #filter(!(label %in% c('kenya_district1962','kenya_cadastral','kenya_cadastral_district',"LINESTRING"))) %>%
        #filter(term != "(Intercept)") %>%
        #mutate(label[Type=="Match Type"]=gsub("FALSE", "Exact", label[Type=="Match Type"])) %>%
        #mutate(label[Type=="Match Type"]=gsub("True", "Fuzzy", label[Type=="Match Type"])) %>%
        mutate(Type=sentence_case(Type),
               label=sentence_case(label)
               ) %>%
  ggplot(aes(x=auc,
             y=round(recall/nrow(events_sf),2),
             color=Type,
             label=label,
             family = fonts[as.numeric(as.factor(Type))],
             fontface= fontfaces[as.numeric(as.factor(Type))]
             ))  + 
  ggrepel::geom_text_repel(size=3) +
  theme_bw() +
  xlab(sentence_case("Predictability of Missingness of Imputed Locations, Area Under the Curve")) +
  ylab(sentence_case("Recovery Rate")) +
  theme(
    legend.position = c(0.9, 0.3), # c(0,0) bottom left, c(1,1) top-right.
  )
p_bias_dv
 #+ coord_cartesian(y="log")


ggsave(
  filename = glue::glue(dir_figures, "p_bias_dv.pdf"),
  plot = p_bias_dv, width = 9, height = 6,
  device = cairo_pdf #have to use cairo to correctly embed the fonts

) 


rexdouglass/MeasuringLandscape documentation built on May 13, 2019, 6:16 p.m.