Demonstrates that georeferencing options vary terms of recall (how many event locations they recover) and accuracy (how far away their imputed locations tend to be from the true location)
rm(list=ls()); gc() # !diagnostics off library(MeasuringLandscape) library(tidyverse) dir_figures <- glue::glue(getwd(), "/../paper/figures/") gc() knitr::opts_knit$set(progress = TRUE, verbose = TRUE) knitr::opts_chunk$set(fig.width=12, fig.height=8, warning=FALSE, message=FALSE, cache=TRUE) options(width = 160)
Load the events file
#Load Events events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) print(dim(events_sf)) events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean','map_coordinate_clean_latitude','map_coordinate_clean_longitude')], "location_text", transform, map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude)) )
Load the suggestions file
#Reload from scratch each time in case we subset sometehing weirdly # georef_all_dt_recommendations <- readRDS(system.file("extdata", "georef_all_dt_recomendations.Rds", package = "MeasuringLandscape")) georef_all_dt <- readRDS(system.file("extdata", "georef_all_dt_recomendations.Rds", package = "MeasuringLandscape")) glue::glue("Size of the Suggestions dataset ", nrow(georef_all_dt)) table(!is.na(events_sf$name_cleaner)) #8,638 events have at least some text table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner) #8,242 events have a name in our suggestions list table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner[!is.na(georef_all_dt$georef_b)]) #7,532 events with at least one gazetteer suggestion
georef_all_dt %>% ggplot(aes(x=distance_km)) + geom_histogram(bins = 100) georef_all_dt %>% summary(distance_km) table(georef_all_dt$distance_km==0) #These are identical matches, same event on the left hand and right hand sides #Exclude all distance = 0 obs, those are self matches georef_all_dt <- subset(georef_all_dt, !is.na(name_cleaner) & # must have a name (is.na(distance_km) | distance_km!=0) ) #Can be either missing or not zero. Only thing we drop is zero because that's a self match table(georef_all_dt$distance_km==0) #These are identical matches, same event on the left hand and right hand sides
This section calculates the recall rate and geographic accuracy of gazetteer suggestions. Recall is the number of events that receive at least one suggestion under a given set of rules. Accuracy is the distance between an observed location with a military coordinate and an imputed location for the same event.
Accuracy is measured differently for hand rule/ensemble and every other specific decision. Hand rule/ensemble accuracy is the distance observed for the best match selected by that process.
Accuracy for the remainder of decisions is the BEST CASE accuracy for any suggestion matching that rule. For example, if the decision is "point" geometry type, then accuracy would be the smallest observed distance of every single suggestion with a point type.
Hand Rules
print("Hand Rule Orderings") table(georef_all_dt$handrule) #Hand Rule data.table::setkey(georef_all_dt, event_hash, handrule) #Sort by event and then hand rule georef_all_dt_handrule <- georef_all_dt[,.SD[1], by=list(event_hash) ] #Take the smallest hand rule score, lower is better georef_all_dt_handrule_summary <- georef_all_dt_handrule[,list( rule="Hand Rule", distance_rmse=sqrt(mean(distance_km^2, na.rm=T)), distance_mse=mean(distance_km^2, na.rm=T), distance_me=mean(distance_km, na.rm=T), distance_median=median(distance_km, na.rm=T), source_dataset_count=.N ) ] print("Distance for Hand Rule") georef_all_dt_handrule_summary
Ensemble Rule
summary(georef_all_dt$rule_ensemble) georef_all_dt %>% ggplot(aes(x=rule_ensemble)) + geom_histogram(bins=100) #Ensemble Rule data.table::setkey(georef_all_dt, rule_ensemble) georef_all_dt_ensemble <- georef_all_dt[,.SD[1], by=list(event_hash) ] #Take the smallest estimated distance, lower is better georef_all_dt_ensemble_summary <- georef_all_dt_ensemble[,list( rule="Ensemble Rule", distance_rmse=sqrt(mean(distance_km^2, na.rm=T)) , distance_mse=mean(distance_km^2, na.rm=T), distance_me=mean(distance_km, na.rm=T), distance_median=median(distance_km, na.rm=T), source_dataset_count=.N ) ] print("Distance for Ensemble") georef_all_dt_ensemble_summary rbind(georef_all_dt_handrule %>% mutate(label="hand"), georef_all_dt_ensemble %>% mutate(label="ensemble")) %>% ggplot(aes(x=distance_km, fill=label)) + geom_histogram()
Now calculate the minimum observed distance for any suggestion matching a specific decision.
#Minimum distance by event-source georef_all_dt_bysource <- georef_all_dt[, list(distance_min=min(distance_km, na.rm=T) ), by=list(event_hash, source_dataset)] ; dim(georef_all_dt_bysource) georef_all_dt_bysource[!is.finite(distance_min), distance_min:=NA] #Minimum distance and recall by source georef_all_dt_bysource_summary <- georef_all_dt_bysource[, list( distance_rmse=sqrt(mean(distance_min^2, na.rm=T)) , source_dataset_count=.N), by=list(source_dataset) ] georef_all_dt_byfuzzy <- georef_all_dt[ ,list(distance_min=min(distance_km, na.rm=T) ), by=list(event_hash, fuzzy)] ; georef_all_dt_byfuzzy[!is.finite(distance_min), distance_min:=NA] georef_all_dt_byfuzzy_summary <- georef_all_dt_byfuzzy[, list(distance_rmse=sqrt(mean(distance_min^2, na.rm=T)) , source_dataset_count=.N), by=list(fuzzy) ] georef_all_dt_byselfreference <- georef_all_dt[,list(distance_min=min(distance_km, na.rm=T) ), by=list(event_hash, SelfReference)] ; dim(georef_all_dt_bysource) georef_all_dt_byselfreference[!is.finite(distance_min), distance_min:=NA] georef_all_dt_byselfreference_summary <- georef_all_dt_byselfreference[,list( distance_rmse=sqrt(mean(distance_min^2, na.rm=T)) , source_dataset_count=.N), by=list(SelfReference) ] georef_all_dt_bygeometry_type <- georef_all_dt[,list(distance_min=min(distance_km, na.rm=T) ),by=list(event_hash, geometry_type )] ; dim(georef_all_dt_bysource) georef_all_dt_bygeometry_type[!is.finite(distance_min), distance_min:=NA] georef_all_dt_bygeometry_type <- georef_all_dt_bygeometry_type[,list( distance_rmse=sqrt(mean(distance_min^2, na.rm=T)) , source_dataset_count=.N), by=list(geometry_type) ] georef_all_dt_milcoords_summary <- data.frame(rule="Mil. Coord", distance_rmse=0, source_dataset_count=sum(!is.na(events_sf$map_coordinate_clean_latitude)) )
recall_accuracy <- data.table::rbindlist(list( #georef_all_dt_milcoords_summary %>% mutate(label="Mil. Coord") %>% mutate(Type="Rule"), georef_all_dt_handrule_summary %>% mutate(label="Hand Rule") %>% mutate(Type="Rule"), georef_all_dt_ensemble_summary %>% mutate(label="Ensemble Rule") %>% mutate(Type="Rule"), georef_all_dt_bysource_summary %>% mutate(label=source_dataset) %>% mutate(Type="Source Gazetteer"), georef_all_dt_byfuzzy_summary %>% mutate(label=ifelse(fuzzy, "Fuzzy","Exact")) %>% mutate(Type="Match Type"), georef_all_dt_byselfreference_summary %>% mutate(label=ifelse(SelfReference, "Match To Other Events","No Match To Other Events")) %>% mutate(Type="Allow Match To Other Events"), georef_all_dt_bygeometry_type %>% mutate(label=geometry_type) %>% mutate(Type="Geometry Type") ),fill=T) %>% mutate(recall_rate=source_dataset_count/10469)
Combine and plot the results for every specific decision and the ensembles.
(Figure 2, Comparison of recall and accuracy by georeferencing method)
sentence_case <- function(x) stringr::str_to_sentence(tolower(gsub("_"," ",x))) #The first time you run this, you'll need to install these additional fonts #install.packages("extrafont"); #library(extrafont) #library(extrafont) #extrafont::font_import(prompt=F ) #capabilities() #windowsFonts() #sort(as.vector(unlist(windowsFonts()))) fonts <- c('Times New Roman', 'Calibri', 'Courier New', "Georgia", "Tunga", "Lucida Fax") #'serif','Helvetica','Bookman','Palatino') #library(ggplot2) fontfaces <- factor(c("plain","bold","italic","bold.italic","plain","plain")) colours = c("Self reference" = "#F8766D", "Geometry type" = "#A3A500", "Match type" = "#00BF7D", "Rule" = "#00B0F6", "Source dataset"="#E76BF3", "Original geo info"="#53B400") #p_load(ggrepel, tools) p_RecallBias <- recall_accuracy %>% filter(!label %in% "tgn") %>% #exclude TGN for being an outlier mutate( label=sentence_case(label), Type=sentence_case(Type) ) %>% ggplot( aes(x=distance_rmse, y=source_dataset_count / 10469, color=Type, label=label, family = fonts[as.numeric(as.factor(Type))], fontface= fontfaces[as.numeric(as.factor(Type))] ) ) + ggrepel::geom_text_repel(size=3) + theme_bw() + xlab(stringr::str_to_sentence("Root Mean Squared Distance from Observed Coordinate (km)")) + ylab(stringr::str_to_sentence("Recovery Rate")) + theme( legend.position = c(0.2, 0.3), # c(0,0) bottom left, c(1,1) top-right. ) p_RecallBias #+ coord_cartesian(y="log") ggsave( filename = glue::glue(dir_figures, "RecallBias.pdf"), plot = p_RecallBias, width = 9, height = 6, device = cairo_pdf #have to use cairo to correctly embed the fonts )
In terms of accuracy, we find wide variation between decisions. In brief: Ensemble Rule > Hand Rule Exact > Fuzzy Point > Multipolygon > Polygon > Linestring Match to Other Events > No Match to Other Events events > nga > livestock_points" > historical > geonames > bing> kenya_cadastral_district > livestock_boundaries > gadm > kenya_district1962 > wikidata > google > kenya_cadastral > openstreetmap > tgn
The hand rules and supervised ensemble represent a compromise between high recovery rate and best case accuracy of each individual decision.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.