Demonstrates that georeferencing options vary terms of recall (how many event locations they recover) and accuracy (how far away their imputed locations tend to be from the true location)

rm(list=ls()); gc()
# !diagnostics off
library(MeasuringLandscape)
library(tidyverse)

dir_figures <- glue::glue(getwd(), "/../paper/figures/")

gc()

knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8,  warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)

Load Files

Load the events file

#Load Events
events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) 
print(dim(events_sf))
events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean','map_coordinate_clean_latitude','map_coordinate_clean_longitude')],
                                     "location_text", transform,
      map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude))
      )

Load the suggestions file

#Reload from scratch each time in case we subset sometehing weirdly
# georef_all_dt_recommendations <- readRDS(system.file("extdata", "georef_all_dt_recomendations.Rds", package = "MeasuringLandscape")) 
georef_all_dt <- readRDS(system.file("extdata", "georef_all_dt_recomendations.Rds", package = "MeasuringLandscape")) 
glue::glue("Size of the Suggestions dataset ", nrow(georef_all_dt))

table(!is.na(events_sf$name_cleaner)) #8,638 events have at least some text
table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner) #8,242 events have a name in our suggestions list
table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner[!is.na(georef_all_dt$georef_b)]) #7,532 events with at least one gazetteer suggestion
georef_all_dt %>% ggplot(aes(x=distance_km)) + geom_histogram(bins = 100)

georef_all_dt %>% summary(distance_km)

table(georef_all_dt$distance_km==0) #These are identical matches, same event on the left hand and right hand sides

#Exclude all distance = 0 obs, those are self matches
georef_all_dt <- subset(georef_all_dt, 
                        !is.na(name_cleaner) & # must have a name
                        (is.na(distance_km) | distance_km!=0)  ) #Can be either missing or not zero. Only thing we drop is zero because that's a self match

table(georef_all_dt$distance_km==0) #These are identical matches, same event on the left hand and right hand sides

Calculate Recall Rate and Distance for Ensembles

This section calculates the recall rate and geographic accuracy of gazetteer suggestions. Recall is the number of events that receive at least one suggestion under a given set of rules. Accuracy is the distance between an observed location with a military coordinate and an imputed location for the same event.

Accuracy is measured differently for hand rule/ensemble and every other specific decision. Hand rule/ensemble accuracy is the distance observed for the best match selected by that process.

Accuracy for the remainder of decisions is the BEST CASE accuracy for any suggestion matching that rule. For example, if the decision is "point" geometry type, then accuracy would be the smallest observed distance of every single suggestion with a point type.

Hand Rules

print("Hand Rule Orderings")
table(georef_all_dt$handrule)

#Hand Rule
data.table::setkey(georef_all_dt, event_hash, handrule) #Sort by event and then hand rule
georef_all_dt_handrule <- georef_all_dt[,.SD[1], by=list(event_hash) ] #Take the smallest hand rule score, lower is better
georef_all_dt_handrule_summary <- georef_all_dt_handrule[,list(
                                               rule="Hand Rule",
                                              distance_rmse=sqrt(mean(distance_km^2, na.rm=T)), 
                                              distance_mse=mean(distance_km^2, na.rm=T),
                                              distance_me=mean(distance_km, na.rm=T),
                                              distance_median=median(distance_km, na.rm=T),
                                              source_dataset_count=.N
                                              )
                                        ]

print("Distance for Hand Rule")
georef_all_dt_handrule_summary

Ensemble Rule

summary(georef_all_dt$rule_ensemble)

georef_all_dt %>% ggplot(aes(x=rule_ensemble)) + geom_histogram(bins=100)

#Ensemble Rule
data.table::setkey(georef_all_dt, rule_ensemble)
georef_all_dt_ensemble <- georef_all_dt[,.SD[1], by=list(event_hash) ] #Take the smallest estimated distance, lower is better
georef_all_dt_ensemble_summary <- georef_all_dt_ensemble[,list(
                                               rule="Ensemble Rule",
                                              distance_rmse=sqrt(mean(distance_km^2, na.rm=T)) ,
                                              distance_mse=mean(distance_km^2, na.rm=T),
                                              distance_me=mean(distance_km, na.rm=T),
                                              distance_median=median(distance_km, na.rm=T),
                                              source_dataset_count=.N
                                              )
                                        ]

print("Distance for Ensemble")
georef_all_dt_ensemble_summary

rbind(georef_all_dt_handrule %>% mutate(label="hand"),
      georef_all_dt_ensemble %>% mutate(label="ensemble")) %>% 
      ggplot(aes(x=distance_km, fill=label)) + geom_histogram()

Calculate Recall Rate and Distance for Each Decision

Now calculate the minimum observed distance for any suggestion matching a specific decision.

#Minimum distance by event-source
georef_all_dt_bysource <- georef_all_dt[,
                                        list(distance_min=min(distance_km, na.rm=T) ),
                                        by=list(event_hash, source_dataset)] ; dim(georef_all_dt_bysource)
georef_all_dt_bysource[!is.finite(distance_min), distance_min:=NA]

#Minimum distance and recall by source
georef_all_dt_bysource_summary <- georef_all_dt_bysource[,
                                                    list(
                                                      distance_rmse=sqrt(mean(distance_min^2, na.rm=T)) ,
                                                      source_dataset_count=.N),
                                                      by=list(source_dataset)
                                                ]

georef_all_dt_byfuzzy <- georef_all_dt[
                                      ,list(distance_min=min(distance_km, na.rm=T) ),
                                      by=list(event_hash, fuzzy)] ;
georef_all_dt_byfuzzy[!is.finite(distance_min), distance_min:=NA]


georef_all_dt_byfuzzy_summary <- georef_all_dt_byfuzzy[,
                                                       list(distance_rmse=sqrt(mean(distance_min^2, na.rm=T)) ,
                                                      source_dataset_count=.N),
                                                      by=list(fuzzy) ]

georef_all_dt_byselfreference <- georef_all_dt[,list(distance_min=min(distance_km, na.rm=T) ),
                                               by=list(event_hash, SelfReference)] ; dim(georef_all_dt_bysource)

georef_all_dt_byselfreference[!is.finite(distance_min), distance_min:=NA]
georef_all_dt_byselfreference_summary <- georef_all_dt_byselfreference[,list(
                                                      distance_rmse=sqrt(mean(distance_min^2, na.rm=T)) ,
                                                      source_dataset_count=.N),
                                                      by=list(SelfReference)
                                                ]


georef_all_dt_bygeometry_type <- georef_all_dt[,list(distance_min=min(distance_km, na.rm=T) ),by=list(event_hash, geometry_type
                                                                                            )] ; dim(georef_all_dt_bysource)
georef_all_dt_bygeometry_type[!is.finite(distance_min), distance_min:=NA]
georef_all_dt_bygeometry_type <- georef_all_dt_bygeometry_type[,list(
                                                      distance_rmse=sqrt(mean(distance_min^2, na.rm=T)) ,
                                                      source_dataset_count=.N),
                                                      by=list(geometry_type)
                                                ]

georef_all_dt_milcoords_summary <- data.frame(rule="Mil. Coord",
                                              distance_rmse=0,
                                              source_dataset_count=sum(!is.na(events_sf$map_coordinate_clean_latitude))
)

Summarize

recall_accuracy <- data.table::rbindlist(list(
  #georef_all_dt_milcoords_summary %>% mutate(label="Mil. Coord") %>% mutate(Type="Rule"),
  georef_all_dt_handrule_summary %>% mutate(label="Hand Rule") %>% mutate(Type="Rule"),
  georef_all_dt_ensemble_summary %>% mutate(label="Ensemble Rule") %>% mutate(Type="Rule"),
  georef_all_dt_bysource_summary %>% mutate(label=source_dataset) %>% mutate(Type="Source Gazetteer"),
  georef_all_dt_byfuzzy_summary %>% mutate(label=ifelse(fuzzy, "Fuzzy","Exact")) %>% mutate(Type="Match Type"),
  georef_all_dt_byselfreference_summary %>% mutate(label=ifelse(SelfReference, "Match To Other Events","No Match To Other Events")) %>%
    mutate(Type="Allow Match To Other Events"),
  georef_all_dt_bygeometry_type %>% mutate(label=geometry_type) %>% mutate(Type="Geometry Type")
),fill=T) %>% mutate(recall_rate=source_dataset_count/10469)

Combine and plot the results for every specific decision and the ensembles.

(Figure 2, Comparison of recall and accuracy by georeferencing method)

sentence_case <- function(x) stringr::str_to_sentence(tolower(gsub("_"," ",x)))

#The first time you run this, you'll need to install these additional fonts
#install.packages("extrafont");
#library(extrafont)
#library(extrafont)
#extrafont::font_import(prompt=F )
#capabilities()
#windowsFonts()
#sort(as.vector(unlist(windowsFonts())))

fonts <- c('Times New Roman',
           'Calibri',
           'Courier New',
           "Georgia",
           "Tunga",
           "Lucida Fax")
           #'serif','Helvetica','Bookman','Palatino')

#library(ggplot2)



fontfaces <- factor(c("plain","bold","italic","bold.italic","plain","plain"))

colours = c("Self reference" = "#F8766D",
            "Geometry type" = "#A3A500",
            "Match type" = "#00BF7D",
            "Rule" = "#00B0F6",
            "Source dataset"="#E76BF3",
            "Original geo info"="#53B400") 

#p_load(ggrepel, tools)
p_RecallBias <- recall_accuracy %>% 
               filter(!label %in% "tgn") %>% #exclude TGN for being an outlier
               mutate(
                      label=sentence_case(label),
                      Type=sentence_case(Type)
                      ) %>%
       ggplot(
       aes(x=distance_rmse,
           y=source_dataset_count / 10469,
             color=Type,
             label=label,
             family = fonts[as.numeric(as.factor(Type))],
             fontface= fontfaces[as.numeric(as.factor(Type))]
             )
)  + 
  ggrepel::geom_text_repel(size=3) +
  theme_bw() +
  xlab(stringr::str_to_sentence("Root Mean Squared Distance from Observed Coordinate (km)")) +
  ylab(stringr::str_to_sentence("Recovery Rate")) +
  theme(
    legend.position = c(0.2, 0.3), # c(0,0) bottom left, c(1,1) top-right.
  )
p_RecallBias
 #+ coord_cartesian(y="log")

ggsave(
  filename = glue::glue(dir_figures, "RecallBias.pdf"),
  plot = p_RecallBias, width = 9, height = 6,
    device = cairo_pdf #have to use cairo to correctly embed the fonts

)

Interpretation

In terms of accuracy, we find wide variation between decisions. In brief: Ensemble Rule > Hand Rule Exact > Fuzzy Point > Multipolygon > Polygon > Linestring Match to Other Events > No Match to Other Events events > nga > livestock_points" > historical > geonames > bing> kenya_cadastral_district > livestock_boundaries > gadm > kenya_district1962 > wikidata > google > kenya_cadastral > openstreetmap > tgn

The hand rules and supervised ensemble represent a compromise between high recovery rate and best case accuracy of each individual decision.



rexdouglass/MeasuringLandscape documentation built on May 13, 2019, 6:16 p.m.