Given a location string, and multiple possible matches to real-world places, this provides some logic for how to rank some matches as better than others.

First, using simple hand rules of what kind of match to prefer over others.

Second, with a supervised model that attempts to predict which match will be geographically closest to the true location (fewest kilometers away from the right answer).

rm(list=ls()); gc()
# Hiding output and warnings
# !diagnostics off
library(MeasuringLandscape)
library(tidyverse)

dir_figures <- glue::glue(getwd(), "/../paper/figures/")

gc()

knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8,  warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)
georef_all_dt <- readRDS(paste0(here::here(), "/inst/extdata/georef_all_dt.Rds"))

Hand Rules

#Calculate some features
dim(georef_all_dt)
georef_all_dt[!is.finite(distance_km),distance_km:=NA] 
georef_all_dt[,SelfReference:=source_dataset=="events"]
georef_all_dt[,fuzzy:=name_cleaner!=georef_b]
#georef_all_dt <- subset(georef_all_dt,  distance_km=!0) #This excludes all self references , we need that here but not necessarily elsewhere


georef_all_dt <- georef_all_dt %>% mutate(handrule1 = dplyr::recode(as.character(fuzzy), "TRUE" = 2, "FALSE" = 1)) 
georef_all_dt <- georef_all_dt %>% mutate(handrule2 = dplyr::recode(as.character(SelfReference), "TRUE" = 1, "FALSE" = 2)) 
table(georef_all_dt$geometry_type)
georef_all_dt <- georef_all_dt %>% mutate(handrule3 = dplyr::recode(as.character(geometry_type), "POINT" = 1, "MULTIPOLYGON"=2, "POLYGON" = 3,"LINESTRING"=4)) 

table(georef_all_dt$source_dataset, useNA="always") #why are their missing sources? A: Because there are events labels with no matching coordinates at all
georef_all_dt <- georef_all_dt %>% 
  mutate(handrule4 = dplyr::recode(as.character(source_dataset),
                                   "events" = 1,
                                   "historical"=2,
                                   "nga"=3,
                                   "geonames" = 4,
                                   "gadm"=5,
                                   "livestock_points"=6,
                                   "bing"=7,
                                   "livestock_boundaries"=8,
                                   "wikidata"=9,
                                   "tgn"=10,
                                   "kenya_cadastral_district"=11,
                                   "kenya_district1962"=12,                                                 
                                   "google"=13,
                                   "openstreetmap"=14,
                                   "kenya_cadastral"=15
  )) %>%
  data.table::as.data.table()
table(georef_all_dt$source_dataset,
      georef_all_dt$handrule4, useNA="always")

georef_all_dt$handrule <- paste(stringr::str_pad(georef_all_dt$handrule1, 2, pad = "0"),
                                stringr::str_pad(georef_all_dt$handrule2,2, pad = "0"),
                                stringr::str_pad(georef_all_dt$handrule3,2, pad = "0"),
                                stringr::str_pad(georef_all_dt$handrule4, 2, pad = "0"),
                                sep="_")

data.table::setkey(georef_all_dt, handrule )
temp <- subset(georef_all_dt,event_hash=="be37b40f")


saveRDS(georef_all_dt,
        file=glue::glue(getwd(), "/../inst/extdata/georef_all_dt.Rds")
)

Note, the immediately following code chunk is no longer used or maintained. It has been set to eval=F so it will not run automatically. A user running code by hand for replication should skip this chunk.

vars_x_string <- c(
    "Jaro",
    "Optimal_String_Alignment"    ,
    "Levenshtein",
    "Damerau_Levenshtein"    ,
     "Longest_Common_Substring"     ,
    "q_gram_1",
    "q_gram_2",
    "q_gram_3",
    "q_gram_4",
    "q_gram_5",
    'Cosine_1',
    'Cosine_2',
    'Cosine_3',
    'Cosine_4',
    'Cosine_5',
    "Jaccard"              ,
     "First_Mistmatch"         ,
    "a_nchar"     ,
    "b_nchar"   ,
    "ab_nchar_diff"       ,
    "dJaro",
    "dOptimal_String_Alignment"      ,
    "dLevenshtein"     ,
    "dDamerau_Levenshtein"  ,
    "dLongest_Common_Substring",
    "dq_gram",
    "dCosine",
    "dJaccard"
)


georef_all_dt$a <- georef_all_dt$name_cleaner
georef_all_dt$b <- georef_all_dt$georef_b
georef_all_dt <- MeasuringLandscape:::toponym_add_distances_dt(georef_all_dt)

dpredict<-xgb.DMatrix(data= as.matrix(georef_all_dt[,vars_x_string, with=F]), missing = NA)
georef_all_dt$toponym_xb_model_prediction <- predict( toponym_xb_model, dpredict )
georef_all_dt$toponym_xb_model_prediction  <- 1/(1 + exp(-georef_all_dt$toponym_xb_model_prediction ))

Select features

vars_y <- "distance_km"
vars_x <- c("source_dataset","geometry_type","SelfReference","fuzzy",
            "document_district_clean",
            "document_unit_type",
            "document_date_best_year",
            "toponym_xb_model_prediction",
            "feature_code")

vars_all <- c(vars_x,vars_y)
xy_all <- georef_all_dt[,vars_all,with=F]
xy_all <- as.data.frame(xy_all)
dim(xy_all)

#p_load(xgboost,dummies)

condition <- !is.na(xy_all$distance_km) & xy_all$distance_km!=0

x_all <- xy_all[,vars_x]
x_all <- dummies::dummy.data.frame(x_all,
                                   all=T,
                                   dummy.classes=c('character','factor','ordered') ) #Have to specify the dummy columns, because getOption("dummy.classes") is failing to be found in this namespace
x_train <- x_all[condition,]

label <- log(xy_all$distance_km[condition])

Develop training and test splits

ids <- georef_all_dt$name_cleaner[condition]
ids_unique <- sample(sort(unique(ids)))
chunks <- split(ids_unique, ceiling(seq_along(ids_unique)/ (length(ids_unique)/5)))
fold1 <- which(ids %in% chunks[[1]])
fold2 <- which(ids %in% chunks[[2]])
fold3 <- which(ids %in% chunks[[3]])
fold4 <- which(ids %in% chunks[[4]])
fold5 <- which(ids %in% chunks[[5]])
folds <- list(fold1,fold2,fold3,fold4,fold5)
#label= label #
condition <- !is.na(label)
dtrain <- xgboost::xgb.DMatrix(data=as.matrix( x_train ), label = label, missing = NA )
dpredict <- xgboost::xgb.DMatrix(data=as.matrix( x_all ), missing = NA )

param <- list("objective" ="reg:linear", #"objective" = logregobj,
              #"scale_pos_weight" = sumwneg / sumwpos,
              "eta" = 0.3,
              "max_depth" = 6,
              "eval_metric" = "rmse",
              "silent" = 1,
              "nthread" = 48,
              'maximize'=T)

#choose folds by name so we're not cheating
#First cross validation to get credible out of sample accuracy
xb <- xgboost::xgb.cv(params=param,
             data=dtrain,
             nrounds = 100,
             early_stopping_rounds=10, 
             #nfold=5,
             folds=folds,
             prediction=T)
#Then fit a single model for variable importance
xb2 <- xgboost::xgb.train(params=param,
                 data=dtrain,
                 nrounds = 100,
                 #early_stopping_rounds=10, 
                 #nfold=5,
                 prediction=T)


#predictions_train <- predict(xb2, dtrain)
predictions_all <- predict(xb2, dpredict) #predict over everything 
#Replace with cross validation predictions
condition <- !is.na(xy_all$distance_km) & xy_all$distance_km!=0 #what was used to subset to train above
cross_valid_predictions <-  xb$pred
predictions_all[condition] <- cross_valid_predictions #but for the subset we trained on, only use predictions from out of sample folds.

georef_all_dt$rule_ensemble <- exp(predictions_all)


data.table::setkey(georef_all_dt, handrule )
temp <- subset(georef_all_dt,event_hash=="be37b40f")

saveRDS(georef_all_dt,
        file=glue::glue(getwd(), "/../inst/extdata/georef_all_dt_recomendations.Rds")
)

Evaluate accuracy (Appendix Figure 7)

#plot(georef_all_dt$rule_ensemble, georef_all_dt$distance_km)
p_ensemble_risiduals <- georef_all_dt %>% ggplot(aes(x=log(distance_km), y=log(rule_ensemble) )) + 
    geom_point(alpha=.01) + 
    geom_smooth(span=.2) + 
    theme_bw() +
    ggtitle("Observed versus predicted Distance to Match (Ensemble Residuals)") +
    xlab("Observed distance between event and gazeteer suggestion (log km)") +
    ylab("Predicted distance between event and gazeteer suggestion (log km) (out of sample)")
p_ensemble_risiduals

ggsave(
  filename = glue::glue(dir_figures, "p_ensemble_risiduals.png"), #save as a png because there are too many points and it's crashing the pdf reader
  plot = p_ensemble_risiduals,
  width = 10,
  height = 8#,
  #device = cairo_pdf #have to use cairo to correctly embed the fonts
)

Variable importance (Appendix Figure 8)

importance_importance <- xgboost::xgb.importance(feature_names=colnames(dtrain), model = xb2) #won't calculate on cv

pdf(file=glue::glue(dir_figures, "p_variable_importance_supervised_ensemble.pdf"), width=5.5, height=6)
xgboost::xgb.plot.importance(importance_importance %>% 
                      head(50) #Top 50 features
                    )
#autoplot(uauc1)
dev.off()
xgboost::xgb.plot.importance(importance_importance %>% 
                      head(50) #Top 50 features
                    )

Compare

#Rank correlation of only .41 between the hand rules and the estimated ensemble
georef_all_dt$handrule_numeric <- as.numeric(as.factor(georef_all_dt$handrule))

cor(log(georef_all_dt$distance_km),
    georef_all_dt$rule_ensemble,
    method="spearman", use="pairwise.complete")


cor(log(georef_all_dt$distance_km),
    georef_all_dt$handrule_numeric,
    method="spearman", use="pairwise.complete")


cor(georef_all_dt$handrule_numeric,
    georef_all_dt$rule_ensemble, method="spearman")


rexdouglass/MeasuringLandscape documentation built on May 13, 2019, 6:16 p.m.