Fuzzy Matcher Stage 2

The fuzzy matcher predicts the likelihood that two toponyms are the same place even though their spellings might be different. It has two stages.

This file develops Stage 2 of the fuzzy toponym matcher. Its job is to refine the prediction for the subset of plausible matches and provide an actual probability of a match.

rm(list=ls()); gc()
library(MeasuringLandscape)
library(tidyverse)
dir_figures <- glue::glue(getwd(), "/../paper/figures/")

knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8,  warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)

It is based on an events dataset built and cleaned in the file "01 Prep Events Counts".

#Load Events
events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) 
events_dt <- data.table::as.data.table(events_sf) #%>% unique() 
dim(events_sf)

unlist(strsplit(events_sf$name_cleaner,"")) %>% janitor::tabyl() %>% janitor::adorn_crosstab(., denom = "col", show_n = T, digits = 1, show_totals = T)

events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean',
                                                  'map_coordinate_clean_latitude','map_coordinate_clean_longitude')],
                                     "location_text", transform,
      map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude))
      )

#Load Gazeteers
flatfiles_sf_roi <- readRDS(system.file("extdata", "flatfiles_sf_roi.Rdata", package = "MeasuringLandscape")) 
dim(flatfiles_sf_roi)
flatfiles_dt <- data.table::as.data.table(flatfiles_sf_roi)
data.table::setkey(flatfiles_dt,place_hash)

Summary statistics

table( !is.na(sf::st_coordinates(events_sf)[,1] ) ,
       !is.na(events_sf$name_cleaner) )

Produce a dataset for hand labeling

This takes every event with a coordinate and looks up the 10 nearest points in the gazetteers. It saves this as a csv file that a human can label as a toponym match or mismatch.

#Should only need to do this once
#MeasuringLandscape:::create_toponym_dataset_forlabeling()

Choose Features

vars_id <- c("name_cleaner_a","name_cleaner_b",'test','extranegative')
vars_weights <- c("weights")
vars_y <- c("rex_match")

vars_x_string <- c(#"exact_match",               
"Jaro",
"Optimal_String_Alignment"    ,
"Levenshtein",
"Damerau_Levenshtein"    ,
"Longest_Common_Substring"     ,
"q_gram_1",
"q_gram_2",
"q_gram_3",
"q_gram_4",
"q_gram_5",
'Cosine_1',
'Cosine_2',
'Cosine_3',
'Cosine_4',
'Cosine_5',
"Jaccard"              ,
 "First_Mistmatch"         ,
"a_nchar"     ,
"b_nchar"   ,
"ab_nchar_diff"       ,             
"dJaro",
"dOptimal_String_Alignment"      ,
"dLevenshtein"     ,
"dDamerau_Levenshtein"  ,           
"dLongest_Common_Substring",
"dq_gram",
"dCosine",
"dJaccard"
# "OM",
# "OMloc",
# "OMslen"
# ,"OMspell",
# "TWED",
# "LCS",                 
# "LCP",
# "RLCP",
# "NMS",
# "NMSMST",                 
# "SVRspell",
# "CHI2"
) 

vars_x_stem <- paste0(vars_x_string, "_stemmed")

vars_x_string_corpus <- c(

"corpus_mention_count_a",
"corpus_mention_year_min_a",
"corpus_mention_year_median_a",
"corpus_mention_year_mean_a",
"corpus_mention_year_max_a",

"corpus_mention_count_b",
"corpus_mention_year_min_b",
"corpus_mention_year_median_b",
"corpus_mention_year_mean_b",
"corpus_mention_year_max_b",

"gazeteer_mentions_count_a",
"gazeteer_mentions_count_b",
"gazeteer_stem_mentions_count_a",
"gazeteer_stem_mentions_count_b"

#"postfix_has_a"
#"postfix_has_b" 

#"ngram_a", #Don't run, it's just a word count and it's missing if it's never found
#"ngram_b"
)


vars_x_stem_corpus <- paste0(vars_x_string_corpus, "_stemmed")
vars_x_everything <-  c(
                        vars_x_string,
                        vars_x_stem#,
                        #vars_x_string_corpus, #excluding corpus features
                        #vars_x_stem_corpus  #excluding corpus features
                        )     
vars_x_onlystring <- c(vars_x_string)
vars_x_stringcorpus <-  c(vars_x_string,  vars_x_string_corpus)     
vars_x_string_and_stem <- c(vars_x_string, vars_x_stem)

Create Training Test Split

handlabeled <- MeasuringLandscape:::toponym_training_dataset_load()

fromscratch=F
if(fromscratch){    

    toponym_training_dataset <- create_toponym_split_training_test( 
                                                           handlabeled=handlabeled,
                                                           vars_id=vars_id,
                                                           vars_weights=vars_weights,
                                                           vars_y=vars_y,
                                                           vars_x=vars_x_everything,
                                                           neg_count=0, #you lose so many to cosine distance
                                                           fromscratch=T
                                                         )

    saveRDS(toponym_training_dataset,
            file=glue(getwd(), "/../inst/extdata/toponym_training_dataset.Rds")
            )
}
toponym_training_dataset <- readRDS(system.file("extdata", "toponym_training_dataset.Rds", package = "MeasuringLandscape"))

Summarize training data

glue::glue("Handlabeled Training Dataset")
dim(handlabeled)
table(handlabeled$rex_match)


glue::glue("All Data Resampled with Features \n")
dim(toponym_training_dataset$xy_all)
table(toponym_training_dataset$xy_all$rex_match)


glue::glue("Training Split \n")
dim(toponym_training_dataset$xy_train)
table(toponym_training_dataset$xy_train$rex_match)


glue::glue("Test Split \n")
dim(toponym_training_dataset$xy_test)
table(toponym_training_dataset$xy_test$rex_match)

Fit XGBoost models predicting match

require(methods)
global_test <- toponym_training_dataset[['xy_all']] %>% filter(test %in% T  & !is.na(rex_match) )

sumwneg=sum(toponym_training_dataset[['xy_train']]$rex_match==0)
sumwpos=sum(toponym_training_dataset[['xy_train']]$rex_match==1)
param2 <- list("objective" = MeasuringLandscape:::logregobj, #"objective" ="binary:logistic", #
              "scale_pos_weight" = sumwneg / sumwpos,
              "eta" = 0.3,
              "max_depth" = 6,
              "eval_metric" = "auc",
              #"eval_metric" = area_under_pr_curve_metric,
              #"eval_metric" = "ams@0.15",
              "silent" = 1,
              "nthread" = 48,
              'maximize'=T)

toponym_xb_everything2 <- MeasuringLandscape:::train_an_xb(toponym_training_dataset[['xy_train']],
                                     toponym_training_dataset[['xy_test']] ,
                                     vars_x=vars_x_everything,
                                     param=param2,
                                     use_weights=F,
                                     extract_features=F )
xgb.save(toponym_xb_everything2,
       glue::glue(getwd(), "/../inst/extdata/toponym_xb_everything2.bin")
         ) #Have to save separately for some reason

toponym_xb_onlystring <- MeasuringLandscape:::train_an_xb(toponym_training_dataset[['xy_train']],
                                     toponym_training_dataset[['xy_test']] ,
                                     vars_x=vars_x_onlystring,
                                     param=param2,
                                     use_weights=F,
                                     extract_features=F,
                                     missing=NA)
xgb.save(toponym_xb_onlystring,
       glue::glue(getwd(), "/../inst/extdata/toponym_xb_onlystring.bin")
         ) #Have to save separately for some reason



toponym_xb_string_and_stem <- MeasuringLandscape:::train_an_xb(toponym_training_dataset[['xy_train']],
                                     toponym_training_dataset[['xy_test']] ,
                                     vars_x=vars_x_string_and_stem,
                                     param=param2,
                                     use_weights=F,
                                     extract_features=F,
                                     missing=NA)

xgb.save(toponym_xb_string_and_stem,
       glue::glue(getwd(), "/../inst/extdata/toponym_xb_string_and_stem.bin")
         ) #Have to save separately for some reason

(Appendix Figure 5, AUC and Precision Recall curves for toponym fuzzy matcher stage 2, predicting likelihood of a match.)

dtest<-xgb.DMatrix(data= as.matrix(as.data.frame(global_test)[,vars_x_everything]),
                   label=as.numeric(global_test$rex_match),
                   weight = global_test$weights,
                   missing = NA)

global_test$toponym_xb_everything2 <- predict(toponym_xb_everything2, dtest )
global_test$toponym_xb_everything2 <- 1/(1 + exp(-global_test$toponym_xb_everything2))

#global_test$toponym_xb_onlystring <- predict(toponym_xb_onlystring, dtest )
#global_test$toponym_xb_onlystring <- 1/(1 + exp(-global_test$toponym_xb_onlystring))


global_test$toponym_xb_string_and_stem <- predict(toponym_xb_string_and_stem, dtest )
global_test$toponym_xb_string_and_stem <- 1/(1 + exp(-global_test$toponym_xb_string_and_stem))

#p_load(Metrics)
#p_load(MLmetrics)
tocompare <- c(#"toponym_xb_everything2",
               #"toponym_xb_onlystring"#,
               #"toponym_xb_stringcorpus"#,
               "toponym_xb_string_and_stem"
               ) # "toponym_xb_everything",, "toponym_xb_everything2_extra","toponym_xb_onlystring_extra","toponym_xb_everything2_extra_large", "toponym_xb_onlystring_extra_large", "toponym_xb_everything2_uber_large" , "toponym_xb_onlystring_uber_large"
print("ce")
sapply(tocompare, function(q) {  Metrics::ce(actual=global_test$rex_match, predicted=global_test[,q]>.5)  })
print("F1_Score")
sapply(tocompare, function(q) {  MLmetrics::F1_Score(global_test$rex_match, as.numeric( global_test[,q]>.5) )  })
print("logloss")
sapply(tocompare, function(q) {  Metrics::logLoss(actual=global_test$rex_match, predicted=global_test[,q])  })
print("confusion")
sapply(tocompare, function(q) {  table(global_test$rex_match,  global_test[,q]>.5)  })


#p_load(precrec)
msmdat1 <- precrec::evalmod( precrec::mmdata(scores=list(
                                #global_test$toponym_xb_everything,
                                #global_test$toponym_rf_everything,
                                #global_test$toponym_xb_everything_extracted,
                                #global_test$toponym_xb_everything2, #highest PRC
                                #global_test$toponym_xb_everything3,
                                #global_test$toponym_xb_everything4,
                                global_test$toponym_xb_string_and_stem#,
                                #global_test$toponym_xb_onlystring#,
                                #global_test$toponym_xb_stringcorpus#,
                                #global_test$toponym_xb_onlystring_tiny
                                #global_test$toponym_xb_everything2_extra,
                                #global_test$toponym_xb_onlystring_extra,
                                #global_test$toponym_xb_everything2_extra_large,
                                #global_test$toponym_xb_onlystring_extra_large,
                                #global_test$toponym_xb_everything2_uber_large,
                                #global_test$toponym_xb_onlystring_uber_large
                                ),
                           labels=as.numeric(as.data.frame(global_test)$rex_match),
                           modnames = c(#"toponym_xb_everything",
                                        #"toponym_rf_everything",
                                        #"toponym_xb_everything_features",
                                        #"toponym_xb_everything2",
                                        #"toponym_xb_everything3",
                                        #"toponym_xb_everything4",
                                        "toponym_xb_string_and_stem"#,
                                        #"toponym_xb_onlystring"#,
                                        #"toponym_xb_stringcorpus"#,
                                        #"toponym_xb_onlystring_tiny"
                                        #"toponym_xb_everything2_extra",
                                        #"toponym_xb_onlystring_extra",
                                        #"toponym_xb_everything2_extra_large", 
                                        #"toponym_xb_onlystring_extra_large",
                                        #"toponym_xb_everything2_uber_large" ,
                                        #"toponym_xb_onlystring_uber_large"
                                        )
                           ) )

msmdat1

uauc1 <- precrec::evalmod(scores = global_test$toponym_xb_string_and_stem,
                 labels=as.numeric(as.data.frame(global_test)$rex_match)#,
                 #mode="aucroc"
                 )

plot(uauc1)

#dev.off()
pdf(file=glue::glue(dir_figures, "p_auc_stage2.pdf"), width=5.5)
plot(uauc1)
#autoplot(uauc1)
dev.off()
autoplot(uauc1)

Evaluate the model (Appendix Figure 6, AUC and Precision Recall curves for toponym fuzzy matcher stage 2, predicting likelihood of a match.)

#p_load(DiagrammeR)
#xgb.plot.tree(model = xb)
#xgb.plot.tree(model = toponym_xb_everything2, trees = 0, show_node_id = TRUE, render = TRUE)

importance_importance <- xgb.importance(feature_names=vars_x_string_and_stem, model = toponym_xb_string_and_stem)
xgb.plot.importance(importance_importance)

pdf(file=glue::glue(dir_figures, "p_variable_importance_stage2.pdf"), width=5.5, height=6)
xgb.plot.importance(importance_importance)
#autoplot(uauc1)
dev.off()

Analysis of Residuals

global_test$toponym_xb_everything2_correct <- (global_test$toponym_xb_everything2>.5) == global_test$rex_match


#global_test$toponym_xb_onlystring_uber_large_correct <- (global_test$toponym_xb_onlystring_uber_large>.5)==global_test$rex_match
#table(global_test$toponym_xb_onlystring_uber_large_correct)


rexdouglass/MeasuringLandscape documentation built on May 13, 2019, 6:16 p.m.