The fuzzy matcher predicts the likelihood that two toponyms are the same place even though their spellings might be different. It has two stages.
This file develops Stage 2 of the fuzzy toponym matcher. Its job is to refine the prediction for the subset of plausible matches and provide an actual probability of a match.
rm(list=ls()); gc() library(MeasuringLandscape) library(tidyverse) dir_figures <- glue::glue(getwd(), "/../paper/figures/") knitr::opts_knit$set(progress = TRUE, verbose = TRUE) knitr::opts_chunk$set(fig.width=12, fig.height=8, warning=FALSE, message=FALSE, cache=TRUE) options(width = 160)
It is based on an events dataset built and cleaned in the file "01 Prep Events Counts".
#Load Events events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) events_dt <- data.table::as.data.table(events_sf) #%>% unique() dim(events_sf) unlist(strsplit(events_sf$name_cleaner,"")) %>% janitor::tabyl() %>% janitor::adorn_crosstab(., denom = "col", show_n = T, digits = 1, show_totals = T) events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean', 'map_coordinate_clean_latitude','map_coordinate_clean_longitude')], "location_text", transform, map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude)) ) #Load Gazeteers flatfiles_sf_roi <- readRDS(system.file("extdata", "flatfiles_sf_roi.Rdata", package = "MeasuringLandscape")) dim(flatfiles_sf_roi) flatfiles_dt <- data.table::as.data.table(flatfiles_sf_roi) data.table::setkey(flatfiles_dt,place_hash)
Summary statistics
table( !is.na(sf::st_coordinates(events_sf)[,1] ) , !is.na(events_sf$name_cleaner) )
This takes every event with a coordinate and looks up the 10 nearest points in the gazetteers. It saves this as a csv file that a human can label as a toponym match or mismatch.
#Should only need to do this once #MeasuringLandscape:::create_toponym_dataset_forlabeling()
vars_id <- c("name_cleaner_a","name_cleaner_b",'test','extranegative') vars_weights <- c("weights") vars_y <- c("rex_match") vars_x_string <- c(#"exact_match", "Jaro", "Optimal_String_Alignment" , "Levenshtein", "Damerau_Levenshtein" , "Longest_Common_Substring" , "q_gram_1", "q_gram_2", "q_gram_3", "q_gram_4", "q_gram_5", 'Cosine_1', 'Cosine_2', 'Cosine_3', 'Cosine_4', 'Cosine_5', "Jaccard" , "First_Mistmatch" , "a_nchar" , "b_nchar" , "ab_nchar_diff" , "dJaro", "dOptimal_String_Alignment" , "dLevenshtein" , "dDamerau_Levenshtein" , "dLongest_Common_Substring", "dq_gram", "dCosine", "dJaccard" # "OM", # "OMloc", # "OMslen" # ,"OMspell", # "TWED", # "LCS", # "LCP", # "RLCP", # "NMS", # "NMSMST", # "SVRspell", # "CHI2" ) vars_x_stem <- paste0(vars_x_string, "_stemmed") vars_x_string_corpus <- c( "corpus_mention_count_a", "corpus_mention_year_min_a", "corpus_mention_year_median_a", "corpus_mention_year_mean_a", "corpus_mention_year_max_a", "corpus_mention_count_b", "corpus_mention_year_min_b", "corpus_mention_year_median_b", "corpus_mention_year_mean_b", "corpus_mention_year_max_b", "gazeteer_mentions_count_a", "gazeteer_mentions_count_b", "gazeteer_stem_mentions_count_a", "gazeteer_stem_mentions_count_b" #"postfix_has_a" #"postfix_has_b" #"ngram_a", #Don't run, it's just a word count and it's missing if it's never found #"ngram_b" ) vars_x_stem_corpus <- paste0(vars_x_string_corpus, "_stemmed") vars_x_everything <- c( vars_x_string, vars_x_stem#, #vars_x_string_corpus, #excluding corpus features #vars_x_stem_corpus #excluding corpus features ) vars_x_onlystring <- c(vars_x_string) vars_x_stringcorpus <- c(vars_x_string, vars_x_string_corpus) vars_x_string_and_stem <- c(vars_x_string, vars_x_stem)
handlabeled <- MeasuringLandscape:::toponym_training_dataset_load() fromscratch=F if(fromscratch){ toponym_training_dataset <- create_toponym_split_training_test( handlabeled=handlabeled, vars_id=vars_id, vars_weights=vars_weights, vars_y=vars_y, vars_x=vars_x_everything, neg_count=0, #you lose so many to cosine distance fromscratch=T ) saveRDS(toponym_training_dataset, file=glue(getwd(), "/../inst/extdata/toponym_training_dataset.Rds") ) } toponym_training_dataset <- readRDS(system.file("extdata", "toponym_training_dataset.Rds", package = "MeasuringLandscape"))
glue::glue("Handlabeled Training Dataset") dim(handlabeled) table(handlabeled$rex_match) glue::glue("All Data Resampled with Features \n") dim(toponym_training_dataset$xy_all) table(toponym_training_dataset$xy_all$rex_match) glue::glue("Training Split \n") dim(toponym_training_dataset$xy_train) table(toponym_training_dataset$xy_train$rex_match) glue::glue("Test Split \n") dim(toponym_training_dataset$xy_test) table(toponym_training_dataset$xy_test$rex_match)
require(methods) global_test <- toponym_training_dataset[['xy_all']] %>% filter(test %in% T & !is.na(rex_match) ) sumwneg=sum(toponym_training_dataset[['xy_train']]$rex_match==0) sumwpos=sum(toponym_training_dataset[['xy_train']]$rex_match==1) param2 <- list("objective" = MeasuringLandscape:::logregobj, #"objective" ="binary:logistic", # "scale_pos_weight" = sumwneg / sumwpos, "eta" = 0.3, "max_depth" = 6, "eval_metric" = "auc", #"eval_metric" = area_under_pr_curve_metric, #"eval_metric" = "ams@0.15", "silent" = 1, "nthread" = 48, 'maximize'=T) toponym_xb_everything2 <- MeasuringLandscape:::train_an_xb(toponym_training_dataset[['xy_train']], toponym_training_dataset[['xy_test']] , vars_x=vars_x_everything, param=param2, use_weights=F, extract_features=F ) xgb.save(toponym_xb_everything2, glue::glue(getwd(), "/../inst/extdata/toponym_xb_everything2.bin") ) #Have to save separately for some reason toponym_xb_onlystring <- MeasuringLandscape:::train_an_xb(toponym_training_dataset[['xy_train']], toponym_training_dataset[['xy_test']] , vars_x=vars_x_onlystring, param=param2, use_weights=F, extract_features=F, missing=NA) xgb.save(toponym_xb_onlystring, glue::glue(getwd(), "/../inst/extdata/toponym_xb_onlystring.bin") ) #Have to save separately for some reason toponym_xb_string_and_stem <- MeasuringLandscape:::train_an_xb(toponym_training_dataset[['xy_train']], toponym_training_dataset[['xy_test']] , vars_x=vars_x_string_and_stem, param=param2, use_weights=F, extract_features=F, missing=NA) xgb.save(toponym_xb_string_and_stem, glue::glue(getwd(), "/../inst/extdata/toponym_xb_string_and_stem.bin") ) #Have to save separately for some reason
(Appendix Figure 5, AUC and Precision Recall curves for toponym fuzzy matcher stage 2, predicting likelihood of a match.)
dtest<-xgb.DMatrix(data= as.matrix(as.data.frame(global_test)[,vars_x_everything]), label=as.numeric(global_test$rex_match), weight = global_test$weights, missing = NA) global_test$toponym_xb_everything2 <- predict(toponym_xb_everything2, dtest ) global_test$toponym_xb_everything2 <- 1/(1 + exp(-global_test$toponym_xb_everything2)) #global_test$toponym_xb_onlystring <- predict(toponym_xb_onlystring, dtest ) #global_test$toponym_xb_onlystring <- 1/(1 + exp(-global_test$toponym_xb_onlystring)) global_test$toponym_xb_string_and_stem <- predict(toponym_xb_string_and_stem, dtest ) global_test$toponym_xb_string_and_stem <- 1/(1 + exp(-global_test$toponym_xb_string_and_stem)) #p_load(Metrics) #p_load(MLmetrics) tocompare <- c(#"toponym_xb_everything2", #"toponym_xb_onlystring"#, #"toponym_xb_stringcorpus"#, "toponym_xb_string_and_stem" ) # "toponym_xb_everything",, "toponym_xb_everything2_extra","toponym_xb_onlystring_extra","toponym_xb_everything2_extra_large", "toponym_xb_onlystring_extra_large", "toponym_xb_everything2_uber_large" , "toponym_xb_onlystring_uber_large" print("ce") sapply(tocompare, function(q) { Metrics::ce(actual=global_test$rex_match, predicted=global_test[,q]>.5) }) print("F1_Score") sapply(tocompare, function(q) { MLmetrics::F1_Score(global_test$rex_match, as.numeric( global_test[,q]>.5) ) }) print("logloss") sapply(tocompare, function(q) { Metrics::logLoss(actual=global_test$rex_match, predicted=global_test[,q]) }) print("confusion") sapply(tocompare, function(q) { table(global_test$rex_match, global_test[,q]>.5) }) #p_load(precrec) msmdat1 <- precrec::evalmod( precrec::mmdata(scores=list( #global_test$toponym_xb_everything, #global_test$toponym_rf_everything, #global_test$toponym_xb_everything_extracted, #global_test$toponym_xb_everything2, #highest PRC #global_test$toponym_xb_everything3, #global_test$toponym_xb_everything4, global_test$toponym_xb_string_and_stem#, #global_test$toponym_xb_onlystring#, #global_test$toponym_xb_stringcorpus#, #global_test$toponym_xb_onlystring_tiny #global_test$toponym_xb_everything2_extra, #global_test$toponym_xb_onlystring_extra, #global_test$toponym_xb_everything2_extra_large, #global_test$toponym_xb_onlystring_extra_large, #global_test$toponym_xb_everything2_uber_large, #global_test$toponym_xb_onlystring_uber_large ), labels=as.numeric(as.data.frame(global_test)$rex_match), modnames = c(#"toponym_xb_everything", #"toponym_rf_everything", #"toponym_xb_everything_features", #"toponym_xb_everything2", #"toponym_xb_everything3", #"toponym_xb_everything4", "toponym_xb_string_and_stem"#, #"toponym_xb_onlystring"#, #"toponym_xb_stringcorpus"#, #"toponym_xb_onlystring_tiny" #"toponym_xb_everything2_extra", #"toponym_xb_onlystring_extra", #"toponym_xb_everything2_extra_large", #"toponym_xb_onlystring_extra_large", #"toponym_xb_everything2_uber_large" , #"toponym_xb_onlystring_uber_large" ) ) ) msmdat1 uauc1 <- precrec::evalmod(scores = global_test$toponym_xb_string_and_stem, labels=as.numeric(as.data.frame(global_test)$rex_match)#, #mode="aucroc" ) plot(uauc1) #dev.off() pdf(file=glue::glue(dir_figures, "p_auc_stage2.pdf"), width=5.5) plot(uauc1) #autoplot(uauc1) dev.off() autoplot(uauc1)
Evaluate the model (Appendix Figure 6, AUC and Precision Recall curves for toponym fuzzy matcher stage 2, predicting likelihood of a match.)
#p_load(DiagrammeR) #xgb.plot.tree(model = xb) #xgb.plot.tree(model = toponym_xb_everything2, trees = 0, show_node_id = TRUE, render = TRUE) importance_importance <- xgb.importance(feature_names=vars_x_string_and_stem, model = toponym_xb_string_and_stem) xgb.plot.importance(importance_importance) pdf(file=glue::glue(dir_figures, "p_variable_importance_stage2.pdf"), width=5.5, height=6) xgb.plot.importance(importance_importance) #autoplot(uauc1) dev.off()
global_test$toponym_xb_everything2_correct <- (global_test$toponym_xb_everything2>.5) == global_test$rex_match #global_test$toponym_xb_onlystring_uber_large_correct <- (global_test$toponym_xb_onlystring_uber_large>.5)==global_test$rex_match #table(global_test$toponym_xb_onlystring_uber_large_correct)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.