Takes in locations of events described as text, and returns all possible matches across different gazetteers.

Dependencies: events_sf.Rdata, flatfiles_sf_roi.Rdata Products: georef_all_dt.Rds

rm(list=ls()); gc()
# Hiding output and warnings
# !diagnostics off
library(MeasuringLandscape)
library(tidyverse)

dir_figures <- paste0(here::here(), "/paper/figures/")

gc()

knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8,  warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)

Load Files

We load the events file to be geolocated.

#Load Events
events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) 

events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean','map_coordinate_clean_latitude','map_coordinate_clean_longitude')],
                                     "location_text", transform,
      map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude))
      )

We load the combined gazetteer files.

#Load Gazetteers
flatfiles_sf_roi <- readRDS(system.file("extdata", "flatfiles_sf_roi.Rdata", package = "MeasuringLandscape")) 
dim(flatfiles_sf_roi)
flatfiles_dt <- data.table::as.data.table(flatfiles_sf_roi)
data.table::setkey(flatfiles_dt,place_hash)

We load a pretrained toponym matcher based on XGBoost and many string distance features.

#Load toponym model
#p_load(xgboost)
toponym_xb_model <- xgboost::xgb.load(system.file("extdata", "toponym_xb_onlystring.bin", package = "MeasuringLandscape"))

Summary Statistics

When we start, we have r length(unique(events_sf$name_cleaner)) unique location strings in the events data.

#events_sf$name_cleaner %>% tabyl(sort=T)

When we start, we have r length(unique(flatfiles_sf_roi$name_cleaner)) unique location strings in the gazetteer data.

#flatfiles_sf_roi$name_cleaner %>% tabyl(sort=T)

If we considered every single pair of possible matches it would require about 62 million comparisons (r length(unique(events_sf$name_cleaner)) * length(unique(flatfiles_sf_roi$name_cleaner))). And that is just for our relatively small number of event location strings and restricting our gazetteers to a relatively small region of interest. Obviously, any approach we implement is going to have to scale sub-quadratic in the number of locations or it won't be applicable to any problem of interesting size.

Stem

Our first move is to stem each location string of prefixes and common suffixes so that we can match slight variations of the same place to a single identifier.

#Step 1  Get Suggestions for Stems

#Stem the gazetteer names
temp <- MeasuringLandscape:::strip_postfixes(flatfiles_sf_roi$name_cleaner)
flatfiles_sf_roi$name_cleaner_stem <- temp[[1]]
flatfiles_sf_roi$name_cleaner_postfix <- temp[[2]]

#flatfiles_sf_roi$name_cleaner_stem %>% tabyl(sort=T)

There are r length(unique(flatfiles_sf_roi$name_cleaner_stem)) unique location stems in the flatfiles data.

#Stem the event names
temp <- MeasuringLandscape:::strip_postfixes(events_sf$name_cleaner)
events_sf$name_cleaner_stem <- temp[[1]]
events_sf$name_cleaner_postfix <- temp[[2]]

#events_sf$name_cleaner %>% tabyl(sort=T)

There are r length(unique(events_sf$name_cleaner_stem)) unique location stems in the events data.

#Unique stems
stemmed_ab <- unique(c(flatfiles_sf_roi$name_cleaner_stem, events_sf$name_cleaner_stem))
length(stemmed_ab) #17,880 unique stems

#Unique name-stem pairs
names_and_stems <- rbind(
                          as.data.frame(flatfiles_sf_roi)[,c('name_cleaner','name_cleaner_stem')],
                          as.data.frame(events_sf)[,c('name_cleaner','name_cleaner_stem')]
                        ) %>% unique() ; dim(names_and_stems)

LSH

Our next move is hash each stemmed location in a way that locations that are somewhat similar to one another are returned as possible matches and the vast majority of pairs that are too dissimilar to ever possibly match are excluded. This requires only a single pass through the data, and so scales linearly in the number of locations. We select parameters which perform well on our hand-labeled training dataset of toponym matches and mismatches. Where performing well at this stage means a low false negative rate (missing few genuine matches) and a moderate false positive rate (returning many irrelevant matches, but not an overwhelming number). Here we are only trying to reject as many obvious nonmatches as possible.

minhash_count=100
bands=25

fromscratch=F # For some reason these parallel functions hang in rmarkdown. Copy and paste them into console to run.
if(fromscratch){
  #Optimal settings derived in 04_fuzzy_matcher_stage_1

  library(textreuse)
  stemmed_ab_spaced <- sapply(strsplit(stemmed_ab, split="") , paste, collapse=" ")
  minhash <- minhash_generator(n = minhash_count, seed = 1)
  options("mc.cores" = parallel::detectCores()) #Much faster when paralized
  #options("mc.cores" = 1) #Much faster when paralized
  #This doesn't like to be ran in rmarkdown, it'll spawn processes and hang. Maybe see if it'll work as a function?

  corpus_ab_spaced <- TextReuseCorpus(text = stemmed_ab_spaced,
                                      tokenizer = tokenize_ngrams,
                                      n = 2, #wow if you pass this as a variable instead of a number it crashes. Wtf?
                                      minhash_func = minhash,
                                      keep_tokens = TRUE,
                                      progress = T)

  #buckets <- lsh(corpus_ab_spaced, bands = 80, progress = T) #Single threaded but should be parallizable

  n.cores <- parallel::detectCores()
  cuts <- cut(1:length(corpus_ab_spaced), n.cores)
  #buckets_list <- mclapply(levels(cuts),
  #               function(q) lsh(corpus_ab_spaced[cuts==q], bands = bands, progress = T) ,
  #               mc.cores = n.cores)
  #buckets <- do.call(rbind, buckets_list)

  library(doParallel)
  library(foreach)
  cl<- makeCluster(parallel::detectCores()) #change the 2 to your number of CPU cores
  registerDoParallel(cl)
  buckets <- foreach(q=levels(cuts), .combine='rbind', .packages=c('textreuse')) %dopar%  
                lsh(corpus_ab_spaced[cuts==q], bands = bands, progress =F)
  stopCluster(cl)

  candidates <- lsh_candidates(buckets)
  dim(candidates) #842,288 
  candidates$a_numeric <- as.numeric( gsub("doc-","",candidates$a) )
  candidates$b_numeric <- as.numeric( gsub("doc-","",candidates$b) )
  candidates$stemmed_a <- stemmed_ab[candidates$a_numeric]
  candidates$stemmed_b <- stemmed_ab[candidates$b_numeric]
  candidates$stemmed_ab <- paste(candidates$stemmed_a,candidates$stemmed_b, sep="_")
  candidates$stemmed_ba <- paste(candidates$stemmed_b,candidates$stemmed_a, sep="_")

  saveRDS(candidates, paste0(here::here(), "inst/extdata/candidates.Rdata")) #Leaving off the .. because you should only be running this by hand

}
candidates <- readRDS(system.file("extdata",
                                 "candidates.Rdata",
                                 package = "MeasuringLandscape"))



#head(candidates) %>% DT::datatable()

There are r nrow(candidates) candidate stem matches return by locality sensitive hashing with these parameters. This is a tiny r round(nrow(candidates)/length(stemmed_ab)^2,4) fraction of all the possible stem to stem matches we could have considered.

Expand Stem matches into full matches

We consider any two strings to be a possible match if we consider their stems to be a possible match. This bit of code expands and merges to get the suggestions in terms of event string- gazetteer string possible matches. Throughout we use the example of the stem "gura" to show possible matches and how they're reduced over various steps.

stemmed_ab_suggestions <- subset(candidates , stemmed_a!="" &  stemmed_b!="") ; #dim(stemmed_ab_suggestions) #drop empty stems. If not it'll crash first match later on
stemmed_ab_suggestions$N <- 1
glue::glue(stemmed_ab_suggestions %>% nrow() , " string matches")

#Link a stemmed_a to stemmed b
stemmed_ab_suggestions_events <- data.table::rbindlist(list(
  stemmed_ab_suggestions[,c('stemmed_a','stemmed_b','N')],
  data.table::setnames(stemmed_ab_suggestions[,c('stemmed_a','stemmed_b','N')], c('stemmed_b','stemmed_a','N')  ), 
  data.table::setnames(stemmed_ab_suggestions[,c('stemmed_a','stemmed_a','N')], c('stemmed_b','stemmed_b','N')  ) #make sure it's a suggestion for itself
)) ; dim(stemmed_ab_suggestions_events)

stemmed_ab_suggestions_events <- unique(stemmed_ab_suggestions_events) ; #dim(stemmed_ab_suggestions_events) #859,118


#Only keep if the a is a stem found in the events data
stemmed_ab_suggestions_events <- subset(stemmed_ab_suggestions_events, stemmed_a %in% unique(events_sf$name_cleaner_stem)) ; #dim(stemmed_ab_suggestions_events)  #148,416
glue::glue(stemmed_ab_suggestions_events %>% nrow() , " string matches")

#Step 2 Pull all full
ab_suggestions_events <- merge(stemmed_ab_suggestions_events,
                               data.table::setnames(names_and_stems, c("name_cleaner_a","name_cleaner_stem")) ,
                               by.x="stemmed_a",
                               by.y="name_cleaner_stem",
                               all.x=T,
                               allow.cartesian=TRUE) ; #dim(ab_suggestions_events)
#DT::datatable(ab_suggestions_events[stemmed_a=="gura"])

#Make sure the full name appears in the event data
ab_suggestions_events <- subset(ab_suggestions_events, name_cleaner_a %in% unique( events_sf$name_cleaner)  ) ; #dim(ab_suggestions_events)  #
glue::glue(stemmed_ab_suggestions_events %>% nrow() , " string matches")
#DT::datatable(ab_suggestions_events[stemmed_a=="gura"])

ab_suggestions_events <- merge(ab_suggestions_events,
                                     data.table::setnames(names_and_stems, c("name_cleaner_b","name_cleaner_stem")) ,
                                     by.x="stemmed_b",
                                     by.y="name_cleaner_stem",
                                     all.x=T,
                                     allow.cartesian=TRUE)  ; #dim(ab_suggestions_events) #4,846,054

ab_suggestions_events <- subset(ab_suggestions_events, name_cleaner_a %in% unique( events_sf$name_cleaner)  ) ; #dim(ab_suggestions_events)  #
glue::glue(stemmed_ab_suggestions_events %>% nrow() , " string matches")
#DT::datatable(ab_suggestions_events[stemmed_a=="gura"])

Screen pairs that are too dissimilar

ab_suggestions_events[,temp_q_cos:= stringdist::stringsim(name_cleaner_a,name_cleaner_b,"cos", nthread=parallel::detectCores(),q=2),] #both fast and very good at sorting
ab_suggestions_events <- ab_suggestions_events[temp_q_cos>.3] ; #dim(ab_suggestions_events) #less than .3 is never match
glue::glue(ab_suggestions_events %>% nrow() , " string matches after removing those with too small a ngram2 cosine distance")

#DT::datatable(ab_suggestions_events[stemmed_a=="gura"])

#library(DT)
#DT::datatable(ab_suggestions_events[stemmed_a=="gura"])

XGBoost Toponym Matching

Next, we refine the remaining matches using additional string distance features and an XGBoost model trained earlier on the labeled match/no match toponym training examples.

ab_suggestions_events$a <- ab_suggestions_events$stemmed_a
ab_suggestions_events$b <- ab_suggestions_events$stemmed_b

ab_suggestions_events_features <- MeasuringLandscape:::toponym_add_features(ab_suggestions_events)

vars_x_string <- c(
    "Jaro",
    "Optimal_String_Alignment"    ,
    "Levenshtein",
    "Damerau_Levenshtein"    ,
     "Longest_Common_Substring"     ,
    "q_gram_1",
    "q_gram_2",
    "q_gram_3",
    "q_gram_4",
    "q_gram_5",
    'Cosine_1',
    'Cosine_2',
    'Cosine_3',
    'Cosine_4',
    'Cosine_5',
    "Jaccard"              ,
     "First_Mistmatch"         ,
    "a_nchar"     ,
    "b_nchar"   ,
    "ab_nchar_diff"       ,             
    "dJaro",
    "dOptimal_String_Alignment"      ,
    "dLevenshtein"     ,
    "dDamerau_Levenshtein"  ,           
    "dLongest_Common_Substring",
    "dq_gram",
    "dCosine",
    "dJaccard"
) 

dpredict<-xgboost::xgb.DMatrix(data= as.matrix(ab_suggestions_events_features[,vars_x_string, with=F]), missing = NA)

#Step 4 Predict the likelihood that two strings are same
ab_suggestions_events_features$toponym_xb_model_prediction <- predict( toponym_xb_model, dpredict )
ab_suggestions_events_features$toponym_xb_model_prediction  <- 1/(1 + exp(-ab_suggestions_events_features$toponym_xb_model_prediction ))

# why does this work now but not in 06
hist(ab_suggestions_events_features$toponym_xb_model_prediction)


ab_suggestions_small <- ab_suggestions_events_features[,c('a','b','name_cleaner_a','name_cleaner_b','toponym_xb_model_prediction',"N")]
ab_suggestions_directed <- ab_suggestions_small
ab_suggestions_directed$a_event <- ab_suggestions_directed$name_cleaner_a %in% events_sf$name_cleaner
ab_suggestions_directed <- subset(ab_suggestions_directed, a_event) #just to double check, should be no change
ab_suggestions_directed$b_event <- ab_suggestions_directed$name_cleaner_b %in% events_sf$name_cleaner
ab_suggestions_directed$b_event_coord <- ab_suggestions_directed$name_cleaner_b %in% flatfiles_sf_roi$name_cleaner[flatfiles_sf_roi$source_dataset %in% c('events') & !is.na(flatfiles_sf_roi$latitude)]
ab_suggestions_directed$b_gaz_coord   <- ab_suggestions_directed$name_cleaner_b %in% flatfiles_sf_roi$name_cleaner[!flatfiles_sf_roi$source_dataset %in% c('events')  & !is.na(flatfiles_sf_roi$latitude) ]
hist(ab_suggestions_directed$toponym_xb_model_prediction) #before subsetting
ab_suggestions_directed <- ab_suggestions_directed[a_event & (b_event_coord|b_gaz_coord)]
ab_suggestions_directed <- ab_suggestions_directed[toponym_xb_model_prediction>.5 | name_cleaner_a==name_cleaner_b] #Only keep those with greater .5 chance of match
hist(ab_suggestions_directed$toponym_xb_model_prediction) #after subsetting
ab_suggestions_directed[,toponym_xb_model_prediction_max:=max(toponym_xb_model_prediction),
                        by=c('name_cleaner_a','b_gaz_coord')]

ab_suggestions_directed_max_gaz_coord <- subset(ab_suggestions_directed,
                                                b_gaz_coord &
                                                  toponym_xb_model_prediction==toponym_xb_model_prediction_max
                                                )

ab_suggestions_directed_max_event_coord <- subset(ab_suggestions_directed,
                                                  b_event_coord &
                                                    toponym_xb_model_prediction==toponym_xb_model_prediction_max
                                                  )

#A non missing coordinate
condition <- !is.na(sf::st_coordinates(events_sf)[,1])
table(condition)

#An exact match to the gaz
condition2 <- events_sf$name_cleaner %in% flatfiles_sf_roi$name_cleaner[!flatfiles_sf_roi$source_dataset %in% c('events','events_poly')]
table(condition2)

table(condition | condition2)

#Fuzzy match to a gaz
condition3 <- events_sf$name_cleaner %in% ab_suggestions_directed_max_gaz_coord$name_cleaner_a 
table(condition3)

table(condition | condition2 | condition3)
table(condition | condition2 | condition3) / length(condition)

#Exact match to another event
condition4 <- events_sf$name_cleaner %in% flatfiles_sf_roi$name_cleaner[flatfiles_sf_roi$source_dataset %in% c('events') & !is.na(flatfiles_sf_roi$latitude) ]
table(condition4)
table(condition4) / length(condition)

table(condition | condition2 | condition3 | condition4)
table(condition | condition2 | condition3 | condition4) / length(condition)

#Fuzzy match to event with coordinate
condition5 <- events_sf$name_cleaner %in% ab_suggestions_directed_max_event_coord$name_cleaner_a 
table(condition5)
table(condition5) / length(condition)

table(condition | condition2 | condition3 | condition4 | condition5)
table(condition | condition2 | condition3 | condition4 | condition5) / length(condition)

condition6 <- tolower(events_sf$document_district_clean) %in% flatfiles_sf_roi$name_cleaner[!flatfiles_sf_roi$source_dataset %in% c('events')]
table(condition6)

saveRDS(ab_suggestions_directed,
        file=glue::glue(getwd(), "/../inst/extdata/ab_suggestions_directed.Rds") 
        )

Given all of the above we want to produce a matching dataset broken out across

exact or fuzzy match place description or district/province

#For when this inevitably crashes
ab_suggestions_directed <-  readRDS(system.file("extdata", "ab_suggestions_directed.Rds", package = "MeasuringLandscape")) 
flatfiles_sf_roi_centroids <- flatfiles_sf_roi %>% filter(!is.na(latitude) | geometry_type != "POINT") %>% sf::st_centroid()

#Do some quick accounting

#Create new dataset that includes
#1) Diads chosen by fuzzy ab_suggestions_directed 
#2) Diads of identical

georef <- data.table::setnames( ab_suggestions_directed[,c('name_cleaner_a','name_cleaner_b','toponym_xb_model_prediction')] , c('georef_a','georef_b','toponym_xb_model_prediction') ) #So this is now a list of names that are either identical or fuzzy matches for one another
georef$georef_a <- trimws(georef$georef_a)
georef$georef_b <- trimws(georef$georef_b)
georef <- unique(georef)
#georef$georef_ab_identical <- georef$georef_a==georef$georef_b 

table(events_sf$name_cleaner %in% georef$georef_a) #7,742 events with at least one gazetteer suggestion

#This is mapping of events to identical and fuzzy
#georef2 is a combination of events, and all the gazetteer names they might match to
georef2 <- as.data.frame(events_sf)[,c('event_hash', 'name_cleaner','document_district_clean','document_unit_type','document_date_best_year')] %>% 
  dplyr::left_join(georef, by = c("name_cleaner" = "georef_a")) %>% unique()  
dim(georef2)

table(events_sf$name_cleaner %in% georef2$name_cleaner) #All events are in here
table(events_sf$name_cleaner %in% georef2$name_cleaner[!is.na(georef2$georef_b)]) #
table(events_sf$event_hash %in% georef2$event_hash) #All event hashes show up in here

#This is mapping of gaz places to identical and fuzzy possible
#georef3 is a combination of gazetteers and all gazetteer names they might match to
#georef3 <- as.data.frame(flatfiles_sf_roi_centroids)[,c('place_hash','source_dataset','name_cleaner','geometry_type')] %>% left_join(georef, by = c("name_cleaner" = "georef_b")) %>% unique() 
#dim(georef3)

#This is a mapping of events to gaz where either their exact or fuzzy counterparts matched each other
#georef all is a mapping of events, to gazetteer names, to 
georef_all <- georef2 %>% 
             dplyr::left_join(as.data.frame(flatfiles_sf_roi_centroids)[,c('place_hash','source_dataset','name_cleaner','geometry_type','feature_code')],
                       by = c("georef_b" = "name_cleaner")) %>% unique()
dim(georef_all)   #303,174


table(events_sf$name_cleaner %in% georef_all$name_cleaner) #All events are in here
table(events_sf$name_cleaner %in% georef_all$name_cleaner[!is.na(georef_all$georef_b)]) #
table(events_sf$event_hash %in% georef_all$event_hash) #All event hashes show up in here

table(georef_all$source_dataset)
length(unique(georef_all$event_hash))
georef_all$georef_a <- NULL
#Ok next up we add distance
#crs_m <- "+proj=utm +zone=27 +datum=NAD83 +units=m +no_defs"  #this is toally wrong, stop using it
#https://epsg.io/21037
#EPSG:21037
#Projected coordinate system
#Arc 1960 / UTM zone 37S
rownames(events_sf) <- events_sf$event_hash
#events_sf_utm <- st_transform(events_sf[,c('event_hash',"geometry")], crs=21037) #
#events_sf_utm <- st_sf(as.data.frame(events_sf_utm))
#rownames(events_sf_utm) <- events_sf_utm$event_hash
#It was slow as hell as a tbl, covert to just sf


#flatfiles_sf_roi_centroids_utm <- st_transform(flatfiles_sf_roi_centroids[,c('place_hash',"geometry")], crs=21037)
#rownames(flatfiles_sf_roi_centroids_utm) <- flatfiles_sf_roi_centroids_utm$place_hash

#Try some smaller experiments and verify if and how long it takes to calculate distance since we've got a million of them
#This takes a long time to calculate because there's a million of them

#Instead, add a column for whether dimensions are zero and then queue that column with the hash, much much better
events_sf$isvalid <- !is.na(st_dimension(events_sf) )
flatfiles_sf_roi_centroids$isvalid <- !is.na(st_dimension(flatfiles_sf_roi_centroids) )

cords_events <- as.data.frame( st_coordinates(events_sf) )
rownames(cords_events) <- rownames(events_sf)
cords_flatfiles <- as.data.frame( st_coordinates(flatfiles_sf_roi_centroids) )
rownames(flatfiles_sf_roi_centroids) <- flatfiles_sf_roi_centroids$place_hash
rownames(cords_flatfiles) <- rownames(flatfiles_sf_roi_centroids)

coords_dt <- as.data.table( cbind(cords_events[georef_all$event_hash,], cords_flatfiles[georef_all$place_hash,]))
names(coords_dt) <- c("X1","Y1", "X2", "Y2")
coords_dt[,distance_km:=sqrt((X2-X1)^2+(Y2-Y1)^2) * 111]

hist(coords_dt$distance)
georef_all$distance <- NULL
georef_all_dt <- as.data.table(cbind(georef_all, coords_dt))

georef_all_dt <- unique(georef_all_dt); dim(georef_all_dt)
#georef_all_dt <- subset(georef_all_dt, !is.na(name_cleaner) & !is.na(georef_b)) #maybe don't drop unmatched ones? If we want events as a source?

table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner) #All events are in here
table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner[!is.na(georef_all_dt$georef_b)]) #7,742 events with at least one gazetteer suggestion
table(events_sf$event_hash %in% georef_all_dt$event_hash) #All event hashes show up in here

saveRDS(georef_all_dt,
        file=paste0(here::here(), "/inst/extdata/georef_all_dt.Rds")
)


rexdouglass/MeasuringLandscape documentation built on May 13, 2019, 6:16 p.m.