# This function takes in a dataframe with two strings, a and b, and then calculates string distance features on them.
toponym_add_distances_dt <- function(inputdata, fromscratch=F, nthread=parallel::detectCores()) {
library(stringdist)
inputdata <- data.table::as.data.table(inputdata)
print("1 of 24")
inputdata[, First_Mistmatch := firstmismatch(a, b, verbose = F), ]
print("2 of 24")
inputdata[, Jaro := stringdist::stringsim(a, b, "jw", nthread = nthread), ]
print("3 of 24")
inputdata[, Optimal_String_Alignment := stringdist::stringsim(a, b, "osa", nthread = nthread), ]
print("4 of 24")
inputdata[, Levenshtein := stringdist::stringsim(a, b, "lv", nthread = nthread), ]
print("5 of 24")
inputdata[, Damerau_Levenshtein := stringdist::stringsim(a, b, "dl", nthread = nthread), ]
print("6 of 24")
inputdata[, Longest_Common_Substring := stringdist::stringsim(a, b, "lcs", nthread = nthread), ]
print("7 of 24")
inputdata[, q_gram_1 := stringdist::stringsim(a, b, "qgram", nthread = nthread, q = 1), ]
print("8 of 24")
inputdata[, q_gram_2 := stringdist::stringsim(a, b, "qgram", nthread = nthread, q = 2), ]
print("9 of 24")
inputdata[, q_gram_3 := stringdist::stringsim(a, b, "qgram", nthread = nthread, q = 3), ]
print("10 of 24")
inputdata[, q_gram_4 := stringdist::stringsim(a, b, "qgram", nthread = nthread, q = 4), ]
print("11 of 24")
inputdata[, q_gram_5 := stringdist::stringsim(a, b, "qgram", nthread = nthread, q = 5), ]
print("12 of 24")
inputdata[, Cosine_1 := stringdist::stringsim(a, b, "cosine", nthread = nthread, q = 1), ]
inputdata[, Cosine_2 := stringdist::stringsim(a, b, "cosine", nthread = nthread, q = 2), ]
inputdata[, Cosine_3 := stringdist::stringsim(a, b, "cosine", nthread = nthread, q = 3), ]
inputdata[, Cosine_4 := stringdist::stringsim(a, b, "cosine", nthread = nthread, q = 4), ]
inputdata[, Cosine_5 := stringdist::stringsim(a, b, "cosine", nthread = nthread, q = 5), ]
print("13 of 24")
inputdata[, Jaccard := stringdist::stringsim(a, b, "jaccard", nthread = nthread), ]
print("14 of 24")
inputdata$a_nchar <- nchar(inputdata$a)
print("15 of 24")
inputdata$b_nchar <- nchar(inputdata$b)
print("16 of 24")
inputdata$ab_nchar_diff <- abs(inputdata$a_nchar - inputdata$b_nchar)
print("17 of 24")
inputdata[, dJaro := stringdist::stringdist(a, b, "jw", nthread = nthread), ]
print("18 of 24")
inputdata[, dOptimal_String_Alignment := stringdist::stringdist(a, b, "osa", nthread = nthread), ]
print("19 of 24")
inputdata[, dLevenshtein := stringdist::stringdist(a, b, "lv", nthread = nthread), ]
print("20 of 24")
inputdata[, dDamerau_Levenshtein := stringdist::stringdist(a, b, "dl", nthread = nthread), ]
print("21 of 24")
inputdata[, dLongest_Common_Substring := stringdist::stringdist(a, b, "lcs", nthread = nthread), ]
print("22 of 24")
inputdata[, dq_gram := stringdist::stringdist(a, b, "qgram", nthread = nthread), ]
print("23 of 24")
inputdata[, dCosine := stringdist::stringdist(a, b, "cosine", nthread = nthread), ]
print("24 of 24")
inputdata[, dJaccard := stringdist::stringdist(a, b, "jaccard", nthread = nthread), ]
# p_load(TraMineR)
# http://traminer.unige.ch/doc/seqdef.html
# labels_unique <- sort(unique(c(inputdata$a, inputdata$b )))
# sequences_a_df <- generate_sequences( labels_unique,
# maxlength=20,
# mask=NA,
# intcutoff=256) #takes about a minute+
# processed_a <- seqdef( data= sequences_a_df )
# inputdata$a_numeric <- as.numeric(factor(inputdata$a, levels= labels_unique ))
# inputdata$b_numeric <- as.numeric(factor(inputdata$b, levels= labels_unique ))
#
# costs <- seqcost(processed_a, method = "INDELSLOG") #I think we calculate this on the training data, save it, and pass it in at ru ntime
#
# dist_OM <- seqdist(processed_a, method = "OM", indel = costs$indel, sm = costs$sm , norm="auto")
# inputdata[,OM:=dist_OM[ cbind(a_numeric, b_numeric) ],]
#
# dist_OMloc <- seqdist(processed_a, method = "OMloc", indel = costs.tr$indel, sm = costs.tr$sm, with.missing = F) #intentionally no norm
# inputdata[,OMloc:=dist_OMloc[ cbind(a_numeric, b_numeric) ],]
#
# dist_OMslen <- seqdist(processed_a, method = "OMslen", indel = costs.tr$indel, sm = costs.tr$sm, with.missing = F )#intentionally no norm
# inputdata[,OMslen:=dist_OMslen[ cbind(a_numeric, b_numeric) ],]
#
# dist_OMspell <- seqdist(processed_a, method = "OMspell", sm = costs.tr$sm, indel = 1, with.missing = TRUE )#intentionally no norm
# inputdata[,OMspell:=dist_OMspell[ cbind(a_numeric, b_numeric) ],]
#
# #ex1.OMstran <- seqdist(processed_a, method = "OMstran", sm = costs.tr$sm, indel = 1, with.missing = TRUE, otto=.5)#intentionally no norm
# dist_TWED <- seqdist(processed_a, method = "TWED", sm = costs.tr$sm, indel = 1, with.missing = TRUE , nu=.5)#intentionally no norm
# inputdata[,TWED:=dist_TWED[ cbind(a_numeric, b_numeric) ],]
#
# dist_LCS <- seqdist(processed_a, method = "LCS", norm = "auto")
# inputdata[,LCS:=dist_LCS[ cbind(a_numeric, b_numeric) ],]
#
# dist_LCP <- seqdist(processed_a, method = "LCP", norm = "auto")
# inputdata[,LCP:=dist_LCP[ cbind(a_numeric, b_numeric) ],]
#
# dist_RLCP <- seqdist(processed_a, method = "RLCP", norm = "auto")
# inputdata[,RLCP:=dist_RLCP[ cbind(a_numeric, b_numeric) ],]
#
# dist_NMS <- seqdist(processed_a, method = "NMS") #intentionally no norm
# inputdata[,NMS:=dist_NMS[ cbind(a_numeric, b_numeric) ],]
#
# dist_NMSMST <- seqdist(processed_a, method = "NMSMST") #intentionally no norm
# inputdata[,NMSMST:=dist_NMSMST[ cbind(a_numeric, b_numeric) ],]
#
# dist_SVRspell <- seqdist(processed_a, method = "SVRspell") #intentionally no norm #this one takes too long to process
# inputdata[,SVRspell:=dist_SVRspell[ cbind(a_numeric, b_numeric) ],]
#
# dist_CHI2 <- seqdist(processed_a, method = "CHI2", step = max(seqlength(processed_a)))
# inputdata[,CHI2:=dist_CHI2[ cbind(a_numeric, b_numeric) ],]
return(inputdata)
}
# This is for corpus features that are no longer used in the paper
toponym_add_corpus <- function(data, fromscratch=F) {
data <- data.table::as.data.table(data)
# Load the ngram corpus and use it to look up things
# Takes a while to load even with fst
library(fst)
# Add feature counts from the gazzetter
if (fromscratch) {
csv_dt_grams_all <- fst::read.fst("/home/rexdouglass/Dropbox (rex)/Kenya Article Drafts/MeasuringLandscapeCivilWar/inst/extdata/csv_dt_grams_all.fst", as.data.table = T) # this is expensive
head(csv_dt_grams_all)
setkey(csv_dt_grams_all, bigram) # this is going to take a while, and then we're going to pull every single flatfile and event name_cleaner and save that as a smaller file for next time
csv_dt_grams_places <- csv_dt_grams_all[bigram %in% unique(c(events_dt$name_cleaner, flatfiles_sf$name_cleaner)), ]
dim(csv_dt_grams_places) # 20k matches
fwrite(csv_dt_grams_places, "/home/rexdouglass/Dropbox (rex)/Kenya Article Drafts/MeasuringLandscapeCivilWar/inst/extdata/csv_dt_grams_places.csv")
}
csv_dt_grams_places <- data.table::fread("/home/rexdouglass/Dropbox (rex)/Kenya Article Drafts/MeasuringLandscapeCivilWar/inst/extdata/csv_dt_grams_places.csv")
csv_dt_grams_places_a <- csv_dt_grams_places[, c("bigram", "count", "year_min", "year_median", "year_mean", "year_max")]
names(csv_dt_grams_places_a) <- c("bigram_a", "corpus_mention_count_a", "corpus_mention_year_min_a", "corpus_mention_year_median_a", "corpus_mention_year_mean_a", "corpus_mention_year_max_a")
csv_dt_grams_places_b <- csv_dt_grams_places[, c("bigram", "count", "year_min", "year_median", "year_mean", "year_max")]
names(csv_dt_grams_places_b) <- c("bigram_b", "corpus_mention_count_b", "corpus_mention_year_min_b", "corpus_mention_year_median_b", "corpus_mention_year_mean_b", "corpus_mention_year_max_b")
data <- merge(data, csv_dt_grams_places_a, by.x = "a", by.y = "bigram_a", all.x = T)
data <- merge(data, csv_dt_grams_places_b, by.x = "b", by.y = "bigram_b", all.x = T)
# Count number of mentions across gazeteers
if (fromscratch) {
flatfiles_sf <- readRDS(system.file("extdata", "flatfiles_sf.Rdata", package = "MeasuringLandscape"))
temp <- strip_postfixes(flatfiles_sf$name_cleaner)
flatfiles_sf$name_cleaner_stem <- temp[[1]]
data.table::fwrite(data.table::as.data.frame(flatfiles_sf)[
!flatfiles_sf$source_dataset %in% c("events", "events_poly"),
c("name_cleaner", "name_cleaner_stem")
], "/home/rexdouglass/Dropbox (rex)/Kenya Article Drafts/MeasuringLandscapeCivilWar/inst/extdata/gazeteer_grams_noevents.csv")
}
gazeteer_grams_noevents <- fread("/home/rexdouglass/Dropbox (rex)/Kenya Article Drafts/MeasuringLandscapeCivilWar/inst/extdata/gazeteer_grams_noevents.csv")
temp <- gazeteer_grams_noevents[, list(gazeteer_mentions_count = .N), by = name_cleaner]
temp2 <- gazeteer_grams_noevents[, list(gazeteer_stem_mentions_count = .N), by = name_cleaner_stem]
data <- merge(
data,
data.table::setnames(temp, c("name_cleaner", "gazeteer_mentions_count_a")),
by.x = "a",
by.y = "name_cleaner",
all.x = T
)
data <- merge(
data,
data.table::setnames(temp, c("name_cleaner", "gazeteer_mentions_count_b")),
by.x = "b",
by.y = "name_cleaner",
all.x = T
)
data <- merge(
data,
data.table::setnames(temp2, c("name_cleaner_stem", "gazeteer_stem_mentions_count_a")),
by.x = "a",
by.y = "name_cleaner_stem",
all.x = T
)
data <- merge(
data,
data.table::setnames(temp2, c("name_cleaner_stem", "gazeteer_stem_mentions_count_b")),
by.x = "b",
by.y = "name_cleaner_stem",
all.x = T
)
# I should fill in NAs
data$corpus_mention_count_a[is.na(data$corpus_mention_count_a)] <- 0
data$corpus_mention_count_b[is.na(data$corpus_mention_count_b)] <- 0
data$gazeteer_mentions_count_a[is.na(data$gazeteer_mentions_count_a)] <- 0
data$gazeteer_mentions_count_b[is.na(data$gazeteer_mentions_count_b)] <- 0
data$gazeteer_stem_mentions_count_a[is.na(data$gazeteer_stem_mentions_count_a)] <- 0
data$gazeteer_stem_mentions_count_b[is.na(data$gazeteer_stem_mentions_count_b)] <- 0
return(data)
}
toponym_add_features <- function(data, fromscratch=F) {
data <- data.table::as.data.table(data)
data <- toponym_add_distances_dt(data)
# data <- toponym_add_corpus(data) #Currently excluded from paper
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.