Cleans and combines a large number of gazetteers of place names for looking up locations by name and retrieving their coordinates.
rm(list=ls()); gc() # Hiding output and warnings # !diagnostics off library(MeasuringLandscape) library(tidyverse) #load independently just to make sure %>% gets imported #devtools::load_all() dir_figures <- glue::glue(getwd(), "/../paper/figures/") dir_package_files <- glue::glue(getwd(), "/inst/extdata/") gc() knitr::opts_knit$set(progress = TRUE, verbose = TRUE) knitr::opts_chunk$set(fig.width=8, warning=FALSE, message=FALSE, cache=TRUE) options(width = 160)
1) NGA - National Geospatial Agency 2) Geonames 3) Historical gazetteer from the time period 4) GoogleMaps 5) BingMaps 6) KEN_adm 7) Livestock - International Livestock Research Institute map 8) Kenya Districts 1962 - Polygons derived from 1962 district map 9) Kenya Cadastral District - district polygons derived from the cadastral map 10) Kenya Cadastral - A contemporary cadastral map 11) Wikidata 12) TGN - Getty Thesaurus of Geographic Names 13) OpenStreetMap
fromscratch <- F # Bounding box of ROI long_min <- 35.67 long_max <- 38.19 lat_min <- -1.43285 lat_max <- 0.54543 # Bounding Box Spatial Object region_of_interest_sf_utm <- MeasuringLandscape:::create_roi( bottom_left_x = long_min, bottom_left_y = lat_min, top_right_x = long_max, top_right_y = lat_max ) #Event locations events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) events_sf_roi <- events_sf %>% MeasuringLandscape:::subset_roi(region_of_interest_sf_utm) %>% dplyr::select("name_cleaner","map_coordinate_clean_latitude","map_coordinate_clean_longitude","geometry") %>% stats::setNames(c("name","latitude","longitude","geometry")) %>% dplyr::mutate(feature_code="event", timeperiod="1952-01-01", source_dataset="events") %>% dplyr::filter(!is.na(name) & name !="" & !is.na(latitude) & !is.na(longitude)) %>% dplyr::distinct() # Moderate Size Point Sources nga_sf_roi <- MeasuringLandscape:::load_nga(roi = region_of_interest_sf_utm, fromscratch = fromscratch) geonames_sf_roi <- MeasuringLandscape:::load_geonames(roi = region_of_interest_sf_utm, fromscratch = fromscratch) historical_sf_roi <- MeasuringLandscape:::load_historical(roi = region_of_interest_sf_utm, fromscratch = fromscratch) # Small API Sources googlemaps_sf_roi <- MeasuringLandscape:::load_googlemaps(roi = region_of_interest_sf_utm, fromscratch = fromscratch) bingmaps_sf_roi <- MeasuringLandscape:::load_bingmaps(roi = region_of_interest_sf_utm, fromscratch = fromscratch) # Moderate Size mixed or polygon sources KEN_adm_sf_roi <- MeasuringLandscape:::load_ken_adm(roi = region_of_interest_sf_utm, fromscratch = fromscratch) livestock_sf_roi <- MeasuringLandscape:::load_livestock(roi = region_of_interest_sf_utm, fromscratch = fromscratch) kenya_districts1962_sf_roi <- MeasuringLandscape:::load_kenya_districts1962(roi = region_of_interest_sf_utm, fromscratch = fromscratch) kenya_cadastral_district_sf_roi <- MeasuringLandscape:::load_kenya_cadastral_district(roi = region_of_interest_sf_utm, fromscratch = fromscratch) kenya_cadastral_sf_roi <- MeasuringLandscape:::load_kenya_cadastral(roi = region_of_interest_sf_utm, fromscratch = fromscratch) # Very Large Sources wikidata_sf_roi <- MeasuringLandscape:::load_wikidata(long_min, long_max, lat_min, lat_max, fromscratch = fromscratch) tgn_sf_roi <- MeasuringLandscape:::load_tgn(long_min, long_max, lat_min, lat_max, fromscratch = fromscratch) openstreetmap_sf_roi <- MeasuringLandscape:::load_openstreetmap(roi = region_of_interest_sf_utm, fromscratch = fromscratch) if (fromscratch) { # combinedlist <- lapply(combinedlist, FUN=function(x) x %>% mutate(hash=apply(x,1 ,digest))) #add an id # This takes way too long but is the only easy way I've found to merge sf frames # For speed and debugging purposes splitting this up # For some reason this works when you split it up but not when you put them together # Double check that they're all sf data.frames, if one of them is a data.table it'll get rbindlist involved which will breaking when rbinding point and multipoint geometries # points flatfiles1 <- list( events_sf_roi %>% distinct(), # ellipse_sf_roi %>% distinct(), tgn_sf_roi %>% distinct(), historical_sf_roi %>% distinct(), geonames_sf_roi %>% distinct(), nga_sf_roi %>% distinct() ) %>% reduce(rbind_sf) table(flatfiles1$source_dataset) flatfiles1b <- list( googlemaps_sf_roi %>% distinct(), bingmaps_sf_roi %>% distinct(), wikidata_sf_roi %>% distinct() ) %>% reduce(rbind_sf) table(flatfiles1$source_dataset) # Polygons flatfiles2 <- list( # ellipse_sf %>% distinct(), KEN_adm_sf_roi %>% distinct(), openstreetmap_sf_roi %>% distinct(), livestock_sf_roi %>% distinct(), kenya_cadastral_sf_roi %>% distinct(), kenya_cadastral_district_sf_roi %>% distinct(), kenya_districts1962_sf_roi %>% distinct() ) %>% reduce(rbind_sf) table(flatfiles2$source_dataset) flatfiles <- list( flatfiles1 %>% distinct(), flatfiles1b %>% distinct(), flatfiles2 %>% distinct() ) %>% reduce(rbind_sf) flatfiles <- flatfiles %>% distinct() %>% st_cast() flatfiles$valid <- st_is_valid(flatfiles) flatfiles <- st_make_valid(flatfiles) # ok now we're going to fix the broken ones. # temp <- unique(flatfiles_sf_roi$name_cleaner_nospace) # flatfiles <- as.data.table(flatfiles) ## flatfiles[,livestock_kill:=F,] # flatfiles[source_dataset=="livestock" & name_cleaner %in% temp,livestock_kill:=T,] #Livestock doesn't have spaces unforutnately # flatfiles <- flatfiles[livestock_kill==F,] #library(stringr) flatfiles_sf_roi <- flatfiles %>% # mutate_all(funs(stri_enc_toascii)) %>% #rbindlist tries to convert to factors # mutate(name = strsplit(names, ";| see | SEE | check if same as ")) %>% # tidyr::unnest(name) %>% # This trick doesn't work when there's multiple list columns, which geometry is mutate(name = str_trim((name))) %>% mutate(name = gsub(",$", "", name)) %>% # remove trailing comma mutate(name = gsub("\032", "", name)) %>% # remove end of line character mutate(name = gsub("_", " ", name)) %>% # convert underscores to spaces mutate(name = gsub(" -|- | - ", "-", name)) %>% # convert dashes with weird spacing to just a dash filter(!is.na(name) & name != "NA" & name != "") %>% # Remove anything that might be a missing name # filter(!is.na(asciiname_either_clean) & asciiname_either_clean!="" & nchar(asciiname_either_clean)>2) %>% distinct() flatfiles_sf_roi$names <- NULL table(flatfiles_sf_roi$source_dataset) table(flatfiles_sf_roi$timeperiod) # Fix geospatial stuff # library(feather) # condition <- st_is_valid(flatfiles_sf_roi$geometry) ; table(condition) # takes a long while # flatfiles_sf_roi$geometry[!condition] <- st_make_valid(flatfiles_sf_roi$geometry[!condition]) flatfiles_sf_roi$geometry_dimensions <- sapply(flatfiles_sf_roi$geometry, FUN = function(x) st_dimension(x)) # takes a while table(flatfiles_sf_roi$geometry_dimensions, useNA = "always") condition <- is.na(flatfiles_sf_roi$geometry_dimensions) table(flatfiles_sf_roi$source_dataset, flatfiles_sf_roi$geometry_dimensions, useNA = "always") flatfiles_sf_roi$geometry[condition] <- st_point() # Replace these empty polygons the make valid put in with empty points condition <- is.na(flatfiles_sf_roi$geometry_dimensions) table(flatfiles_sf_roi$source_dataset, flatfiles_sf_roi$geometry_dimensions, useNA = "always") flatfiles_sf_roi$geometry_type <- sapply(flatfiles_sf_roi$geometry, FUN = function(x) class(x)[2]) table(flatfiles_sf_roi$source_dataset, flatfiles_sf_roi$geometry_type, useNA = "always") condition <- flatfiles_sf_roi$geometry_type %in% c("POLYGON", "MULTIPOLYGON") flatfiles_sf_roi$geometry_area <- NA flatfiles_sf_roi$geometry_area[condition] <- st_area(flatfiles_sf_roi$geometry[condition]) # ok is failing for empty polygons condition <- flatfiles_sf_roi$geometry_type %in% c("LINESTRING") flatfiles_sf_roi$geometry_length <- NA flatfiles_sf_roi$geometry_length[condition] <- st_length(flatfiles_sf_roi$geometry[condition]) # By definition, all ROI matches are intersections with the ROI # flatfiles_sf_roi <- flatfiles_sf_roi %>% mutate(region_of_interest_overlap = # as.vector(st_overlaps( geometry, region_of_interest_sf_utm, sparse=F) ) ) #Flag the Region of Interest # flatfiles_sf_roi <- flatfiles_sf_roi %>% mutate(region_of_interest_within = # as.vector(st_within( geometry, region_of_interest_sf_utm, sparse=F) ) ) #Flag the Region of Interest # flatfiles_sf_roi <- flatfiles_sf_roi %>% mutate(region_of_interest_intersects = # as.vector(st_intersects(geometry, region_of_interest_sf_utm, sparse=F) ) ) #Flag the Region of Interest # table(flatfiles_sf_roi$region_of_interest_overlap) #I guess just for polygons # table(flatfiles_sf_roi$region_of_interest_within) #requires polys to be entirely inside # table(flatfiles_sf_roi$region_of_interest_intersects) #just requires polys to overlap a little bit # #Time to fix livestock names # temp <- flatfiles_sf_roi %>% filter(!source_dataset %in% c('livestock_boundaries', 'livestock_points')) %>% as.data.frame() %>% # select(c("name_cleaner","name_cleaner_nospace")) %>% # distinct() %>% # filter(!duplicated(name_cleaner_nospace)) #for some reason multiple names are maping to the same name without spaces # rownames(temp) <- temp$name_cleaner_nospace # # flatfiles_sf_roi$name_cleaner_spaced <- temp[flatfiles_sf_roi$name_cleaner,"name_cleaner"] # # condition <- flatfiles_sf_roi$source_dataset %in% c('livestock_boundaries', 'livestock_points') & !is.na(flatfiles_sf_roi$name_cleaner_spaced) ; table(condition) # flatfiles_sf_roi$name_cleaner[condition] <- flatfiles_sf_roi$name_cleaner_spaced[condition] flatfiles_sf_roi$eventsource <- flatfiles_sf_roi$source_dataset %in% "events" # Get lat longs back cords <- st_coordinates(flatfiles_sf_roi %>% filter(geometry_type %in% "POINT")) condition1 <- flatfiles_sf_roi$geometry_type %in% "POINT" & is.na(flatfiles_sf_roi$longitude) condition2 <- is.na(flatfiles_sf_roi$longitude)[flatfiles_sf_roi$geometry_type %in% "POINT"] flatfiles_sf_roi$longitude[condition1] <- cords[condition2, 1] flatfiles_sf_roi$latitude[condition1] <- cords[condition2, 2] flatfiles_sf_roi <- flatfiles_sf_roi %>% # This distinct is very important, it correctly removes duplicates distinct(feature_code, latitude, longitude, name, source_dataset, geometry_type, region_of_interest_intersects, name_alternates, .keep_all = T) %>% mutate(name_clean = str_trim(tolower(name))) %>% mutate(name_clean_posessive = grepl("'s|`s", name_clean)) %>% mutate(name_cleaner = trimws(name_clean)) %>% mutate(name_cleaner = gsub("'s|`s", "", name_cleaner, fixed = T)) %>% mutate(name_cleaner = str_replace_all(name_cleaner, "[[:punct:]]|", "")) %>% mutate(name_cleaner = trimws(name_cleaner)) %>% mutate(name_cleaner_nospace = str_replace_all(name_cleaner, " ", "")) sort(unique(unlist(strsplit(flatfiles_sf_roi$name_clean, "")))) flatfiles_sf_roi$name_cleaner <- clean_noascii(flatfiles_sf_roi$name_cleaner) sort(unique(unlist(strsplit(flatfiles_sf_roi$name_cleaner, "")))) # only number and lowercase letters from now on # Geonames Code Descriptions # flatfiles_sf_roi$feature_code %>% janitor::tabyl( sort = TRUE) #512 unique codes geonames_code_descriptions <- as.data.frame(read_csv(system.file("extdata", "geonames_code_descriptions.csv", package = "MeasuringLandscape"), col_names = F)) names(geonames_code_descriptions) <- c("code", "code_txt", "description") rownames(geonames_code_descriptions) <- geonames_code_descriptions$code flatfiles_sf_roi$feature_code_txt <- tolower(geonames_code_descriptions[flatfiles_sf_roi$feature_code, "code_txt"]) flatfiles_sf_roi$feature_code_txt[is.na(flatfiles_sf_roi$feature_code_txt)] <- tolower(flatfiles_sf_roi$feature_code[is.na(flatfiles_sf_roi$feature_code_txt)]) table(flatfiles_sf_roi$feature_code[is.na(flatfiles_sf_roi$feature_code_txt)]) flatfiles_sf_roi$feature_code_txt <- gsub("_", " ", flatfiles_sf_roi$feature_code_txt) flatfiles_sf_roi$feature_code_txt <- gsub("(-ies)", " ", flatfiles_sf_roi$feature_code_txt, fixed = T) flatfiles_sf_roi$feature_code_txt <- gsub("(s)", " ", flatfiles_sf_roi$feature_code_txt, fixed = T) flatfiles_sf_roi$feature_code_txt <- gsub("(es)", " ", flatfiles_sf_roi$feature_code_txt, fixed = T) flatfiles_sf_roi$feature_code_txt <- trimws(flatfiles_sf_roi$feature_code_txt) flatfiles_sf_roi$place_hash <- apply(flatfiles_sf_roi, 1, digest, algo = "xxhash64") # create a hash id to reference rownames(flatfiles_sf_roi) <- flatfiles_sf_roi$id_hash # actually getting collisions saveRDS( flatfiles_sf_roi, file = glue::glue(getwd(), "/../inst/extdata/flatfiles_sf_roi.Rdata") ) # st_write(obj=flatfiles_sf_roi, # "/home/rexdouglass/Dropbox (rex)/Kenya Article Drafts/MeasuringLandscapeCivilWar/inst/extdata/flatfiles_sf_roi.gpkg", # delete_layer = TRUE) } flatfiles_sf_roi <- readRDS(system.file("extdata", "flatfiles_sf_roi.Rdata", package = "MeasuringLandscape"))
flatfiles_sf_roi %>% janitor::tabyl(source_dataset) %>% janitor::adorn_crosstab(., denom = "col", show_n = T, digits = 1, show_totals = T)
Different gazetteers record what type of location a name belongs to slightly differently. We've partially merged some categories to provide greater overlap.
flatfiles_sf_roi$feature_code_txt %>% janitor::tabyl(sort = TRUE) %>% janitor::adorn_crosstab(., denom = "all", rounding = "half up", show_n = T, digits = 1, show_totals = T) # 459 unique codes
Visual inspection reveals differences between the spatial coverage of each source. (Appendix Figure 1 and Appendix Figure 3)
```{R, fig.width=12, fig.height=8}
p_points <- flatfiles_sf_roi %>% filter(geometry_type %in% "POINT") %>% ggplot(aes(x = longitude, y = latitude, col = as.factor(source_dataset))) + geom_point(alpha = 0.1) + facet_wrap(~as.factor(source_dataset), drop = T) + guides(color = FALSE) + theme_bw() + xlim(35.67, 38.19) + ylim(-1.43285, 0.54543)
ggsave( filename = glue::glue(dir_figures, "flatfiles_sf_roi_facet_source_dataset_points.png"), plot = p_points, width = 12, height = 10 ) p_points
p_notpoints <- flatfiles_sf_roi %>% filter(!geometry_type %in% "POINT") %>% ggplot(aes(color = as.factor(source_dataset))) + ### geom_sf requires ggplot installed off of the dev server geom_sf(alpha = 0.1, size = 1) + facet_wrap(~as.factor(source_dataset), drop = T) + # ggtitle("Locations in Region of Interest Stratified by Source") + guides(color = FALSE) + xlim(35.67, 38.19) + ylim(-1.43285, 0.54543) + theme_bw()
ggsave( filename = glue::glue(dir_figures, "flatfiles_sf_roi_facet_source_dataset_notpoints.png"), plot = p_notpoints, width = 12, height = 6 ) p_notpoints
# Visually Comparing Spatial Precision of each Sources (Appendix Figure 2 on Longitude) ```{R, fig.width=12, fig.height=8} # Events and historical have the smoothest distributions, but that's misleading because conversion from degrees to decimal # Wikidata has lots of truncated precision entries, at 0 degrees # TGN has so few it's hard to tell # Open streetmap has a disproportionate number of business in the capital which leads to a weird distribution # NGA, geonames and historical look like a comb # livestock looks like comb with teeth missing # Events look fairly continuous with some spikes # It looks like a lot of these that don't match have truncated precision p_combs_lat <- flatfiles_sf_roi %>% filter(geometry_type %in% "POINT") %>% as.data.frame() %>% select(latitude, source_dataset) %>% ggplot(aes(x = latitude - round(latitude), col = as.factor(source_dataset))) + geom_histogram(bins = 1000) + facet_wrap(~source_dataset, scales = "free") ggsave( filename = glue::glue(dir_figures, "p_combs_lat.pdf"), plot = p_combs_lat, width = 12, height = 6 ) p_combs_lat p_combs_long <- flatfiles_sf_roi %>% filter(geometry_type %in% "POINT") %>% as.data.frame() %>% select(longitude, source_dataset) %>% ggplot(aes(x = longitude - round(longitude), col = as.factor(source_dataset))) + geom_histogram(bins = 1000) + facet_wrap(~source_dataset, scales = "free") ggsave( filename = glue::glue(dir_figures, "p_combs_long.pdf"), plot = p_combs_long, width = 12, height = 6 ) p_combs_long temp <- flatfiles_sf_roi %>% filter(geometry_type %in% "POINT") %>% as.data.frame() %>% select(longitude, source_dataset) %>% group_by(source_dataset) %>% mutate(longitude_trunc = longitude - round(longitude)) #temp %>% filter(source_dataset %in% "historical") %>% ungroup() %>% select("longitude_trunc") %>% as.vector() %>% table() #temp %>% filter(source_dataset %in% "geonames") %>% ungroup() %>% select("longitude_trunc") %>% as.vector() %>% round(5) %>% table() %>% sort(decreasing = T) %>% as.data.frame() %>% filter(Freq > 10) temp <- flatfiles_sf_roi %>% filter(geometry_type %in% "POINT") %>% as.data.frame() %>% select(longitude, source_dataset) %>% group_by(source_dataset) %>% arrange(longitude) %>% mutate(longitude_l1 = lag(longitude, 1)) %>% mutate(longitude_fd = round(longitude - longitude_l1, 4)) d <- as.matrix(table(temp$longitude_fd, temp$source_dataset)) # ok so the most common difference between points is 0.016. That's about 1.776 km
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.