This file demonstrates that the kinds of locations that are imputed are different from the true locations, in terms of things like population, distance from roads, ruggedness, etc.
rm(list=ls()); gc() # !diagnostics off library(MeasuringLandscape) library(tidyverse) dir_figures <- glue::glue(getwd(), "/../paper/figures/") gc() knitr::opts_knit$set(progress = TRUE, verbose = TRUE) knitr::opts_chunk$set(fig.width=12, fig.height=8, warning=FALSE, message=FALSE, cache=TRUE) options(width = 160)
#Load Events events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean','map_coordinate_clean_latitude','map_coordinate_clean_longitude')], "location_text", transform, map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude)) )
#Reload from scratch each time in case we subset sometehing weirdly georef_all_dt <- readRDS(system.file("extdata", "georef_all_dt_recomendations.Rds", package = "MeasuringLandscape")) table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner) #All events are in here table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner[!is.na(georef_all_dt$georef_b)]) #7,742 events with at least one gazeteer suggestion #Exclude all distance = 0 obs, those are self matches georef_all_dt <- subset(georef_all_dt, !is.na(name_cleaner) & # must have a name (is.na(distance_km) | distance_km!=0) ) #Can be either missing or not zero. Only thing we drop is zero because that's a self match
Create a subset just for those events that have both original coords and a potential match
#Subset to make sure both sides have lat/long georef_all_dt_covariates <- subset(georef_all_dt, !is.na(X1) & !is.na(X2)) ; dim(georef_all_dt_covariates) #Create an events dataset for merging on covariates georef_all_dt_covariates_events <- georef_all_dt_covariates[!duplicated(event_hash)] georef_all_dt_covariates_events_sf <- georef_all_dt_covariates_events %>% data.frame() %>% sf::st_as_sf(coords = c("X1","Y1"), crs = 4326, agr = "constant", remove=F, na.fail =T) georef_all_dt_covariates_events_sf$imputed <- 0 #Create a gaz dataset for mering on covarates georef_all_dt_covariates_gaz_sf <- georef_all_dt_covariates[,distance_km_min:=min(distance_km, na.rm=T), by=list(event_hash, source_dataset)] georef_all_dt_covariates_gaz_sf <- georef_all_dt_covariates_gaz_sf[distance_km==distance_km_min] georef_all_dt_covariates_gaz_sf <- georef_all_dt_covariates_gaz_sf[,head(.SD, 1), by=list(event_hash, source_dataset)] %>% data.frame() %>% sf::st_as_sf(coords = c("X2","Y2"), crs = 4326, agr = "constant", remove=F, na.fail =T) georef_all_dt_covariates_gaz_sf$imputed <- 1
#covariate_list <- readRDS( '/home/rexdouglass/Dropbox (rex)/Kenya Article Drafts/MeasuringLandscapeCivilWar/inst/extdata/covariate_list.Rds' ) covariate_list <- MeasuringLandscape:::prep_covariates() sapply(covariate_list, FUN=function(q) class(q)[1])
#georef_all_dt_covariates_events_sf$district <- new_over(georef_all_dt_covariates_events_sf , covariate_list[[1]] , 'name' ) #georef_all_dt_covariates_events_sf$cadastral <- new_over(georef_all_dt_covariates_events_sf , covariate_list[[2]] , 'name' ) #georef_all_dt_covariates_events_sf$language <- new_over(georef_all_dt_covariates_events_sf , covariate_list[[3]] , 'LANGUAGE' ) #georef_all_dt_covariates_events_sf$tribe <- new_over(georef_all_dt_covariates_events_sf , covariate_list[[4]] , 'Tribe' ) georef_all_dt_covariates_events_sf$rain <- MeasuringLandscape:::new_over(georef_all_dt_covariates_events_sf , covariate_list[['raster_rain']] , 'Tribe' ) georef_all_dt_covariates_events_sf$population <- MeasuringLandscape:::new_over(georef_all_dt_covariates_events_sf , covariate_list[['pop_raster_roi']] , '' ) georef_all_dt_covariates_events_sf$treecover <- MeasuringLandscape:::new_over(georef_all_dt_covariates_events_sf , covariate_list[['forest_raster_roi']] , '' ) georef_all_dt_covariates_events_sf$ruggedness <- MeasuringLandscape:::new_over(georef_all_dt_covariates_events_sf , covariate_list[['ruggedness_raster_roi']] , '' ) georef_all_dt_covariates_events_sf$roads_distance <- MeasuringLandscape:::new_over(georef_all_dt_covariates_events_sf , covariate_list[['roads_distance_to']] , '' ) #georef_all_dt_covariates_events_sf$landuse <- new_over(georef_all_dt_covariates_events_sf , covariate_list[[9]] , 'LANDUSE' ) #georef_all_dt_covariates_gaz_sf$district <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[1]] , 'name' ) #georef_all_dt_covariates_gaz_sf$cadastral <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[2]] , 'name' ) #georef_all_dt_covariates_gaz_sf$language <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[3]] , 'LANGUAGE' ) #georef_all_dt_covariates_gaz_sf$tribe <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[4]] , 'Tribe' ) #georef_all_dt_covariates_gaz_sf$rain <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[5]] , '' ) #georef_all_dt_covariates_gaz_sf$population <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[6]] , '' ) #georef_all_dt_covariates_gaz_sf$treecover <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[7]] , '' ) #georef_all_dt_covariates_gaz_sf$ruggedness <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[8]] , '' ) #georef_all_dt_covariates_gaz_sf$roads_distance <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[9]] , '' ) #georef_all_dt_covariates_gaz_sf$landuse <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[9]] , 'LANDUSE' )
sentence_case <- function(x) stringr::str_to_sentence(tolower(gsub("_"," ",x))) #Make that bias in the coef plot data.table::setkey(georef_all_dt, "rule_ensemble") #sort by ensemble score, pick the best option for each source georef_all_dt_source <- georef_all_dt[,.SD[1], by=list(event_hash, source_dataset) ] temp <- as.data.frame(georef_all_dt_covariates_events_sf)[,c('source_dataset', 'population', 'treecover', 'ruggedness', 'roads_distance', 'rain') ] %>% group_by(source_dataset) %>% summarise_all(funs(mean, .args =list(na.rm=T)) ) #I don't think we want to scale this, I think we want to do percentages #temp[,2:6] <- lapply(temp[,2:6], scale) #temp[,2:6] <- temp[,2:6] - temp[rep(2,nrow(temp)),2:6] #This demeans each test <- as.numeric(unlist(as.vector(data.frame(temp[2,])))) temp <- as.data.frame(temp) temp[,2] <- temp[,2] / rep(test[2],10) #This demeans each temp[,3] <- temp[,3] / rep(test[3],10) #This demeans each temp[,4] <- temp[,4] / rep(test[4],10) #This demeans each temp[,5] <- temp[,5] / rep(test[5],10) #This demeans each temp[,6] <- temp[,6] / rep(test[6],10) #This demeans each temp <- temp %>% tidyr::gather(variable, value, -source_dataset) p9 <- temp %>% mutate(source_dataset=sentence_case(source_dataset), variable=sentence_case(variable) ) %>% ggplot(aes(x = value, y = as.factor(sub("_","",source_dataset)), shape=as.factor(sub("_","",source_dataset)))) + geom_point() + facet_grid(.~variable, scales="free") + geom_vline(xintercept=1) + ylab(sentence_case("Source Dataset")) + xlab(sentence_case("Ratio of Mean Value at Imputed Location Relative to True Location")) + scale_shape_manual(values=1:15) + theme_bw() + theme(legend.position="none") p9 ggsave( filename = glue::glue(dir_figures, "p_bias_in_covariates_by_source.pdf"), plot = p9, width = 16, height = 3 )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.