This file demonstrates that the kinds of locations that are imputed are different from the true locations, in terms of things like population, distance from roads, ruggedness, etc.

rm(list=ls()); gc()
# !diagnostics off
library(MeasuringLandscape)
library(tidyverse)

dir_figures <- glue::glue(getwd(), "/../paper/figures/")

gc()

knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8,  warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)
#Load Events
events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) 

events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean','map_coordinate_clean_latitude','map_coordinate_clean_longitude')],
                                     "location_text", transform,
      map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude))
      )
#Reload from scratch each time in case we subset sometehing weirdly
georef_all_dt <- readRDS(system.file("extdata", "georef_all_dt_recomendations.Rds", package = "MeasuringLandscape")) 

table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner) #All events are in here
table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner[!is.na(georef_all_dt$georef_b)]) #7,742 events with at least one gazeteer suggestion

#Exclude all distance = 0 obs, those are self matches
georef_all_dt <- subset(georef_all_dt, 
                        !is.na(name_cleaner) & # must have a name
                        (is.na(distance_km) | distance_km!=0)  ) #Can be either missing or not zero. Only thing we drop is zero because that's a self match

Create a subset just for those events that have both original coords and a potential match

#Subset to make sure both sides have lat/long
georef_all_dt_covariates <- subset(georef_all_dt, !is.na(X1) & !is.na(X2)) ; dim(georef_all_dt_covariates)

#Create an events dataset for merging on covariates
georef_all_dt_covariates_events <- georef_all_dt_covariates[!duplicated(event_hash)]
georef_all_dt_covariates_events_sf <- georef_all_dt_covariates_events  %>% data.frame() %>% sf::st_as_sf(coords = c("X1","Y1"),  crs = 4326, agr = "constant", remove=F, na.fail =T)
georef_all_dt_covariates_events_sf$imputed <- 0

#Create a gaz dataset for mering on covarates
georef_all_dt_covariates_gaz_sf <- georef_all_dt_covariates[,distance_km_min:=min(distance_km, na.rm=T),
                                                            by=list(event_hash, source_dataset)]
georef_all_dt_covariates_gaz_sf <- georef_all_dt_covariates_gaz_sf[distance_km==distance_km_min]

georef_all_dt_covariates_gaz_sf <- georef_all_dt_covariates_gaz_sf[,head(.SD, 1), by=list(event_hash, source_dataset)] %>% 
                                  data.frame() %>% sf::st_as_sf(coords = c("X2","Y2"),  crs = 4326, agr = "constant", remove=F, na.fail =T)
georef_all_dt_covariates_gaz_sf$imputed <- 1

Load all of the spatial files for the covariates

#covariate_list <- readRDS( '/home/rexdouglass/Dropbox (rex)/Kenya Article Drafts/MeasuringLandscapeCivilWar/inst/extdata/covariate_list.Rds' )
covariate_list <-  MeasuringLandscape:::prep_covariates()

sapply(covariate_list, FUN=function(q) class(q)[1])

Extract the values of covariates found at each location and imputed location

#georef_all_dt_covariates_events_sf$district <- new_over(georef_all_dt_covariates_events_sf , covariate_list[[1]]  , 'name' )
#georef_all_dt_covariates_events_sf$cadastral <- new_over(georef_all_dt_covariates_events_sf , covariate_list[[2]]  , 'name' )
#georef_all_dt_covariates_events_sf$language <- new_over(georef_all_dt_covariates_events_sf , covariate_list[[3]]  , 'LANGUAGE' )
#georef_all_dt_covariates_events_sf$tribe <- new_over(georef_all_dt_covariates_events_sf , covariate_list[[4]]  , 'Tribe' )
georef_all_dt_covariates_events_sf$rain <- MeasuringLandscape:::new_over(georef_all_dt_covariates_events_sf ,
                                                    covariate_list[['raster_rain']]  , 'Tribe' )
georef_all_dt_covariates_events_sf$population <- MeasuringLandscape:::new_over(georef_all_dt_covariates_events_sf ,
                                                          covariate_list[['pop_raster_roi']]  , '' )
georef_all_dt_covariates_events_sf$treecover <- MeasuringLandscape:::new_over(georef_all_dt_covariates_events_sf ,
                                                         covariate_list[['forest_raster_roi']]  , '' )
georef_all_dt_covariates_events_sf$ruggedness <- MeasuringLandscape:::new_over(georef_all_dt_covariates_events_sf ,
                                                          covariate_list[['ruggedness_raster_roi']]  , '' )
georef_all_dt_covariates_events_sf$roads_distance <- MeasuringLandscape:::new_over(georef_all_dt_covariates_events_sf ,
                                                              covariate_list[['roads_distance_to']]  , '' )

#georef_all_dt_covariates_events_sf$landuse <- new_over(georef_all_dt_covariates_events_sf , covariate_list[[9]]  , 'LANDUSE' )
#georef_all_dt_covariates_gaz_sf$district <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[1]]  , 'name' )
#georef_all_dt_covariates_gaz_sf$cadastral <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[2]]  , 'name' )
#georef_all_dt_covariates_gaz_sf$language <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[3]]  , 'LANGUAGE' )
#georef_all_dt_covariates_gaz_sf$tribe <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[4]]  , 'Tribe' )
#georef_all_dt_covariates_gaz_sf$rain <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[5]]  , '' )
#georef_all_dt_covariates_gaz_sf$population <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[6]]  , '' )
#georef_all_dt_covariates_gaz_sf$treecover <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[7]]  , '' )
#georef_all_dt_covariates_gaz_sf$ruggedness <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[8]]  , '' )
#georef_all_dt_covariates_gaz_sf$roads_distance <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[9]]  , '' )
#georef_all_dt_covariates_gaz_sf$landuse <- new_over(georef_all_dt_covariates_gaz_sf , covariate_list[[9]]  , 'LANDUSE' )

Plot the differences of points at real locations versus their imputed counterparts

sentence_case <- function(x) stringr::str_to_sentence(tolower(gsub("_"," ",x)))


#Make that bias in the coef plot
data.table::setkey(georef_all_dt, "rule_ensemble") #sort by ensemble score, pick the best option for each source
georef_all_dt_source <- georef_all_dt[,.SD[1], by=list(event_hash,
                                                       source_dataset) ]

temp <- as.data.frame(georef_all_dt_covariates_events_sf)[,c('source_dataset',
                                                          'population',
                                                          'treecover',
                                                          'ruggedness',
                                                          'roads_distance',
                                                          'rain')
                                                         ] %>% 
                                                           group_by(source_dataset) %>% 
                                                          summarise_all(funs(mean, .args =list(na.rm=T)) )

#I don't think we want to scale this, I think we want to do percentages
#temp[,2:6] <- lapply(temp[,2:6], scale)

#temp[,2:6] <- temp[,2:6] - temp[rep(2,nrow(temp)),2:6] #This demeans each
test <- as.numeric(unlist(as.vector(data.frame(temp[2,]))))
temp <- as.data.frame(temp)
temp[,2] <- temp[,2] / rep(test[2],10) #This demeans each
temp[,3] <- temp[,3] / rep(test[3],10)  #This demeans each
temp[,4] <- temp[,4] / rep(test[4],10)  #This demeans each
temp[,5] <- temp[,5] / rep(test[5],10)  #This demeans each
temp[,6] <- temp[,6] / rep(test[6],10)  #This demeans each

temp <- temp %>% tidyr::gather(variable, value, -source_dataset)

p9 <-  temp %>% 
       mutate(source_dataset=sentence_case(source_dataset),
              variable=sentence_case(variable)
      ) %>%
       ggplot(aes(x = value,
                 y = as.factor(sub("_","",source_dataset)),
                 shape=as.factor(sub("_","",source_dataset)))) + 
                 geom_point() + 
                 facet_grid(.~variable, scales="free") + 
                 geom_vline(xintercept=1) + ylab(sentence_case("Source Dataset")) +
                 xlab(sentence_case("Ratio of Mean Value at Imputed Location Relative to True Location")) +
                 scale_shape_manual(values=1:15) + theme_bw() + theme(legend.position="none") 
p9
ggsave(
  filename = glue::glue(dir_figures, "p_bias_in_covariates_by_source.pdf"),
  plot = p9, width = 16, height = 3
)


rexdouglass/MeasuringLandscape documentation built on May 13, 2019, 6:16 p.m.