Demonstrate what kinds of events tend to systematically get excluded. Here, in terms of whether the event would have received an original military coordinate or not.

rm(list=ls()); gc()
# !diagnostics off
library(MeasuringLandscape)
library(tidyverse)

dir_figures <- glue::glue(getwd(), "/../paper/figures/")

gc()

knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8,  warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)

sentence_case <- function(x) stringr::str_to_sentence(tolower(gsub("_"," ",x)))
#Load Events
events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) 

events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text',
                                    'name_clean','name_cleaner','document_district_clean','map_coordinate_clean_latitude','map_coordinate_clean_longitude')],
                                     "location_text", transform,
      map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude))
      )

Plot the predicted effects for a single model, Mil. Coords or no Mil. Coords

pred_cords <- MeasuringLandscape:::predict_missingness_dv(is.na(events_sf$map_coordinate_clean_latitude))
rf <- pred_cords$xb_model
train <- pred_cords$x_all_pre_dummy
label <- pred_cords$label
x_all_pre_dummy <- pred_cords$x_all_pre_dummy
x_all <- dummies::dummy.data.frame(pred_cords$x_all_pre_dummy,
                                   all=T,
                                   dummy.classes=c('character','factor','ordered'))

dtrain <- xgboost::xgb.DMatrix(data=as.matrix( x_all ), 
                      label = label, missing = NA )

#options(na.action='na.pass')
#testdata_dummy <-  model.matrix(~ . - 1, pred_cords$x_all_pre_dummy)
#options(na.action='na.omit')
testdata_dummy <- x_all_pre_dummy %>% as.tibble %>% fastDummies::dummy_columns() %>% dplyr::select_if( (is.numeric) )

#testdata_dummy <- dummies::dummy.data.frame(x_all_pre_dummy, drop=F,
#                                   dummy.classes=c('character','factor','ordered'))
dtest <- xgboost::xgb.DMatrix(data=as.matrix( testdata_dummy ),  missing = NA ) 
dtest <- xgboost::xgb.DMatrix(data=as.matrix( pred_cords$postdummy ),  missing = NA )

Importance scores for each variable, predicting the missingness of exact map coordinates as a function of each event's details.

importance_importance <- xgboost::xgb.importance(feature_names=names(testdata_dummy),
                                        model = rf)
xgboost::xgb.plot.importance(importance_importance)
histogram=T
scale=2
a <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing",var="document_date_type",minsize=100,
                  scale=scale,histogram=T)

b <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing",var="document_date_best_year",minsize=100,
                  scale=scale,histogram=T)

c <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing",var="initiator_clean_1_aggmed",minsize=100,
                  scale=scale,histogram=T)



d <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing",var="target_clean_1_aggmed",minsize=100,
                  scale=scale,histogram=T)

e <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing",var="type_clean_aggmed",minsize=100,
                  scale=scale,histogram=T)


f <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing",var="document_unit_type",minsize=100,
                  scale=scale,histogram=T)


g <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing", var="document_district_clean",
                  minsize=100, train=pred_cords$x_all_pre_dummy ,
                  scale=scale,histogram=T)

h <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing",var="event_date_clean_year",minsize=100,
                  scale=scale,histogram=T)

#e <- plot_partial_effects(rf=pred_cords$xb_model,
#                  outcome="mapcoordinate_clean_missing",var="locationtext_ruleclean_suffix",minsize=100)
#p_load(cowplot)
final_histogram <- cowplot::plot_grid(

  a+ggtitle(sentence_case('Document Date Type')),
  b+ggtitle(sentence_case('Document Year')),
  c+ggtitle(sentence_case('Initiator')),
  d+ggtitle(sentence_case('Target')),
  e+ggtitle(sentence_case('Act Type')),
  f+ggtitle(sentence_case('Document Unit')),
  g+ggtitle(sentence_case('Document District')),
  h+ggtitle(sentence_case('Event Year')) + ylab(sentence_case("Probability of Missing Military Coordinates")),
  #i+ggtitle('Reporting Office'),
  ncol = 3, align = "hv" ) #,rel_heights=heights)
final_histogram

ggsave(
  filename = glue::glue(dir_figures, "rf_mapcoordinate_clean_missing.pdf"),
  plot = final_histogram, width = 10, height = 8
)


rexdouglass/MeasuringLandscape documentation built on May 13, 2019, 6:16 p.m.