This is the entry point for the paper "Measuring the Landscape of Civil War."

In this file, a raw csv file of the events dataset created for the Mau Mau rebellion is loaded and processed.

Load Library

rm(list=ls()); gc() #clear objects from memory
library(MeasuringLandscape)
library(tidyverse) #load independently just to make sure %>% gets imported

knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width = 12, fig.height = 8, warning = FALSE, message = FALSE, cache = TRUE)
options(width = 160)

Load Events Data

events <- MeasuringLandscape:::prep_events(fromscratch = F)
dim(events)

Dates

Basic cleaning. Format is usually DD.MM.YYYY but sometimes multiple days are included by DD1/DD2/MM/YY. Sometimes year is YY or YYYY.

#p_load(date)
events <- events %>%
          dplyr::mutate(event_date_clean=event_date) %>%
          dplyr::mutate(event_date_clean=stringr::str_replace_all(event_date_clean,"[[:digit:]]+/", "")) %>%
          dplyr::mutate(event_date_clean=stringr::str_replace_all(event_date_clean,"[[:digit:]]+/", "")) %>% #strip off extra day at the front 01/02.12.1950
          dplyr::mutate(event_date_clean=stringr::str_replace_all(event_date_clean,"\\.", "/")) %>% #Convert periods to slashes
          dplyr::mutate(event_date_clean=trimws(event_date_clean)) %>% #trim whitespace
          dplyr::mutate(event_date_clean=stringr::str_replace_all(event_date_clean,"/52", "/1952")) %>% #convert 2 digit years to 4 digit years
          dplyr::mutate(event_date_clean=stringr::str_replace_all(event_date_clean,"/53", "/1953")) %>%
          dplyr::mutate(event_date_clean=stringr::str_replace_all(event_date_clean,"/54", "/1954")) %>%
          dplyr::mutate(event_date_clean=stringr::str_replace_all(event_date_clean,"/55", "/1955")) %>%
          dplyr::mutate(event_date_clean=stringr::str_replace_all(event_date_clean,"/56", "/1956")) %>%
          dplyr::mutate(event_date_clean=stringr::str_replace_all(event_date_clean,"/19524", "/1954")) %>% #clean typo
          dplyr::mutate(event_date_clean= lubridate::dmy(event_date_clean) ) #Feed to lubridate

events %>% filter(is.na(event_date_clean)) %>% dplyr::select(starts_with("event_date")) %>% distinct() %>% print(n=40) #visualize errors

events$event_date_clean_year <- lubridate::year(events$event_date_clean)
events$event_date_clean_year %>% janitor::tabyl() %>% round(3)

How often are event dates missing?

table(is.na(events$event_date))

The documents also have dates, sometimes spanning a period of time. Can use that to nail down missing dates.

(events$document_date_type <- events$document_date %>% 
                             tolower() %>% 
                             mosaic::derivedFactor(
                                          "unknown" = T,
                                          "missing"     = stringr::str_detect(.,"obscured|missing|illegible|xx|Document missing"),
                                          "on the"      = stringr::str_detect(.,"on the"),
                                          "to"          = stringr::str_detect(.," to"),
                                          "for"         = stringr::str_detect(.,"For "),
                                          "week"        = stringr::str_detect(.,"week"),
                                          "week ending" = stringr::str_detect(.,"week ending"),
                                          "period"      = stringr::str_detect(.,"period"),
                                          "fortnight"   = stringr::str_detect(.,"fortnight"),
                                          "ending"      = stringr::str_detect(.,"ending"),
                                          .method = "last",
                                          .default = "unknown"
                            ) 
 ) %>% janitor::tabyl() 


events$document_date_clean <- events$document_date %>% tolower() %>% 
                             stringr::str_replace_all("Fortnight Ended |period|week ending|for |the |fortnight |ending |week |From |on ","") %>%
                             stringr::str_replace_all("[Digits]*th|[Digits]*st|[Digits]*rd|[Digits]*nd","")

events <- events %>% 
         dplyr::select(-one_of("document_date_1","document_date_2")) %>%  #separate will continue to add columns every time its run
                              tidyr ::separate(col=document_date_clean,
                                        into=c("document_date_1","document_date_2"),
                                        sep = " to|to |To | - ", remove=F, extra="drop", fill="right")

events$document_date_clean_1 <- events$document_date_1 %>% 
                                 stringr::str_replace_all("[[:digit:]]+/", "")   %>% #strip off extra day at the front 01/02.12.1950
                                 stringr::str_replace_all("\\.", "/")             %>% #Convert periods to slashes
                                 trimws() %>%                            
                                 lubridate::dmy()

events$document_date_clean_2 <- events$document_date_2 %>% 
                                 stringr::str_replace_all("[[:digit:]]+/", "")   %>% #strip off extra day at the front 01/02.12.1950
                                 stringr::str_replace_all("\\.", "/")             %>% #Convert periods to slashes
                                 trimws() %>%                            
                                 lubridate::dmy()    

events %>% filter(is.na(document_date_clean_1)) %>% dplyr::select(starts_with("document_date")) %>% distinct() %>% print(n=40) #visualize errors


events$document_date_best_date <- events$document_date_clean_2
condition <- is.na(events$document_date_best_date)
events$document_date_best_date[condition] <- events$document_date_clean_1[condition]
(events$document_date_best_year <- lubridate::year(events$document_date_best_date)) %>% janitor::tabyl() %>% round(3)

Type of Event

cat("\014")
events$type_clean <- stringr::str_trim(stringi::stri_trans_totitle(events$type))

events$type_clean_agglow <- events$type_clean %>%
  stringr::str_trim() %>%
  tolower() %>%
  forcats::fct_collapse( #removed Car dependency for forcats
             'desertion'='desertion',
             'escape'='escape',
             'abduction'=c('abduction','kidnapping','kidnap','kitnap','kindnap'),
             'assault'=c('assault','attack','assaulted','assaults','assualt','assult'),
             'murder'=c('murder','elimination','kidnap / murder',''),
             'arson'=c('arson','burn'),
             'cattle slashing'=c('slashed','stampede'),
             'vandalism'='vandalism',
             'theft'=c('theft','thefts','thet','missing','lost','entry'),
             'punishment'=c('confiscate','sentenced'),
             'rebel capture'=c('capture','captured'),
             'oathing'=c('oath','oathing','recruitment','recruited'),
             'contact'=c('contact','caontact','contacts','drove off','drive off','drove  off',
              'chased off','broke up oathing','ambush'),
             'patrol'=c('patrol','police and kpr patrol','sweep'),
             'screening'=c('screening','sreening'),
             'unclassified'=c('type')
)

events$type_clean_agglow %>%
  janitor::tabyl(sort = TRUE) %>%
  janitor::adorn_crosstab(.,digits = 1)

Collapse Event Types

(events$type_clean_aggmed <- events$type_clean_agglow %>% forcats::fct_collapse(
                                 'physical violence'=c('abduction','assault','murder'),
                                 'property destruction'=c('vandalism','arson','cattle slashing'),
                                 'theft'=c('theft'),
                                 'security operations'=c('contact','screening','sreening','patrol','punishment'),
                                 'unclassified'=c('desertion','escape','unclassified')
                            )
 ) %>%
  janitor::tabyl(sort = TRUE) %>%
  janitor::adorn_crosstab(digits = 1)
(events$type_clean_agghigh <- events$type_clean_aggmed %>% forcats::fct_collapse(
                                 'rebel activity'=c('oathing','physical violence','property destruction','theft'),
                                 'government activity'=c('rebel capture','security operations')
                            )
) %>%
  janitor::tabyl(sort = TRUE) %>%
  janitor::adorn_crosstab(digits = 1)

Initiator of Event

Collapsed Initiators to just Rebels, Government, and Civilians

cat("\014")

initiator_target_master_clean <- list(

'ammunition'=c('ammunition'),

'explosives'=c('explosives', 'gelignite'),

'firearms'=c('firearms','arms', 'firearm', 'gun', 'pistol', 'rifle', 'ammunition', 'rifile', 'shotgun', 'verey pistol'),

'other weapons'=c('other weapons','axe','scabbard','weapons'),

'colonial authorities'=c('colonial authorities','councillor', 'district commissioner', 'district officer', 'forest ranger', 'game ranger', 'game warden', 'government', 'government employees', 'port authority', 'public works department', 'screening team' , 'do', 'govrnment', 'wakamba screening team', 'do munuga','african do','dcmeru', 'colonial authorities' ,'govtemployee' ),

'tribal authorities'=c('tribal authorities','chief', 'elders', 'headman' , 'chief chostram','chief eliud', "chief's sentry") ,

'private property'=c('private property','buildings', 'cattle dip', 'duka', 'farms', 'garage', 'homes','huts', 'hotel', 'land rover', 'lorry', 'market', 'office', 'oxcart', 'property',  'pump house', 'sawmill', 'shops', 'stores', 'tractor', 'vehicle', 'windmill' , "bullock's farm",'cattle boma','coffe trees','coffee trees', 'cuthouse','dairy farm','dip','house','household', 'houses','hut','instrument','labour camp post','labour huts','lorries','lucerne sheds','maize shamba', 'milk factory','pig sty','private property', 'property of civilians','shop','store','thika fishing camp','vehicles'),

'cash'=c('cash', 'funds', 'money' , "conductor's takings"),

'food'=c('banana', 'barley', 'bran', 'cabbage', 'coffee', 'corn', 'cream', 'crops', 'dairy', 'food', 'fruit', 'grain', 'honey', 'maize',  'meat', 'milk', 'oats', 'posho', 'potatoes', 'sugar', 'vegetable', 'wheat',
'food','food etc','food store','food stores','foodstuffs','fruits','grains', 'grains+cloth +money','green maize cobs','potato','potato store','potatos','skimmed milk','sugar cane','sugar maize','vegetables','vegitable garden', 'vegitables','wheat bags','wheat store','wheet','whisky') ,

'livestock'=c('beast', 'cattle', 'cow', 'herd', 'livestock', 'pig', 'sheep', 'steer', 'stock',
'animal', 'bulls','calf','calves','chicken','cows','donkey','goat','goats',
'head of cattle','head of cow','head of sheep','heifer','heifers',
'lamb','live stock','livestock','livestocks','masai herd','milk cow','ox','ox cart',
'oxen','ram','red poll cattle','shee','sheep or ox','steers','stocks' ),

'medicine'=c('medical supplies', 'medicine', 'm&b tablets', 'medicines'),

'supplies'=c('supplies','bags', 'bedding', 'blankets', 'books', 'charcoal', 'cloth', 'clothing', 'cooking utensils', 'cutlery', 'equipment', 'farm implements', 
'household items','instruments', 'iron', 'pails','petrol', 'provisions', 'oil', 'sacks', 'supplies', 'tarpaulin', 'thatch', 'timber', 
'tobacco', 'tools', 'uniforms', 'wire', 'wireless set', 'whiskey', 'articles','bag','battery','bucket','ciga','cigarettes','clothes',
'clothing etc','cloths','dairy item','dairy record book','goods', 'material','oil+tins','provisionv','railway uniforms','supplies', 'tarpaulian','typewriter','v- drive belts', 'gunny bags' ),

'church'=c('church'),

'infrastructure'=c('airstrip', 'bridges', 'half built village', 'roads', 'trenches', 'water tank','bridge', 'bridge broken', 'bridge damaged', 'infrastructure', 'milt property', 'miltproperty', 'prison camp','stn damaged'),

'school'=c('school', 'school','school building','school house','school property','schools'),

'home guard'=c('bg','kg','eg', 'guard','embu guard', 'farm guard', 'forest guard', 'home guard','ikandine guard', 'kathanjure guard', 'kijabe guard',
'kikuyu guard', 'masai guard', 'meru guard', 'nandi guard', 'nkubu guard', 'stock guard', 'tigoni guard','tp and eg patrol','hg','tp patrol','home guard patrol',
'm', 'm/g','m/g patrol','g', 'kathanjure hg','k g', 'ng', 'eg patrol', 'hg camp','hg leader','hg patrol','hg post','home','home guard','kg post'),

'arab combat units'=c('arab combat' , 'arab combat unit','arab combat units'),

'asian combat units'=c('asian combat', 'asian combat unit', 'asian combat team', 'second asian combat unit','asian combat units' ),

'kings african rifles'=c('kings african rifles','3 kar', '4 kar', '5 kar', '6 kar', '7 kar', '23 kar', '26 kar','k.a.r','k.p.r','k.a.r.', '5th k.a.r','5kar','5 k.a.r','4th kar','kar' ) ,

'british military'=c('british military', 'devonshire regiment','devons', 'field intelligence assistant', 'field intelligence officer', 'fio', 'gloucestershire regiment', 'glosters', 'lancashire fusiliers', 
"king's shropshire light infantry", 'royal east kent regiment', 'buffs', 'royal fusiliers', 'royal highland regiment','black watch', 'watch', 'royal inniskilling fusiliers', 'royal irish fusiliers', 'royal northumberland fusiliers', 'rnf','police and military', 'army' , 'lancashire fusilliers', 'sp company 1 royal innisks', '1 rnf', 'rif', 'ksli', 'inniskillings', 'fia','1 glosters', '1 bw', '1 buffs', 
'\"a\" company 1 royal innisks', '\"a\" company', 'royal fusilers', 'of devons','of 1 glosters', 'lanc fus', 'fusiliers', 'fio kruger','fios','a co devon','4 platoon support company',
'\"c\" company1 royal innisks','6 platoonsp company 1 royal innisks','1 lf', '\"c\" company', '\"d\" company','\"a\"','\"a\" company bw','buffs ambush','d company',
"d' force",'devens', 'c company','\"d\" force', 'army officer', 'british army officer', 'british military', 'buffs patrol', 'european officer', 'european soldiers', 'gloster patrol' ),

'kenya regiment'=c('kenya regiment','captain folliott’s team' , 'kr', 'kenreg', 'kenregg','kenya regiment sergeant', 'kenya regt','keniya regiment','kenya regiment private'),

 'military (generic)'=c('military (generic)', 'captain', 'company', 'military', 'army', 'military property', 'platoon', 'security forces', 'security force', 'coy', 'striking force' ,'sentry', 'non commissioned officers', 'patrol', 'sentrie', 'sgt white' ),

'psuedo gangs'=c('psuedo gangs','pseudo gang', 'pseudo team', 'trojan', 'psuedo gangs', 'trojan team' , 'tracker group', 'pseudo teams'),

'royal air force'=c('royal air force','raf', 'bombers', 'air strike', 'harvards', 'raf lincolns','flying squard'),

'paramilitary'=c('paramilitary','general service unit', 'gsu' ),

'cid'=c('cid'),

'kenya police'=c('kenya police', 'kp' , "kp constables' quarters", 'kpa' ),

'kenya police reserve'=c('kenya police reserve', 'kpr', 'kpr officers', 'reserve police officer', 'rpo' , 'rpos', 'police and k.p.r'),

'police (generic)'=c('police (generic)','constable', 'police', 'polce','policy party'),

'railway police'=c('railway police' ),

'special branch'=c('special branch', 'blue doctor team', 'special branch team', 'sb officers' ),

'tribal police'=c('githumu police', 'masai special constable', 'tribal police', 'tp' , 'tpeg','african constable', 'african costable', 'african special constable', 'tribal police'),

'tribal police reserve'=c('tribal police reserve', 'tpr')  ,

'communities'=c('communities','manyatta', 'fishing camp', 'sublocation', 'village', 'camp' , 'villages') ,

'detainees'=c('detainees', 'prisoner', 'prisoners'),

'suspected insurgents'=c('suspected insurgents','bandits', 'food foragers', 'gangs', 'gang', 'kiama kia muingi' , 'kkm', 'komerera' , 'mau mau', 'oath administrator', 'passive wing',
'rebels', 'suspects', 'terrorists','terrorosts','terrorist', 'gunman', 'terorist', 'gunmen', 'resistance group','resistance groups', 'oath administrater','oath administrators','passive wing members', 'resistance','suspect', 'suspected insurgents','terroist','terroists','terrost') ,

'civilians'=c('civilians','africans', 'children', 'civilian', 'driver', 'employees', 'evangelist', 'family', 'farm boys', 'girls', 'informer',
'kikuyu', 'laborour', 'loyalist', 'masai', 'men', 'mission staff', 'owner', 'passengers', 'people',  'tugen tribesmen' , 'stranger', 'sikh',
'herd boys', 'isiolo game scouts', 'farm labour', 'farmer', 'european', 'employer', 'employee', 'civilan','shopkeeper' , 'students', 'teachers',
'turkana', 'vigilantes', 'women', 'workers','villagers',  'labour', 'local labour', 'kikuyus', 'embu', 'tiriki houseboy', 'samburu', 'manager', 'woman',
'vetofficer', 'mrhiggins', 'masai party','kuria tribesmen','manager of akira estates', 'kuria tribesmen','chstephen','african',
'catholic misson staff', 'african staff', 'asian women', 'bus conductor', 'child', 'civilian(food carriers)', 'civilian(schoolmaster)', 'civilians',
'civilion', 'committee', 'committee member',  'courier','elder','embu tractor driver', 'employees of club','engine boy','girl','golf club staff','his own hut',
'hotel keeper','houseboy','illegal residents','indian','interpreter','kem','kikiyu', 'kikuyu assessor','kikuyu families','kikuyu houseboy','kikuyu labourer','kikyu',
'kirua village','labour line','labour lines','labourer','labourers', 'laboures','labourline','labours','males','man','maragoli','maragoli labourer',
'masai elders','masai tribesman','members of the thika committee', 'mna section leaders','municipal inspectors','non kikuyu employees','person',
'prostitutes','purke masai','pwd employee','railway employees', 'school master','school teacher','sisters committee','somali','staff','strangers',
'taxi drivers','teacher','treasurers', "headman's son","norton traill's labour","gordon's labour", 'food carriers') 


)
regex <- "\\.|patrol|[1-9]\\s*rd|[1-9]\\s*th" # with regex start trying to get more of these to automatically map instead of generating lots of hand codings
events$initiator_clean <- events$initiator %>% stringr::str_trim() %>% gsub(regex, "", ., ignore.case =T) %>% tolower()

events <- events %>%
  dplyr::select(-one_of("initiator_clean_1", "initiator_clean_2", "initiator_clean_3")) %>% # separate will continue to add columns every time its run
  tidyr ::separate(
    col = initiator_clean,
    into = c("initiator_clean_1", "initiator_clean_2", "initiator_clean_3"),
    sep = "and|\\\\|/|\\&|,", remove = F, extra = "drop", fill = "right"
  )

events <- events %>%
  mutate_at(vars(starts_with("initiator_clean_")), funs(gsub(".*police.*", "police", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("initiator_clean_")), funs(gsub(".*guard.*", "guard", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("initiator_clean_")), funs(gsub(".*terror.*|.*mau mau.*|.*gang.*", "terrorist", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("initiator_clean_")), funs(gsub(".*kpr.*|.*k p r.*", "kpr", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("initiator_clean_")), funs(gsub(".*kar.*|.*k a r.*", "kar", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("initiator_clean_")), funs(gsub(".*coy.*", "coy", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("initiator_clean_")), funs(gsub(".*gsu.*", "gsu", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("initiator_clean_")), funs(gsub(".*watch.*", "watch", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("initiator_clean_")), funs(trimws(.)))

events <- events %>% 
          mutate(initiator_clean_1_agglow = do.call(forcats::fct_collapse, c(list(events$initiator_clean_1), initiator_target_master_clean)) %>% as.character() ) %>%
          mutate(initiator_clean_2_agglow = do.call(forcats::fct_collapse, c(list(events$initiator_clean_2), initiator_target_master_clean)) %>% as.character()) %>%
          mutate(initiator_clean_3_agglow = do.call(forcats::fct_collapse, c(list(events$initiator_clean_3), initiator_target_master_clean)) %>% as.character())

#https://community.rstudio.com/t/using-list-argument-with-fct-collapse/6552/7

# sort(table(events$initiator_clean_1_agglow))

lowlevelagg <- c(
  "arab combat units", "cid", "psuedo gangs", "asian combat units", "special branch",
  "tribal authorities", "tribal police reserve", "royal air force",
  "paramilitary", "kenya regiment", "tribal police", "kenya police reserve", "kenya police",
  "british military", "civilians", "Kings African Rifles", "military (generic)", "police (generic)",
  "railway police", "home guard", "colonial authorities", "suspected insurgents"
)

#events <- events %>%
#  mutate(initiator_clean_1_agglow=ifelse(initiator_clean_1_agglow  %in% lowlevelagg & !is.na(initiator_clean_1_agglow),initiator_clean_1_agglow, "uncategorized")) %>%
#  mutate(initiator_clean_2_agglow=ifelse(initiator_clean_2_agglow  %in% lowlevelagg & !is.na(initiator_clean_2_agglow),initiator_clean_2_agglow, "uncategorized")) %>%  
#  mutate(initiator_clean_3_agglow=ifelse(initiator_clean_3_agglow  %in% lowlevelagg & !is.na(initiator_clean_3_agglow),initiator_clean_3_agglow, "uncategorized"))

# table(events$initiator_clean_1_agglow, useNA="always")
events[, c("initiator_clean_1_aggmed", "initiator_clean_2_aggmed", "initiator_clean_3_aggmed")] <-
  events[, c("initiator_clean_1_agglow", "initiator_clean_2_agglow", "initiator_clean_3_agglow")]
events <- events %>%
  mutate_at(
    vars(starts_with("initiator_clean_1_aggmed|initiator_clean_2_aggmed|initiator_clean_3_aggmed")),
    .funs = funs(forcats::fct_collapse(
     'police'=c('cid','kenya police reserve','kenya police','police (generic)','railway police','special branch','tribal police','tribal police reserve') ,
     'military'=c('arab combat units','asian combat units','british military','kings african rifles','kenya regiment','military (generic)','psuedo gangs','royal air force') , 
     'civil authorities'=c('colonial authorities', 'tribal authorities')
     ))
  )

events$initiator_clean_2_aggmed %>%
  janitor::tabyl(sort = TRUE) %>%
  janitor::adorn_crosstab(digits = 1)
events[, c("initiator_clean_1_agghigh", "initiator_clean_2_agghigh", "initiator_clean_3_agghigh")] <-
  events[, c("initiator_clean_1_aggmed", "initiator_clean_2_aggmed", "initiator_clean_3_aggmed")]
events <- events %>%
  mutate_at(
    vars(starts_with("initiator_clean_1_agghigh|initiator_clean_2_agghigh|initiator_clean_3_agghigh")),
    .funs = funs(forcats::fct_collapse(
                  'government'=c('civil authorities', 'home guard', 'military', 'police', 'paramilitary') ,
                  'rebels'=c('suspected insurgents') 
          ))
  )

events$initiator_clean_3_agghigh %>%
  janitor::tabyl(sort = TRUE) %>%
  janitor::adorn_crosstab(digits = 1)

Target of Event

regex <- "\\.|patrol|[1-9]\\s*rd|[1-9]\\s*th" # with regex start trying to get more of these to automatically map instead of generating lots of hand codings
events$target_clean <- events$initiator %>% stringr::str_trim() %>% tolower() %>% gsub(regex, "", .)

events <- events %>%
  dplyr::select(-one_of("target_clean_1", "target_clean_2", "target_clean_3")) %>% # separate will continue to add columns every time its run so drop old versions. First time this is run will throw a warning.
  tidyr ::separate(
    col = initiator_clean,
    into = c("target_clean_1", "target_clean_2", "target_clean_3"),
    sep = "and|\\\\|/|\\&|,", remove = F, extra = "drop", fill = "right"
  )

events <- events %>% 
  mutate_at(vars(starts_with("target_clean_")), funs(gsub(".*terror.*|.*erori.*|.*erroris*|.*mau mau.*|.*gang.*", "terrorist", ., ignore.case =T))  )  %>%
  mutate_at(vars(starts_with("target_clean_")), funs(gsub(".*police.*", "police", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("target_clean_")), funs(gsub(".*guard.*", "guard", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("target_clean_")), funs(gsub(".*kpr.*|.*k p r.*", "kpr", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("target_clean_")), funs(gsub(".*kar.*|.*k a r.*", "kar", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("target_clean_")), funs(gsub(".*coy.*", "coy", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("target_clean_")), funs(gsub(".*gsu.*", "gsu", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("target_clean_")), funs(gsub(".*watch.*", "watch", ., ignore.case =T))) %>%
  mutate_at(vars(starts_with("target_clean_")), funs(trimws(.)))

events$target_clean_1 %>%
  janitor::tabyl(sort = TRUE) %>%
  janitor::adorn_crosstab(digits = 1)
events <- events %>% 
          mutate(target_clean_1_agglow = do.call(forcats::fct_collapse, c(list(events$target_clean_1), initiator_target_master_clean)) %>% as.character() ) %>%
          mutate(target_clean_2_agglow = do.call(forcats::fct_collapse, c(list(events$target_clean_2), initiator_target_master_clean)) %>% as.character()) %>%
          mutate(target_clean_3_agglow = do.call(forcats::fct_collapse, c(list(events$target_clean_3), initiator_target_master_clean)) %>% as.character())

lowlevelagg <- c(
  "church", "kenya police", "medicine", "tribal police reserve", "detainees", "kenya regiment", "other weapons",
  "paramilitary", "ammunition", "communities", "british military", "military (generic)", "tribal authorities", "kenya police reserve", "tribal police",
  "Kings African Rifles", "infrastructure", "school", "cash", "colonial authorities", "police (generic)", "supplies", "firearms", "food", "private property",
  "home guard", "civilians", "livestock", "suspected insurgents"
)

#events <- events %>%
#   mutate(target_clean_1_agglow=ifelse(target_clean_1_agglow  %in% lowlevelagg & !is.na(target_clean_1_agglow),target_clean_1_agglow, "uncategorized")) %>%
#   mutate(target_clean_2_agglow=ifelse(target_clean_2_agglow  %in% lowlevelagg & !is.na(target_clean_2_agglow),target_clean_2_agglow, "uncategorized")) %>% 
#   mutate(target_clean_3_agglow=ifelse(target_clean_3_agglow  %in% lowlevelagg & !is.na(target_clean_3_agglow),target_clean_3_agglow, "uncategorized"))

events$target_clean_1_agglow %>%
  janitor::tabyl(sort = TRUE) %>%
  janitor::adorn_crosstab(digits = 1)
events[, c("target_clean_1_aggmed", "target_clean_2_aggmed", "target_clean_3_aggmed")] <-
  events[, c("target_clean_1_agglow", "target_clean_2_agglow", "target_clean_3_agglow")]
events <- events %>%
  mutate_at(
    vars(starts_with("initiator_clean_1_aggmed|initiator_clean_2_aggmed|initiator_clean_3_aggmed")),
    .funs = funs(forcats::fct_collapse(temp,
     'police'=c('cid','kenya police reserve','kenya police','police (generic)','railway police','special branch','tribal police','tribal police reserve') ,
     'military'=c('arab combat units','asian combat units','british military','Kings African Rifles','kenya regiment','military (generic)','psuedo gangs','royal air force'), 
     'civil authorities'=c('colonial authorities', 'tribal authorities'),
     'armaments'=c('ammunition','firearms','other weapons'),
     'provisions'=c('cash','food','livestock','medicine','supplies'),
     'public buildings'=c('church','school','infrastructure') 
     ))
  )

events$initiator_clean_1_aggmed %>%
  janitor::tabyl(sort = TRUE) %>%
  janitor::adorn_crosstab(digits = 1)
events[, c("target_clean_1_agghigh", "target_clean_2_agghigh", "target_clean_3_agghigh")] <-
  events[, c("target_clean_1_aggmed", "target_clean_2_aggmed", "target_clean_3_aggmed")]
events <- events %>%
  mutate_at(
    vars(starts_with("target_clean_1_agghigh|target_clean_2_agghigh|target_clean_3_agghigh")),
    .funs = funs(forcats::fct_collapse(
                  'government'=c('civil authorities', 'home guard', 'military', 'police', 'paramilitary') ,
                  'rebels'=c('suspected insurgents','detainees') ,
                  'property'=c('armaments','private property','provisions','public buildings') ,
                  'civilians'=c('communities','communities')
          ))
  )

events$target_clean_1_agghigh %>%
  janitor::tabyl(sort = TRUE) %>%
  janitor::adorn_crosstab(digits = 1)

Count of Initiators and Targets

# These numbers are estimates and can be changed
# These columns are dirty and when an incoherent value is given, it is set to missing.

recode_counts <- function(indata){
  acouple <- "2"
  afew <- "3"
  agang <- "6"
  agang_large <- "12"

  indata %>% tolower() %>% dplyr::recode(
    "100+"= "100",
    "??"= "",
    "1 bag"= "1",
    "1 blanket"= "1",
    "1 burnt down"= "1",
    "1 civilian"= "1",
    "1 cow, 6 sheep"= "7",
    "1 cow"= "1",
    "1 goat, clothing"= "1",
    "1 goat"= "1",
    "1 looted"= "1",
    "1 looted"= "1",
    "1 ox"= "1",
    "1 sheep and chickens"= "1",
    "1 sheep, some chickens"= "1",
    "1 sheep"= "1",
    "1 shotgun ,30 rounds"= "31",
    "1 shotgun + 10rds"= "11",
    "1 steer"= "1",
    "1 village, 1 market"= "1",
    "1 wounded"= "1",
    "1 wrecked"= "1",
    "1+"= "1",
    "1+3"= "4",
    "1+some"= "1",
    "10 acres"= "10",
    "10 bags"= "10",
    "10 cattle"= "10",
    "10 sacks"= "10",
    "10 to 12"= "11",
    "10 to 15"= "13",
    "10/14/2013"= "",
    "10/15/2013"= "",
    "10/20/2013"= "",
    "100 lb"= "100",
    "100-130"= "115",
    "100-150"= "125",
    "100+"= "100",
    "10000"= "",
    "109 cattle"= "109",
    "10bags potatoes"= "10",
    "11 cattle"= "11",
    "11 sheep"= "11",
    "112 bore & 20.1.45 &7 rds"= "112",
    "12 bags"= "12",
    "12 cattle"= "12",
    "12 goats"= "12",
    "12 to 15"= "13",
    "12 to 20"= "17",
    "12/14/2013"= "",
    "120 cattle"= "120",
    "120+1"= "121",
    "13 sheep"= "13",
    "13-15"= "14",
    "1300 worth"= "1300",
    "14 cattle"= "14",
    "14 goats"= "14",
    "14 head"= "14",
    "14+"= "14",
    "15 - 20"= "18",
    "15 cattle"= "15",
    "15 to 20"= "17",
    "15 to 20"= "17",
    "15 to 25"= "20",
    "15-20"= "17",
    "15+"= "15",
    "150-200"= "175",
    "150+"= "150",
    "151 cattle"= "151",
    "17 cattle"= "17",
    "172 bags burnt"= "172",
    "18 cattle"= "18",
    "19 bags"= "19",
    "196 rounds"= "196",
    "2 bags maize"= "2",
    "2 bags"= "2",
    "2 bags"= "2",
    "2 buckets"= "2",
    "2 cattle hamstrung"= "2",
    "2 cattle, corn"= "3",
    "2 cattle"= "2",
    "2 cows"= "2",
    "2 debbies"= "2",
    "2 goats"= "2",
    "2 groups"= "2",
    "2 huts burnt"= "2",
    "2 sheep"= "2",
    "2 watches, cash"= "2",
    "2/3/2013"= "",
    "2+"= "2",
    "20 bags maize, 9 goats, 32 chickens and ducks, cash"= "60",
    "20 bags"= "20",
    "20 cattle"= "20",
    "20 goats"= "20",
    "20 sheep"= "20",
    "20 to 25"= "23",
    "20 to 30"= "25",
    "20 to 40"= "30",
    "20-25"= "23",
    "20-30"= "25",
    "20-35"= "30",
    "20-50"= "35",
    "20/30"= "25",
    "20/30"= "25",
    "20+"= "20",
    "200 yds"= "200",
    "200-300"= "250",
    "200+"= "200",
    "2000 acres"= "2000",
    "21 goats"= "21",
    "21 head"= "21",
    "22 cattle"= "22",
    "25 to 30"= "28",
    "25-30"= "27",
    "25-30"= "27",
    "28 killed"= "28",
    "28 sheep"= "28",
    "3 bags"= "3",
    "3 bags"= "3",
    "3 bikes"= "3",
    "3 cattle"= "3",
    "3 cattle"= "3",
    "3 goats"= "3",
    "3 or 4"= "3",
    "3 or 4"= "3",
    "3 pangas"= "3",
    "3 sheep, 2 calves"= "5",
    "3 sheep"= "3",
    "3 to 4"= "3",
    "3 to 4"= "3",
    "3/10/2013"= "",
    "3/4/2013"= "",
    "3/5/2013"= "",
    "3/6/2013"= "",
    "3+"= "3",
    "3+3+1+2"= "9",
    "3+some"= "3",
    "30 acres"= "30",
    "30 cattle"= "30",
    "30 to 40"= "35",
    "30-35"= "33",
    "30-40"= "35",
    "30-50"= "40",
    "30+"= "30",
    "300-400"= "350",
    "300+"= "300",
    "35 bags"= "35",
    "35 to 40"= "37",
    "38 cattle"= "38",
    "3or 4"= "3",
    "4 bags potatoes"= "4",
    "4 bags"= "4",
    "4 goats"= "4",
    "4 groups"= "",
    "4 or 5"= "4",
    "4 oxen"= "4",
    "4 sheep"= "4",
    "4 to 8"= "6",
    "4/6/2013"= "",
    "40 bag"= "40",
    "40 cattle"= "40",
    "40 sacks"= "40",
    "40 sheep"= "40",
    "40 to 50"= "45",
    "40/50"= "45",
    "40-50"= "45",
    "400 cattle"= "400",
    "4000"= "",
    "40161"="",
    "44 cattle"= "44",
    "5 bags"= "5",
    "5 calves"= "5",
    "5 cattle"= "5",
    "5 destroyed"= "5",
    "5 goats"= "5",
    "5 killed"= "5",
    "5 or 6"= "5",
    "5 sheep, 1 ox"= "6",
    "5 sheep"= "5",
    "5 to 6"= "5",
    "5/10/2013"= "",
    "5/6/2013"= "",
    "50 cattle"= "50",
    "50 to 60"= "55",
    "50-100"= "75",
    "50-60"= "55",
    "50-75"= "62",
    "50+"= "50",
    "50+"= "50",
    "5000 acres"= "5000",
    "519 +"= "519",
    "53 detained"= "53",
    "54 sheep and goats"= "54",
    "56 committee members"= "56",
    "6 bag"= "6",
    "6 bags"= "6",
    "6 cattle"= "6",
    "6 cattle"= "6",
    "6 goats"= "6",
    "6 or 7"= "6",
    "6 sheep and goats"= "6",
    "6 sheep"= "6",
    "6 to 7"= "6",
    "6 to 8"= "7",
    "6 to 9"= "8",
    "6-8 man"= "7",
    "6/10/2013"= "",
    "6/8/2013"= "",
    "60-100"= "80",
    "60-70"= "65",
    "64 cattle"= "64",
    "7 bags"= "7",
    "7 cattle"= "7",
    "7 sheep"= "7",
    "7/10/2013"= "",
    "70 bags"= "70",
    "70 cattle, sheep"= "70",
    "70-100"= "85",
    "70000"= "",
    "75 rounds"= "75",
    "8 bags potatoes"= "8",
    "8 cattle"= "8",
    "8 cows slashed"= "8",
    "8 cows"= "8",
    "8 sheep"= "8",
    "8 to 10"= "9",
    "8/10/2013"= "",
    "80 cattle"= "80",
    "80-100"= "90",
    "84 sheep, 1 cow, 5 chickens"= "90",
    "9 cattle"= "9",
    "9 sheep"= "9",
    "9 to 10"= "9",
    "9+9"= "18",
    "900(not clear)"= "900",
    "all locals"= "",
    "all"= "",
    "app 5"= "5",
    "app. 100"= "100",
    "app. 120"= "120",
    "armed gang"= agang,
    "band"= agang,
    "bands"= "",
    "cattle slashing"= "",
    "clothing"= "",
    "considerable quantity"= "",
    "fairly large gang"= agang_large,
    "few bags"= "",
    "few"= "",
    "food"= "",
    "gang"= agang,
    "gangs"= agang_large,
    "guards"= afew,
    "half village"= "",
    "labour"= "",
    "large crowd"= "",
    "large force"= agang_large,
    "large gang"= agang_large,
    "large meeting"= "",
    "large number"= "",
    "large numbers"= "",
    "large quantities"= "",
    "large quantity"= "",
    "large re-oathing ceremony"= "",
    "large scale"= "",
    "large"= agang_large,
    "largish gang"= agang_large,
    "local populace"= "",
    "many thousand"= "2000",
    "mob"= "",
    "not given"= "",
    "number"= "",
    "occupants"= "",
    "over 200"= "200",
    "Party"= agang,
    "party"= agang,
    "patrol"= agang,
    "posho"= "",
    "potatoes"= "",
    "quantity of clothing"= "",
    "section"= "",
    "several gangs"= agang_large,
    "several"= "3",
    "sheep and goats"= "",
    "shs 2,300/-"= "2300",
    "shs 60/-"= "60",
    "shs. 1,000"= "1000",
    "shs. 18"= "18",
    "shs. 30"= "30",
    "small gang"= agang,
    "small gangs"= agang,
    "small group"= agang,
    "small party"= afew,
    "small"= agang,
    "some"= afew,
    "sufficient food"= "",
    "unknown"= "",
    "very large gang"= agang_large,
    "villages in ndia, gichugu, embu divisions"= "",
    "wives"= ""
  ) %>% as.numeric() %>% return()

}

events$initiator_numbers_numeric <- events$initiator_numbers %>% recode_counts()
events$target_numbers_numeric <- events$target_numbers %>% recode_counts()
events$affected_count_numeric <- events$affected_count %>% recode_counts()

Casualties

events[, c(
  "government_killed_clean", "government_wounded_clean", "government_captured_clean",
  "rebels_killed_clean", "rebels_wounded_clean", "rebels_captured_clean",
  "civilians_killed_clean", "civilians_wounded_clean", "civilians_captured_clean"
)] <-
  events[, c(
    "government_killed", "government_wounded", "government_captured",
    "rebels_killed", "rebels_wounded", "rebels_captured",
    "civilians_killed", "civilians_wounded", "civilians_captured"
  )]

events <- events %>% mutate_at(
  .vars = c(
    "government_killed_clean", "government_wounded_clean", "government_captured_clean",
    "rebels_killed_clean", "rebels_wounded_clean", "rebels_captured_clean",
    "civilians_killed_clean", "civilians_wounded_clean", "civilians_captured_clean"
  ),
  funs(as.numeric(forcats::fct_collapse(.,
                                  '1'=c('unKnown','unknown','UnKnown','UNKNOWN','Unkown','Unknown','Number','More','101','146','122','208','94'),
                                  '2'=c('Few','others','Few','some'),
                                  '3'=c('Many','Sevaral','several','Several More','Several others','Some','Council of elders','Council of war','Several','Majority','many','Gang','Several','Small gang','3+'),
                                  '100'='100+',
                                  '23'='23 Families',
                                  '28'='28 families',
                                  '35'='30-40',
                                  '50'='50+',
                                  '45'='4500', 
                                  '80'='800', 
                                  '6'='6+' , 
                                  '10'='10+' ,
                                  '10197'=NA,
                                  '7'='48') %>%
                    fct_explicit_na(na_level='0')
                  )
       )
)

events <- events %>% mutate_at(.vars = c(
  "government_killed_clean", "government_wounded_clean", "government_captured_clean",
  "rebels_killed_clean", "rebels_wounded_clean", "rebels_captured_clean",
  "civilians_killed_clean", "civilians_wounded_clean", "civilians_captured_clean"
), funs(as.numeric))

events <- events %>%
  mutate(rebels_killedwounded_clean = rebels_killed_clean + rebels_wounded_clean) %>%
  mutate(government_killed_wounded_clean = government_killed_clean + government_wounded_clean) %>%
  mutate(rebels_government_killedwounded_clean = rebels_killed_clean + rebels_wounded_clean) %>%
  mutate(rebels_government_killed_clean = rebels_killed_clean + government_killed_clean) %>%
  mutate(rebels_government_civilians_killed_clean = rebels_killed_clean + government_killed_clean + civilians_killed_clean)
events %>% janitor::crosstab(initiator_clean_1_agghigh, type_clean_agghigh) %>% janitor::adorn_crosstab(digits = 1)
events %>% janitor::crosstab(target_clean_1_agghigh, type_clean_agghigh) %>% janitor::adorn_crosstab(digits = 1)
events %>% janitor::crosstab(target_clean_1_agghigh, initiator_clean_1_agghigh) %>% janitor::adorn_crosstab(digits = 1)

Clean Map Coordinates (East Africa Grid System)

cat("\014")

#Cases to handle
#"928141"
#"311449  328445    338443"
#"EASTING 30 and 27"
#"EastLeigh Sect.7"
#"FARM 535/4"
#"HAC  0202"
#"HAC.577236"
#"HZN 974641 & HZN 974651"
#"HZJ. 8595"
#"HZJ. 465765, HZJ. 459771, HZJ. 451756"
#"HZJ 42765, HZJ 42375and HZJ 429761"
#"HZH 960610, HZH 960630, HZH 977538"
#"H.Z.R. 4786"
#"HAD 1708, HAD 1709"
#"HAD 3326/3327"
#"HZJ 42765, HZJ 42375and HZJ 429761"
#"HZJ 9518  9617"
#"HZP 7430, HZP 9029, HZP 6448, HZP 7252, HZP 9448"

events$map_coordinate %>% janitor::tabyl() 

events$map_coordinate_clean <- events$map_coordinate %>% stringr::str_replace_all("[[:punct:]]| ", "") 
(events$map_coordinate_clean_length <- events$map_coordinate_clean %>% nchar() ) %>% janitor::tabyl() %>% round(3)

(events$map_coordinate_clean_text <- events$map_coordinate_clean %>% gsub("[0-9]", "\\1",.)) %>% janitor::tabyl()  %>% mutate_if(is.numeric, round,2) #Split into a text component and numeric component
(events$map_coordinate_clean_number <- events$map_coordinate_clean %>% gsub("[A-Za-z]", "\\1", .) ) %>% janitor::tabyl()  %>% mutate_if(is.numeric, round,2)

(events$map_coordinate_clean_text_band <- events$map_coordinate_clean_text %>% substring(1,1) ) %>% janitor::tabyl()  %>% mutate_if(is.numeric, round,2)
(events$map_coordinate_clean_text_block <- events$map_coordinate_clean_text %>% substring(2,2) ) %>% janitor::tabyl()  %>% mutate_if(is.numeric, round,2)
(events$map_coordinate_clean_text_subblock <-  events$map_coordinate_clean_text %>%  substring(3,3) ) %>% janitor::tabyl()  %>% mutate_if(is.numeric, round,2)

(events$map_coordinate_clean_number_length <- events$map_coordinate_clean_number %>% nchar() ) %>% janitor::tabyl()  %>% mutate_if(is.numeric, round,2)
(events$map_coordinate_clean_number_easting <- events$map_coordinate_clean_number %>%
                                              substring(1, events$map_coordinate_clean_number_length/2) %>% as.numeric() ) %>%
                                              janitor::tabyl()  %>% mutate_if(is.numeric, round,2)

(events$map_coordinate_clean_number_northing <- events$map_coordinate_clean_number %>%
                                              substring(events$map_coordinate_clean_number_length/2+1, events$map_coordinate_clean_number_length) %>%
                                              as.numeric() )  %>%
                                              janitor::tabyl()  %>% mutate_if(is.numeric, round,2)

Convert Coordinates to lat long

#

cat("\014")
print("Starting Converting Coordinates, may take some time")
for(i in 1:nrow(events)){
  # print(i)
  #The “exposition” pipe operator, %$% exposes the names within the left-hand side object to the right-hand side expression. Essentially, it is a short-hand for using the with functions 
  suppressMessages({
    temp <- with(events[i,] ,
                 MeasuringLandscape:::EAGS2LatLong(band=map_coordinate_clean_text_band,
                                        block=map_coordinate_clean_text_block,
                                        subblock=map_coordinate_clean_text_subblock,
                                        easting=map_coordinate_clean_number_easting , 
                                        northing=map_coordinate_clean_number_northing)
    )
    events$map_coordinate_clean_latitude[i] <- temp$latitude
    events$map_coordinate_clean_longitude[i] <- temp$longitude
  })
  #print(is.na(events$map_coordinate_clean_latitude[i]))
}
print("Finished Converting Coordinates")
#(temp <- events %>% mutate(map_coordinate_clean_row=1:n()) %>% filter(is.na(map_coordinate_clean_latitude) & !is.na(map_coordinate_clean)) %>% select(starts_with("map_coordinate_clean")) ) %>% distinct() %>% print(n=40) #visualize errors
#dim(temp) #195 coordinates don't convert.

testing=F
if(testing){
  i=3684
  events[i,] %>%  select(starts_with("map_coordinate_clean")) %$% EAGS2LatLong(band=map_coordinate_clean_text_band,
                                                                              block=map_coordinate_clean_text_block,
                                                                              subblock=map_coordinate_clean_text_subblock,
                                                                              easting=map_coordinate_clean_number_easting , 
                                                                              northing=map_coordinate_clean_number_northing)

  with(events[i,], map_coordinate_clean)
  with(events[i,], map_coordinate)
  band <- with(events[i,], map_coordinate_clean_text_band)
  block <- with(events[i,],map_coordinate_clean_text_block)
  subblock <- with(events[i,],map_coordinate_clean_text_subblock) #
  easting <- with(events[i,],map_coordinate_clean_number_easting)
  northing <- with(events[i,],map_coordinate_clean_number_northing)
}

stats::quantile(events$map_coordinate_clean_latitude, probs =c(.005,.01,.1,.5,.9,.99,.995), na.rm=T, type=9) 
stats::quantile(events$map_coordinate_clean_longitude, probs =c(.005,.01,.1,.5,.9,.99,.995), na.rm=T, type=9)
#plot(events$map_coordinate_clean_longitude,events$map_coordinate_clean_latitude) #plot with the outliers

#This is just to remove absolutely clear outliers. Not to set the region of interest.
#Outlier Bounding Box:
#NE 4.62933, 41.899059
#SW -4.71712, 33.90884
events$map_coordinate_clean_latitude[events$map_coordinate_clean_latitude < -4.71712 |
                                      events$map_coordinate_clean_latitude>4.62933] <- NA
events$map_coordinate_clean_longitude[events$map_coordinate_clean_longitude < 33.90884 |
                                      events$map_coordinate_clean_longitude>41.899059] <- NA
plot(events$map_coordinate_clean_longitude,events$map_coordinate_clean_latitude)

District of Document

cat("\014")

#clean document district
events$document_district_clean <- events$document_district %>% stringi::stri_trans_totitle() %>% stringr::str_trim() %>% as.factor()

events$document_district_clean <- events$document_district_clean %>% forcats::fct_collapse( 
                                   'Embu'=c('Embu-Fort Hall Border'),
                                   'Baringo'=c('BARINGO'),
                                   'FORT HALL'=c('Fort Hall'),
                                   'Naivasha'=c('Naviasha'),
                                   'Nyeri' = c('Nyeri Settled Area'),
                                   'Nyeri' = c('South Nyeri Reserve') ,
                                   'Nairobi' = c('Jock Scott'),
                                   NULL=c('',' ' ,'Document District','Kitui','Matathia', 'H/M','Reference Serial')
)


events$document_unit_type <- NA
condition <- events$document_district_clean %in% c("Rift Valley","Central Province"); table(condition)
events$document_unit_type[condition] <- "Province"

#Jock Scott Nairobi City
condition <- events$document_district_clean %in% c("Nairobi"); table(condition)
events$document_unit_type[condition] <- "City"

condition <- events$document_district %in% c("JOCK SCOTT"); table(condition)
events$document_unit_type[condition] <- "Operation Jock Scott"


#Missing? Elgeyo/Marakwet
#Baringo, , Embu, Fort Hall, Kajiado, Kiambu, Kitui, Laikipia, Machakos, Meru, Naivasha, Nakuru, Nanyuki, Narok, Nyeri, Thika
condition <- events$document_district_clean %in% c("Baringo","Embu","Fort Hall","Kajiado","Kiambu",
                                                 "Laikipia","Machakos","Meru","Naivasha","Nakuru",
                                                  "Nanyuki","Narok","Nyeri","Thika"); table(condition)
events$document_unit_type[condition] <- "District"

events$document_unit_type %>%
  janitor::tabyl(sort = TRUE) %>%
  janitor::adorn_crosstab(digits = 1)

events$document_district_clean  %>%
  janitor::tabyl(sort = TRUE) %>%
  janitor::adorn_crosstab(digits = 1)

Handle suffixes and directions

# Now we need to handle suffixes and combined locations
# "farm" now is followed by things because they crunched in additional location info at the end
## "coles estate farm
## agriculture experimental farm
## demonstration farm
## "farm near churo"
## reubens farm near churo
################################################

events$location_text_ruleclean <- events$location_text %>% stringr::str_trim() %>% tolower()
events <- events %>% 
          dplyr::select(-one_of("location_text_ruleclean_connector_prefix","location_text_ruleclean_connector_suffix")) %>%  #this intentionally throws a warning
          tidyr ::separate(col=location_text_ruleclean,
                    into=c("location_text_ruleclean_connector_prefix","location_text_ruleclean_connector_suffix"),
                    sep = " of | near ", remove=F, extra="drop", fill="right")
events <- events %>% mutate(name_clean=stringr::str_trim(tolower(location_text))) %>%
           mutate(name_clean_posessive=grepl("'s|`s",name_clean)) %>%
           mutate(name_cleaner=trimws(name_clean)  ) %>%
           mutate(name_cleaner=gsub("'s|`s","",name_cleaner, fixed=T)  ) %>%
           mutate(name_cleaner= stringr::str_replace_all(name_cleaner, "[[:punct:]]|`", "")  ) %>% 
           mutate(name_cleaner= stringr::str_replace_all(name_cleaner, "[^[:alnum:] ]", "")  ) %>%  #removes all the weird unicode and ascii
           mutate(name_cleaner=trimws(name_cleaner)  ) %>%
           mutate(name_cleaner_nospace= stringr::str_replace_all(name_cleaner, " ", "") )

Create a Simple Features Version GIS Version

events_sf <- events %>% # filter(!is.na(longitude) & !is.na(latitude))  %>%
  distinct() %>%
  # filter( between(longitude, 30.0,45.0)  )  %>%  #Flag ROI but don't subset on it yet
  # filter( between(latitude, -5.0,5.0) ) %>%
  mutate(name_clean = stringr::str_trim(tolower(location_text))) %>%
  mutate(name_clean_posessive = grepl("'s|`s", name_clean)) %>%
  mutate(name_cleaner = trimws(name_clean)) %>%
  mutate(name_cleaner = gsub("'s|`s", "", name_cleaner, fixed = T)) %>%
  mutate(name_cleaner = stringr::str_replace_all(name_cleaner, "[[:punct:]]|`", "")) %>%
  mutate(name_cleaner = trimws(name_cleaner)) %>%
  mutate(name_cleaner_nospace = stringr::str_replace_all(name_cleaner, " ", ""))

# Avoid creating geometries where one of the two is NA
events_sf$map_coordinate_clean_longitude[is.na(events_sf$map_coordinate_clean_latitude)] <- NA
events_sf$map_coordinate_clean_latitude[is.na(events_sf$map_coordinate_clean_longitude)] <- NA
#events_sf$event_hash <- NULL #Make sure we're not hashing on the previous hash which might be a random walk

events_sf <- events_sf %>%
            sf::st_as_sf(coords = c("map_coordinate_clean_longitude", "map_coordinate_clean_latitude"),
                     crs = 4326, agr = "constant", remove = F, na.fail = F) # %>% 
             #mutate(event_hash = apply(., 1, digest, algo="xxhash64") ) #Do this once and only once
valid <- sf::st_is_valid(events_sf$geometry); table(valid)

eventsnames_sf <- events_sf %>% 
  select("name_cleaner", "geometry") %>% 
  setNames(c("name", "geometry")) %>% 
  mutate(source_dataset = "events")

Output Cleaned Files

saveRDS(events_sf, glue::glue(getwd(), "/../inst/extdata/events_sf.Rdata"))


rexdouglass/MeasuringLandscape documentation built on May 13, 2019, 6:16 p.m.