R/Ward_LGA_Results.R

Defines functions make_proportions add_col_nonexist rank_money rank_least agg_pivot

Documented in make_proportions

#PARENT FOLDER
parent_folder <- "~/Desktop/Nigeria/Hard_to_reach/New_Organization/H2R_SEPT_SO"
#H2R EXCEL SHEET--NAME OF SETTLEMENT MERGED FILE
h2r_csv <- "consensus_H2R_sept"
#GIS FILE NAME
gis_file_name <- "Wards_X_GRID3_number-settlements.xlsx"
gis_sheet_name <- "Feuil1"
#DEFINE THRESHOLD (% OF SETTLEMENTS per-LGA (and ward))
threshold <- 0.05  # % OF SETTLEMENTS THRESHOLD
 #"ALL"; otherwise "01" to "12" (characters)
 month_choose <- "09"
 ##############################################################################
 #SET WORKING DIRECTORY
 setwd(parent_folder)
 #ANALYSIS FOLDER
 analysis_folder <- "Analysis"
 #GIS (INPUT) FOLDER
 gis_working_directory <- paste0(parent_folder,"/",analysis_folder,"/","GIS_Settlement_list")
 #WARD & LGA (OUTPUT) FOLDER
 ward_working_directory <- paste0(parent_folder,"/",analysis_folder,"/","Ward_Results")
 lga_working_directory <- paste0(parent_folder,"/",analysis_folder,"/","LGA_Results")
 global_working_directory <- paste0(parent_folder,"/",analysis_folder,"/","Global_Results")
 #SETTLEMENT PATH
 settlement_working_directory <- paste0(parent_folder,"/",analysis_folder,"/","Settlements_Merged")  #DO NOT TOUCH
 #HARD TO REACH DATA
 h2r_cleaned_data <- read_csv(paste0(settlement_working_directory,"/",h2r_csv,".csv"))
 h2r_cleaned_data[1] <-NULL
 #WARD FILE NAMES
 if(month_choose !="ALL"){
   ###EXTRACT X CHARACTERS FROM THE RIGHT                      
   substrRight <- function(x, n){
     substr(x, nchar(x)-n+1, nchar(x))
   }
   ward_named_dataset <- paste0("H2R_Ward_Results_",substrRight(h2r_csv, 5) )
 } else if (month_choose =="ALL"){
   ward_named_dataset <- gsub("Settlements_Merged", "Ward_Results", h2r_csv)
 } else{ ward_named_dataset <- "NO_NAME"
 }
 
 #LGA FILE NAMES
 if(month_choose !="ALL"){
   lga_named_dataset <- gsub("consensus_", "LGA_Results_", h2r_csv)
 } else if (month_choose =="ALL"){
   lga_named_dataset <- gsub("consensus_", "ALL_LGA_Results_", h2r_csv)
 } else{ lga_named_dataset <- "NO_NAME"
 }
 
 #GLOBAL FILE NAMES
 if(month_choose !="ALL"){
   global_named_dataset <- gsub("consensus_", "H2R_GLOBAL_Results_", h2r_csv)
 } else if (month_choose =="ALL"){
   global_named_dataset <- gsub("consensus_", "GLOBAL_Results", h2r_csv)
 } else{ global_named_dataset <- "NO_NAME"
 }
 
#####LOAD PACKAGES#####
if (!require(readxl)) install.packages('readxl')
library(readxl)

if (!require(weights)) install.packages('weights')
library(weights)

if (!require(agricolae)) install.packages('agricolae')
library(agricolae)

if (!require(pls)) install.packages('pls')
library(pls)

if (!require(gmodels)) install.packages('gmodels')
library(gmodels)

if (!require(splitstackshape)) install.packages('splitstackshape')
library(splitstackshape)

if (!require(readxl)) install.packages('readxl')
library(readxl)

if (!require(ggplot2)) install.packages('ggplot2')
library(ggplot2)

if (!require(reshape2)) install.packages('reshape2')
library(reshape2)

if (!require(data.table)) install.packages('data.table')
library(data.table)

if (!require(magrittr)) install.packages('magrittr')
library(magrittr)

if (!require(scales)) install.packages('scales')
library(scales)

if (!require(tm)) install.packages('tm')
library(tm)

if (!require(SDMTools)) install.packages('SDMTools')
library(SDMTools)

if (!require(dplyr)) install.packages('dplyr')
library(dplyr)
 
 if (!require(writexl)) install.packages('writexl')
 library(writexl)

if (!require(plotly)) install.packages('plotly')
library(plotly)

if (!require(tibble)) install.packages('tibble')
library(tibble)

if (!require(plotly)) install.packages('plotly')
library(plotly)

if (!require(plyr)) install.packages('plyr')
library(plyr)

if (!require(tidyr)) install.packages('tidyr')
library(tidyr)

if (!require(stringr)) install.packages('stringr')
library(stringr)

if (!require(ggrepel)) install.packages('ggrepel')
library(ggrepel)

if (!require(reshape2)) install.packages('reshape2')
library(reshape2)

if (!require(MASS)) install.packages('MASS')
library(MASS)

if (!require(magrittr)) install.packages('magrittr')
library(magrittr)

if (!require(foreign)) install.packages('foreign')
library(foreign)

if (!require(sandwich)) install.packages('sandwich')
library(sandwich)

if (!require(lmtest)) install.packages('lmtest')
library(lmtest)

if (!require(corrplot)) install.packages('corrplot')
library(corrplot)

if (!require(RColorBrewer)) install.packages('RColorBrewer')
library(RColorBrewer)

if (!require(xtable)) install.packages('xtable')
library(xtable)

if (!require(Hmisc)) install.packages('Hmisc')
library(Hmisc)

if (!require(car)) install.packages('car')
library(car)

if (!require(readr)) install.packages('readr')
library(readr)

######################FUNCTIONS######################
 #MOVE COLUMNS--NOT MINE
 moveme <- function (invec, movecommand) {
   movecommand <- lapply(strsplit(strsplit(movecommand, ";")[[1]], 
                                  ",|\\s+"), function(x) x[x != ""])
   movelist <- lapply(movecommand, function(x) {
     Where <- x[which(x %in% c("before", "after", "first", 
                               "last")):length(x)]
     ToMove <- setdiff(x, Where)
     list(ToMove, Where)
   })
   myVec <- invec
   for (i in seq_along(movelist)) {
     temp <- setdiff(myVec, movelist[[i]][[1]])
     A <- movelist[[i]][[2]][1]
     if (A %in% c("before", "after")) {
       ba <- movelist[[i]][[2]][2]
       if (A == "before") {
         after <- match(ba, temp) - 1
       }
       else if (A == "after") {
         after <- match(ba, temp)
       }
     }
     else if (A == "first") {
       after <- 0
     }
     else if (A == "last") {
       after <- length(myVec)
     }
     myVec <- append(temp, values = movelist[[i]][[1]], after = after)
   }
   myVec
 }
 
############Calculate proportions with or without "dont know"
#data == dataframe
#agg_var == Name of the geographic aggregation unit(e.g., lga)
#indicator_index == Column INDEX of the first column to be aggregated
#dont_denom = TRUE if "dont know" is included in the calculations/denominator; FALSE if not
make_proportions <- function(data,agg_var,indicator_index, dont_denom){
  var_name <- colnames(data)[indicator_index]
  locationz <- as.vector(unique(data[grep(paste0("^",deparse(substitute(agg_var)),"$"),colnames(data))]))
  #locationz <- sort(locationz, decreasing=TRUE)
  data<- add_column(data, onesz = 1)
  if(dont_denom == FALSE){
    data  <- data %>%
      dplyr ::filter(!is.na(data[grep(paste0("^",var_name,"$"), colnames(data))])) 
    data  <- dplyr:: filter_(data, paste(var_name,"!=", "'dontknow'",sep=" "))
  } else {
    data  <- data %>%
      dplyr ::  filter(!is.na(data[grep(paste0("^",var_name,"$"), colnames(data))]))
  }
  agg_var_indicator <- as.formula(paste0(deparse(substitute(agg_var)),"~",var_name))  
  result <- data %>% dcast(agg_var_indicator, fun.aggregate = sum,value.var="onesz", na.rm=TRUE)
  namez <- paste0(names(result)[2:ncol(result)],"_",var_name)
  names(result)[2:ncol(result)] <- namez
  result<- add_column(result, total_respondents = rowSums(result[2:ncol(result)]))
  denom_column <- ncol(result)
  props <- list()
  for(i in 2:(ncol(result)-1)){
    props[[i]]<-  result[i]/result[denom_column]
  }
  props[sapply(props, is.null)] <- NULL
  props <- as.data.frame(props)
  names(props) <- paste0("pr_",names(props))
  result <- data.frame(result, props )
  result<-merge(locationz, result, by=deparse(substitute(agg_var)) , all.x=TRUE)
  return(result)
}
#aa<- make_proportions(nonnumeric,lga, 3,TRUE)

########ADD COLUMN IF DOES NOT EXIST########
#data == dataframe
#cname <- single column name or vector of colmn names
add_col_nonexist <- function(data, cname) {
  add <-cname[!cname%in%names(data)]
  if(length(add)!=0) data[add] <- 0
  data
}
#add_col_nonexist(togo, c("No","No.information","Yes"))


############RANK VALUES##################
#df == Dataframe of columns -- NOTE THAT IT MUST BE THE ID COLUMNS AND THE REST ARE THE COLUMNS TO BE RANKED
#aggunit == IN QUOTATIONS: Aggregation unit
#toprank == Top-n ranking (e.g., 5 produces the top 5)
rank_money <- function(df, aggunit, toprank) {
  callag <- melt(df, id.vars = c(aggunit))
  id_index <- grep(paste("^",aggunit,"$", sep=""),colnames(callag))
  unique_units <- unique(callag[id_index])
  unique_units<-as.data.frame(unique_units)
  snowflakes <- vector("list")
  for (i in 1:nrow(unique_units)){
    snowflakes[[i]] <- subset(callag, get(aggunit) == unique_units[i,])
  }
  snowflakes<-  lapply(snowflakes, function(x) x[!duplicated(x), ])
  sorted_dataframes_list <- lapply(snowflakes, function(df){
    df[order(df$value,decreasing = TRUE),]
  })
  rankked <- lapply(sorted_dataframes_list,head,n=toprank)
  castedd <- lapply(rankked, function(df){
    units_variable <- as.formula(paste0(as.symbol(aggunit),"~", "factor(",as.symbol("variable"),",levels=unique(",as.symbol("variable"),"))","+",as.symbol("value")))
    dcast(df, units_variable) 
  }) 
  trimcast <- lapply(castedd, function(df){
    sub("_[^_]+$", "", names(df[2:(toprank+1)]))
  })
  for (k in 1: nrow(unique_units)){
    for (j in (toprank+2):(toprank+1+toprank)){
      castedd[[k]][j]<-NA
    }
  }
  for (k in 1: nrow(unique_units)){
    for (j in 1: toprank){
      castedd[[k]][j+toprank+1] <- trimcast[[k]][j] 
    }
  }
  named <-c()  
  for (h in 1:toprank){
    named[h] <- paste0("rank",h,sep="")
  }
  ranknamed <-c() 
  for (l in 1:toprank ){
    ranknamed[l] <- paste0("name",l,sep="")
  }
  titles <- c("geounit", named,ranknamed)
  castedd <- lapply(castedd, setNames, titles)
  locations <- df[grep(paste0("^",aggunit,"$"),colnames(df))]
  locations <- unique(locations)
  ordername <- data.frame(matrix(unlist(castedd), nrow=nrow(unique_units), byrow=T),stringsAsFactors=FALSE)
  colnames(ordername) <- titles
  for (j in 1: toprank+1){
    ordername[j]<-round(as.numeric(unlist(ordername[j])),4)
  }
  ordername$geounit<-locations
  ordername[ordername == 0] <- NA
  names(ordername)[1]<-aggunit
  for(i in 2:(1+toprank)){
    ordername[,i+toprank] <- ifelse(is.na(ordername[,i]),NA,ordername[,i+toprank])
  }
  return(ordername)
}
#aaa <- rank_money(push_reasons, "lga_group", 3)



############RANK LOWEST VALUES##################
#df == Dataframe of columns -- NOTE THAT IT MUST BE THE ID COLUMNS AND THE REST ARE THE COLUMNS TO BE RANKED
#aggunit == IN QUOTATIONS: Aggregation unit
#toprank == Top-n ranking (e.g., 5 produces the top 5)
rank_least <- function(df, aggunit, toprank) {
  callag <- melt(df, id.vars = c(aggunit))
  id_index <- grep(paste("^",aggunit,"$", sep=""),colnames(callag))
  unique_units <- unique(callag[id_index])
  unique_units<-as.data.frame(unique_units)
  snowflakes <- vector("list")
  for (i in 1:nrow(unique_units)){
    snowflakes[[i]] <- subset(callag, get(aggunit) == unique_units[i,])
  }
  snowflakes<-  lapply(snowflakes, function(x) x[!duplicated(x), ])
  sorted_dataframes_list <- lapply(snowflakes, function(df){
    df[order(df$value,decreasing =  FALSE),]
  })
  rankked <- lapply(sorted_dataframes_list,head,n=toprank)
  castedd <- lapply(rankked, function(df){
    units_variable <- as.formula(paste0(as.symbol(aggunit),"~", "factor(",as.symbol("variable"),",levels=unique(",as.symbol("variable"),"))","+",as.symbol("value")))
    dcast(df, units_variable) 
  }) 
  trimcast <- lapply(castedd, function(df){
    sub("_[^_]+$", "", names(df[2:(toprank+1)]))
  })
  for (k in 1: nrow(unique_units)){
    for (j in (toprank+2):(toprank+1+toprank)){
      castedd[[k]][j]<-NA
    }
  }
  for (k in 1: nrow(unique_units)){
    for (j in 1: toprank){
      castedd[[k]][j+toprank+1] <- trimcast[[k]][j] 
    }
  }
  named <-c()  
  for (h in 1:toprank){
    named[h] <- paste0("rank",h,sep="")
  }
  ranknamed <-c() 
  for (l in 1:toprank ){
    ranknamed[l] <- paste0("name",l,sep="")
  }
  titles <- c("geounit", named,ranknamed)
  castedd <- lapply(castedd, setNames, titles)
  locations <- df[grep(paste0("^",aggunit,"$"),colnames(df))]
  locations <- unique(locations)
  ordername <- data.frame(matrix(unlist(castedd), nrow=nrow(unique_units), byrow=T),stringsAsFactors=FALSE)
  colnames(ordername) <- titles
  for (j in 1: toprank+1){
    ordername[j]<-round(as.numeric(unlist(ordername[j])),4)
  }
  ordername$geounit<-locations
  return(ordername)
}
#aaa <- rank_money(push_reasons, "lga_group", 3)

 #########PIVOT/AGGREGATE SETTLEMENT-LEVEL DATA (SETTLEMENT PROPORTIONS#########
 #dataset == dataframe with data to be aggregated 
 #first_column_to_agg == IN QUOTATIONS: Name of the first column to be aggregated -- all columns to aggregate must be consecutive
 #last_column_to_agg == IN QUOTATIONS: Name of the last column to be aggregated -- all columns to aggregate must be consecutive
 #geo_agg_level == IN QUOTATIONS: Name of the desired geographic aggregation level
 agg_pivot <- function(dataset,first_column_to_agg,last_column_to_agg,geo_agg_level){
   results <- list()
   dataset<-add_column(dataset, oneszz := 1)
   for(i in grep(paste0("^",first_column_to_agg,"$"),colnames(dataset)):grep(paste0("^",last_column_to_agg,"$"),colnames(dataset))){
     #ONE QUESTION AT A TIME
     units_variable <- as.formula(paste0(geo_agg_level,"~",colnames(dataset[i]))) 
     colnamedd <- colnames(dataset[i])  
     agg_level <- dataset %>% dcast(units_variable, fun.aggregate = sum,value.var="oneszz", na.rm=TRUE)
     agg_level <- add_col_nonexist(agg_level, c("dontknow","SL","NC"))
     agg_level<-as.data.frame(agg_level)
     #REMOVE "dontknow" "SL" "NC" responses from aggregation
     agg_level<-agg_level[ , -which(names(agg_level) %in% c("dontknow","SL","NC"))]
     agg_level<-as.data.frame(agg_level)
     cols_agg_level <- ncol(agg_level) 
     if(ncol(agg_level)>1){
       colnames(agg_level)[2:cols_agg_level] <- paste0(colnamedd,"_",colnames(agg_level)[2:cols_agg_level])
       for(k in 2:cols_agg_level){
         agg_level[k+(cols_agg_level-1)] <- agg_level[k]/rowSums(agg_level[2:cols_agg_level])
         colnames(agg_level)[k+(cols_agg_level-1)] <- paste0("pr_",colnames(agg_level)[k+(cols_agg_level-1)])
         names(agg_level) <- gsub(x = names(agg_level), pattern = "\\.1", replacement = "") 
       }
     }else{ agg_level <- NULL
     }
     results[[i]] <- agg_level
   }
   #REMOVE NULLS
   results[sapply(results, is.null)] <- NULL
   #EXPORT
   agg_pivots <- as.data.frame(results)
   agg_pivots<-agg_pivots[!grepl("lga_location_real.", colnames(agg_pivots))]
   return(agg_pivots)
 }
 
 
####################################BEGIN AGGREGATION###################################
##REMOVE SPECIAL CHARACTERS AND "consent_yes" FROM HEADERS
names(h2r_cleaned_data) <- names(h2r_cleaned_data) %<>%
  gsub("/", "_", .) %>%
  gsub("-", "_", .)%>%
  gsub("/", "_", .) %>%
  gsub("'", "", .) %>%
  gsub("consent_ok_hoh_ok_", "", .) 
h2r_cleaned_data<-as.data.frame(h2r_cleaned_data)
#CREATE COMBINED LGA AND GROUP VARIABLE TO AGGREGATE BY
h2r_cleaned_data$lga_group <- paste0(h2r_cleaned_data$lga,".",h2r_cleaned_data$group)
h2r_cleaned_data$onesz <-1 

#MAKE LOCATION COLUMN
h2r_cleaned_data$ward_location_real <- paste0(h2r_cleaned_data$C_info_state,";",h2r_cleaned_data$C_info_lga,";",h2r_cleaned_data$C_info_ward)
h2r_cleaned_data$lga_location_real <- paste0(h2r_cleaned_data$C_info_state,";",h2r_cleaned_data$C_info_lga)

#GIS SETTLEMENT COUNTS--WARDS
#GIS ROUTE
gis <- read_excel(paste0(gis_working_directory,"/",gis_file_name), sheet = gis_sheet_name)
gis$onesz <- 1 
gis$state_lga_ward <- paste0(tolower(gis$ADM1_EN),";",gis$ADM2_PCODE,";",gis$ADM3_PCODE)
gis_ward <- gis %>% dcast(state_lga_ward~onesz, fun.aggregate = sum,value.var="Number_of_settlements", na.rm=TRUE)
colnames(gis_ward)[2] <- "gis_settlment_cnt"
gis_ward$state_lga_ward %>%
  gsub("&", "_", .) %>%
  gsub(" ", "_", .) 
#LGA LEVEL--GIS COUNT
gis$state_lga <- paste0(tolower(gis$ADM1_EN),";",gis$ADM2_PCODE)
gis_lga <- gis %>% dcast(state_lga~onesz, fun.aggregate = sum,value.var="Number_of_settlements", na.rm=TRUE)
colnames(gis_lga)[2] <- "gis_settlment_cnt"

h2r_cleaned_data <- as.data.frame(h2r_cleaned_data)

#H2R SETTLEMENT COUNTS
h2r_cleaned_data$onesz <-1
h2r_cleaned_data$state_lga_ward <- paste0(tolower(h2r_cleaned_data$C_info_state),";",h2r_cleaned_data$C_info_lga,";",h2r_cleaned_data$C_info_ward)
ward_settlment_cnt <- dcast(h2r_cleaned_data, state_lga_ward~onesz, fun.aggregate = sum,value.var="onesz", na.rm=TRUE)
colnames(ward_settlment_cnt)[2] <- "h2r_settlement_cnt"

##########################WARDS PIVOTS##########################
h2r_cleaned_data$onesz <-1
wards_settlment_cnt <-  dcast(h2r_cleaned_data, state_lga_ward~onesz, fun.aggregate = sum,value.var="onesz", na.rm=TRUE)
colnames(wards_settlment_cnt)[2] <- "h2r_settlement_cnt"
h2r_cleaned_data$state_lga_ward <- paste0(h2r_cleaned_data$C_info_state, ";",h2r_cleaned_data$C_info_lga, ";",h2r_cleaned_data$C_info_ward )

#NUM OF KIs--WARDS
#CHANGE KI_CNT TO NUMERIC
h2r_cleaned_data$ki_coverage %<>% as.numeric
wards_ki_cnt <-  dcast(h2r_cleaned_data,state_lga_ward ~onesz, fun.aggregate = sum,value.var="ki_coverage", na.rm=TRUE)
colnames(wards_ki_cnt)[2] <- "ki_cnt"

#RUN PIVOT
ward_level_pivots <- agg_pivot(h2r_cleaned_data,"D_D1_hc_now","L_idp_leadership","state_lga_ward")

#CHECK GIS
ward_gis_joined <- merge(ward_level_pivots,gis_ward, by="state_lga_ward", all.x=TRUE)
ward_gis_select <- subset(ward_gis_joined,select=c(state_lga_ward,gis_settlment_cnt))
ward_gis_select <- merge(ward_gis_select,wards_settlment_cnt, by="state_lga_ward",all.x=TRUE)
ward_gis_select$pr_settlments <-  ward_gis_select$h2r_settlement_cnt/ward_gis_select$gis_settlment_cnt
ward_gis_select$pr_settlments <- ceiling(ward_gis_select$pr_settlments/0.005)*0.005
ward_gis_select$over_threshold_wards <- ifelse(ward_gis_select$pr_settlments>=threshold,1,0)
ward_gis_select <- merge(ward_gis_select, wards_ki_cnt, by= "state_lga_ward",all.x=TRUE)
#write.csv(ward_level_pivots, "lookat2.csv")
#JOIN TO GIS AND SUBSET ONLY WARDS OVER THE THRESHOLD OF the % OF SETTLEMENTS
ward_level_pivots <- merge(ward_level_pivots, ward_gis_select, by="state_lga_ward", all.x=TRUE)
ward_level_pivots <- dplyr::filter(ward_level_pivots,over_threshold_wards==1)
ward_gis_select[grep("over_threshold_wards", colnames(ward_gis_select))] <- paste0("over_",as.character(threshold),"_wards") 

#SEPARATE GEOGRAPHIES & EXPORT
ward_level_pivots<-  ward_level_pivots %>% separate(state_lga_ward, c("State", "LGA","Ward"),sep =";")
ward_level_pivots <- dplyr:: select(ward_level_pivots, -contains("state_lga_ward"))
setwd(ward_working_directory)
write.csv(ward_level_pivots,paste0("WARDS_",ward_named_dataset,".csv"),na="",row.names = FALSE)

####################################BEGIN LGA AGGREGATION###################################
setwd(settlement_working_directory)
h2r_cleaned_data <- read_csv(paste0(h2r_csv,".csv"))
##REMOVE SPECIAL CHARACTERS AND "consent_yes" FROM HEADERS
names(h2r_cleaned_data) <- names(h2r_cleaned_data) %<>%
  gsub("/", "_", .) %>%
  gsub("-", "_", .)%>%
  gsub("/", "_", .) %>%
  gsub("'", "", .) %>%
  gsub("consent_ok_hoh_ok_", "", .) 
h2r_cleaned_data<-as.data.frame(h2r_cleaned_data)
#CREATE COMBINED LGA AND GROUP VARIABLE TO AGGREGATE BY
h2r_cleaned_data$lga_group <- paste0(h2r_cleaned_data$lga,".",h2r_cleaned_data$group)
h2r_cleaned_data$onesz <-1 

#MAKE LOCATION COLUMN
h2r_cleaned_data$state_lga <- paste0(tolower(h2r_cleaned_data$C_info_state),";",h2r_cleaned_data$C_info_lga)
h2r_cleaned_data <- as.data.frame(h2r_cleaned_data)

#H2R SETTLEMENT COUNTS
h2r_cleaned_data$onesz <-1
lga_settlment_cnt <- h2r_cleaned_data %>% dcast(state_lga~onesz, fun.aggregate = sum,value.var="onesz", na.rm=TRUE)
colnames(lga_settlment_cnt)[2] <- "h2r_settlement_cnt"

#NUM OF KIs--LGAS
#CHANGE KI_CNT TO NUMERIC
h2r_cleaned_data$ki_coverage %<>% as.numeric
lga_ki_cnt <-  dcast(h2r_cleaned_data,state_lga ~onesz, fun.aggregate = sum,value.var="ki_coverage", na.rm=TRUE)
colnames(lga_ki_cnt)[2] <- "ki_cnt"
##########################LGA PIVOTS##########################
#RUN PIVOT
lga_level_pivots <- agg_pivot(h2r_cleaned_data,"D_D1_hc_now","L_idp_leadership","state_lga")
lga_to_bind <- lga_level_pivots
colnames(lga_to_bind)[1] <- "aggregation_level"

#GIS CHECK
lga_gis_joined <- merge(lga_level_pivots,gis_lga, by="state_lga", all.x=TRUE)
lga_look_gis_percent <- subset(lga_gis_joined, select=c(state_lga,gis_settlment_cnt))
lga_look_gis_percent <- merge(lga_look_gis_percent,lga_settlment_cnt, by="state_lga", all.x=TRUE)
lga_look_gis_percent$pr_settlments <- lga_look_gis_percent$h2r_settlement_cnt/lga_look_gis_percent$gis_settlment_cnt
lga_look_gis_percent$pr_settlments <- ceiling(lga_look_gis_percent$pr_settlments/0.005)*0.005
lga_look_gis_percent$over_threshold_lgas <- ifelse(lga_look_gis_percent$pr_settlments>=threshold,1,0)
#JOIN GIS SETTLEMENT COUNTS AND SUBSET ONLY THOSE OVER THE THRESHOLD OF % OF SETTLEMENTS
lga_level_pivots <- merge(lga_level_pivots, lga_look_gis_percent, by="state_lga", all.x=TRUE)
lga_level_pivots <- merge(lga_level_pivots, lga_ki_cnt, by="state_lga", all.x=TRUE)
allreported_gis_settlement_cnt <- sum(lga_level_pivots$gis_settlment_cnt)
lga_level_pivots <- dplyr::filter(lga_level_pivots,over_threshold_lgas==1)
print(getwd())
write.csv(lga_level_pivots,"look.csv")
if(nrow(lga_level_pivots)==0){
  lga_level_pivots <- dplyr:: select(lga_level_pivots, -contains("state_lga."))
  print("NO LGAs AT OR ABOVE THRESHOLD")
} else{
  lga_level_pivots <- dplyr:: select(lga_level_pivots, -contains("state_lga."))
lga_level_pivots[grep("over_threshold_wards", colnames(lga_level_pivots))] <- paste0("over_",as.character(threshold),"_lga") 

#SEPARATE GEOGRAPHIES & EXPORT
lga_level_pivots <- lga_level_pivots %>% separate(state_lga, c("State", "LGA"),sep =";")
setwd(lga_working_directory)
#write.csv(lga_level_pivots,paste0(lga_named_dataset,".csv"),na="")
}


####################################BEGIN GLOBAL AGGREGATION###################################
setwd(settlement_working_directory)
h2r_cleaned_data <- read_csv(paste0(h2r_csv,".csv"))
##REMOVE SPECIAL CHARACTERS AND "consent_yes" FROM HEADERS
names(h2r_cleaned_data) <- names(h2r_cleaned_data) %<>%
  gsub("/", "_", .) %>%
  gsub("-", "_", .)%>%
  gsub("/", "_", .) %>%
  gsub("'", "", .) %>%
  gsub("consent_ok_hoh_ok_", "", .) 
h2r_cleaned_data<-as.data.frame(h2r_cleaned_data)
#CREATE COMBINED LGA AND GROUP VARIABLE TO AGGREGATE BY
h2r_cleaned_data$lga_group <- paste0(h2r_cleaned_data$lga,".",h2r_cleaned_data$group)
h2r_cleaned_data$onesz <-1 

h2r_cleaned_data <- as.data.frame(h2r_cleaned_data)

#H2R SETTLEMENT COUNTS
h2r_cleaned_data$onesz <-1
global_settlment_cnt <- h2r_cleaned_data %>% dcast(onesz~onesz, fun.aggregate = sum,value.var="onesz", na.rm=TRUE)
global_settlment_cnt[1] <- NULL

#NUM OF KIs--LGAS
#CHANGE KI_CNT TO NUMERIC
h2r_cleaned_data$ki_coverage %<>% as.numeric
global_ki_cnt <-  dcast(h2r_cleaned_data,onesz ~onesz, fun.aggregate = sum,value.var="ki_coverage", na.rm=TRUE)
global_ki_cnt[1] <- NULL

##########################GLOBAL PIVOTS##########################
#RUN PIVOT
global_level_pivots <- agg_pivot(h2r_cleaned_data,"D_D1_hc_now","L_idp_leadership","onesz")

global_level_pivots <- global_level_pivots %>% dplyr:: select(-contains("onesz")) #%>% dplyr:: select(contains("pr_"))
global_level_pivots$gis_settlment_cnt <- allreported_gis_settlement_cnt
global_level_pivots$h2r_settlement_cnt <- global_settlment_cnt[1,]
global_level_pivots$ki_cnt <- global_ki_cnt[1,]

setwd(global_working_directory)
write.csv(global_level_pivots,paste0("WIDE_",global_named_dataset,".csv"),na="")

###COMBINE LGA & GLOBAL RESPONSES
#REMOVE EXTRA COLUMNS FROM LGA FILE
lga_level_pivots[grep("over_threshold_lgas" ,colnames(lga_level_pivots))]<-NULL
lga_level_pivots[grep("pr_settlments" ,colnames(lga_level_pivots))]<-NULL

#REMOVE ID COLUMN AND HARMONIZE HEADERS
lga_level_pivots$state_lga <- paste0(lga_level_pivots$State,";",lga_level_pivots$LGA)
lga_level_pivots$State <- NULL 
lga_level_pivots$LGA <- NULL
lga_level_pivots <- lga_level_pivots[moveme(names(lga_level_pivots), "state_lga first")]
colnames(lga_level_pivots)[1] <- "geo_aggregation"
global_level_pivots$geo_aggregation <- "global"
global_level_pivots <- global_level_pivots[moveme(names(global_level_pivots), "geo_aggregation first")]

#CHECK HEADERS
lga_global_match <- names(lga_level_pivots) ==  names(global_level_pivots)
if(any(lga_global_match==FALSE)){
  print("LGA & GLOBAL HEADER MISMATCH")
} else{
  print("LGA & GLOBAL HEADERS MATCH")
}
#STACK FILES
lga_global_stacked <- rbind(lga_level_pivots,global_level_pivots)
lga_global_stacked <- lga_global_stacked %>% separate(geo_aggregation, c("State", "LGA"),sep =";")
#GET LGA NAMES
lga_info <- gis %>% dplyr:: select(ADM2_EN,ADM2_PCODE) %>% dplyr:: distinct(ADM2_EN,ADM2_PCODE)
colnames(lga_info)[2]<-"LGA"
lga_global_stacked <- merge(lga_global_stacked,lga_info, by="LGA", all.x = TRUE)
lga_global_stacked <- lga_global_stacked[moveme(names(lga_global_stacked), "ADM2_EN first")]

#KEEP GEO COLUMNS
geo_cols <- lga_global_stacked %>% dplyr::select(ADM2_EN,LGA,State)
#MAP COLUMNS
maps <- c("@assessed_settlements","@prop_org_pop_remain","@idp_reside_network",
          "@idp_movement_network", "@prop_idp_in_settlement","@prop_returnee",
  "@prop_access_food","@prop_allowed_buy_markets","@prop_idp_access_food",
  "@access_land_cultivation", "@time_access_heath_foot","@feeding_programs",
  "@conflict_civ_killed","@prop_item_ufo","@prop_shelter_destroy",
  "@prop_exist_boreholes", "@prop_funct_boreholes", "@prop_walkable_education")
map_col <- list()
for(f in 1:length(maps)){
  map_col[[f]] <- paste0(maps[f],"_",geo_cols$ADM2_EN)
}
map_col <- data.frame(map_col)
map_col <- as.data.frame(gsub("@", "", as.matrix(map_col))) 
map_col[nrow(geo_cols),] <- ""
names(map_col) <- maps

#KEEP settlement COLUMNS

settlement_cols <- lga_global_stacked %>% dplyr::select(gis_settlment_cnt,h2r_settlement_cnt,ki_cnt)

#KEEP ONLY PERCENT COLUMNS
lga_global_stacked_props <- lga_global_stacked[ , startsWith(names(lga_global_stacked), "pr_") ]
lga_global_stacked_props <- round(lga_global_stacked_props*100,0)

#COMBINE DATAFRAME PIECES
lga_global_stacked <- cbind(geo_cols,map_col,lga_global_stacked_props,settlement_cols)
#REMOVE NO/FALSE RESPONSES
lga_global_stacked <- lga_global_stacked %>% dplyr::select(-contains("_FALSE")) #%>%
 #dplyr::select(-ends_with("_no"))
#EXPORT
setwd(lga_working_directory)
write.csv(lga_global_stacked,paste0("LGA_GLOBAL_",lga_named_dataset,".csv"),na="",row.names = FALSE)
#write_xlsx(lga_global_stacked, paste0("LGA_GLOBAL",lga_named_dataset,".xlsx"))
misizsembes/NGA_H2R_TOOL_v2_June2019 documentation built on Nov. 4, 2019, 6:56 p.m.