riseR:

####################################################
##            MERGE                               ##
## This file merges the survey files together     ##
## before corrections and analyses are conducted. ##
####################################################


#############################################
# MERGE HOUSE

# remove useless columns
house <- house %>% 
  select (-SubmissionDate, -starttime, -endtime, -deviceid, -subscriberid, -simid, 
          -devicephonenum, -username, -caseid, -length1, -length2, -check_locn, 
          -check_no, -signedyn_pl, 
          -study_pl, -surveys_pl, -instanceID, -instanceName, -formdef_version) %>% 
  arrange(KEY)

house.water <- house.water %>% 
  rename (Number = KEY) %>% 
  select (-`SET-OF-water_repeat`) %>% 
  arrange (PARENT_KEY)

# SORT DATA BY KEYS
# house[order(house$KEY), ]
# house.water[order(house.water$PARENT_KEY), ]

# MERGE BOTH HOUSE SURVEY FILES ON KEY AND PARENT KEY 
house.merge <- merge(x = house, y = house.water, by.x = "KEY", by.y = "PARENT_KEY", all.x = TRUE)
# xx <- full_join (house, house.water, by = c("KEY" = "PARENT_KEY")) #this was a double check - all good

house.merge <- house.merge %>% 
  select (settlement_barcode, extract_house_no, hhd_name, everything()) %>% 
  arrange (settlement_barcode, extract_house_no, hhd_name) 
  
#############################################
# MERGE PERSON-LEVEL DATA FROM HOUSEHOLD SURVEY
# need to merge based on PARENT_KEY and number in household (end of KEY)
#these are all blank

a <- hhd.person %>% 
  filter(is.na(hhd.person$person_gender)) #68 of 427
b <- hhd.person %>% 
  filter(is.na(hhd.person$person_name)) #68
rm(a, b)

# FUNCTION TO GET JUST NUMBER FROM "KEY" - this now works regardless of the number
# ADD NUMBER COLUMN
substrRight <- function(x){
  Number <- x %>% 
    mutate (KEY2 = KEY) %>% 
    select (KEY2, KEY) %>% 
    separate (KEY2, c("x", "Number"), "\\[|\\]") %>% 
    select (-x) %>% 
    mutate (Number = as.numeric(Number))
} 

Number <- substrRight(hhd.person)
hhd.person <- full_join (hhd.person, Number, by = c("KEY" = "KEY"))
rm(Number)

Number <- substrRight(hhd.ethnicity)
hhd.ethnicity <- full_join (hhd.ethnicity, Number, by = c("KEY" = "KEY"))
rm(Number)

Number <- substrRight(hhd.religion)
hhd.religion <- full_join (hhd.religion, Number, by = c("KEY" = "KEY"))
rm(Number)

Number <- substrRight(hhd.marital)
hhd.marital <- full_join (hhd.marital, Number, by = c("KEY" = "KEY"))
rm(Number)

Number <- substrRight(hhd.daycare)
hhd.daycare <- full_join (hhd.daycare, Number, by = c("KEY" = "KEY"))
rm(Number)

Number <- substrRight(hhd.school)
hhd.school <- full_join (hhd.school, Number, by = c("KEY" = "KEY"))
rm(Number)

Number <- substrRight(hhd.read)
hhd.read <- full_join (hhd.read, Number, by = c("KEY" = "KEY"))
rm(Number)

Number <- substrRight(hhd.activity)
hhd.activity <- full_join (hhd.activity, Number, by = c("KEY" = "KEY"))
rm(Number)

#hhd.child is different. name_id == Number in other loops
#so nust use name_id


#remove rows of empty data 
# (these are generated by the survey during the looping - checked and ok; 
# so "Number" won't be chronological and some will be missing)
hhd.activity <- hhd.activity[!is.na(hhd.activity$person_name_activity), ]
hhd.child <- hhd.child[!is.na(hhd.child$name_child), ]
hhd.daycare <- hhd.daycare[!is.na(hhd.daycare$person_name_daycare), ]
hhd.ethnicity <- hhd.ethnicity[!is.na(hhd.ethnicity$person_name_ethnicity), ]
hhd.marital <- hhd.marital[!is.na(hhd.marital$person_name_marital), ]
hhd.person <- hhd.person[!is.na(hhd.person$person_name), ]
hhd.read <- hhd.read[!is.na(hhd.read$person_name_read), ]
hhd.religion <- hhd.religion[!is.na(hhd.religion$person_name_religion), ]
hhd.school <- hhd.school[!is.na(hhd.school$person_name_school), ]

# remove useless columns
hhd.person <- hhd.person %>% 
  select (person_name, person_name_first, person_name_last, child_name_first_final, child_name_last1, child_name_last2,
         person_gender,  child_gender_final, child_gender_pull, child_gender_pull_text, child_gender1, child_gender2, 
         child_age1, child_dob_final, child_age_calc, child_age_final1, child_age_final2,  
         dob, age, age_calc, new_age_final1, person_dob, age_final, age_final2, 
         person_relationship, concat_name_age, PARENT_KEY, KEY, Number)
hhd.activity <- hhd.activity %>% 
  select (-(`SET-OF-activity`), -KEY)
hhd.child <- hhd.child %>% 
  select (-(`SET-OF-child_loop`), -KEY)
hhd.daycare <- hhd.daycare %>% 
  select (-(`SET-OF-daycare`), -KEY)
hhd.ethnicity <- hhd.ethnicity %>% 
  select (-(`SET-OF-ethnicity_repeat`), -KEY)
hhd.marital <- hhd.marital %>% 
  select (-(`SET-OF-marital_status1`), -KEY)
hhd.read <- hhd.read %>% 
  select (-(`SET-OF-read`), -KEY)
hhd.religion <- hhd.religion %>% 
  select (-(`SET-OF-religion_repeat`), -KEY)
hhd.school <- hhd.school %>% 
  select (-(`SET-OF-school`), -KEY)

# pull settlement and house # from hhd survey
x <- hhd %>% 
  select (settlement_barcode, extract_settlement, extract_house_no, hhd_id, KEY, survey_status, today, adult_respondent_name, 
          ethnicity_q, ethnicity_others, ethnicity_hhd_yn, religion_q, religion_others, religion_hhd_yn, list_5plus, no_children)

# SORT DATA BY KEYS
x[order(x$KEY), ]
hhd.person[order(hhd.person$PARENT_KEY, hhd.person$Number), ]
hhd.ethnicity[order(hhd.ethnicity$PARENT_KEY, hhd.ethnicity$Number), ]
hhd.religion[order(hhd.religion$PARENT_KEY, hhd.religion$Number), ]
hhd.marital[order(hhd.marital$PARENT_KEY, hhd.marital$Number), ]
hhd.daycare[order(hhd.daycare$PARENT_KEY, hhd.daycare$Number), ]
hhd.school[order(hhd.school$PARENT_KEY, hhd.school$Number), ]
hhd.read[order(hhd.read$PARENT_KEY, hhd.read$Number), ]
hhd.activity[order(hhd.activity$PARENT_KEY, hhd.activity$Number), ]
hhd.child[order(hhd.child$PARENT_KEY, hhd.child$name_id), ]

#merge together ON KEY AND PARENT KEY 
a <- merge(x = x, y = hhd.person, by.x = "KEY", by.y = "PARENT_KEY", all.x = TRUE)
b <- merge(x = a, y = hhd.ethnicity, by.x = c("KEY", "Number"), by.y = c("PARENT_KEY", "Number"), all = TRUE)
c <- merge(x = b, y = hhd.religion, by.x = c("KEY", "Number"), by.y = c("PARENT_KEY", "Number"), all = TRUE)
d <- merge(x = c, y = hhd.marital, by.x = c("KEY", "Number"), by.y = c("PARENT_KEY", "Number"), all = TRUE)
e <- merge(x = d, y = hhd.daycare, by.x = c("KEY", "Number"), by.y = c("PARENT_KEY", "Number"), all = TRUE)
f <- merge(x = e, y = hhd.school, by.x = c("KEY", "Number"), by.y = c("PARENT_KEY", "Number"), all = TRUE)
g <- merge(x = f, y = hhd.read, by.x = c("KEY", "Number"), by.y = c("PARENT_KEY", "Number"), all = TRUE) %>% 
  rename(KEY.y1 = KEY.y)
h <- merge(x = g, y = hhd.activity, by.x = c("KEY", "Number"), by.y = c("PARENT_KEY", "Number"), all = TRUE)
hhd.merge <- full_join (h, hhd.child, by = c("KEY" = "PARENT_KEY", 
                                             "Number" = "name_id"))
rm(x, a, b, c, d, e, f, g, h)

rm(hhd.activity, hhd.child, hhd.daycare, hhd.ethnicity, hhd.marital, hhd.person, hhd.read,
   hhd.religion, hhd.school, house, house.water)


#############
# FIX DOB - WILL HAVE TO ESTIMATE FOR THOSE THAT ONLY ENTERED AGE
#############
#DOB and age - remember that DOB was optional ***
check <- hhd.merge %>% 
  select (settlement_barcode, extract_house_no, hhd_id, today, person_name,  
          child_age1, child_dob_final, child_age_calc, child_age_final1, child_age_final2,  
          dob, age, age_calc, new_age_final1, person_dob, age_final, age_final2) %>% # age_child, dob_child - these are just pulled through
  filter (age<=5 | child_age1<=5) %>%  
  filter (person_dob == "2000-02-29") #these still need to be manually fixed if possible
rm(check)



#now I need to "estimate" DOB for those where age was entered
hhd.merge <- hhd.merge %>%  
  rename (person_dob1 = person_dob) %>% 
  mutate (person_dob2 = today - 365*age_final) %>% 
  mutate (person_dob = if_else(!is.na(child_age1) | !is.na(age), person_dob2, person_dob1)) %>% 
  select (-person_dob1, -person_dob2)


#############

#FIX ETHNICITY AND RELIGION - infill
# hhd.merge 
# a <- hhd.merge %>% 
#   mutate (ethnic = as.numeric(if_else(ethnicity_others == 1 & is.na(ethnicity), ethnicity_q, ethnicity)))  %>% 
#   mutate (relig = if_else(religion_others == 1 & is.na(religion), religion_q, religion))  %>% 
#   select (-ethnicity, -ethnicity_q, -ethnicity_others, -ethnicity_hhd_yn, 
#           -religion, -religion_q, -religion_others, -religion_hhd_yn) %>% 
#   rename (religion = relig, ethnicity = ethnic)