MargotFun: Just a bunch of useful functions to do my job.

Documented in makeCohort

makeCohort <- function(vars,sex="BOTH",initialPath=file.path("s:/cps")){
  require(dplyr)
  require(BERG)

  message("makeCohort() last updated on July 30th 2019")
  message("Cohort only includes data 1982, and 1992-2015.  If followup goes later, please update the function")

  # Pull the master/NUT_92 IDs
  master  <- rbind(readRDS(file.path(initialPath,"master/cps2smgmst17f.rds")),
                   readRDS(file.path(initialPath,"master/cps2smgmst17m.rds")))
  master <- dplyr::select(master,ID,NUT_92)
  keepID <- master$ID[master$NUT_92==1]

  # # Pull a bit from the person year file
  # pyear <- dplyr::select(
  #                rbind(readRDS(file.path(initialPath,"nutrition/pyear/nut92fempyr1216.rds")),
  #                readRDS(file.path(initialPath,"nutrition/pyear/nut92menpyr1216.rds"))),
  #                ID,BDAYDATE,DATEDD)
  #
  # drop <- c(names(master),names(pyear))
  # drop <- drop[!drop %in% "ID"]

  # Reformat the vars vector - all capitalized and no duplicates
  # Also drop ID and SEX, those will be manually added in the function
  newvars <- toupper(vars)
  newvars <- newvars[!duplicated(newvars)]
  newvars <- newvars[!newvars %in% c("ID","SEX")]

  # Check to see if all the variables are available
  # Print an error message if a variable is not available
  allvars <- unique(c(variables$men$Variables, variables$women$Variables))
  have_vars <- newvars[newvars %in% allvars]
  wrong_vars <- newvars[!newvars %in% allvars]
  v <- paste0("The following variables are not available in CPS2 - ",paste(c(wrong_vars), collapse=", "))
  if (length(wrong_vars>0)){
    stop(v)
  }

  # Initialize some input vectors
  menFiles <<- NULL
  menVars <<- NULL
  womenFiles <<- NULL
  womenVars <<- NULL

  # Men and women get different input vectors
  # This will save data-read time
  if (toupper(sex)=="MEN") {
    foo <- dplyr::filter(variables$men,Variables %in% newvars)
    foo$FullPath <- file.path(initialPath,foo$subPath,foo$File)
    menFiles <<- unique(foo$FullPath)
    menVars <<- c("ID",unique(foo$Variables))
  }
  if (toupper(sex)=="WOMEN") {
    foo <- dplyr::filter(variables$women, Variables %in% newvars)
    foo$FullPath <- file.path(initialPath,foo$subPath,foo$File)
    womenFiles <<- unique(foo$FullPath)
    womenVars <<- c("ID",unique(foo$Variables))
  }
  if (toupper(sex)=="BOTH") {
    foo <- dplyr::filter(rbind(variables$men,variables$women),
                         Variables %in% newvars)
    foo$FullPath <- file.path(initialPath,foo$subPath,foo$File)
    menFiles <<- unique(foo$FullPath[foo$Sex=="MEN"])
    menVars <<- c("ID",unique(foo$Variables[foo$Sex=="MEN"]))
    womenFiles <<- unique(foo$FullPath[foo$Sex=="WOMEN"])
    womenVars <<- c("ID",unique(foo$Variables[foo$Sex=="WOMEN"]))
  }

  # Initialize the cluster for parallel processing
  cl <- parallel::makeCluster(parallel::detectCores(logical=T))
  parallel::clusterExport(cl,c("menFiles","menVars","womenFiles","womenVars"))


# If only pulling men or women, the non-included gender will return a NULL object
# This NULL object can still be included in the final bind_rows without error

  # Merge the men's data
  menCohort <- parallel::parLapply(cl,menFiles, function(x){
    dat <- readRDS(x)
    dat <- dat[,names(dat) %in% menVars]
    return(dat)
  }) %>% Reduce(function(x,y) full_join(x,y,"ID"),.)

  # Merge the women's data
  womenCohort <- parallel::parLapply(cl,womenFiles, function(x){
    dat <- readRDS(x)
    dat <- dat[,names(dat) %in% womenVars]
    return(dat)
  }) %>% Reduce(function(x,y) full_join(x,y,"ID"),.)

  # Close the cluster
  parallel::stopCluster(cl)
  rm(cl)

  # Little bit of clean up
  if (toupper(sex)=="MEN") {
    cohort <- menCohort
    cohort$SEX <- "MEN"
      }
  if (toupper(sex)=="WOMEN") {
    cohort <- womenCohort
    cohort$SEX <- "WOMEN"
  }
  if (toupper(sex)=="BOTH") {
    menCohort$SEX <- "MEN"
    womenCohort$SEX <- "WOMEN"
    cohort <- dplyr::bind_rows(menCohort,womenCohort)
  }

  # Subset to those in master file with NUT_92==1
  cohort <- cohort[cohort$ID %in% keepID,]
  cohort <- cohort[!duplicated(cohort$ID),]
  cohort <- dplyr::filter(cohort, ID %in% keepID)


  # Final output messages
  n <- nrow(cohort)
  n.men <- nrow(cohort[cohort$SEX=="MEN",])
  n.women <- nrow(cohort[cohort$SEX=="WOMEN",])

  message("Cohort has been subset to those in the master file AND NUT_92")
  message(paste0("This includes ",n.men," men, and ", n.women," women."))
  rm(menFiles,menVars,womenFiles,womenVars,envir=globalenv())

  # Frequently the DTINT variables come from multiple files
  # This results in DTINT97.x and DTINT97.y
  # They are identical, so going to drop one of them
  v <- names(cohort)[substr(names(cohort),1,5)=="DTINT"] # all the included DTINT
  vx <- v[substr(v,(nchar(v)-1),nchar(v))==".x"] # the .X versions
  vy <- v[substr(v,(nchar(v)-1),nchar(v))==".y"] # the .Y version
  vz <- sub(".x","",vx) # the final versions
  cohort[,vz] <- cohort[,vx]
  cohort <- cohort[,!names(cohort) %in% c(vx,vy)]
  return(cohort)
}