makeCohort <- function(vars,sex="BOTH",initialPath=file.path("s:/cps")){
require(dplyr)
require(BERG)
message("makeCohort() last updated on July 30th 2019")
message("Cohort only includes data 1982, and 1992-2015. If followup goes later, please update the function")
# Pull the master/NUT_92 IDs
master <- rbind(readRDS(file.path(initialPath,"master/cps2smgmst17f.rds")),
readRDS(file.path(initialPath,"master/cps2smgmst17m.rds")))
master <- dplyr::select(master,ID,NUT_92)
keepID <- master$ID[master$NUT_92==1]
# # Pull a bit from the person year file
# pyear <- dplyr::select(
# rbind(readRDS(file.path(initialPath,"nutrition/pyear/nut92fempyr1216.rds")),
# readRDS(file.path(initialPath,"nutrition/pyear/nut92menpyr1216.rds"))),
# ID,BDAYDATE,DATEDD)
#
# drop <- c(names(master),names(pyear))
# drop <- drop[!drop %in% "ID"]
# Reformat the vars vector - all capitalized and no duplicates
# Also drop ID and SEX, those will be manually added in the function
newvars <- toupper(vars)
newvars <- newvars[!duplicated(newvars)]
newvars <- newvars[!newvars %in% c("ID","SEX")]
# Check to see if all the variables are available
# Print an error message if a variable is not available
allvars <- unique(c(variables$men$Variables, variables$women$Variables))
have_vars <- newvars[newvars %in% allvars]
wrong_vars <- newvars[!newvars %in% allvars]
v <- paste0("The following variables are not available in CPS2 - ",paste(c(wrong_vars), collapse=", "))
if (length(wrong_vars>0)){
stop(v)
}
# Initialize some input vectors
menFiles <<- NULL
menVars <<- NULL
womenFiles <<- NULL
womenVars <<- NULL
# Men and women get different input vectors
# This will save data-read time
if (toupper(sex)=="MEN") {
foo <- dplyr::filter(variables$men,Variables %in% newvars)
foo$FullPath <- file.path(initialPath,foo$subPath,foo$File)
menFiles <<- unique(foo$FullPath)
menVars <<- c("ID",unique(foo$Variables))
}
if (toupper(sex)=="WOMEN") {
foo <- dplyr::filter(variables$women, Variables %in% newvars)
foo$FullPath <- file.path(initialPath,foo$subPath,foo$File)
womenFiles <<- unique(foo$FullPath)
womenVars <<- c("ID",unique(foo$Variables))
}
if (toupper(sex)=="BOTH") {
foo <- dplyr::filter(rbind(variables$men,variables$women),
Variables %in% newvars)
foo$FullPath <- file.path(initialPath,foo$subPath,foo$File)
menFiles <<- unique(foo$FullPath[foo$Sex=="MEN"])
menVars <<- c("ID",unique(foo$Variables[foo$Sex=="MEN"]))
womenFiles <<- unique(foo$FullPath[foo$Sex=="WOMEN"])
womenVars <<- c("ID",unique(foo$Variables[foo$Sex=="WOMEN"]))
}
# Initialize the cluster for parallel processing
cl <- parallel::makeCluster(parallel::detectCores(logical=T))
parallel::clusterExport(cl,c("menFiles","menVars","womenFiles","womenVars"))
# If only pulling men or women, the non-included gender will return a NULL object
# This NULL object can still be included in the final bind_rows without error
# Merge the men's data
menCohort <- parallel::parLapply(cl,menFiles, function(x){
dat <- readRDS(x)
dat <- dat[,names(dat) %in% menVars]
return(dat)
}) %>% Reduce(function(x,y) full_join(x,y,"ID"),.)
# Merge the women's data
womenCohort <- parallel::parLapply(cl,womenFiles, function(x){
dat <- readRDS(x)
dat <- dat[,names(dat) %in% womenVars]
return(dat)
}) %>% Reduce(function(x,y) full_join(x,y,"ID"),.)
# Close the cluster
parallel::stopCluster(cl)
rm(cl)
# Little bit of clean up
if (toupper(sex)=="MEN") {
cohort <- menCohort
cohort$SEX <- "MEN"
}
if (toupper(sex)=="WOMEN") {
cohort <- womenCohort
cohort$SEX <- "WOMEN"
}
if (toupper(sex)=="BOTH") {
menCohort$SEX <- "MEN"
womenCohort$SEX <- "WOMEN"
cohort <- dplyr::bind_rows(menCohort,womenCohort)
}
# Subset to those in master file with NUT_92==1
cohort <- cohort[cohort$ID %in% keepID,]
cohort <- cohort[!duplicated(cohort$ID),]
cohort <- dplyr::filter(cohort, ID %in% keepID)
# Final output messages
n <- nrow(cohort)
n.men <- nrow(cohort[cohort$SEX=="MEN",])
n.women <- nrow(cohort[cohort$SEX=="WOMEN",])
message("Cohort has been subset to those in the master file AND NUT_92")
message(paste0("This includes ",n.men," men, and ", n.women," women."))
rm(menFiles,menVars,womenFiles,womenVars,envir=globalenv())
# Frequently the DTINT variables come from multiple files
# This results in DTINT97.x and DTINT97.y
# They are identical, so going to drop one of them
v <- names(cohort)[substr(names(cohort),1,5)=="DTINT"] # all the included DTINT
vx <- v[substr(v,(nchar(v)-1),nchar(v))==".x"] # the .X versions
vy <- v[substr(v,(nchar(v)-1),nchar(v))==".y"] # the .Y version
vz <- sub(".x","",vx) # the final versions
cohort[,vz] <- cohort[,vx]
cohort <- cohort[,!names(cohort) %in% c(vx,vy)]
return(cohort)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.