# want to write a function that will make a binary variable for a certain character value found
#make a fake data set function that saves a dataset to the global environment
makeData <- function() {
id <- seq(from=101, to=150, by=2)
vals12 <- rep(c("high324", "low268"), 6)
diagCode <- c(vals12, "med44", vals12)
diagCode2 <- rep(NA, 25); diagCode2[c(2,13,24)] <- "high324"
myDF <<- data.frame(id=id, diagCode=diagCode, diagCode2=diagCode2)
}
#now make a function that will create three data sets, stored within a single list
makeDatas <- function() {
id <- seq(from=101, to=150, by=2)
vals12 <- rep(c("high324", "low268"), 6)
diagCode <- c(vals12, "med44", vals12)
diagCode2 <- rep(NA, 25); diagCode2[c(2,13,24)] <- "high324"
myDF1 <- data.frame(id=id, diagCode=diagCode, diagCode2=diagCode2)
id <- seq(from=201, to=260, by=2)
vals14 <- rep(c("high324", "low268"), 7)
diagCode <- c(vals14, "med44", "med44", vals14)
diagCode2 <- rep(NA, 30); diagCode2[c(4,9,21, 24)] <- "high324"
myDF2 <- data.frame(id=id, diagCode=diagCode, diagCode2=diagCode2)
id <- seq(from=301, to=340, by=2)
vals10 <- rep(c( "low268", "high324"), 5)
diagCode <- c(vals10, "med44", vals10[1:8], "med44")
diagCode2 <- rep(NA, 20); diagCode2[c(2,9,20)] <- "high324"
myDF3 <- data.frame(id=id, diagCode=diagCode, diagCode2=diagCode2)
myDFlist <<- list(myDF1, myDF2, myDF3)
}
makeData()
makeDatas()
#NONFUNCTION version of the steps needed to do this job
#
#create variables for the input
thing <- "high324" #this is the code we'll hunt for
varNames <- c("diagCode", "diagCode2") #these are the names of the variables we'll hunt through
#get the column indices of the variables of interest
colNum <- which(names(myDF) %in% varNames)
#do the test to match input columns to the given code of interest, return 1 if any match in a row
newBinVar <- list(ifelse(rowSums(myDF[, colNum]== thing, na.rm=TRUE) >0,1,0))
#name the resulting list so that it'll be recognized after adding to the original data frame
names(newBinVar) <- paste0("b_", thing)
#add the new variable onto the end of the orginal data frame
myDF[,ncol(myDF)+length(thing)] <- newBinVar
#show me the result
myDF
#
#
#seems to be working.
#Now make this into a proper function that makes a single binary variable
#
makeBinVar <- function(x,y,df) {
#x is the code of interest. a single code for now
#y is a character vector reflecting the names of df columns to look through for x
#df is the data frame of interest
#first job, get the column indices for each element of y
colNums <- which(names(df) %in% y)
#second job, create the new variable as a list
newBinVar <- list(ifelse(rowSums(df[, colNums]== x, na.rm=TRUE) >0,1,0))
#third job, name the variable something sensible
names(newBinVar) <- paste0("b_", x)
#fourth job, name the
assign(paste0("b_", x), newBinVar, pos = 1)
}
makeData()
makeBinVar(x="high324", y= c("diagCode", "diagCode2"), df=myDF)
#
#
#now try for one that can take in multiple codes, and create multiple indicators
#
makeBinVars <- function(x,y,df, outName="newBinVars") {
#x is a vector of the codes of interest
#y is either a character vector reflecting the names of df columns to look through for x, or a numeric vector of column indices
#df is the data frame of interest
#first job, get the column indices for each element of y
colNums <- if(is.integer(y)) {y} else{which(names(df) %in% y)}
#next, make the character vector x into a list
xlist <- as.list(x)
# job, create the new variables, and store results as a list
newBinVar <- lapply(xlist, function (b) ifelse(rowSums(df[, colNums]== b, na.rm=TRUE) > 0,1,0))
#third job, name the variable something sensible
names(newBinVar) <- paste0("b_", x)
#fourth job, name the
assign(outName, newBinVar, pos = 1)
}
makeData()
codes <- as.character(unique(myDF$diagCode))
lookVars <- names(myDF)[2:3]
makeBinVars(x=codes, y=lookVars, df=myDF, outName = "lookyHere")
myDF <- cbind(myDF, lookyHere)
myDF
#
#
#
#try using the function within lapply to run over multiple dataframes of differing lengths
newDatas <- lapply(dahDatas, function(q) makeBinVars(x=codes, y=lookVars, df=q))
newDatas
#
#also seems to work
#rather than stuff the workings of the lapply within a new function, it might be good to keep that out and explicit
#the set of steps then might be, load the package that will carry this function (which will load data.table, and possibly
#parallel sugar as well), then create a list of dataframes, a character vector of codes to look for, a character vector of variable
#names (or their respective column indices). then run the thing and get an output list of data.tables where the first column should
#be pat_id, or some kind of key, and all following columns are binary variables reflecting the presence or absence of each
#given code within each row and set of columns in each data.table passed in the function.
#then you manually merge to any existing or original data (after inspecting and ensuring quality)
Rprof(tmp <- tempfile())
replicate( lapply(dahDatas, function(q) makeBinVars(x=codes, y=lookVars, df=q)), n=100)
Rprof()
summaryRprof(tmp)
#things I'd like to add
#pass as argument the column for pat_id or key
#error checking
#a multicore module
#see about making this an explicitly data.table based project for possible speed ups
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.