variableCreationFunction.R

# want to write a function that will make a binary variable for a certain character value found


#make a fake data set function that saves a dataset to the global environment
makeData <- function() {
id <- seq(from=101, to=150, by=2)
vals12 <- rep(c("high324", "low268"), 6)
diagCode <- c(vals12, "med44", vals12)
diagCode2 <- rep(NA, 25); diagCode2[c(2,13,24)] <- "high324"
myDF <<- data.frame(id=id, diagCode=diagCode, diagCode2=diagCode2)
                        }
#now make a function that will create three data sets, stored within a single list

makeDatas <- function() {
  id <- seq(from=101, to=150, by=2)
  vals12 <- rep(c("high324", "low268"), 6)
  diagCode <- c(vals12, "med44", vals12)
  diagCode2 <- rep(NA, 25); diagCode2[c(2,13,24)] <- "high324"
  myDF1 <- data.frame(id=id, diagCode=diagCode, diagCode2=diagCode2)

  id <- seq(from=201, to=260, by=2)
  vals14 <- rep(c("high324", "low268"), 7)
  diagCode <- c(vals14, "med44", "med44", vals14)
  diagCode2 <- rep(NA, 30); diagCode2[c(4,9,21, 24)] <- "high324"
  myDF2 <- data.frame(id=id, diagCode=diagCode, diagCode2=diagCode2)

  id <- seq(from=301, to=340, by=2)
  vals10 <- rep(c( "low268", "high324"), 5)
  diagCode <- c(vals10, "med44", vals10[1:8], "med44")
  diagCode2 <- rep(NA, 20); diagCode2[c(2,9,20)] <- "high324"
  myDF3 <- data.frame(id=id, diagCode=diagCode, diagCode2=diagCode2)

  myDFlist <<- list(myDF1, myDF2, myDF3)

}

makeData()
makeDatas()

#NONFUNCTION version of the steps needed to do this job
#
#create variables for the input
thing <- "high324" #this is the code we'll hunt for
varNames <- c("diagCode", "diagCode2") #these are the names of the variables we'll hunt through

#get the column indices of the variables of interest
colNum <- which(names(myDF) %in%  varNames)

#do the test to match input columns to the given code of interest, return 1 if any match in a row
newBinVar <- list(ifelse(rowSums(myDF[, colNum]== thing, na.rm=TRUE) >0,1,0))

#name the resulting list so that it'll be recognized after adding to the original data frame
names(newBinVar) <- paste0("b_", thing)

#add the new variable onto the end of the orginal data frame
myDF[,ncol(myDF)+length(thing)] <-  newBinVar

#show me the result
myDF
#
#
#seems to be working.




#Now make this into a proper function that makes a single binary variable
#
makeBinVar <- function(x,y,df) {
  #x is the code of interest. a single code for now
  #y is a character vector reflecting the names of df columns to look through for x
  #df is the data frame of interest

  #first job, get the column indices for each element of y
  colNums <- which(names(df) %in%  y)
  #second job, create the new variable as a list
  newBinVar <- list(ifelse(rowSums(df[, colNums]== x, na.rm=TRUE) >0,1,0))
  #third job, name the variable something sensible
  names(newBinVar) <-  paste0("b_", x)
  #fourth job, name the
  assign(paste0("b_", x), newBinVar, pos = 1)
}

makeData()
makeBinVar(x="high324", y= c("diagCode", "diagCode2"), df=myDF)
#
#



#now try for one that can take in multiple codes, and create multiple indicators
#
makeBinVars <- function(x,y,df, outName="newBinVars") {
  #x is a vector of the codes of interest
  #y is either a character vector reflecting the names of df columns to look through for x, or a numeric vector of column indices
  #df is the data frame of interest

  #first job, get the column indices for each element of y
  colNums <- if(is.integer(y)) {y} else{which(names(df) %in%  y)}

  #next, make the character vector x into a list
  xlist <- as.list(x)

  # job, create the new variables, and store results as a list
  newBinVar <- lapply(xlist, function (b) ifelse(rowSums(df[, colNums]== b, na.rm=TRUE) > 0,1,0))

  #third job, name the variable something sensible
  names(newBinVar) <-  paste0("b_", x)
  #fourth job, name the
  assign(outName, newBinVar, pos = 1)
}


makeData()
codes <- as.character(unique(myDF$diagCode))
lookVars <- names(myDF)[2:3]
makeBinVars(x=codes, y=lookVars, df=myDF,  outName = "lookyHere")
myDF <- cbind(myDF, lookyHere)
myDF
#
#


#
#try using the function within lapply to run over multiple dataframes of differing lengths
newDatas <- lapply(dahDatas, function(q) makeBinVars(x=codes, y=lookVars, df=q))
newDatas
#
#also seems to work
#rather than stuff the workings of the lapply within a new function, it might be good to keep that out and explicit
#the set of steps then might be, load the package that will carry this function (which will load data.table, and possibly
#parallel sugar as well), then create a list of dataframes, a character vector of codes to look for, a character vector of variable
#names (or their respective column indices). then run the thing and get an output list of data.tables where the first column should
#be pat_id, or some kind of key, and all following columns are binary variables reflecting the presence or absence of each
#given code within each row and set of columns in each data.table passed in the function.
#then you manually merge to any existing or original data (after inspecting and ensuring quality)

Rprof(tmp <- tempfile())
replicate( lapply(dahDatas, function(q) makeBinVars(x=codes, y=lookVars, df=q)), n=100)
Rprof()
summaryRprof(tmp)









#things I'd like to add
#pass as argument the column for pat_id or key
#error checking
#a multicore module
#see about making this an explicitly data.table based project for possible speed ups
etesdahl/claimWranglR documentation built on Dec. 20, 2017, 7:12 a.m.