R/dfanalysis.R

#functions used for some analysis

#' @title perform 2^k analysis on factors
#'
#' @description will report the effects of various factors of two levels
#'
#' @name twokeffect
#'
#' @param df data frame passed containing value vector and factors
#' @param response positive integer index or character name of a column response vector
#' @param factor.colmn vector of positive integer(s) index(es) or character name of column(s)
#' that describes the index of the factors to be used
#' @param interaction Logical, augments if interaction between factor levels will
#' be reported. Default is set to FALSE
#'
#' @return data frame of effects
#'
#' @export
twokeffect <- function(df,response,factor.colmn, interaction=F) {

  response <- index.o.coln(vec = response, v.size = 1,v.name = "response", name.col = colnames(df))
  factor.colmn <- index.o.coln(vec = factor.colmn, v.size = length(factor.colmn),v.name = "factor.colmn", name.col = colnames(df))
  n <- length(factor.colmn)
  if(any(factor.colmn%%1 != 0)) {
    stop("factor.colmn must be an integer vector")
  }
  if(n <2) {
    stop("factor.colmn must be of at least length 2.")
  }
  if(!is.logical(interaction)) {
    stop("interaction must be a logical vector. Assign 'TRUE' to also get interactions between factors")
  }
  colsize <- n + 1
  sub<- vector("integer", colsize)
  sub[1] <- response
  for(i in 2:colsize) {
    sub[i] <- factor.colmn[i-1]
  }
  df <- as.data.frame(df[,sub])
  cnames <- as.character(colnames(df))
  if(!is.numeric(df[,1])) {
    stop("value must be a numeric vector")
  }
  for(j in 2:colsize){
    if(!is.factor(df[,j])) {
     stop(paste(cnames[j], " must be a factor.", sep = ""))
    }
    if(nlevels(df[,j])!=2) {
      stop(paste(cnames[j], " must be a factor of only 2 levels.", sep = ""))
    }
    if((length(unique(df[,j]))!=2)) {
      stop(paste(cnames[j], " must be a factor of only 2 levels.", sep = ""))
    }

  }
  index1 <- 1
  index2 <- 0

  if(interaction==TRUE) {
    effect <- vector("numeric", (choose(n,2)+(n)))
    result <- vector("character", (choose(n,2)+(n)))
    rnames <- vector("character", (choose(n,2)+(n)))
    for(k in 2:(colsize-1)){
      for(l in (3+index2):(colsize)){
        x.fctrs <- as.character(unique(df[,k]))
        y.fctrs <- as.character(unique(df[,l]))
        cmb.mean1 <- mean(df[((df[,k]==x.fctrs[1])&(df[,l]==y.fctrs[1])|(df[,k]==x.fctrs[2])&(df[,l]==y.fctrs[2])),1])
        cmb.mean2 <- mean(df[((df[,k]==x.fctrs[2])&(df[,l]==y.fctrs[1])|(df[,k]==x.fctrs[1])&(df[,l]==y.fctrs[2])),1])
        if(is.nan(cmb.mean1)|is.nan(cmb.mean2)){
          effect[n+index1] <-NA
        } else {
          if(cmb.mean1>cmb.mean2) {
            effect[n+index1] <- cmb.mean1-cmb.mean2
          } else {
            effect[n+index1] <- cmb.mean2-cmb.mean1
          }
        }
        result[n+index1] <- paste("The interaction difference between these four factors is ", round(effect[n+index1]),".", sep = "")
        rnames[n+index1] <- paste(cnames[k], " vs ", cnames[l], sep = "")
        index1 <- index1 + 1
      }
      index2 <- index2 +1
    }
  } else{
    effect <- vector("numeric", n)
    result <- vector("character", n)
    rnames <- vector("character", n)
  }


  for(m in 2:colsize){
    fctrs <- as.character(unique(df[,m]))
    mean1 <- mean(df[df[,m]==fctrs[1],1])
    mean2 <- mean(df[df[,m]==fctrs[2],1])

    if(mean1>mean2) {
      effect[m-1] <- mean1-mean2
      result[m-1] <- paste("On average, switching from ", fctrs[2], " to ", fctrs[1], " increases ", cnames[1], " by ", round(effect[m-1]), ".", sep = "")
      rnames[m-1] <- cnames[m]
    } else {
      effect[m-1] <- mean2-mean1
      result[m-1] <- paste("On average, switching from ", fctrs[1], " to ", fctrs[2], " increases ", cnames[1], " by ", round(effect[m-1]), ".", sep = "")
      rnames[m-1] <- cnames[m]
    }
  }

  vec <- cbind(effect, result)
  vec <- as.data.frame(vec)
  colnames(vec) <- c("Effects","Results")
  vec$Effects <- as.numeric(as.character(vec$Effects))
  rownames(vec) <- rnames
  return(vec)
}

#' @title Reports a balanced sample of a data frame's factors
#'
#' @description will return a balancd data frame subset of the original
#'
#' @name balance.sample
#'
#' @param df data frame passed containing value vector and factors
#' @param response positive integer index or character name of a column response vector
#' @param factor.colmn vector of positive integer(s) index(es) or character name of column(s)
#' that describes the index of the factors to be used
#' @param replace Not yet working
#' @param min.samp.numb Can take a positive integer, only subsets of factors
#' greater than or equal to this number will be reported
#'
#' @return Balanced data frame that is a subset of the first
#'
#' @export
balance.sample <- function(df,response,factor.colmn, replace=FALSE, min.samp.numb = "min") {

  response <- index.o.coln(vec = response, v.size = 1,v.name = "response", name.col = colnames(df))
  factor.colmn <- index.o.coln(vec = factor.colmn, v.size = length(factor.colmn),v.name = "factor.colmn", name.col = colnames(df))
  n <- length(factor.colmn)
  if(any(factor.colmn%%1 != 0)) {
    stop("factor.colmn must be an integer vector")
  }
  # if(n <2) {
  #   stop("factor.colmn must be of at least length 2.")
  # }
  if(!is.logical(replace)) {
    stop("replace must be a logical vector.")
  }
  colsize <- n + 1
  sub<- vector("integer", colsize)
  sub[1] <- response
  for(i in 2:colsize) {
    sub[i] <- factor.colmn[i-1]
  }
  df <- df[,sub]
  #df$id <- interaction(df[,-1])
  df <- as.data.frame(df)
  cnames <- as.character(colnames(df))
  if(!is.numeric(df[,1])) {
    stop("value must be a numeric vector")
  }
  for(j in 2:colsize){
    if(!is.factor(df[,j])) {
      stop(paste(cnames[j], " must be a factor.", sep = ""))
    }
    if(nlevels(df[,j])<2) {
      stop(paste(cnames[j], " must be a factor of at least 2 levels.", sep = ""))
    }
    if((length(unique(df[,j]))<2)) {
      stop(paste(cnames[j], " must be a factor of at least 2 levels.", sep = ""))
    }

  }

  tab <- table(df[,-1])
  if(!any(min.samp.numb %in% c("max","min"))) {
    if((min.samp.numb%%1)!=0){
      stop("samp.numb must be either max or min. or an integer")
    } else{
      tab.temp <-  as.data.frame(tab)
      a <- ncol(tab.temp)
      tab.temp <- tab.temp[tab.temp[,a]>=min.samp.numb,]
      tab.temp$id <- interaction(tab.temp[,-a])
      temp.df <- df
      temp.df$id <- interaction(temp.df[,-1])

        temp.df <-temp.df[temp.df[,ncol(temp.df)] %in% tab.temp[,ncol(tab.temp)],]

      temp.df <- droplevels(temp.df)
      df <- temp.df[,-ncol(temp.df)]
      sample.n <- min(tab.temp[,a])
    }
  } else{
    if(min.samp.numb=="min") {
      sample.n <- min(tab)
    }
  }

  if(sample.n==0){
    stop("Some combination of factors passed has no samples to pull from. \n Consider passing integer to min.samp.numb.")
  }

  level.comp <-1
  for(a in 2:colsize){
    level.n.temp <- nlevels(df[,a])*level.comp
    level.comp <- level.n.temp
  }
  #alpha <- level.comp*sample.n

  factor.id.df <- as.data.frame(unique(df[,-1]))
  #factor.id.df$id <- interaction(factor.id.df)
  #browser()
  vec <- c()
  for(i in 1:nrow(factor.id.df)){
    temp.df <- df
    for(j in 1:ncol(factor.id.df)){
      temp.df <- temp.df[temp.df[,j+1]==factor.id.df[i,j],]
    }
    if(replace == T){
      v <- sample(x = as.numeric(rownames(temp.df)),size = sample.n, replace = replace)
    } else {
      v <- sample(x = as.numeric(rownames(temp.df)),size = sample.n)
    }

    v <- temp.df[as.numeric(rownames(temp.df))%in% v,] #max and replace not working as intended because this will not repeat repeated values
    vec <- rbind(vec, v)
  }
  vec <- as.data.frame(vec)
  return(vec)
}

#' @title generate combinations
#'
#' @description Generate a dataframe of the 2 group combinations between factors
#'
#' @name comb.comp
#'
#' @param vector factor vector of at least 2
#' @param rtrn.all logical, controlled internally
#' @return dataframe
comb.comp <- function(vector, rtrn.all = FALSE){
  if(!is.factor(vector)){
    stop("ERROR: at least one column is not a vector of factors")
  }
  n <- nlevels(vector)
  if(n<2) {
    stop("Number of factor levels should at least be 2")
  }
  lvl.names <- levels(vector)
  numb.comb <- choose(n,2)
  rep.seq <- seq(from = n-1, to = 0, by = -1)
  factor1 <- rep(lvl.names, rep.seq)
  factor2 <- rep("factor", length(factor1))

  j <-1
  index <- 0
  for(i in 1:length(factor1)){
    if(j==n){
      index <- index +1
      j <-1 +index
    }
    j <- j + 1
    factor2[i]<- lvl.names[j]
  }

  comp.f <- cbind(factor1,factor2)
  comp.f <- as.data.frame(comp.f)
  colnames(comp.f) <- c("f1","f2")

  if(rtrn.all==TRUE){
  comp.f <- comp.f %>%
    mutate(comb = paste(f1,".v.",f2, sep = ""))
  }

  return(comp.f)
}


#' @title Wilcoxon tests through many factors
#'
#' @description Calculate the wilcoxon test between several factors
#'
#' @name combo.wilcox
#'
#' @param df data frame passed containing value vector and factors
#' @param response positive integer index or character name of a column response vector
#' @param factor.colmn vector of positive integer(s) index(es) or character name of column(s)
#' that describes the index of the factors to be used. Number of factors passed changes the
#' comparison. One factor vector passed will return wilcox test values between levels. Multiple
#' factor vectors passed will return wilcon test values between all combination of interactions
#' of factors.
#' @param p.v.correct Logical assignment. Default set to FALSE. Will return logical vector
#' along with test results. TRUE indicates P.value is less than adjusted P.value, FALSE
#' indicates P.value is greated than adjusted P.value. Adjusted P.value is calculated using
#' the bonferroni correction.
#' @param single.factor.comp logical assignment. Default set to FALSE. Set to TRUE when you
#' user wants compare between two levels of the same factor while all other factors are
#' held constant. When set to TRUE, factor vector must be at least length 2.
#'
#' @return dataframe of wilcoxn test P.value results
#'
#' @export
combo.wilcox <- function(df, response, factor, p.v.correct= FALSE, single.factor.comp = FALSE) {

  #browser()
  name.col <- colnames(df)
  response <- index.o.coln(vec = response, v.size = 1, v.name = "response", name.col = name.col)
  factor <- index.o.coln(vec = factor, v.size = length(factor), v.name = "factor", name.col = name.col)
  n <- length(factor)
  if(!is.logical(p.v.correct)) {
    stop("interaction must be a logical vector. Assign 'TRUE' to also get interactions between factors")
  }
  if(n >= 2) { #if n is less than 2, function should assume all factor comparisons
               #that want to be made are in this single vector and skip this edit.
               #if greater than 2, colapse into 1 factor via interaction
               # will also allow for single factor comparisons
    colsize <- n + 1
    sub<- vector("integer", colsize) #hold the index of the data frame
    sub[1] <- response
    for(i in 2:colsize) {
      sub[i] <- factor[i-1]
    }
    df.work <- as.data.frame(df[,sub])
    cnames <- as.character(colnames(df.work))
    for(j in 2:colsize){
      if(!is.factor(df.work[,j])) {
        stop(paste(cnames[j], " must be a factor.", sep = ""))
      }
      if(nlevels(df.work[,j])<2) {
        stop(paste(cnames[j], " must be a factor of at least 2 levels.", sep = ""))
      }
      if((length(unique(df.work[,j]))<2)) {
        stop(paste(cnames[j], " must be a factor of at least 2 levels.", sep = ""))
      }

    }
    #add single.factor.comp variable here...
    #build a for loop to loop over factor vectors passed, and then through factors
    #build either the df that is needed, or just a vector of desired interactions and subset from df later.

    df.work <- df.work %>%
      mutate(int=interaction(df.work[,-1]))
    df <- df.work

    if(single.factor.comp==TRUE){

      combo.vec <- comb.comp(vector = df[,2])     #building combo.vec
      combo.vec <- cbind(rep(2,nrow(combo.vec)), combo.vec)
      colnames(combo.vec) <- c("column.index","f1","f2")
      for(m in 3:colsize){
        combo.vec.work <- comb.comp(vector = df[,m])
        combo.vec.work <- cbind(rep(m,nrow(combo.vec.work)), combo.vec.work)
        colnames(combo.vec.work) <- c("column.index","f1","f2")
        combo.vec <- rbind(combo.vec, combo.vec.work)
      }

      combo.vec <- as.data.frame(combo.vec)

      vec <- c()
      for(o in 1:nrow(combo.vec)){
      df.temp1 <- df.work[df.work[,combo.vec[o,1]] == as.character(combo.vec[o,2]) | df.work[,combo.vec[o,1]] == as.character(combo.vec[o,3]),]
      df.temp1$temp <- interaction(df.temp1[,c(-1,-(combo.vec[o,1]),-(colsize+1))])
      temp.names <- levels(df.temp1$temp)

        df.z <- c()
        for(z in 1:nlevels(df.temp1$temp)){
          df.z.work <- df.temp1[df.temp1$temp == temp.names[z],]
          #df.z <- rbind(df.z,df.z.work)
          df.z.work <- droplevels(df.z.work)
          wil.work <- wilcox.test(df.z.work[,1]~df.z.work[,combo.vec[o,1]])
          new.df.z <- cbind(as.character(paste(cnames[combo.vec[o,1]])),
                            as.character(paste(levels(df.z.work[,combo.vec[o,1]])[1],
                                               ".v.",
                                               levels(df.z.work[,combo.vec[o,1]])[2],sep = "")),
                                        as.character(df.z.work[1,ncol(df.z.work)]),
                                        wil.work[[3]])
          colnames(new.df.z) <- c("Tested Factor","Tested Levels", "Constant Levels", "P.value")
          df.z <- rbind(df.z,new.df.z)
        }
        vec <-  rbind(vec,df.z)
      }
      vec <- as.data.frame(vec)
      vec$P.value <- as.numeric(as.character(vec$P.value))
      if(p.v.correct==TRUE){
        adjusted <- (0.05)/(nrow(vec))
        vec <- vec %>%
          mutate(Sig.=ifelse(P.value<adjusted, TRUE, FALSE))
      }
      return(vec)
    }else{
      df <- as.data.frame(df.work[,c(1,ncol(df.work))])
    }
  } else{
    df <- as.data.frame(df[,c(response,factor)])
  }
  if(!is.numeric(df[,1])) {
    stop("response must be a numeric vector")
  }
  df <- droplevels(df)
  combo.vec <- comb.comp(vector = df[,2], rtrn.all = TRUE)
  ncomp.made <- nrow(combo.vec)
  vec <- vector("numeric", ncomp.made)
  names.c <-  vector("character", ncomp.made)
  for(k in 1:ncomp.made){
    df.temp <- df[df[,2]==as.character(combo.vec[k,1])|df[,2]==as.character(combo.vec[k,2]),]
    df.temp <- droplevels(df.temp)
    wil.work <- wilcox.test(df.temp[,1]~df.temp[,2],data=df.temp)
    vec[k] <- wil.work[[3]]
    names.c[k] <- combo.vec[k,3]
  }

  vec <- cbind(names.c,vec)
  vec <- as.data.frame(vec)
  colnames(vec) <- c("Tested Levels","P.value")
  vec$P.value <- as.numeric(as.character(vec$P.value))
  if(p.v.correct==TRUE){
    adjusted <- (0.05)/(ncomp.made)
    vec <- vec %>%
      mutate(Sig.=ifelse(P.value<adjusted, TRUE, FALSE))
  }
  return(vec)
}

#' @title Three factor contrast anova
#'
#' @description Generate the anova test statistics for three factors
#'
#' @name contrast.aov3
#'
#' @param df data frame passed containing value vector and factors
#' @param response positive integer that describes the index of the response vector
#' @param factor3 positive integer vector of length three that describes the index
#' of three factor vectors within the passed df.
#'
#' @return return dataframe of anova test statistic
#'
#' @export
contrast.aov3 <- function(df, response, factor3){

  response <- index.o.coln(vec = response, v.size = 1, v.name = "response", name.col = name.col)
  factor3 <- index.o.coln(vec = factor3, v.size = 3, v.name = "factor3", name.col = name.col)
  n <- length(factor3)
  if(any(factor3%%1 != 0)) {
    stop("factor3 must be an integer vector")
  }
  if(n!=3) {
    stop("factor3 must be of length 3.")
  }
  colsize <- n + 1
  sub<- vector("integer", colsize)
  sub[1] <- response
  for(i in 2:colsize) {
    sub[i] <- factor3[i-1]
  }
  df <- as.data.frame(df[,sub])
  cnames <- as.character(colnames(df))
  if(!is.numeric(df[,1])) {
    stop("value must be a numeric vector")
  }
  for(j in 2:colsize){
    if(!is.factor(df[,j])) {
      stop(paste(cnames[j], " must be a factor.", sep = ""))
    }
    if(nlevels(df[,j])<2) {
      stop(paste(cnames[j], " must be a factor of at least 2 levels.", sep = ""))
    }
    if((length(unique(df[,j]))<2)) {
      stop(paste(cnames[j], " must be a factor of at least 2 levels.", sep = ""))
    }

  }

  test <- df
  cnames <- colnames(test)[-1]
  .t1 <- summary(aov(test[,1]~test[,2]+test[,3]+test[,4]))[[1]]
  .t1 <- .t1 %>%
    mutate(names=rownames(.t1)) %>%
    mutate(id="2,3,4")
  .t2 <- summary(aov(test[,1]~test[,2]+test[,4]+test[,3]))[[1]]
  .t2 <- .t2 %>%
    mutate(names=rownames(.t2))%>%
    mutate(id="2,4,3")
  .t3 <- summary(aov(test[,1]~test[,3]+test[,2]+test[,4]))[[1]]
  .t3 <- .t3 %>%
    mutate(names=rownames(.t3))%>%
    mutate(id="3,2,4")
  .t4 <- summary(aov(test[,1]~test[,3]+test[,4]+test[,2]))[[1]]
  .t4 <- .t4 %>%
    mutate(names=rownames(.t4))%>%
    mutate(id="3,4,2")
  .t5 <- summary(aov(test[,1]~test[,4]+test[,2]+test[,3]))[[1]]
  .t5 <- .t5 %>%
    mutate(names=rownames(.t5))%>%
    mutate(id="4,2,3")
  .t6 <- summary(aov(test[,1]~test[,4]+test[,3]+test[,2]))[[1]]
  .t6 <- .t6 %>%
    mutate(names=rownames(.t6))%>%
    mutate(id="4,3,2")
  test <- rbind(.t1,.t2,.t3,.t4,.t5,.t6)
  test <- dcast(data=test, names~id,value.var = "Pr(>F)")
  rownames(test) <- c("Residuals", cnames)
  return(test)
}

#' @title standard hclust function
#'
#' @description internal clustering function to automatically
#' reorder factors
#'
#' @name clust
#'
#' @param df data frame passed
#' @param col.index index or character name of columns in df that will be clustered
#' @param value.var value by which clustering occures
#'
#' @return df but with reordered factors
clust <- function(df, formula, value.var, fun.aggregate = NULL, vec.col) {

  #col.index <- index.o.coln(vec=col.index, v.size = 2, v.name = "col.index", name.col = colnames(df))
  df.wide <- dcast(df, formula = formula, value.var = value.var, fun.aggregate = fun.aggregate)
  row.names(df.wide) <- df.wide[,1]
  df.wide <- df.wide[,-1]
  a <- dist((df.wide))
  hc <- hclust(a)

  .test <- as.data.frame((df.wide))
  .test <- .test %>%
    mutate(names = rownames(.test))
  c <- .test[hc$order,]$names

  eval(parse(text = paste("df$",vec.col," <- factor(df$", vec.col,", levels = c)", sep = "")))
  #df[,col.index[1]] <- factor(df[,col.index[1]], levels = c)
  return(df)
}

#' @title euclidian clustering function
#'
#' @description reorder factors passed to y.by.x via their respective
#' clusters
#'
#' @name distance.cluster
#'
#' @param df data frame passed containing both factor columns to reorder
#' and numeric column to cluster by
#' @param y.by.x Index of the columns or character column
#' names to reorder
#' @param value.var Character column name by which clustering will be
#' calculated
#' @param which.clut Defines which column is reordered. Default set to both.
#' Set to "y" to only reorder y.by.x[1], set to "x" to only reorder y.by.x[2]
#'
#' @return returns df but with reordered factors
#'
#' @export
distance.cluster <- function(df, formula, value.var,which.clust = "both", fun.aggregate = mean) {

  df <- droplevels(df)
  #name.col <- colnames(df)
  #index <- index.o.coln(vec = y.by.x, v.size = 2, v.name = "y.by.x", name.col)

  if(!any(which.clust==(c("both","y","x")))) {
    stop("specify which factor you wish to cluster by. Default \"both\", or \"y\" or \"x\". ")
  }

  form <- deparse(substitute(formula))
  if(any(str_detect(form, "\\+"))) {
    stop("ERROR: \"+\" character detected in column names. please pass single vectors on either side of ~")
  }
    form <- gsub(pattern = " ", replacement = "", form)
    form <- gsub(pattern = "formula=", replacement = "", form)
    form <- str_split(form, "~")
    form.y <- form[[1]][1]
    form.x <- form[[1]][2]


  if(which.clust=="both"|which.clust=="y") {
    df <- clust(df = df,
                formula = formula,
                value.var = value.var,
                fun.aggregate = fun.aggregate,
                vec.col = form.y)
  }
  if(which.clust=="both"|which.clust=="x") {
    df <- clust(df = df,
                formula = paste(form.x,"~",form.y,sep = ""),
                value.var = value.var,
                fun.aggregate = fun.aggregate,
                vec.col = form.x)
  }

  return(df)

}



#' @title linear model testing
#'
#' @description report r-squared values of subsets
#'
#' @name lm.test
#'
#' @param df data frame
#' @param factors factor column names or indexs to subset by
#' @param formula formula to use in lm function
#'
#' @return vector of r-squared values
#'
#' @export
lm.test <- function(df, factors, formula) {

  n <- length(factors)
  colnam <- colnames(df)
  index.f <- index.o.coln(vec = factors, v.size = n, v.name = "factors", name.col = colnam)
  #index.v <- index.o.coln(vec = y.by.x, v.size = 2, v.name = "factors", name.col = colnam)


  df.work <- df
  df.work$group.id <- interaction(df[,index.f])
  #   mutate(group.id=interaction(df[,index.f]))
  df.work$group.id <- as.factor(as.character(df.work$group.id))
  df.work <- droplevels(df.work)

  ngroups <- nlevels(df.work$group.id)
  groups <- levels(df.work$group.id)

  vec.r <- vector(mode = "numeric",length = ngroups)
  vec.s <- vector(mode = "numeric",length = ngroups)
  vec.i <- vector(mode = "numeric",length = ngroups)
  for(i in 1:ngroups){
    df.test <- subset(df.work, group.id %in% groups[i])
    a <- summary(lm(formula = formula, data = df.test))
    #print(a["coefficients"])

    vec.r[i] <- a$r.squared
    vec.s[i] <- a$coefficients[2,1]
    vec.i[i] <- a$coefficients[1,1]
  }

  vec <- as.data.frame(cbind(vec.r,vec.s,vec.i))

  rownames(vec) <- groups
  colnames(vec) <- c("r.squared","slope","intercept")



  return(vec)
}
jtlandis/justinmisc documentation built on May 25, 2019, 8:18 a.m.