R/helpfulFunctions.R

Defines functions logRatio makeHeader transformDat getCond getName strReverse cl clr clrInv cNorm anovaPrep makeMat alrInv partShift summSimp contSimp takeMax makePredDat

#Code for small silent functions used in the data manipulation


#Function that converts intensities into log ratios
#also returns the minimum log intensity for each pair
#mat should be a matrix of intensities
#missing values and values < 1 will be set to 1.
logRatio <- function(mat, ref_index){
  #mat[is.na(mat)] <- 1  normalization now done prior to this routine
  #mat[mat < 1] <- 1
  lMat <- log2(mat)
  denom <- rowMeans(lMat[ , ref_index, drop = F])
  numCols <- lMat[ , -ref_index, drop = F]

  p_ <- ncol(numCols)
  denomMat <- matrix(rep(denom, p_), ncol = p_)
  mintensity <- 2^(denomMat) + 2^(numCols) #for making pairwise ssn
  lrMat <- numCols - denom

  list(lrMat, mintensity)
}


#Function that makes a single groupID from the first 4 rows of df
makeHeader <- function(df, index){

  header <- paste("lr", colnames(df)[index], df[1, index],
                  df[2, index], df[3, index], sep = "qqqq")
  header
}

#function that takes a dataframe and returns a dataframe with unique ids
transformDat <- function(df, plexNumber, normalize, simpleMod){
  #convert factors to strings
  facIndex <- which(sapply(df, is.factor))
  df[facIndex] <- lapply(df[facIndex], as.character)

  #Zero out unused columns
  bioCol <- df$bioID[1]
  if(bioCol == 0){
    df$bioID[] <- 0
  }

  covarCol <- df$Covariate[1]
  if(covarCol == 0){
    df$Covariate[] <- 0
  }
  varCol <- df$varCat[1]
  if(varCol == 0){
    df$varCat[] <- 0
  }


  # if(df[2, 1] == 0){
  #   df[2, ][] <- 0
  # }
  #if(df[3, 1] == 0){
  #  df[3, ][] <- 0
  #}

  n_ <- nrow(df)

  jDat <- df[4:(n_), ]
  df[4:n_, ] <- jDat[order(jDat$bioID), ]

  value_index <- grep("tag", colnames(df))

  nMat <- df[4:(n_), value_index]
  #normalize the df
  if(normalize == TRUE){
  normed <- by(as.matrix(df[4:(n_), value_index]), df$bioID[4:n_], cNorm)
  nMat <- as.matrix(do.call(cbind, normed))
  }else{
    nMat[nMat == 0] <- 1
  }

  #new paradigm.  Force the population model
  if(sum(df[2, value_index]) == 0){
    startP <- length(value_index) * (plexNumber - 1) + 1
    endP <- startP + length(value_index) - 1
    df[2, value_index] <- c(startP:endP)
  }

  condBio <- paste(df[1, value_index], df[2, value_index])
  #create set of columns to be averaged into one reference
  #if(simpleMod){
     ref_index <- which(df[1, value_index] == as.numeric(df[1, value_index][1]))
  #}else{
     #ref_index <- which(condBio == condBio[1])
  #}
  normal_index <- setdiff(1:length(value_index), ref_index)

  lRes <- logRatio(nMat, ref_index)
  lrMat <- lRes[[1]]
  minTensities <- lRes[[2]]
  header <- makeHeader(df[ , value_index], normal_index)
  colnames(lrMat) <- header

  newDf1 <- data.frame(Gene = df[4:n_, ]$Gene,
                      Protein = df[4:n_, ]$Protein,
                      Peptide = df[4:n_, ]$Peptide, bioID = df[4:n_, ]$bioID,
                      Covariate = df[4:n_, ]$Covariate,
                      varCat = df[4:n_, ]$varCat,
                      lrMat, stringsAsFactors = F)

  newDf2 <- data.frame(Gene = df[4:n_, ]$Gene,
                       Protein = df[4:n_, ]$Protein,
                       Peptide = df[4:n_, ]$Peptide, bioID = df[4:n_, ]$bioID,
                       Covariate = df[4:n_, ]$Covariate,
                       varCat = df[4:n_, ]$varCat,
                       minTensities, stringsAsFactors = F)

  melted1 <- reshape2::melt(newDf1, id.vars = c("Gene", "Protein", "Peptide", "bioID",
                                              "Covariate", "varCat"),
                 value.name = "lr", variable.name = "header")


  melted2 <- reshape2::melt(newDf2, id.vars = c("Gene", "Protein", "Peptide", "bioID",
                                                "Covariate", "varCat"),
                            value.name = "pairMin", variable.name = "header")


  melted <- data.frame(melted1, pairMin = melted2$pairMin)

  separated <- stringr::str_split_fixed(as.character(melted$header), "qqqq",5)

  #Merge tag with tenplex info

  tag_plex <- paste(separated[ , 2], melted$bioID, plexNumber, sep = "_")

  #figure out if we are using a bioid from the header or from a column

  if(bioCol == 1){
    bioID <- paste(melted$Protein, separated[ , 3], melted$bioID,  sep = "_")
  }else{bioID <- paste(melted$Protein, separated[ , 3], separated[, 4], sep = "_")}

  finalDat <- data.frame(gene = melted$Gene,
                  protein = melted$Protein,
                  condID = paste(melted$Protein, separated[, 3],
                                        sep = "_"), bioID,
                  ptmID = paste(melted$Protein, separated[, 3],
                                       melted$Peptide, separated[ , 5],
                                       sep = "_"),
                  ptm = separated[ , 5], tag_plex,
                  covariate = melted$Covariate,
                  varCat = melted$varCat,
                  pairMin = melted$pairMin,
                  lr = melted$lr, stringsAsFactors = F)
  finalDat <- finalDat[order(finalDat$tag_plex, finalDat$condID,
                             finalDat$bioID, finalDat$ptm, finalDat$ptmID), ]

  #do normalization at a previous step
  #normalize the non-ptm data by tag

  #normed <- unlist(by(melted[finalDat$ptm == 0, ]$lr,
   #              finalDat[finalDat$ptm == 0, ]$tag_plex, function(x)
    #               x - mean(x)))
  #melted$lr[finalDat$ptm == 0] <- normed
  #finalDat$lr <- melted$lr


  finalDat
}#end function transformDat

#function for extracting the condition number from labels
getCond <- function(strVec, bio = FALSE){

  chunks <- strsplit(strVec, "_")

  if(bio == FALSE){
    condNumber <- unlist(lapply(chunks, function(x) x[2]))
  }

  if(bio == TRUE){
    condNumber <- unlist(lapply(chunks, function(x) x[3]))
  }

  as.integer(condNumber)
}

getName <- function(strVec){
  ePosition <- regexpr("_", strVec)
  condName <- substring(strVec, 1, ePosition - 1)

  condName
}

#function to reverse strings
strReverse <- function(x){
  sapply(lapply(strsplit(x, NULL), rev), paste, collapse="")
}

#closure function
cl <- function(vec){
  vec/sum(vec)
}

#centered log ratio
clr <- function(vec){
  gm <- exp(mean(log(vec)))
  log(vec/gm)
}

clrInv <- function(vec){
  cl(exp(vec))
}

#Function to normalize the data

cNorm <- function(mat, subIndex = NULL){
  #takes a compostion matrix and subtracts out the mean
  mat[mat < 1] <- 1
  clrMat <- t(apply(mat, 1, clr))

  if(is.null(subIndex)){
    cMean <- clrInv(apply(clrMat, 2, mean))
  }else{
    cMean <- clrInv(apply(clrMat[subIndex, ], 2, mean))
  }

  normed <- t(apply(mat, 1, function(x) cl(x/cMean)))
  normed
}


#Function to prepare data for an ANOVA.

anovaPrep <- function(df){
  #Zero out unused column
  bioCol <- df$bioID[1]
  if(bioCol == 0){
    df$bioID[] <- 0
  }
  n_ <- nrow(df)

  jDat <- df[4:(n_), ]
  df[4:n_, ] <- jDat[order(jDat$bioID), ]

  value_index <- grep("tag", colnames(df))

header <- makeHeader(df[ , value_index], normal_index)
  colnames(lrMat) <- header

  newDf1 <- data.frame(Protein = df[4:n_, ]$Protein,
                       Peptide = df[4:n_, ]$Peptide, bioID = df[4:n_, ]$bioID,
                       Covariate = df[4:n_, ]$Covariate,
                       varCat = df[4:n_, ]$varCat,
                       lrMat, stringsAsFactors = F)

}

#function to make a matrix of relevant samples
makeMat <- function(vec, model, bio = FALSE, useCov = FALSE, avgCond = FALSE){
  if(avgCond){
    extracted <- lapply(vec, function(x)
      rstan::extract(model,
                   pars=paste("avgCond[", x, "]", sep = ""))$avgCond)
    extractMat <- do.call(cbind, extracted)
    return(extractMat)
  }

  if(bio == FALSE){
    if(useCov == FALSE){
      extracted <- lapply(vec, function(x)
      rstan::extract(model,
                   pars=paste("beta[", x, "]", sep = ""))$beta)
    }else{
      extracted <- lapply(vec, function(x)
        rstan::extract(model,
                       pars=paste("betaP_c[", x, "]", sep = ""))$betaP_c)
    }
  }else{
    if(useCov == FALSE){
      extracted <- lapply(vec, function(x)
        rstan::extract(model,
                     pars=paste("beta_b[", x, "]", sep = ""))$beta_b)
    }else{
      extracted <- lapply(vec, function(x)
        rstan::extract(model,
                       pars=paste("betaP_b[", x, "]", sep = ""))$betaP_b)
    }
  }
  extractMat <- do.call(cbind, extracted)
  extractMat
}

#function to apply alr inverse to a matrix
alrInv <- function(mat, refCol = NULL, justSimp = FALSE){

if(length(refCol) == 0){
  lrMat <- mat
}else{
  lrMat <- mat[ , -refCol] - mat[ , refCol]
}

  lrMeans <- apply(lrMat, 2, mean)
  lrVar <- apply(lrMat, 2, var)
  lrInt <- t(apply(lrMat, 2, quantile, probs = c(.025, .975, .1, .9)))
  colnames(lrInt) <- c("LL95", "UL95", "LL80", "UL80")

  lrDf <- data.frame(Estimate = lrMeans, Variance = lrVar, lrInt)

  zeroMat <- cbind(rep(0, nrow(lrMat)), lrMat)
  expMat <- 2^zeroMat
  simplex <- t(apply(expMat, 1, function(x) x/sum(x)))
  simpMeans <- apply(simplex, 2, mean)
  simpVar <- apply(simplex, 2, var)
  simpInt <- t(apply(simplex, 2, quantile, probs = c(.025, .975, .1, .9)))
  colnames(simpInt) <- c("LL95", "UL95", "LL80", "UL80")

  df <- data.frame(Estimate = simpMeans, Variance = simpVar, simpInt)

  if(justSimp){
    res <- simplex
  }else{res <- list(df, lrDf)}

  res
}

#Function to shift indices above a certain reference point
partShift <- function(refPos, vec){
  belowI <- which(vec < refPos)
  aboveI <- which(vec > refPos)

  shifted <- c(vec[belowI], vec[aboveI] - 1)
  shifted
}


#Summarize the posterior of a simplex
summSimp <- function(simp, conds, obsConds, refCond, pName){

  refPos <- which(conds == refCond) #position of reference in complete list
  suCond <- conds[-refPos]

  lrs <- matrix(NA, nrow = 1, ncol = length(suCond))
  colnames(lrs) <- paste0("Est_Fc", suCond)

  lrVars <- matrix(NA, nrow = 1, ncol = length(suCond))
  colnames(lrVars) <- paste0("Var_", suCond)

  lrLL95 <- matrix(NA, nrow = 1, ncol = length(suCond))
  colnames(lrLL95) <- paste0("LL95_", suCond)

  lrUL95 <- matrix(NA, nrow = 1, ncol = length(suCond))
  colnames(lrUL95) <- paste0("UL95_", suCond)

  #exit if reference was not observed
  if((refCond %in% obsConds == FALSE)){
    return(data.frame(Protein = pName, lrs, lrVars, lrLL95, lrUL95,
                                    stringsAsFactors = F))
  }

  oRefPos <- which(obsConds == refCond)  # position of reference in observed vector
  lrMat <- log2(simp[ , -(oRefPos + 1), drop = FALSE]) - log2(simp[ , (oRefPos + 1)]) #add 1 since simplex has an extra first column

  colConds <- c(1, obsConds[-oRefPos]) #mapping from lrMat to suCond

  obsPos <- match(colConds, suCond)

  lrs[obsPos] <- apply(lrMat, 2, mean)

  lrVars[obsPos] <- apply(lrMat, 2, var)

  lrLL95[obsPos] <- apply(lrMat, 2, function(x) quantile(x, probs = .025))

  lrUL95[obsPos] <- apply(lrMat, 2, function(x) quantile(x, probs = .975))


  avgLrTab <- data.frame(Protein = pName, lrs, lrVars, lrLL95, lrUL95,
                         stringsAsFactors = F)

  avgLrTab

}


#Summarize a contrast
contSimp <- function(simp, conds, obsConds, cont, pName){

  #Figure out if the contrast can be made
  obsBool <- (conds %in% c(1, obsConds))
  validCont <- identical(cont, cont*obsBool)
  if(!validCont){
    return(data.frame(Protein = pName, Cont_Est = NA, Cont_Var = NA,
                      Cont_LL95 = NA, Cont_UL95 = NA,
                      stringsAsFactors = F))
  }

  #map from full conditions to observed
  newCont <- cont[match(c(1, obsConds), conds)]
  contrastChain <- log2(simp) %*% newCont

  Cont_Est = mean(contrastChain)
  Cont_Var = var(contrastChain)
  Cont_LL95 = quantile(contrastChain, probs = .025)
  Cont_UL95 = quantile(contrastChain, probs = .975)

  avgLrTab <- data.frame(Protein = pName, Cont_Est, Cont_Var, Cont_LL95,
                         Cont_UL95, stringsAsFactors = F)

  avgLrTab

}


#aggregate repeat peptides to maximum
#expects standard column format of a single plex with no header
takeMax <- function(df){
  df <- df[order(df$Protein, df$Peptide), ]
  pepId <- paste(df$Protein, df$Peptide)
  rowN <- 1:length(pepId)
  uPepId <- unique(pepId)
  valCols <- grep("tag", colnames(df))

  ssn <- apply(df[ , valCols], 1, sum)
  tempDf <- cbind(df, ssn, pepId, rowN)

  pepCount <- table(pepId)

  manyPeps <- attr(pepCount, "dimnames")[[1]][which(pepCount > 1)]
  manyIndex <- which(pepId %in% manyPeps)
  oneIndex <- setdiff(rowN, manyIndex)

  manyDf <- tempDf[manyIndex, ]
  uMult <- unique(manyDf$pepId)

  pepList <- by(manyDf, factor(manyDf$pepId), function(x) x[ , ])
  maxIndex <- sapply(pepList, function(x) x[which.max(x$ssn), "rowN"])

  newDf <- df[c(oneIndex, maxIndex), ]
  newDf <- newDf[order(newDf$Protein, newDf$Peptide), ]

  newDf
}

#function to generate a dataframe for model predictions
makePredDat <- function(prot, timeVec, category, header, timeDegree, catRefs,
                        sinusoid, sinVec, cosVec){
  #header is a vector of column names found in the data
  #timeDegree = 1, 2, or 3, specifying a linear, quadratic or cubic model
  #catRefs is a vector containing the reference levels of each baseline categorical covariate

  if(sinusoid == FALSE){
    if(timeDegree == 1){
      newDf <- data.frame(Protein = prot, Time = timeVec, Category = category)
    }
    if(timeDegree == 2){
      newDf <- data.frame(Protein = prot, Time = timeVec, Time2 = timeVec^2,
                                                               Category = category)
    }
    if(timeDegree == 3){
      newDf <- data.frame(Protein = prot, Time = timeVec, Time2 = timeVec^2,
                                                               Time3 = timeVec^3, Category = category)
    }
  }else{
    #newDf <- data.frame(Protein = prot, Sin = sinVec, Cos = cosVec, Category = category)
    if(timeDegree == 1){
      newDf <- data.frame(Protein = prot, Time = timeVec, Sin = sinVec, Cos = cosVec, Category = category)    
    }    
    if(timeDegree == 2){      
      newDf <- data.frame(Protein = prot, Time = timeVec, Time2 = timeVec^2, Sin = sinVec, Cos = cosVec, Category = category)    
    }    
    if(timeDegree == 3){      
      newDf <- data.frame(Protein = prot, Time = timeVec, Time2 = timeVec^2,  Time3 = timeVec^3, Sin = sinVec, Cos = cosVec, Category = category)    
    }
  }
  #now add baseline covariates
  contIndex <- grep("Continuous_Covariate", header)
  catCovarIndex <- grep("Categorical_Covariate", header)

  if(length(contIndex) > 0){
    tempVars <- matrix(0, nrow = nrow(newDf), ncol = length(contIndex))
    colnames(tempVars) <- header[contIndex]
    newDf <- cbind(newDf, tempVars)
  }

  if(length(catCovarIndex) > 0){
    catCols <- list()
    for(i in 1:length(catCovarIndex)){
      refVec <- data.frame(TempName = rep(catRefs[i], nrow(newDf)))
      colnames(refVec) <- header[catCovarIndex[i]]
      catCols[[i]] <- refVec
    }

    newDf <- data.frame(newDf, do.call(cbind, catCols))
  }

  newDf
}
ColtoCaro/compMS documentation built on March 13, 2020, 10:11 a.m.