R/retrieveDataFromESets_TR.R
In TPP: Analyze thermal proteome profiling (TPP) experiments

retrieveDataFromESets_TR <- function(data){
  ## Merge row annotations and fold changes from different expressionSets
  ## containing TR data
  
  ## Initialize variables to prevent "no visible binding for global
  ## variable" NOTE by R CMD check:
  Protein_ID <- NULL
  
  ## 1. Preparation
  expNames<- names(data)
  list1=list2=list4=list3=list5=list6 = vector(mode="list", length=length(expNames))
  names(list1)=names(list2)=names(list4)=names(list3)=names(list5)=names(list6) = expNames
  
  ## 2. Iterate over all experiments and retrieve data
  for (en in expNames){
    setTmp <- data[[en]]
    
    ## Split annotation data (stored as featureData in the expressionSets) into
    ## a data frame of melting curve parameters, model information (boolean
    ## variables indicating whether sufficient non-missing values were available
    ## for model fit and whether the model converged successfully).
    fDat        <- pData(featureData(setTmp))
    
    ## Specify column names:
    cols1 <- meltCurveParamNames(returnParNames = TRUE, 
                                 returnPerformanceInfo = FALSE)
    cols2 <- meltCurveParamNames(returnParNames = FALSE,
                                 returnPerformanceInfo = TRUE)
    
    ciOpt <- getOption("TPPTR_CI")
    if(!is.null(ciOpt)){
      if (ciOpt){
        cols2 = c(cols2, "CI_meltPointUpper", "CI_meltPointLower", "CI_meltPoint_delta")
      }
    }    
    cols6 <- "plot"
    cols3 <- setdiff(colnames(fDat), c(cols1,cols2, cols6))
    
    ## Split featureData into separate data frames:
    df1  <- select(fDat, !!!syms(cols1))
    df2  <- select(fDat, !!!syms(cols2))
    df3  <- select(fDat, !!!syms(cols3))
    df6  <- select(fDat, !!!syms(cols6))
    
    ## Retrieve fold change matrix from current expressionSet and convert to 
    ## data frame:
    df4 <- as.data.frame(Biobase::exprs(setTmp))
    
    ## Data frame with indicators which proteins where identified in which experiment:
    df5 <- data.frame("protein_identified_in" = rep(TRUE, nrow(setTmp)))
    
    ## Append experiment id to all data frame columns to make them unique when 
    ## combined to big experiment-spanning results table:
    if (ncol(df1) > 0) colnames(df1) <- paste(colnames(df1), en, sep="_")
    if (ncol(df2) > 0) colnames(df2) <- paste(colnames(df2), en, sep="_")
    if (ncol(df3) > 0) colnames(df3) <- paste(colnames(df3), en, sep="_")
    if (ncol(df4) > 0) colnames(df4) <- paste(colnames(df4), en, sep="_")
    if (ncol(df5) > 0) colnames(df5) <- paste(colnames(df5), en, sep="_")
    if (ncol(df6) > 0) colnames(df6) <- paste(colnames(df6), en, sep="_")
    
    ## If data was normalized, add suffix 'norm_' to the fold change column 
    ## names. Normalized data is recognized by the values of the normalization 
    ## coefficients in the fold change column annotation.
    flagIsNormalized <- any(!is.na(pData(data[[en]])$normCoeff))
    if (flagIsNormalized) {
      colnames(df4) <- paste("norm", colnames(df4), sep="_")
    }
    
    ## Add protein ID column so that the data frames of multiple experiment 
    ## (with different subsets of proteins detected in each experiment) can 
    ## later be merged together in a robust way:
    idsTmp <- featureNames(setTmp)
    df1 <- data.frame(Protein_ID=idsTmp, df1, stringsAsFactors=FALSE)
    df2 <- data.frame(Protein_ID=idsTmp, df2, stringsAsFactors=FALSE)
    df3 <- data.frame(Protein_ID=idsTmp, df3, stringsAsFactors=FALSE)
    df4 <- data.frame(Protein_ID=idsTmp, df4, stringsAsFactors=FALSE)
    df5 <- data.frame(Protein_ID=idsTmp, df5, stringsAsFactors=FALSE)
    df6 <- data.frame(Protein_ID=idsTmp, df6, stringsAsFactors=FALSE)
    
    ## Store data frames of each experiment in a list. This will enable
    ## easy and robust merging using plyr::join_all.
    list1[[en]] <- df1
    list2[[en]] <- df2
    list3[[en]] <- df3
    list4[[en]] <- df4
    list5[[en]] <- df5
    list6[[en]] <- df6
  }
  merged1 <- arrange(join_all(list1, by="Protein_ID", type="full"), Protein_ID)
  merged2 <- arrange(join_all(list2, by="Protein_ID", type="full"), Protein_ID)
  merged3 <- arrange(join_all(list3, by="Protein_ID", type="full"), Protein_ID)
  merged4 <- arrange(join_all(list4, by="Protein_ID", type="full"), Protein_ID)
  merged5 <- arrange(join_all(list5, by="Protein_ID", type="full"), Protein_ID)
  merged6 <- arrange(join_all(list6, by="Protein_ID", type="full"), Protein_ID)
  
  ## Insert FALSE if a protein was not present in an experiment (instead of the
  ## NAs generated by the join_all function):
  for (en in expNames){
    name <- paste("protein_identified_in", en, sep="_")
    x    <- merged5[, name]
    x[is.na(x)] <- FALSE
    merged5[, name] <- x
  }
  
  ## Merge plot columns (columns of individual experiments can contain missing 
  ## values if experiment did not provide enough data for plotting):
  plotCols <- grep("plot", colnames(merged6), value = TRUE)
  if (length(plotCols)>0){
    allPlots <- data.frame(
      Protein_ID = merged6$Protein_ID, 
      plot = merge_cols(data = merged6[,plotCols], 
                        fun = paste,
                        collapse = '|')
    )
    merged6 <- join(merged6, allPlots, by="Protein_ID")
  }
  merged6 <- subset(merged6, select = !colnames(merged6) %in% plotCols)
  
  ## Return results:
  return(list(curveParDF   = merged1,
              modelInfoDF  = merged2,
              otherAnnotDF = merged3,
              fcDF         = merged4,
              presenceDF   = merged5,
              plotCol      = merged6))
}