R/dataProcessing.R

Defines functions dataProcessing

Documented in dataProcessing

#' Data processing
#'
#' @return
#' @export
#'
#' @importFrom rlist list.append
#' @importFrom fastcluster hclust
#' @importFrom ggdendro ggdendrogram
#'
#'
dataProcessing <- function() {

  message("Find raw data frame.")
  data <- as.matrix(getDataFrame())

  data <- transposeData(data)

  message("")
  message("Specify data based on: ")
  data <- data[getNames(data, 1), ]
  class(data) <- "numeric"
  data[is.na(data)] <- 0


  rownames(data) <- modifyNames(rownames(data))

  message("")
  if(ok("Specify subgroup to process?")) {

    data <- data[setSubGroup(rownames(data)), ]

  }



  name <- readline("Name of dataset: ")

  message("Ready to process.")





  #Remove proteins based on prefix

  done <- F

  while(!done) {

    prefix <- readline("Based on which prefix should proteins be removed? (CON/REV/...) ")

    if(prefix != "") {
      message(paste(sum(regexpr(prefix, colnames(data)) == 1), " proteins will be removed based on the prefix ", prefix, ".", sep = ""))

      if(ok("Ok?")) {
        data <- data[, regexpr(prefix, rownames(data)) != 1]

        done <- ok("Done?")
      }

    }

  }



  #Check samples

  #message("Let's check samples.")



  #dendro.x <- hclust(d = dist(x = data), method = "complete")
  #print(ggdendrogram(data = as.dendrogram(dendro.x), rotate = FALSE))



  #Remove proteins based on presence in samples

  torg <- "?"

  while(torg != "total" && torg != "groupwise") {
    torg <- readline("How should proteins be counted? (total/groupwise) ")
  }


  if(torg == "total") {

    count <- c()

    for(i in 1:ncol(data)) {
      count <- c(count,sum(data[,i] > 0))
    }

    print(table(count))
    print(hist(count, xlab = "present in # samples", ylab = "Number of proteins"))

    #Sum up

    count2 <- c()

    for(i in 1:length(table(count))) {
      count2 <- c(count2, sum(table(count)[i:length(table(count))]))
    }

    plot(0:(length(count2) - 1), count2, xlab = "Sample threshold", ylab = "Number of proteins")
    plot(0:(length(count2) - 1) / (length(count2) - 1), count2/count2[1], xlab = "Sample threshold", ylab = "Fraction of proteins")

    print(rbind("# samples" = 0:(length(count2) - 1), "% samples" = round(0:(length(count2) - 1) / (length(count2) - 1), 2) * 100, "# proteins" = count2, "% proteins" = round(count2/(count2[1] - 1), 2) * 100))




    threshold <- as.numeric(readline("Threshold to use protein in the analysis: (number or fraction of samples) "))

    if(threshold <= 1) {
      threshold <- ceiling(threshold * nrow(data))
    }


    data <- data[,count >= threshold]

    message(paste(ncol(data), " proteins left."))

  }








  #Data imputation

  impute <- "?"

  while(!impute %in% c("not", "shifted normal distribution", "s")) {

    impute <- readline("Would you like to impute data? (ja/nein) ")

    if(impute == "nein") {
      impute <- "not"
    }

    else if(impute == "ja") {

      impute <- readline("How would you like to impute data? (shifted normal distribution) ")

    }

    else {
      message("Please answer the question.")
    }

  }



  #No imputation
  if(impute == "not") {
    message("Nothing was successfully imputed.")
  }


  #Imputation from normal distribution
  if(impute == "shifted normal distribution" || impute == "s") {


    done <- FALSE

    while(!done) {

      shift <- as.numeric(readline("Shift? (default = 1.8) "))
      width <- as.numeric(readline("Width? (default = 0.2) "))

      datalog2 <- log2(data)
      datalog2[datalog2 == -Inf] <- 0

      for(i in 1:nrow(datalog2)) {
        for(j in 1:ncol(datalog2)) {
          if(datalog2[i,j] == 0) {

            data[i,j] <- 2^rnorm(1,
                                 mean = mean(datalog2[,j][datalog2[,j] > 0]) - shift,
                                 sd = width * sd(datalog2[,j][datalog2[,j] > 0]))
          }
        }
      }


      message("Data imputed successfully from shifted normal distribution.")
      message("")
      done <- ok()

    }


  }



  #Save data frame in LFQ
  assign("data0", list.append(data0, data), pos = .GlobalEnv)
  names(data0)[length(data0)] <- name
  assign("data0", data0, pos = .GlobalEnv)

  message("Data saved in data0.")

  #Add protein lists
  assign("protein.groups",
         list.append(protein.groups, colnames(data)),
         pos = .GlobalEnv)
  names(protein.groups)[length(protein.groups)] <- paste("all_", name, sep = "")
  assign("protein.groups",
         protein.groups,
         pos = .GlobalEnv)

  message("Vector of all protein names saved in protein.groups.")



}
nicohuttmann/htmnanalysis documentation built on Dec. 6, 2020, 3:02 a.m.