#' Data processing
#'
#' @return
#' @export
#'
#' @importFrom rlist list.append
#' @importFrom fastcluster hclust
#' @importFrom ggdendro ggdendrogram
#'
#'
dataProcessing <- function() {
message("Find raw data frame.")
data <- as.matrix(getDataFrame())
data <- transposeData(data)
message("")
message("Specify data based on: ")
data <- data[getNames(data, 1), ]
class(data) <- "numeric"
data[is.na(data)] <- 0
rownames(data) <- modifyNames(rownames(data))
message("")
if(ok("Specify subgroup to process?")) {
data <- data[setSubGroup(rownames(data)), ]
}
name <- readline("Name of dataset: ")
message("Ready to process.")
#Remove proteins based on prefix
done <- F
while(!done) {
prefix <- readline("Based on which prefix should proteins be removed? (CON/REV/...) ")
if(prefix != "") {
message(paste(sum(regexpr(prefix, colnames(data)) == 1), " proteins will be removed based on the prefix ", prefix, ".", sep = ""))
if(ok("Ok?")) {
data <- data[, regexpr(prefix, rownames(data)) != 1]
done <- ok("Done?")
}
}
}
#Check samples
#message("Let's check samples.")
#dendro.x <- hclust(d = dist(x = data), method = "complete")
#print(ggdendrogram(data = as.dendrogram(dendro.x), rotate = FALSE))
#Remove proteins based on presence in samples
torg <- "?"
while(torg != "total" && torg != "groupwise") {
torg <- readline("How should proteins be counted? (total/groupwise) ")
}
if(torg == "total") {
count <- c()
for(i in 1:ncol(data)) {
count <- c(count,sum(data[,i] > 0))
}
print(table(count))
print(hist(count, xlab = "present in # samples", ylab = "Number of proteins"))
#Sum up
count2 <- c()
for(i in 1:length(table(count))) {
count2 <- c(count2, sum(table(count)[i:length(table(count))]))
}
plot(0:(length(count2) - 1), count2, xlab = "Sample threshold", ylab = "Number of proteins")
plot(0:(length(count2) - 1) / (length(count2) - 1), count2/count2[1], xlab = "Sample threshold", ylab = "Fraction of proteins")
print(rbind("# samples" = 0:(length(count2) - 1), "% samples" = round(0:(length(count2) - 1) / (length(count2) - 1), 2) * 100, "# proteins" = count2, "% proteins" = round(count2/(count2[1] - 1), 2) * 100))
threshold <- as.numeric(readline("Threshold to use protein in the analysis: (number or fraction of samples) "))
if(threshold <= 1) {
threshold <- ceiling(threshold * nrow(data))
}
data <- data[,count >= threshold]
message(paste(ncol(data), " proteins left."))
}
#Data imputation
impute <- "?"
while(!impute %in% c("not", "shifted normal distribution", "s")) {
impute <- readline("Would you like to impute data? (ja/nein) ")
if(impute == "nein") {
impute <- "not"
}
else if(impute == "ja") {
impute <- readline("How would you like to impute data? (shifted normal distribution) ")
}
else {
message("Please answer the question.")
}
}
#No imputation
if(impute == "not") {
message("Nothing was successfully imputed.")
}
#Imputation from normal distribution
if(impute == "shifted normal distribution" || impute == "s") {
done <- FALSE
while(!done) {
shift <- as.numeric(readline("Shift? (default = 1.8) "))
width <- as.numeric(readline("Width? (default = 0.2) "))
datalog2 <- log2(data)
datalog2[datalog2 == -Inf] <- 0
for(i in 1:nrow(datalog2)) {
for(j in 1:ncol(datalog2)) {
if(datalog2[i,j] == 0) {
data[i,j] <- 2^rnorm(1,
mean = mean(datalog2[,j][datalog2[,j] > 0]) - shift,
sd = width * sd(datalog2[,j][datalog2[,j] > 0]))
}
}
}
message("Data imputed successfully from shifted normal distribution.")
message("")
done <- ok()
}
}
#Save data frame in LFQ
assign("data0", list.append(data0, data), pos = .GlobalEnv)
names(data0)[length(data0)] <- name
assign("data0", data0, pos = .GlobalEnv)
message("Data saved in data0.")
#Add protein lists
assign("protein.groups",
list.append(protein.groups, colnames(data)),
pos = .GlobalEnv)
names(protein.groups)[length(protein.groups)] <- paste("all_", name, sep = "")
assign("protein.groups",
protein.groups,
pos = .GlobalEnv)
message("Vector of all protein names saved in protein.groups.")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.