options(stringsAsFactors=FALSE)
#' Customization of QC output
#'
#' Enable the user to choose which parameter to output from a dataframe containing the columns compound, metabolite, nbna, sd, mu, CV, and remove.
#'
#' @param data the input QC dataframe;
#' @param COMPOUND default=TRUE, output the column named "compound";
#' @param NBNA default=TRUE, output the column named "nbna";
#' @param SD default=TRUE, output the column named "sd";
#' @param MU default=TRUE, output the column named "mu";
#' @param CV default=TRUE, output the column named "CV";
#' @param REMOVE default=TRUE, output the column named "remove".
#'
#' @return subdata the desired dataframe,
#'
#'
#' @examples
#' QCcustomization(QCresult$QC, REMOVE=FALSE)
#'
#' @export
QCcustomization <- function(data, COMPOUND=TRUE, COHORTE=TRUE, METABOLITE=TRUE, NBNA=TRUE, SD=TRUE, MU=TRUE, CV=TRUE, REMOVE=TRUE){
cols <- c(COMPOUND, COHORTE, METABOLITE, NBNA, SD, MU, CV, REMOVE)
subdata <- cbind(data[,cols])
return(subdata)
}
#' Data Quality Control
#'
#' Quality control for metabolomic data. Will remove metabolites with more than a determined thershold of missing values for all patient.
#' Produces the following files:
#' \itemize{
#' \item "QC_ <data> .txt": Summary of the QC;
#' \item "REMOVED_QC_<data>.txt": Summary of the removed metabolite;
#' \item output, cleaned dataset file(optional);
#' \item "REMOVED_<output>.txt", the removed set of metabolite (optional)
#'
#' }
#'
#' Can be used on any kind of file containing rows with x (gene, metabolite, etc.) and column with y (Sample, patients, etc.).
#'
#' @param data name of the initial metabolomic dataset file. Must contain:
#' \itemize{
#' \item a column "Compound";
#' \item a column "Metabolite";
#' \item and all the columns sample from <sampleStart> to the end of the file;
#' }
#' the rest doesn't matter and the names "Compound" and "Metabolite" are optional, as long as the column position is specified in params. ;
#'
#' @param output default NULL, name of the output cleaned dataset to produce if not NULL;
#' @param cohort default "sample", name of the sample's cohort;
#' @param compound default NULL, position of the compound column if named otherwise;
#' @param metabolite default NULL, position of the metabolite column if named otherwise;
#' @param sampleStart default 6, the index of the 1st sample column.
#'
#' @return a list, containing the cleaned dataset and the QC dataframe
#' \itemize{
#' \item dataset cleaned dataset
#' \item QC the QC dataframe containing the columns compound, metabolite, nbna, sd, mu, CV, and remove.
#'
#' }
#'
#' @examples
#' for a dataset with the following header ; Compound, m/z, Metabolite, RT, Sample #1, ...
#' qualityControl("dummySet.txt", sampleStart = 5)
#'
#' for a dataset with the following header ; compound, m/z, metabolite, RT, Sample #1, ...
#' qualityControl("dummySet.txt", sampleStart = 5, compound = 1, metabolite = 3)
#'
#'
#' @export
qualityControl <- function(data, output=NULL, cohort="",
missing=0.2,
compound=NULL,
metabolite=NULL,
sampleStart=6){
###### Read data #################################################
DAT <- read.table(data, header=T, sep="\t", na.strings="", fill=TRUE)
dat <- DAT
if (!is.null(compound)){
colnames(dat)[compound] <- "Compound"
}
if (!is.null(metabolite)){
colnames(dat)[metabolite] <- "Metabolite"
}
if(is.null(dat$Compound)){
stop("no Compound column or column number entered")
}
if(is.null(dat$Metabolite)){
stop("no Metabolite column or column number entered")
}
dat$Cohort= cohort
GENMOD <- dat$Cohort
# les 5 premieres colonnes donnent les infos sur le metabolite
IDS <- data.frame(Compound=dat$Compound, Metabolite=dat$Metabolite)
METABO <- dat[,((sampleStart+1):dim(dat)[2]-1)]
METABO <- suppressWarnings(data.matrix(METABO))
######### QC data et sommaire ##############
# remplacer les 0 par NA
METABO[METABO == 0] <- NA
# compte des NA par groupe
nbna <- apply(is.na(METABO),1, sum)
# standard deviation par groupe
sd <- apply(METABO,1, sd, na.rm=T)
# moyenne par groupe
mu <- apply(METABO,1 , mean, na.rm=T)
# coefficient de variation
CV <- sd/mu
# on identifie les metabolites avec plus de 20% de valeurs manquantes par groupe
rm <- (nbna > (missing * dim(METABO)[2]))
# Build the name of the output QC_ file
name<- paste0("QC_", strsplit(data, ".", fixed=TRUE)[[1]][1])
QC <- data.frame(compound =IDS$Compound , cohort=GENMOD, metabolite=IDS$Metabolite, nbna, sd, mu, CV, remove=rm)
print("Saving QC data")
write.table(QCcustomization(QC[QC$remove == FALSE,], REMOVE=FALSE), paste0(name,".txt"), quote=F, row.names=F,sep="\t")
write.table(QCcustomization(QC[QC$remove==TRUE,], REMOVE=FALSE), paste0("REMOVED_",paste0(name,".txt")), quote=F, row.names=F,sep="\t")
# Retirer les métabolites concernés
clean_dataset <- as.data.frame(METABO)
clean_dataset$remove=QC$remove
remove_set <- clean_dataset[clean_dataset$remove==TRUE,]
clean_dataset <- clean_dataset[clean_dataset$remove==FALSE,]
# Ré-insérer les IDS dans le fichier et générer le nouveau dataset
IDS$remove=QC$remove
clean_dataset <- cbind(IDS[IDS$remove==FALSE, c(1,2)], clean_dataset[,1:(dim(clean_dataset)[2]-1)])
removed_set <- cbind(IDS[IDS$remove==TRUE, c(1,2)], remove_set[,1:(dim(remove_set)[2]-1)])
# Output le cleaned_dataset si demandé
if(!(is.null(output))){
print("Saving cleaned data")
write.table(clean_dataset, output, quote=F, row.names=F,sep="\t")
write.table(removed_set, paste0("REMOVED_",output), quote=F, row.names=F,sep="\t")
}
label<- paste(QC[QC$remove==FALSE,"compound"],
QC[QC$remove==FALSE,"metabolite"], sep = "_")
dup_label<- duplicated(label)
i <- 0
for (dup in dup_label){
i <- i+1
if(dup){
warning(paste(paste(paste("row", i), "is a duplicated compound_metabolite:"), label[i]))
}
}
j <- 0
for(i in unlist(QC[QC$remove==FALSE,]$sd)){
j<- j+1
if(is.na(i)){
warning(paste0("Sd == NA for row: ", label[j]))
}else if(i==0){
warning(paste0("Sd == 0 for row: ", label[j]))
}
}
return(list("dataset"= clean_dataset, "QC"=QC))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.