data_preprocess <-
function(Xmat=NA,Ymat=NA,feature_table_file,parentoutput_dir,class_labels_file,num_replicates=3,feat.filt.thresh=NA,summarize.replicates=TRUE,summary.method="mean",
all.missing.thresh=0.5,group.missing.thresh=0.7,
log2transform=TRUE,medcenter=TRUE,znormtransform=FALSE,quantile_norm=TRUE,lowess_norm=FALSE,madscaling=FALSE,missing.val=0,samplermindex=NA, rep.max.missing.thresh=0.5,summary.na.replacement="zeros",featselmethod=NA){
options(warn=-1)
#read file; First row is column headers
if(is.na(Xmat==TRUE)){
data_matrix<-read.table(feature_table_file,sep="\t",header=TRUE)
}else{
data_matrix<-Xmat
#rm(Xmat)
}
#print("signal filter threshold ")
#print(group.missing.thresh)
#print(missing.val)
if(is.na(samplermindex)==FALSE){
data_matrix<-data_matrix[,-c(samplermindex)]
}
#use only unique records
data_matrix<-unique(data_matrix)
if(is.na(missing.val)==FALSE){
#print("Replacing missing values with NAs.")
data_matrix<-replace(as.matrix(data_matrix),which(data_matrix==missing.val),NA)
}
#print("dim of original data matrix")
#print(dim(data_matrix))
data_matrix_orig<-data_matrix
snames<-colnames(data_matrix)
dir.create(parentoutput_dir,showWarnings=FALSE)
parentoutput_dir<-paste(parentoutput_dir,"/Stage1/",sep="")
dir.create(parentoutput_dir,showWarnings=FALSE)
fheader="transformed_log2fc_threshold_"
setwd(parentoutput_dir)
#Step 1) PID or CV calculation
if(is.na(feat.filt.thresh)==FALSE)
{
if(num_replicates>1)
{
feat.eval.result=evaluate.Features(cbind(data_matrix[,c(1:2)],data_matrix), num_replicates,alignment.tool="apLCMS",impute.bool=TRUE)
cnames=colnames(feat.eval.result)
feat.eval.result<-apply(feat.eval.result,2,as.numeric)
feat.eval.result<-as.data.frame(feat.eval.result)
feat.eval.result.mat=cbind(data_matrix[,c(1:2)],feat.eval.result)
feat.eval.outfile=paste("featureassessment_usinggoodsamples.txt",sep="")
#write results
write.table(feat.eval.result.mat, feat.eval.outfile,sep="\t", row.names=FALSE)
#data_m<-data_m[which(as.numeric(feat.eval.result$median)<=feat.filt.thresh),]
data_matrix<-data_matrix[which(as.numeric(feat.eval.result$median)<=feat.filt.thresh),]
}
}
####################################################################################
#Step 2) Average replicates
if(summarize.replicates==TRUE)
{
if(num_replicates>1)
{
data_m<-getSumreplicates(data_matrix,alignment.tool="apLCMS",numreplicates=num_replicates,numcluster=10,rep.max.missing.thresh=rep.max.missing.thresh,summary.method=summary.method,summary.na.replacement, missing.val=missing.val)
if(summary.method=="mean"){
#print("Replicate averaging done")
filename<-paste("Rawdata_averaged.txt",sep="")
}else{
if(summary.method=="median"){
#print("Replicate median summarization done")
filename<-paste("Rawdata_median_summarized.txt",sep="")
}
}
data_m_prenorm<-cbind(data_matrix[,c(1:2)],data_m)
write.table(data_m_prenorm, file=filename,sep="\t",row.names=FALSE)
#num_samps_group[[1]]=(1/num_replicates)*num_samps_group[[1]]
#num_samps_group[[2]]=(1/num_replicates)*num_samps_group[[2]]
}else
{
data_m<-data_matrix[,-c(1:2)]
if(summary.na.replacement=="zeros"){
data_m<-replace(data_m,which(is.na(data_m)==TRUE),0)
}else{
if(summary.na.replacement=="halfsamplemin"){
data_m<-apply(data_m,2,function(x){naind<-which(is.na(x)==TRUE); if(length(naind)>0){x[naind]<-min(x,na.rm=TRUE)/2}; return(x)})
}else{
if(summary.na.replacement=="halfdatamin"){
min_val<-min(data_m,na.rm=TRUE)*0.5
data_m<-replace(data_m,which(is.na(data_m)==TRUE),min_val)
#data_m<-apply(data_m,1,function(x){naind<-which(is.na(x)==TRUE); if(length(naind)>0){x[naind]<-min(x,na.rm=TRUE)/2}; return(x)})
}else{
if(summary.na.replacement=="halffeaturemin"){
data_m<-apply(data_m,1,function(x){naind<-which(is.na(x)==TRUE); if(length(naind)>0){x[naind]<-min(x,na.rm=TRUE)/2}; return(x)})
data_m<-t(data_m)
}
}
}
}
}
}else
{
data_m<-data_matrix[,-c(1:2)]
if(summary.na.replacement=="zeros"){
data_m<-replace(data_m,which(is.na(data_m)==TRUE),0)
}else{
if(summary.na.replacement=="halfsamplemin"){
data_m<-apply(data_m,2,function(x){naind<-which(is.na(x)==TRUE); if(length(naind)>0){x[naind]<-min(x,na.rm=TRUE)/2}; return(x)})
}else{
if(summary.na.replacement=="halfdatamin"){
min_val<-min(data_m,na.rm=TRUE)*0.5
data_m<-replace(data_m,which(is.na(data_m)==TRUE),min_val)
#data_m<-apply(data_m,1,function(x){naind<-which(is.na(x)==TRUE); if(length(naind)>0){x[naind]<-min(x,na.rm=TRUE)/2}; return(x)})
}else{
if(summary.na.replacement=="halffeaturemin"){
data_m<-apply(data_m,1,function(x){naind<-which(is.na(x)==TRUE); if(length(naind)>0){x[naind]<-min(x,na.rm=TRUE)/2}; return(x)})
data_m<-t(data_m)
}
}
}
}
}
data_matrix<-cbind(data_matrix[,c(1:2)],data_m)
data_matrix_orig<-data_matrix
data_subjects<-data_m
ordered_labels={}
num_samps_group<-new("list")
if(is.na(class_labels_file)==FALSE)
{
#print("Class labels file:")
#print(class_labels_file)
data_matrix={}
if(is.na(Ymat)==TRUE){
classlabels<-read.table(class_labels_file,sep="\t",header=TRUE)
}else{
classlabels<-Ymat
}
classlabels<-as.data.frame(classlabels)
colnames(classlabels)<-c("SampleID","Class")
f1<-table(classlabels$SampleID)
classlabels<-as.data.frame(classlabels)
classlabels<-classlabels[seq(1,dim(classlabels)[1],num_replicates),]
#print(classlabels)
class_labels_levels<-levels(as.factor(classlabels[,2]))
bad_rows<-which(class_labels_levels=="")
if(length(bad_rows)>0){
class_labels_levels<-class_labels_levels[-bad_rows]
}
for(c in 1:length(class_labels_levels))
{
if(c>1){
data_matrix<-cbind(data_matrix,data_subjects[,which(classlabels[,2]==class_labels_levels[c])])
}else{
data_matrix<-data_subjects[,which(classlabels[,2]==class_labels_levels[c])]
}
classlabels_index<-which(classlabels[,2]==class_labels_levels[c])
ordered_labels<-c(ordered_labels,as.character(classlabels[classlabels_index,2]))
num_samps_group[[c]]<-length(classlabels_index)
}
#colnames(data_matrix)<-as.character(ordered_labels)
data_matrix<-cbind(data_matrix_orig[,c(1:2)],data_matrix)
data_m<-as.matrix(data_matrix[,-c(1:2)])
}else{
if(is.na(Ymat)==TRUE){
classlabels<-rep("A",dim(data_m)[2])
ordered_labels<-classlabels
num_samps_group[[1]]<-dim(data_m)[2]
class_labels_levels<-c("A")
data_m<-as.matrix(data_matrix[,-c(1:2)])
}else{
classlabels<-Ymat
classlabels<-as.data.frame(classlabels)
colnames(classlabels)<-c("SampleID","Class")
f1<-table(classlabels$SampleID)
classlabels<-as.data.frame(classlabels)
classlabels<-classlabels[seq(1,dim(classlabels)[1],num_replicates),]
#print(classlabels)
class_labels_levels<-levels(as.factor(classlabels[,2]))
bad_rows<-which(class_labels_levels=="")
if(length(bad_rows)>0){
class_labels_levels<-class_labels_levels[-bad_rows]
}
for(c in 1:length(class_labels_levels))
{
#if(c>1){
#data_matrix<-cbind(data_matrix,data_subjects[,which(classlabels[,2]==class_labels_levels[c])])
#}else{
# data_matrix<-data_subjects[,which(classlabels[,2]==class_labels_levels[c])]
#}
classlabels_index<-which(classlabels[,2]==class_labels_levels[c])
ordered_labels<-c(ordered_labels,as.character(classlabels[classlabels_index,2]))
num_samps_group[[c]]<-length(classlabels_index)
}
#colnames(data_matrix)<-as.character(ordered_labels)
#data_matrix<-cbind(data_matrix_orig[,c(1:2)],data_matrix)
#data_m<-as.matrix(data_matrix[,-c(1:2)])
}
}
##################################################################################
metab_zeros={}
data_clean<-{}
clean_metabs<-{}
#num_samps_group[[3]]<-0
if(is.na(all.missing.thresh)==FALSE){
total_sigs<-apply(data_m,1,function(x){
if(is.na(missing.val)==FALSE){return(length(which(x>missing.val)))
}else{
return(length(which(is.na(x)==FALSE)))
}})
total_sig_thresh<-dim(data_m)[2]*all.missing.thresh
total_good_metabs<-which(total_sigs>total_sig_thresh)
if(length(total_good_metabs)>0){
data_m<-data_m[total_good_metabs,]
data_matrix<-data_matrix[total_good_metabs,]
#print(paste("Dimension of data matrix after overall ",all.missing.thresh,"% signal threshold filtering",sep=""))
#print(paste("Dimension of data matrix after using overall ",100*all.missing.thresh, "% signal criteria for filtering:"),sep="")
#print(dim(data_matrix))
}else{
stop(paste("None of the metabolites have signal in ",all.missing.thresh*100, "% of samples",sep=""))
}
}
#print(dim(data_m))
#Step 3) Remove features if signal is not detected in at least 70% of samples in either of the groups
if(is.na(group.missing.thresh)==FALSE){
if(length(class_labels_levels)==2){
sig_thresh_groupA<-group.missing.thresh*num_samps_group[[1]]
sig_thresh_groupB<-group.missing.thresh*num_samps_group[[2]]
for(metab_num in 1:dim(data_matrix)[1])
{
#print(missing.val)
if(is.na(missing.val)==FALSE){
num_sigsA<-length(which(data_m[metab_num,1:num_samps_group[[1]]]>missing.val))
num_sigsB<-length(which(data_m[metab_num,(num_samps_group[[1]]+1):(num_samps_group[[1]]+num_samps_group[[2]])]>missing.val))
}else{
num_sigsA<-length(which(is.na(data_m[metab_num,1:num_samps_group[[1]]])==FALSE))
num_sigsB<-length(which(is.na(data_m[metab_num,(num_samps_group[[1]]+1):(num_samps_group[[1]]+num_samps_group[[2]])])==FALSE))
}
if((num_sigsA>=sig_thresh_groupA) || (num_sigsB>=sig_thresh_groupB))
{
clean_metabs<-c(clean_metabs,metab_num)
}
}
#print(length(clean_metabs))
}else{
if(length(class_labels_levels)==3){
sig_thresh_groupA<-group.missing.thresh*num_samps_group[[1]]
sig_thresh_groupB<-group.missing.thresh*num_samps_group[[2]]
sig_thresh_groupC<-group.missing.thresh*num_samps_group[[3]]
for(metab_num in 1:dim(data_matrix)[1])
{
if(is.na(missing.val)==FALSE){
num_sigsA<-length(which(data_m[metab_num,1:num_samps_group[[1]]]>missing.val))
num_sigsB<-length(which(data_m[metab_num,(num_samps_group[[1]]+1):(num_samps_group[[1]]+num_samps_group[[2]])]>missing.val))
num_sigsC<-length(which(data_m[metab_num,(num_samps_group[[1]]+num_samps_group[[2]]+1):(num_samps_group[[1]]+num_samps_group[[2]]+num_samps_group[[3]])]>missing.val))
}else{
num_sigsA<-length(which(is.na(data_m[metab_num,1:num_samps_group[[1]]])==FALSE))
num_sigsB<-length(which(is.na(data_m[metab_num,(num_samps_group[[1]]+1):(num_samps_group[[1]]+num_samps_group[[2]])])==FALSE))
num_sigsC<-length(which(is.na(data_m[metab_num,(num_samps_group[[1]]+num_samps_group[[2]]+1):(num_samps_group[[1]]+num_samps_group[[2]]+num_samps_group[[3]])])==FALSE))
}
if((num_sigsA>=sig_thresh_groupA) || (num_sigsB>=sig_thresh_groupB) || (num_sigsC>=sig_thresh_groupC))
{
clean_metabs<-c(clean_metabs,metab_num)
}
}
}else{
if(length(class_labels_levels)>2){
for(metab_num in 1:dim(data_m)[1])
{
for(c in 1:length(class_labels_levels)){
if(is.na(missing.val)==FALSE){
num_cursig<-length(which(data_m[metab_num,which(ordered_labels==class_labels_levels[c])]>missing.val))
}else{
num_cursig<-length(which(is.na(data_m[metab_num,which(ordered_labels==class_labels_levels[c])])==FALSE))
}
sig_thresh_cur<-length(which(ordered_labels==class_labels_levels[c]))*group.missing.thresh
if(num_cursig>=sig_thresh_cur)
{
clean_metabs<-c(clean_metabs,metab_num)
break #for(i in 1:4){if(i==3){break}else{#print(i)}}
}
}
}
}
else{
if(length(class_labels_levels)==1){
num_samps_group[[1]]<-num_samps_group[[1]]
sig_thresh_groupA<-group.missing.thresh*num_samps_group[[1]]
for(metab_num in 1:dim(data_matrix)[1])
{
if(is.na(missing.val)==FALSE){
num_sigsA<-length(which(data_m[metab_num,1:num_samps_group[[1]]]>missing.val))
}else{
num_sigsA<-length(which(is.na(data_m[metab_num,1:num_samps_group[[1]]])==FALSE))
}
if((num_sigsA>=sig_thresh_groupA) )
{
clean_metabs<-c(clean_metabs,metab_num)
}
}
}
}
}
}
if(length(clean_metabs)>0){
#print(length(clean_metabs))
data_m<-data_m[clean_metabs,]
data_matrix<-data_matrix[clean_metabs,]
#print(paste("Dimension of data matrix after using group-wise ",100*group.missing.thresh, "% signal criteria for filtering:"),sep="")
#print(dim(data_matrix))
#print("here 2")
#print(data_m[1:10,1:5])
}
}
#print("before")
#print(data_m[1:10,1:10])
####################################################################
#Step 4) Data transformation and normalization
if(log2transform==TRUE)
{
data_m<-log2(data_m+1)
#print("log scale")
#print(data_m[1:10,1:10])
}
if(quantile_norm==TRUE)
{
data_m<-normalizeQuantiles(data_m)
#print("quant norm")
#print(data_m[1:10,1:10])
}
if(lowess_norm==TRUE)
{
data_m<-normalizeCyclicLoess(data_m)
#print("lowess")
}
data_m_prescaling<-data_m
if(medcenter==TRUE)
{
colmedians=apply(data_m,1,function(x){median(x,na.rm=TRUE)})
data_m=sweep(data_m,1,colmedians)
}
if(znormtransform==TRUE)
{
data_m<-scale(t(data_m))
data_m<-t(data_m)
}
if(madscaling==TRUE)
{
colmedians=apply(data_m,2,function(x){median(x,na.rm=TRUE)})
Y=sweep(data_m,2,colmedians)
mad<-apply(abs(Y),2,function(x){median(x,na.rm=TRUE)})
const<-prod(mad)^(1/length(mad))
scale.normalized<-sweep(data_m,2,const/mad,"*")
data_m<-scale.normalized
}
#print("after")
#print(data_m[1:10,1:10])
#Use this if first column is gene/metabolite name
#for apLCMS:
#data_matrix_temp<-data_matrix[,c(1:2)]
data_matrix<-cbind(data_matrix[,c(1:2)],data_m)
#print(dim(data_matrix))
#print(dim(data_m))
data_m<-as.data.frame(data_m)
num_rows<-dim(data_m)[1]
num_columns<-dim(data_m)[2]
#print("num rows is ")
#print(num_rows)
#for apLCMS:
rnames<-paste("mzid_",seq(1,num_rows),sep="")
rownames(data_m)=rnames
mzid_mzrt<-data_matrix[,c(1:2)]
colnames(mzid_mzrt)<-c("mz","time")
rownames(mzid_mzrt)=rnames
write.table(mzid_mzrt, file="mzid_mzrt.txt",sep="\t",row.names=FALSE)
#filename<-paste("Classlabels_file.txt",sep="")
#write.table(classlabels, file=filename,sep="\t",row.names=FALSE)
filename<-paste("Normalized_sigthreshfilt_averaged_data.txt",sep="")
data_matrix<-cbind(data_matrix[,c(1:2)],data_m)
write.table(data_matrix, file=filename,sep="\t",row.names=FALSE)
data_matrix_prescaling<-cbind(data_matrix[,c(1:2)],data_m_prescaling)
return(list(data_matrix_afternorm_scaling=data_matrix,data_matrix_prescaling=data_matrix_prescaling))
#return(data_matrix)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.