R/data_preprocess.R

Defines functions data_preprocess

data_preprocess <-
function(Xmat=NA,Ymat=NA,feature_table_file,parentoutput_dir,class_labels_file,num_replicates=3,feat.filt.thresh=NA,summarize.replicates=TRUE,summary.method="mean",
 all.missing.thresh=0.5,group.missing.thresh=0.7,
log2transform=TRUE,medcenter=TRUE,znormtransform=FALSE,quantile_norm=TRUE,lowess_norm=FALSE,madscaling=FALSE,missing.val=0,samplermindex=NA, rep.max.missing.thresh=0.5,summary.na.replacement="zeros",featselmethod=NA){
	
	options(warn=-1)

	#read file; First row is column headers
	if(is.na(Xmat==TRUE)){
		data_matrix<-read.table(feature_table_file,sep="\t",header=TRUE)
	}else{
		data_matrix<-Xmat
		#rm(Xmat)
		}
	#print("signal filter threshold ")
	#print(group.missing.thresh)
	#print(missing.val)
	
	if(is.na(samplermindex)==FALSE){
		data_matrix<-data_matrix[,-c(samplermindex)]
		}
		
		#use only unique records
		data_matrix<-unique(data_matrix)
		
		if(is.na(missing.val)==FALSE){
		
		#print("Replacing missing values with NAs.")
		data_matrix<-replace(as.matrix(data_matrix),which(data_matrix==missing.val),NA)
		}
		
		
		#print("dim of original data matrix")
		#print(dim(data_matrix))
		data_matrix_orig<-data_matrix

		
		snames<-colnames(data_matrix)
			

			
			

		dir.create(parentoutput_dir,showWarnings=FALSE)
		parentoutput_dir<-paste(parentoutput_dir,"/Stage1/",sep="")

		dir.create(parentoutput_dir,showWarnings=FALSE)
		fheader="transformed_log2fc_threshold_"
		setwd(parentoutput_dir)
		
	#Step 1) PID or CV calculation
	if(is.na(feat.filt.thresh)==FALSE)
	{
		if(num_replicates>1)
		{

				feat.eval.result=evaluate.Features(cbind(data_matrix[,c(1:2)],data_matrix), num_replicates,alignment.tool="apLCMS",impute.bool=TRUE)

								cnames=colnames(feat.eval.result)
								feat.eval.result<-apply(feat.eval.result,2,as.numeric)
								feat.eval.result<-as.data.frame(feat.eval.result)
								feat.eval.result.mat=cbind(data_matrix[,c(1:2)],feat.eval.result)  
								feat.eval.outfile=paste("featureassessment_usinggoodsamples.txt",sep="")					  
								#write results
								write.table(feat.eval.result.mat, feat.eval.outfile,sep="\t", row.names=FALSE)
								
								#data_m<-data_m[which(as.numeric(feat.eval.result$median)<=feat.filt.thresh),]
								data_matrix<-data_matrix[which(as.numeric(feat.eval.result$median)<=feat.filt.thresh),]
		}
	}
	
	####################################################################################
	
	#Step 2) Average replicates
	if(summarize.replicates==TRUE)
	{
		if(num_replicates>1)
		{

			data_m<-getSumreplicates(data_matrix,alignment.tool="apLCMS",numreplicates=num_replicates,numcluster=10,rep.max.missing.thresh=rep.max.missing.thresh,summary.method=summary.method,summary.na.replacement, missing.val=missing.val)
		
			if(summary.method=="mean"){
			#print("Replicate averaging done")
			filename<-paste("Rawdata_averaged.txt",sep="")
			}else{
				if(summary.method=="median"){
			#print("Replicate median summarization done")
			filename<-paste("Rawdata_median_summarized.txt",sep="")
			}
				
			}
						
			data_m_prenorm<-cbind(data_matrix[,c(1:2)],data_m)
			
			write.table(data_m_prenorm, file=filename,sep="\t",row.names=FALSE)
			#num_samps_group[[1]]=(1/num_replicates)*num_samps_group[[1]]
			#num_samps_group[[2]]=(1/num_replicates)*num_samps_group[[2]]
		}else
		{
			data_m<-data_matrix[,-c(1:2)]
			if(summary.na.replacement=="zeros"){
		data_m<-replace(data_m,which(is.na(data_m)==TRUE),0)
		}else{
			if(summary.na.replacement=="halfsamplemin"){
		data_m<-apply(data_m,2,function(x){naind<-which(is.na(x)==TRUE); if(length(naind)>0){x[naind]<-min(x,na.rm=TRUE)/2}; return(x)})
		}else{
			
			if(summary.na.replacement=="halfdatamin"){
				
				
			min_val<-min(data_m,na.rm=TRUE)*0.5
			data_m<-replace(data_m,which(is.na(data_m)==TRUE),min_val)

			#data_m<-apply(data_m,1,function(x){naind<-which(is.na(x)==TRUE); if(length(naind)>0){x[naind]<-min(x,na.rm=TRUE)/2}; return(x)})
			}else{
			if(summary.na.replacement=="halffeaturemin"){
			data_m<-apply(data_m,1,function(x){naind<-which(is.na(x)==TRUE); if(length(naind)>0){x[naind]<-min(x,na.rm=TRUE)/2}; return(x)})
			data_m<-t(data_m)
			}
			}
		}
			
			
		}
		}
	}else
	{
		data_m<-data_matrix[,-c(1:2)]
				if(summary.na.replacement=="zeros"){
		data_m<-replace(data_m,which(is.na(data_m)==TRUE),0)
		}else{
			if(summary.na.replacement=="halfsamplemin"){
		data_m<-apply(data_m,2,function(x){naind<-which(is.na(x)==TRUE); if(length(naind)>0){x[naind]<-min(x,na.rm=TRUE)/2}; return(x)})
		}else{
			
			if(summary.na.replacement=="halfdatamin"){
				

			min_val<-min(data_m,na.rm=TRUE)*0.5
			data_m<-replace(data_m,which(is.na(data_m)==TRUE),min_val)

			#data_m<-apply(data_m,1,function(x){naind<-which(is.na(x)==TRUE); if(length(naind)>0){x[naind]<-min(x,na.rm=TRUE)/2}; return(x)})
			}else{
			if(summary.na.replacement=="halffeaturemin"){
			data_m<-apply(data_m,1,function(x){naind<-which(is.na(x)==TRUE); if(length(naind)>0){x[naind]<-min(x,na.rm=TRUE)/2}; return(x)})
			data_m<-t(data_m)
			}
			}
		}
			
			
		}
		
	}
	

	data_matrix<-cbind(data_matrix[,c(1:2)],data_m)

	data_matrix_orig<-data_matrix
	data_subjects<-data_m

		ordered_labels={}
		
	num_samps_group<-new("list")
	
	if(is.na(class_labels_file)==FALSE)
	{
		
		#print("Class labels file:")
		#print(class_labels_file)
		
			data_matrix={}
			
		if(is.na(Ymat)==TRUE){
		classlabels<-read.table(class_labels_file,sep="\t",header=TRUE)
		}else{
			classlabels<-Ymat
			}
			
			
				classlabels<-as.data.frame(classlabels)
				colnames(classlabels)<-c("SampleID","Class")
				f1<-table(classlabels$SampleID)
			
			
			classlabels<-as.data.frame(classlabels)
			classlabels<-classlabels[seq(1,dim(classlabels)[1],num_replicates),]
			#print(classlabels)
			class_labels_levels<-levels(as.factor(classlabels[,2]))
			bad_rows<-which(class_labels_levels=="")
			if(length(bad_rows)>0){
			class_labels_levels<-class_labels_levels[-bad_rows]
			}
			
	for(c in 1:length(class_labels_levels))
	{
		if(c>1){
		data_matrix<-cbind(data_matrix,data_subjects[,which(classlabels[,2]==class_labels_levels[c])])
		}else{
			data_matrix<-data_subjects[,which(classlabels[,2]==class_labels_levels[c])]
		}
		classlabels_index<-which(classlabels[,2]==class_labels_levels[c])
		ordered_labels<-c(ordered_labels,as.character(classlabels[classlabels_index,2]))
		num_samps_group[[c]]<-length(classlabels_index)
		
	}

	#colnames(data_matrix)<-as.character(ordered_labels)
	data_matrix<-cbind(data_matrix_orig[,c(1:2)],data_matrix)
	data_m<-as.matrix(data_matrix[,-c(1:2)])

}else{
	if(is.na(Ymat)==TRUE){
		classlabels<-rep("A",dim(data_m)[2])
			ordered_labels<-classlabels
			num_samps_group[[1]]<-dim(data_m)[2]
		    class_labels_levels<-c("A")		
		    data_m<-as.matrix(data_matrix[,-c(1:2)])
		
		}else{
			classlabels<-Ymat
			classlabels<-as.data.frame(classlabels)
				colnames(classlabels)<-c("SampleID","Class")
				f1<-table(classlabels$SampleID)
			
			
			classlabels<-as.data.frame(classlabels)
			classlabels<-classlabels[seq(1,dim(classlabels)[1],num_replicates),]
			#print(classlabels)
			class_labels_levels<-levels(as.factor(classlabels[,2]))
			bad_rows<-which(class_labels_levels=="")
			if(length(bad_rows)>0){
			class_labels_levels<-class_labels_levels[-bad_rows]
			}
			
	for(c in 1:length(class_labels_levels))
	{
		#if(c>1){
		#data_matrix<-cbind(data_matrix,data_subjects[,which(classlabels[,2]==class_labels_levels[c])])
		#}else{
		#	data_matrix<-data_subjects[,which(classlabels[,2]==class_labels_levels[c])]
		#}
		classlabels_index<-which(classlabels[,2]==class_labels_levels[c])
		ordered_labels<-c(ordered_labels,as.character(classlabels[classlabels_index,2]))
		num_samps_group[[c]]<-length(classlabels_index)
		
	}

	#colnames(data_matrix)<-as.character(ordered_labels)
	#data_matrix<-cbind(data_matrix_orig[,c(1:2)],data_matrix)
	#data_m<-as.matrix(data_matrix[,-c(1:2)])
			}
			

			
	}	
	
	
	##################################################################################
	metab_zeros={}
	data_clean<-{}
	clean_metabs<-{}
	#num_samps_group[[3]]<-0
	
	if(is.na(all.missing.thresh)==FALSE){
	
		total_sigs<-apply(data_m,1,function(x){
		if(is.na(missing.val)==FALSE){return(length(which(x>missing.val)))
		}else{
		return(length(which(is.na(x)==FALSE)))
		}})
		
		

		total_sig_thresh<-dim(data_m)[2]*all.missing.thresh
		
		total_good_metabs<-which(total_sigs>total_sig_thresh)
		if(length(total_good_metabs)>0){
		data_m<-data_m[total_good_metabs,]
		data_matrix<-data_matrix[total_good_metabs,]
		#print(paste("Dimension of data matrix after overall ",all.missing.thresh,"% signal threshold filtering",sep=""))
		#print(paste("Dimension of data matrix after using overall ",100*all.missing.thresh, "% signal criteria for filtering:"),sep="")
		#print(dim(data_matrix))
		}else{
		stop(paste("None of the metabolites have signal in ",all.missing.thresh*100, "% of samples",sep=""))
		}
	}
	
	#print(dim(data_m))
	#Step 3) Remove features if signal is not detected in at least 70% of samples in either of the groups		
	
	if(is.na(group.missing.thresh)==FALSE){
	
	if(length(class_labels_levels)==2){
		
		sig_thresh_groupA<-group.missing.thresh*num_samps_group[[1]]
		sig_thresh_groupB<-group.missing.thresh*num_samps_group[[2]]
	
		for(metab_num in 1:dim(data_matrix)[1])
		{
			#print(missing.val)
			if(is.na(missing.val)==FALSE){
			
			num_sigsA<-length(which(data_m[metab_num,1:num_samps_group[[1]]]>missing.val))
			
			
			num_sigsB<-length(which(data_m[metab_num,(num_samps_group[[1]]+1):(num_samps_group[[1]]+num_samps_group[[2]])]>missing.val))
			
			}else{
			
			num_sigsA<-length(which(is.na(data_m[metab_num,1:num_samps_group[[1]]])==FALSE))
			num_sigsB<-length(which(is.na(data_m[metab_num,(num_samps_group[[1]]+1):(num_samps_group[[1]]+num_samps_group[[2]])])==FALSE))
			
			}
			
			if((num_sigsA>=sig_thresh_groupA) || (num_sigsB>=sig_thresh_groupB))
			{
				clean_metabs<-c(clean_metabs,metab_num)
			}

		}
		
		#print(length(clean_metabs))
	}else{
		if(length(class_labels_levels)==3){
		
		
			sig_thresh_groupA<-group.missing.thresh*num_samps_group[[1]]
			sig_thresh_groupB<-group.missing.thresh*num_samps_group[[2]]
			sig_thresh_groupC<-group.missing.thresh*num_samps_group[[3]]
			
			for(metab_num in 1:dim(data_matrix)[1])
			{
				if(is.na(missing.val)==FALSE){
				num_sigsA<-length(which(data_m[metab_num,1:num_samps_group[[1]]]>missing.val))
				num_sigsB<-length(which(data_m[metab_num,(num_samps_group[[1]]+1):(num_samps_group[[1]]+num_samps_group[[2]])]>missing.val))
				num_sigsC<-length(which(data_m[metab_num,(num_samps_group[[1]]+num_samps_group[[2]]+1):(num_samps_group[[1]]+num_samps_group[[2]]+num_samps_group[[3]])]>missing.val))
				}else{
				
				num_sigsA<-length(which(is.na(data_m[metab_num,1:num_samps_group[[1]]])==FALSE))
				num_sigsB<-length(which(is.na(data_m[metab_num,(num_samps_group[[1]]+1):(num_samps_group[[1]]+num_samps_group[[2]])])==FALSE))
				num_sigsC<-length(which(is.na(data_m[metab_num,(num_samps_group[[1]]+num_samps_group[[2]]+1):(num_samps_group[[1]]+num_samps_group[[2]]+num_samps_group[[3]])])==FALSE))
				
				}
				
				if((num_sigsA>=sig_thresh_groupA) || (num_sigsB>=sig_thresh_groupB) || (num_sigsC>=sig_thresh_groupC))
				{
					clean_metabs<-c(clean_metabs,metab_num)
				}

			}
		}else{
			if(length(class_labels_levels)>2){
		
				
			
			for(metab_num in 1:dim(data_m)[1])
			{
				for(c in 1:length(class_labels_levels)){
				if(is.na(missing.val)==FALSE){
				
				num_cursig<-length(which(data_m[metab_num,which(ordered_labels==class_labels_levels[c])]>missing.val))
				
				}else{
				
				num_cursig<-length(which(is.na(data_m[metab_num,which(ordered_labels==class_labels_levels[c])])==FALSE))
				
		
				}
				
				sig_thresh_cur<-length(which(ordered_labels==class_labels_levels[c]))*group.missing.thresh
				if(num_cursig>=sig_thresh_cur)
				{
					clean_metabs<-c(clean_metabs,metab_num)
					break   #for(i in 1:4){if(i==3){break}else{#print(i)}}

				}

				}
			}
		}
			else{
			
						
						
							if(length(class_labels_levels)==1){
		num_samps_group[[1]]<-num_samps_group[[1]]
		
		
			sig_thresh_groupA<-group.missing.thresh*num_samps_group[[1]]

			
			for(metab_num in 1:dim(data_matrix)[1])
			{
				if(is.na(missing.val)==FALSE){
				num_sigsA<-length(which(data_m[metab_num,1:num_samps_group[[1]]]>missing.val))
			
				}else{
				
				num_sigsA<-length(which(is.na(data_m[metab_num,1:num_samps_group[[1]]])==FALSE))
								}
				
				if((num_sigsA>=sig_thresh_groupA) )
				{
					clean_metabs<-c(clean_metabs,metab_num)
				}

			}
			}
		}
			}
	}
	
	if(length(clean_metabs)>0){
		#print(length(clean_metabs))
		
		
	
	data_m<-data_m[clean_metabs,]
	data_matrix<-data_matrix[clean_metabs,]
	
	#print(paste("Dimension of data matrix after using group-wise ",100*group.missing.thresh, "% signal criteria for filtering:"),sep="")
	#print(dim(data_matrix))
	
	
	
	#print("here 2")
	#print(data_m[1:10,1:5])
	
	}
	
	}
	#print("before")
	#print(data_m[1:10,1:10])
	####################################################################
	#Step 4) Data transformation and normalization
	
		if(log2transform==TRUE)
	{
		data_m<-log2(data_m+1)
		
		#print("log scale")
		#print(data_m[1:10,1:10])
	}
	
	if(quantile_norm==TRUE)
	{
		data_m<-normalizeQuantiles(data_m)
		#print("quant norm")
		#print(data_m[1:10,1:10])
	}
	
	if(lowess_norm==TRUE)
	{
		data_m<-normalizeCyclicLoess(data_m)
		#print("lowess")
	}
	
	data_m_prescaling<-data_m
	
	if(medcenter==TRUE)
	{
		colmedians=apply(data_m,1,function(x){median(x,na.rm=TRUE)})
		data_m=sweep(data_m,1,colmedians)
		
		
	}
	if(znormtransform==TRUE)
	{
		data_m<-scale(t(data_m))
		data_m<-t(data_m)
	}
	

	if(madscaling==TRUE)
	{
		colmedians=apply(data_m,2,function(x){median(x,na.rm=TRUE)})

		Y=sweep(data_m,2,colmedians)
		mad<-apply(abs(Y),2,function(x){median(x,na.rm=TRUE)})
		const<-prod(mad)^(1/length(mad))
		scale.normalized<-sweep(data_m,2,const/mad,"*")
		data_m<-scale.normalized
	}

		
	#print("after")
	#print(data_m[1:10,1:10])
	#Use this if first column is gene/metabolite name
	#for apLCMS:
	#data_matrix_temp<-data_matrix[,c(1:2)]
	data_matrix<-cbind(data_matrix[,c(1:2)],data_m)	

	#print(dim(data_matrix))
	#print(dim(data_m))
	
	data_m<-as.data.frame(data_m)
	
	num_rows<-dim(data_m)[1]
	num_columns<-dim(data_m)[2]

	#print("num rows is ")
	#print(num_rows)
	#for apLCMS:
	rnames<-paste("mzid_",seq(1,num_rows),sep="")
	rownames(data_m)=rnames
	
	mzid_mzrt<-data_matrix[,c(1:2)]
	colnames(mzid_mzrt)<-c("mz","time")
	rownames(mzid_mzrt)=rnames
	write.table(mzid_mzrt, file="mzid_mzrt.txt",sep="\t",row.names=FALSE)

	#filename<-paste("Classlabels_file.txt",sep="")
	#write.table(classlabels, file=filename,sep="\t",row.names=FALSE)
	
	filename<-paste("Normalized_sigthreshfilt_averaged_data.txt",sep="")
	data_matrix<-cbind(data_matrix[,c(1:2)],data_m)
	write.table(data_matrix, file=filename,sep="\t",row.names=FALSE)
	data_matrix_prescaling<-cbind(data_matrix[,c(1:2)],data_m_prescaling)
	return(list(data_matrix_afternorm_scaling=data_matrix,data_matrix_prescaling=data_matrix_prescaling))
	#return(data_matrix)
}
jaspershen/MSannotator documentation built on May 18, 2019, 5:55 p.m.