R/clone.R

Defines functions categorize_dataframe categorize_matrix generateCloneSizeBySampling

#R code to do clone related operation
#clone.R
#
#==defining 1)the functions to run sampling from clone assignment file
#==    2)run summary of clones.
#Feng @bu updated 2/27/2019

#'@title sampling to generate clone size distribution
#'@description generate the clone size distribution by bootstrap from input obtained 
#'	by the clone partation software.
#'@details be careful not to input the distribution size larger than the input files. 
#'	A warning will be issued when it happens.
#'@param filename string for the input file. Note this input file is the result by 
#'	the clonayst clone partition software. Usually it is named like "cloneAssignment.txt"
#'@param size integer for the size of the total number of members/Igs. This is like a 
#'	normalization which is identical to all sample so that we can compare between samples
#'@param times integer the number sampling to generate the distribution
#'@param sample.name string the sample name for the clone assignment.
#'
#'@return a data frame for the clone size distribution 
#'@examples
#' dir_file<-system.file("extdata", package="frLib")
#' ca<-read.table(file.path(dir_file,"CloneAssignments.txt"), header=T,sep="\t") ; #note this CloneAssignments file has been truncated 
#' print(ca);
#'x<-generateCloneSizeBySampling(filename=file.path(dir_file,"CloneAssignments.txt"), size=20, times=10, sample.name="sample13")
#'@export
#reading a file, named cloneAssignment, which is an output of 
#clonyast. each row of the file is for one sequence. This row shows
#that for this sequence which clones it belong to, etc.
#assuming a header and tab-delimited txt file.
#we basically will draw samples (size) and then 
#we will return a data.frame showing the clone size distribution
#like
#clone.size   #freq

#==========================
#'@import plyr

generateCloneSizeBySampling<-function(filename, size, times, sample.name)
{
	#clone.size<-data
	#first read the files 
	if(missing(filename))
	{
		stop("missing input file name, please specify....");
	}

	if(missing(size))
	{
		stop("missing input size (for sample size), please specify....");
	}
	if(missing(times))
	{
		stop("missing input times (repeat sampling times), please specify....");
	}
	if(missing(sample.name))
	{
		stop("missing sample name, please specify.......");
	}
	#filename="CloneAssignments.txt"
	clone.assignment<-read.table(file=filename, header=T, sep="\t");
	if(dim(clone.assignment)[1]<size)
	{
		stop("the specify the size is bigger than the total number of records, please check and try again")
	}
	
	num.records<-dim(clone.assignment)[1];
	
	clone.size<-NULL;
	for(i in 1:times)
	{
		index<-sample(c(1:num.records), size);
		clone.picked<-clone.assignment[index,];
		#now we get a sample clones, run stats
		
		clone.temp<-aggregate(clone.picked$CDR3Length, by=list(CloneID=clone.picked$CloneID), FUN=length)
		colnames(clone.temp)[2]<-"Members";
		
		clone.temp<-count(clone.temp, vars="Members")
		names(clone.temp)<-c("Members", paste0("freq_",i))
		if(i==1)
		{
			clone.size<-clone.temp;
		}else {
			clone.size<-merge(clone.size, clone.temp, all.x=T, all.y=T)
		}
	}
	#first get NA to be replaced by 0
	for(i in 1:dim(clone.size)[1])
	{
		clone.size[i,is.na(clone.size[i,])]<-0;
	}

	#now ran state for the samples
	clone.size<-cbind(clone.size[,1],apply(clone.size[,c(-1)], 1, mean))
	colnames(clone.size)<-c("Members", sample.name); 
	#done in here.
	clone.size;
}

#convert the numeric data frame or data matrix into groups/categories
#'@title categorize the data on the basis of the input criteria
#'@description We count the frequencies of a numeric data set based on the input category intervals 
#'	. 
#'@param dat the numeric data that will be grouped.
#'@param criteria numeric vector describing the grouping criteria or the intervals. The grouping is done
#'	in such way: 1)group less than the minimum value of criteria; 2) group larger than minimum one and 
#'		less than 2nd smallest value; ..... n-1) group larger than 2nd largest value and less the largest
#'		value; n) group with values larger than the maximum x value.
#'@param by the numeric data that we used to check against the criteria for categorizing
#'@return a data matrix holding the newly categorized data.
#'@examples
#'
#create random data
#'set.seed(0)

#'dat<-data.frame(g1=runif(100, 0,1), g2=runif(100, 0,1))
#'criteria<-c(0.1, 0.5,0.7)
#'
#'cdat<-categorize_matrix(as.matrix(dat), criteria,FUN="sum");
#'print(cdat)
#'cdat.2<-categorize_matrix(as.matrix(dat), by=dat[,1], criteria,FUN="sum");
#'print(cdat.2)
#'
#'@export 
categorize_matrix<-function(dat, criteria, by,FUN)
{
	if(missing(dat))
	{
		stop("please specify the input data \"dat\"!!")
	}
	if(missing(criteria))
	{
		stop("please specify the criteria variable!!")
	}
	criteria<-sort(criteria)

	if(length(criteria)<1)
	{
		stop("the input criteria variable is zero-lengthed!!")
	}
	
	if(class(dat)!="matrix")
	{
		stop("Expecting a data matrix as input")
	}
	#dat<-as.matrix(dat);
	re.dat<-matrix(0, nrow=length(criteria)+1, ncol=dim(dat)[2]);
	compare.against.criteria<-dat
	if(!missing(by))
	{
		compare.against.criteria<-by
	}
	for(j in 1:dim(dat)[2])
	{
		#for the first case
		#compare.values<-dat[,j]
		if(missing(by))
		{
			compare.against.criteria<-dat[,j]
		}
		index<-which(compare.against.criteria<=criteria[1]);
		re.dat[1,j]<-(eval(call(FUN,dat[index,j])));#dat[index,j],2,sum)
		if(length(criteria)>1)
		{
			#now, stop doing the job
			for(i in 2:length(criteria))
			{
				index<-which(compare.against.criteria>criteria[i-1]&compare.against.criteria<=criteria[i]);
				re.dat[i,j]<-eval(call(FUN,dat[index,j]));#(dat[index,j],2,sum)
			}
		}
		index<-which(compare.against.criteria>criteria[length(criteria)]);
		re.dat[length(criteria)+1,j]<-eval(call(FUN,dat[index,j]));#apply(dat[index,j],2,sum);
	}
	#cat("set the names!!")
	colnames(re.dat)<-colnames(dat);
	return (re.dat);
}


#convert the numeric data frame or data matrix into groups/categories
#'@title categorize the data on the basis of the input criteria
#'@description We count the frequencies of a numeric data set based on the input category intervals 
#'	, or run other functions (FUN) on them. 
#'@param dat the data frame that will be grouped. There is only on column of data we will categorize.
#'@param criteria numeric vector describing the grouping criteria or the intervals. The grouping is done
#'	in such way: 1)group less than the minimum value of criteria; 2) group larger than minimum one and 
#'		less than 2nd smallest value; ..... n-1) group larger than 2nd largest value and less the largest
#'		value; n) group with values larger than the maximum x value.
#'@param index.data the index of the data column to be categorized based on
#'@param index.group the index of the grouping variable.
#'@param index.out the index of the data column used to run the function on. see example 
#'@param FUN string to indicate the function for summarize the data, eg, mean, length, sum are supported
#'@return a data matrix holding the newly categorized data and the grouping variable.
#'@examples
#'
#create random data
#'set.seed(0)
#'
#'dat<-data.frame(g1=runif(200, 0,1), g2=rep(c("group1","group2"),100 ), count=sample(c(1:100),200, replace=T))
#'criteria<-c(0.1, 0.5,0.7)
#'#first do the case where the index.out is the same as the index.data, we are generate the criteria and output on
#'#the same  column. 
#'cdat<-categorize_dataframe(dat=dat, criteria=criteria,
#'           	index.data=1,index.group=2, index.out=1, FUN="length"); 
#'	#or we call the function by omitting the index.out , so it take the default.
#'      #cdat<-categorize_dataframe(dat=data, criteria=criteria,
#'        #   	index.data=1,index.group=2, FUN="length"); 
#'
#' #second case, calling to do the output and criteria on different columns
#'cdat2<-categorize_dataframe(dat=dat, criteria=criteria,
#'           	index.data=1,index.group=2, index.out=3, FUN="sum"); 
#'@export 
categorize_dataframe<-function(dat, criteria,index.data, index.group, index.out=index.data, FUN)
{
	if(missing(dat))
	{
		stop("please specify the input data \"dat\"!!")
	}
	if(missing(criteria))
	{
		stop("please specify the criteria variable!!")
	}
	criteria<-sort(criteria)
	
	if(missing(index.data))
	{
		stop("please specify the input index.data")
	}
	if(missing(index.group))
	{
		stop("please specify the input index.group")
	}
	#if no index of the output data was specified, we will simply
	#make the index.data as the output field.
	if(missing(index.out))
	{
		index.out<-index.data;
	}
	if(length(criteria)<1)
	{
		stop("the input criteria variable is zero-lengthed!!")
	}
	if(missing(FUN))
	{
		stop("please specify the FUN")
	}
	groupNames<-unique(dat[,index.group])
	#dat<-as.matrix(dat);
	re.dat<-data.frame(rep(0,(length(criteria)+1)*length(groupNames)),
						rep(groupNames,each=(length(criteria)+1))
						)
	names(re.dat)<-names(dat)[c(index.out,index.group)]
	for(j in 1:length(groupNames))
	{	
			#for the first case
		dat.group=dat[dat[,index.group]==groupNames[j],index.data]
		out.group=dat[dat[,index.group]==groupNames[j],index.out]
		index<-which(dat.group<=criteria[1]);
		re.dat[1+(length(criteria)+1)*(j-1),1]<-eval(call(FUN,out.group[index]));#dat[index,j],2,sum)
		if(length(criteria)>1)
		{
			#now, start doing the job
			for(i in 2:length(criteria))
			{
				index<-which(dat.group>criteria[i-1]&dat.group<=criteria[i]);
				re.dat[i+(length(criteria)+1)*(j-1),1]<-eval(call(FUN,out.group[index]));#(dat[index,j],2,sum)
			}
		}
		index<-which(dat.group>criteria[length(criteria)]);
		re.dat[length(criteria)+1+(length(criteria)+1)*(j-1),1]<-eval(call(FUN,out.group[index]));#apply(dat[index,j],2,sum);
		
	}
	return (re.dat);
}
ffeng23/frLib documentation built on Jan. 14, 2023, 10:14 a.m.