R/methods-CuffFeatureSet.R

Defines functions .samples .replicates .fpkm .repFpkm .featureNames .annotation .fpkmMatrix .repFpkmMatrix .countMatrix .repCountMatrix .diffData .diffTable .count .heatmap .ggheat

########################
#methods-CuffFeatureSet.R
#
#Author: Loyal A. Goff
#
#Date created: 5-17-2011
#
#Description:
#########################

#################
#Initialize		#
#################
setMethod("initialize","CuffFeatureSet",
		function(.Object,
				annotation=data.frame(),
				fpkm=data.frame(),
				diff=data.frame(),
				repFpkm=data.frame(),
				count=data.frame(),
				... ){
			.Object<-callNextMethod(.Object,
					annotation=annotation,
					fpkm=fpkm,
					diff=diff,
					repFpkm=repFpkm,
					count=count,
					...)
		}
)

#################
#Validate		#
#################
#TODO: Add validity constraints
setValidity("CuffFeatureSet",function(object){
			TRUE
			#Add test for genes with no expression
		}
)		

#################
#Class Methods	#
#################
setMethod("show","CuffFeatureSet",function(object){
		cat(class(object),"instance for ",length(object)," features\nSlots:
			\tannotation
			\tfpkm
			\trepFpkm
			\tdiff
			\tcount\n")
		}
)

setMethod("length","CuffFeatureSet",
		function(x){
			dim(x@annotation)[1]
		}
)

#Add dim method so you can find number of samples in CuffFeatureSet, not just number of genes


#################
#Subsetting		#
#################
#TODO: Add subset methods to return a CuffFeature object
#setMethod("[","CuffFeatureSet",function(object,featureID){
#			
#		}
#)

#################
#Accessors
##################
.samples<-function(object){
	res<-fpkm(object)$sample_name
	res
}

setMethod("samples","CuffFeatureSet",.samples)

.replicates<-function(object){
	res<-repFpkm(object)$rep_name
	res
}

setMethod("replicates","CuffFeatureSet",.replicates)

.fpkm<-function(object,features=FALSE){
	myFPKM<-object@fpkm
	myFPKM$stdev<-(myFPKM$conf_hi-myFPKM$fpkm)/2
	if (features){
		return (merge(object@annotation,myFPKM,by=1))
	}else{
		return(myFPKM)
	}
}
setMethod("fpkm",signature(object="CuffFeatureSet"),.fpkm)

.repFpkm<-function(object,features=FALSE){
	myFPKM<-object@repFpkm
	#myFPKM$stdev<-(myFPKM$conf_hi-myFPKM$fpkm)/2
	if (features){
		return (merge(object@annotation,myFPKM,by=1))
	}else{
		return(myFPKM)
	}
}
setMethod("repFpkm",signature(object="CuffFeatureSet"),.repFpkm)

.featureNames<-function(object){
	data.frame(tracking_id=object@annotation[,1],gene_short_name=object@annotation$gene_short_name)
}

setMethod("featureNames",signature(object="CuffFeatureSet"),.featureNames)

.annotation<-function(object){
	object@annotation
}

setMethod(BiocGenerics::annotation,signature(object="CuffFeatureSet"),.annotation)

.fpkmMatrix<-function(object,fullnames=FALSE,sampleIdList){
	#Sample subsetting
	if(!missing(sampleIdList)){
		if (!all(sampleIdList %in% samples(object))){
			stop("Sample does not exist!")
		}else{
			mySamples<-sampleIdList
		}
	}else{
		mySamples<-samples(object)
	}
	if(fullnames){
		res<-fpkm(object,features=TRUE)
		res$tracking_id<-paste(res$gene_short_name,res[,1],sep="|")
	}else{
		res<-fpkm(object)
		colnames(res)[1]<-"tracking_id"	
	}
	selectedRows<-c('tracking_id','sample_name','fpkm')
	res<-res[,selectedRows]
	res<-melt(res)
	res<-dcast(res,tracking_id~sample_name)
	res<-data.frame(res[,-1],row.names=res[,1])
	if(!missing(sampleIdList)){
		res<-res[,mySamples]
	}
	res
}

setMethod("fpkmMatrix",signature(object="CuffFeatureSet"),.fpkmMatrix)

.repFpkmMatrix<-function(object,fullnames=FALSE,repIdList){
	#Sample subsetting
	if(!missing(repIdList)){
		if (!all(repIdList %in% replicates(object))){
			stop("Replicate does not exist!")
		}else{
			myReps<-repIdList
		}
	}else{
		myReps<-replicates(object)
	}
	if(fullnames){
		res<-repFpkm(object,features=TRUE)
		res$tracking_id<-paste(res$gene_short_name,res[,1],sep="|")
	}else{
		res<-repFpkm(object)
		colnames(res)[1]<-"tracking_id"	
	}
	selectedRows<-c('tracking_id','rep_name','fpkm')
	res<-res[,selectedRows]
	res<-melt(res)
	res<-dcast(res,tracking_id~rep_name)
	res<-data.frame(res[,-1],row.names=res[,1])
	if(!missing(repIdList)){
		res<-res[,myReps]
	}
	res
}

setMethod("repFpkmMatrix",signature(object="CuffFeatureSet"),.repFpkmMatrix)

.countMatrix<-function(object,fullnames=FALSE,sampleIdList){
	#Sample subsetting
	if(!missing(sampleIdList)){
		if (!all(sampleIdList %in% samples(object))){
			stop("Sample does not exist!")
		}else{
			mySamples<-sampleIdList
		}
	}else{
		mySamples<-samples(object)
	}
	if(fullnames){
		res<-count(object,features=TRUE)
		res$tracking_id<-paste(res$gene_short_name,res[,1],sep="|")
	}else{
		res<-count(object)
		colnames(res)[1]<-"tracking_id"	
	}
	selectedRows<-c('tracking_id','sample_name','count')
	res<-res[,selectedRows]
	res<-melt(res)
	res<-dcast(res,tracking_id~sample_name)
	res<-data.frame(res[,-1],row.names=res[,1])
	if(!missing(sampleIdList)){
		res<-res[,mySamples]
	}
	res
}

setMethod("countMatrix",signature(object="CuffFeatureSet"),.countMatrix)

.repCountMatrix<-function(object,fullnames=FALSE,repIdList){
	#Sample subsetting
	if(!missing(repIdList)){
		if (!all(repIdList %in% replicates(object))){
			stop("Replicate does not exist!")
		}else{
			myReps<-repIdList
		}
	}else{
		myReps<-replicates(object)
	}
	if(fullnames){
		res<-repFpkm(object,features=TRUE)
		res$tracking_id<-paste(res$gene_short_name,res[,1],sep="|")
	}else{
		res<-repFpkm(object)
		colnames(res)[1]<-"tracking_id"	
	}
	selectedRows<-c('tracking_id','rep_name','external_scaled_frags')
	res<-res[,selectedRows]
	res<-melt(res)
	res<-dcast(res,tracking_id~rep_name)
	res<-data.frame(res[,-1],row.names=res[,1])
	if(!missing(repIdList)){
		res<-res[,myReps]
	}
	res
}

setMethod("repCountMatrix",signature(object="CuffFeatureSet"),.repCountMatrix)

.diffData<-function(object){
	return(object@diff)
}

setMethod("diffData",signature(object="CuffFeatureSet"),.diffData)

.diffTable<-function(object,logCutoffValue=99999){
	measureVars<-c('status','value_1','value_2','log2_fold_change','test_stat','p_value','q_value','significant')
	all.diff<-diffData(object,features=TRUE)
	all.diff$log2_fold_change[all.diff$log2_fold_change>=logCutoffValue]<-Inf
	all.diff$log2_fold_change[all.diff$log2_fold_change<=-logCutoffValue]<--Inf
	all.diff.melt<-melt(all.diff,measure.vars=measureVars)
	#all.diff.melt<-all.diff.melt[!grepl("^value_",all.diff.melt$variable),]
	all.diff.cast<-dcast(all.diff.melt,formula=...~sample_2+sample_1+variable)
	all.diff.cast
}

setMethod("diffTable",signature(object="CuffFeatureSet"),.diffTable)


.count<-function(object,features=FALSE){
	if (features){
		return (merge(object@annotation,object@count,by=1))
	}else{
		return(object@count)
	}
}

setMethod("count",signature(object="CuffFeatureSet"),.count)

setMethod("annotation","CuffFeatureSet",function(object){
			return(object@annotation)
		})

#################
#Plotting		#
#################
#Basic heatmap
.heatmap<-function(object,logMode=TRUE,pseudocount=1.0){
	dat<-fpkm(object)
	if(logMode){
		dat$fpkm<- log10(dat$fpkm+pseudocount)
	}
	colnames(dat)[1] <- "tracking_id"
	p<-ggplot(dat)
	p <- p + geom_tile(aes(x=tracking_id,y=sample_name,fill=fpkm)) + scale_fill_gradient(low="white",high="red") + theme(axis.text.x=element_text(angle=-90, hjust=0))
	p
}

#######################
#The following is borrowed from Malarkey and is not yet ready for prime time...
#I would like to replace the clustering here with JSdistance on rows and/or columns and make
#this package work with cuffSet objects by default. 
#There is no genericMethod yet, goal is to replace .heatmap with .ggheat for genericMethod 'csHeatmap'

.ggheat<-function(object, rescaling='none', clustering='none', labCol=T, labRow=T, logMode=T, pseudocount=1.0, 
		border=FALSE, heatscale=c(low='lightyellow',mid='orange',high='darkred'), heatMidpoint=NULL,fullnames=T,replicates=FALSE,method='none',...) {
	## the function can be be viewed as a two step process
	## 1. using the rehape package and other funcs the data is clustered, scaled, and reshaped
	## using simple options or by a user supplied function
	## 2. with the now resahped data the plot, the chosen labels and plot style are built

	if(replicates){
		m=repFpkmMatrix(object,fullnames=fullnames)
	}else{
		m=fpkmMatrix(object,fullnames=fullnames)
	}
	#remove genes with no expression in any condition
	m=m[!apply(m,1,sum)==0,]
	
	## you can either scale by row or column not both! 
	## if you wish to scale by both or use a different scale method then simply supply a scale
	## function instead NB scale is a base funct
	
    if(logMode) 
    {
      m = log10(m+pseudocount)
    }
	
	
	if(is.function(rescaling))
	{ 
		m=rescaling(m)
	} else {
		if(rescaling=='column'){
			m=scale(m, center=T)
			m[is.nan(m)] = 0
		}
		if(rescaling=='row'){ 
			m=t(scale(t(m),center=T))
			m[is.nan(m)] = 0
		}
	}
	
	
	## I have supplied the default cluster and euclidean distance (JSdist) - and chose to cluster after scaling
	## if you want a different distance/cluster method-- or to cluster and then scale
	## then you can supply a custom function 
	
	if(!is.function(method)){
		method = function(mat){JSdist(makeprobs(t(mat)))}	
	}

	if(clustering=='row')
		m=m[hclust(method(m))$order, ]
	if(clustering=='column')  
		m=m[,hclust(method(t(m)))$order]
	if(clustering=='both')
		m=m[hclust(method(m))$order ,hclust(method(t(m)))$order]

	## this is just reshaping into a ggplot format matrix and making a ggplot layer
	

	rows=dim(m)[1]
	cols=dim(m)[2]
	
	
	
    # if(logMode) {
    #   melt.m=cbind(rowInd=rep(1:rows, times=cols), colInd=rep(1:cols, each=rows), melt( log10(m+pseudocount)))
    # }else{
    #   melt.m=cbind(rowInd=rep(1:rows, times=cols), colInd=rep(1:cols, each=rows), melt(m))
    # }
    


    melt.m=cbind(rowInd=rep(1:rows, times=cols), colInd=rep(1:cols, each=rows), melt(m))

	g=ggplot(data=melt.m)
	
	## add the heat tiles with or without a white border for clarity
	
	if(border==TRUE)
		g2=g+geom_rect(aes(xmin=colInd-1,xmax=colInd,ymin=rowInd-1,ymax=rowInd, fill=value),colour='grey')
	if(border==FALSE)
		g2=g+geom_rect(aes(xmin=colInd-1,xmax=colInd,ymin=rowInd-1,ymax=rowInd, fill=value))
	
	## add axis labels either supplied or from the colnames rownames of the matrix
	
	if(labCol==T) 
	{
		g2=g2+scale_x_continuous(breaks=(1:cols)-0.5, labels=colnames(m))
	}
	if(labCol==F) 
	{
		g2=g2+scale_x_continuous(breaks=(1:cols)-0.5, labels=rep('',cols))
	}
	
	
	if(labRow==T) 
	{
		g2=g2+scale_y_continuous(breaks=(1:rows)-0.5, labels=rownames(m))	
	}
	if(labRow==F)
	{ 
		g2=g2+scale_y_continuous(breaks=(1:rows)-0.5, labels=rep('',rows))	
	}
	
	# Get rid of the ticks, they get way too dense with lots of rows
    g2 <- g2 + theme(axis.ticks = element_blank()) 

	## get rid of grey panel background and gridlines
	
	g2=g2+theme(panel.grid.minor=element_line(colour=NA), panel.grid.major=element_line(colour=NA),
			panel.background=element_rect(fill=NA, colour=NA))
	
	##adjust x-axis labels
	g2=g2+theme(axis.text.x=element_text(angle=-90, hjust=0))

    #write(paste(c("Length of heatscale is :", length(heatscale))), stderr())
	
	if (logMode)
	{
	   legendTitle <- bquote(paste(log[10]," FPKM + ",.(pseudocount),sep=""))
	   #legendTitle <- paste(expression(plain(log)[10])," FPKM + ",pseudocount,sep="")
	} else {
	   legendTitle <- "FPKM"
	}
	
	if (length(heatscale) == 2){
	    g2 <- g2 + scale_fill_gradient(low=heatscale[1], high=heatscale[2], name=legendTitle)
	} else if (length(heatscale) == 3) {
	    if (is.null(heatMidpoint))
	    {
	        heatMidpoint = (max(m) + min(m)) / 2.0
	        #write(heatMidpoint, stderr())
	    }

	    g2 <- g2 + scale_fill_gradient2(low=heatscale[1], mid=heatscale[2], high=heatscale[3], midpoint=heatMidpoint, name=legendTitle)
	}
	
	#g2<-g2+scale_x_discrete("",breaks=tracking_ids,labels=gene_short_names)
	
	
	## finally add the fill colour ramp of your choice (default is blue to red)-- and return
	return (g2)
	
}

setMethod("csHeatmap",signature("CuffFeatureSet"),.ggheat)


.fcheatmap<-function(object, control_condition, replicate_num=NULL, clustering='none', labCol=T, labRow=T, logMode=F, pseudocount=1.0, 
		border=FALSE, heatscale=c(low='steelblue',mid='white',high='tomato'), heatMidpoint=0,fullnames=T,replicates=FALSE,method='none',heatRange=3, ...) {
	## the function can be be viewed as a two step process
	## 1. using the rehape package and other funcs the data is clustered, scaled, and reshaped
	## using simple options or by a user supplied function
	## 2. with the now resahped data the plot, the chosen labels and plot style are built

	if(replicates){
	    if (is.null(replicate_num)){
	        print ("Error: if replicates == TRUE, you must specify both a control condition and a replicate number")
	        return()
	    }
		m=repFpkmMatrix(object,fullnames=fullnames)
		selected_rep <- paste(control_condition,replicate_num,sep="_")
		
		#remove genes with no expression in any condition
    	m=m[!apply(m,1,sum)==0,]
    	m=log2(m/m[,selected_rep])
		m=m[,names(m) != selected_rep]
	}else{
		m=fpkmMatrix(object,fullnames=fullnames)
		#remove genes with no expression in any condition
    	m=m[!apply(m,1,sum)==0,]
		m=log2(m/m[,control_condition])
		m=m[,names(m) != control_condition]
	}
	
	m_vals <- unlist(as.list(m))[is.finite(unlist(as.list(m)))]
	m_max <- max(m_vals)
	m_min <- max(m_vals)
	range_lim <- max(c(abs(m_max), abs(m_min)))
	range_lim <- min(c(heatRange, range_lim))
	m_max <- range_lim
	m_min <- -range_lim
	m[m <  m_min] <- m_min
	m[m >  m_max] <- m_max
	m[is.na(m)] <- 0
	
	## I have supplied the default cluster and euclidean distance (JSdist) - and chose to cluster after scaling
	## if you want a different distance/cluster method-- or to cluster and then scale
	## then you can supply a custom function 
	
	if(!is.function(method)){
		method = dist
	}

	if(clustering=='row')
		m=m[hclust(method(m))$order, ]
	if(clustering=='column')  
		m=m[,hclust(method(t(m)))$order]
	if(clustering=='both')
		m=m[hclust(method(m))$order ,hclust(method(t(m)))$order]
	
	rows=dim(m)[1]
	cols=dim(m)[2]

    melt.m=cbind(rowInd=rep(1:rows, times=cols), colInd=rep(1:cols, each=rows), melt(m))

	g=ggplot(data=melt.m)
	
	## add the heat tiles with or without a white border for clarity
	
	if(border==TRUE)
		g2=g+geom_rect(aes(xmin=colInd-1,xmax=colInd,ymin=rowInd-1,ymax=rowInd, fill=value),colour='grey')
	if(border==FALSE)
		g2=g+geom_rect(aes(xmin=colInd-1,xmax=colInd,ymin=rowInd-1,ymax=rowInd, fill=value))
	
	## add axis labels either supplied or from the colnames rownames of the matrix
	
	if(labCol==T) 
	{
		g2=g2+scale_x_continuous(breaks=(1:cols)-0.5, labels=colnames(m))
	}
	if(labCol==F) 
	{
		g2=g2+scale_x_continuous(breaks=(1:cols)-0.5, labels=rep('',cols))
	}
	
	
	if(labRow==T) 
	{
		g2=g2+scale_y_continuous(breaks=(1:rows)-0.5, labels=rownames(m))	
	}
	if(labRow==F)
	{ 
		g2=g2+scale_y_continuous(breaks=(1:rows)-0.5, labels=rep('',rows))	
	}
	
	# Get rid of the ticks, they get way too dense with lots of rows
    g2 <- g2 + theme(axis.ticks = element_blank()) 

	## get rid of grey panel background and gridlines
	
	g2=g2+theme(panel.grid.minor=element_line(colour=NA), panel.grid.major=element_line(colour=NA),
			panel.background=element_rect(fill=NA, colour=NA))
	
	##adjust x-axis labels
	g2=g2+theme(axis.text.x=element_text(angle=-90, hjust=0))

    #write(paste(c("Length of heatscale is :", length(heatscale))), stderr())
    if(replicates){
        legendTitle <- bquote(paste(log[2], frac("FPKM",.(selected_rep),sep="")))
    }else{
    	legendTitle <- bquote(paste(log[2], frac("FPKM",.(control_condition),sep="")))
	}
	#legendTitle <- paste(expression(plain(log)[10])," FPKM + ",pseudocount,sep="")

	if (length(heatscale) == 2){
	    g2 <- g2 + scale_fill_gradient(low=heatscale[1], high=heatscale[2], name=legendTitle)
	} else if (length(heatscale) == 3) {
	    if (is.null(heatMidpoint))
	    {
	        heatMidpoint = (max(m) + min(m)) / 2.0
	        #write(heatMidpoint, stderr())
	    }

	    g2 <- g2 + scale_fill_gradient2(low=heatscale[1], mid=heatscale[2], high=heatscale[3], midpoint=heatMidpoint, name=legendTitle)
	}
	
	#g2<-g2+scale_x_discrete("",breaks=tracking_ids,labels=gene_short_names)
	
	
	## finally add the fill colour ramp of your choice (default is blue to red)-- and return
	return (g2)
	
}

setMethod("csFoldChangeHeatmap",signature("CuffFeatureSet"),.fcheatmap)

# Distance Heatmaps
.distheat<-function(object, replicates=F, samples.not.genes=T, logMode=T, pseudocount=1.0, heatscale=c(low='lightyellow',mid='orange',high='darkred'), heatMidpoint=NULL, ...) {
  # get expression from a sample or gene perspective
	if(replicates){
		obj.fpkm<-repFpkmMatrix(object,fullnames=T)
	}else{
		obj.fpkm<-fpkmMatrix(object,fullnames=T)
	}

	if(samples.not.genes) {
    obj.fpkm.pos = obj.fpkm[rowSums(obj.fpkm)>0,]
  } else {
    obj.fpkm = t(obj.fpkm)
    obj.fpkm.pos = obj.fpkm[,colSums(obj.fpkm)>0]
  }
  
  if(logMode) {
    obj.fpkm.pos = log10(obj.fpkm.pos+pseudocount)
  }

  # compute distances
  obj.dists = JSdist(makeprobs(obj.fpkm.pos))
  
  # cluster to order
  obj.hc = hclust(obj.dists)

  # make data frame
  dist.df = melt(as.matrix(obj.dists),varnames=c("X1","X2"))

  # initialize
  g = ggplot(dist.df, aes(x=X1, y=X2, fill=value))

  # draw
  labels = labels(obj.dists)
  g = g + geom_tile() + scale_x_discrete("", limits=labels[obj.hc$order]) + scale_y_discrete("", limits=labels[obj.hc$order])

  # roll labels
  g = g + theme(axis.text.x=element_text(angle=-90, hjust=0), axis.text.y=element_text(angle=0, hjust=1))

  # drop grey panel background and gridlines
  g = g + theme(panel.grid.minor=element_line(colour=NA), panel.grid.major=element_line(colour=NA), panel.background=element_rect(fill=NA, colour=NA))

  # adjust heat scale
  if (length(heatscale) == 2) {
    g = g + scale_fill_gradient(low=heatscale[1], high=heatscale[3], name="JS Distance")
  }
  else if (length(heatscale) == 3) {
    if (is.null(heatMidpoint)) {
      heatMidpoint = max(obj.dists) / 2.0
    }
    g = g + scale_fill_gradient2(low=heatscale[1], mid=heatscale[2], high=heatscale[3], midpoint=heatMidpoint, name="JS Distance")
  }
  
  g <- g + geom_text(aes(label=format(value,digits=3)),size=6)
  
  # return
  g
}

setMethod("csDistHeat", signature("CuffFeatureSet"), .distheat)

#Scatterplot
.scatter<-function(object,x,y,logMode=TRUE,pseudocount=0.0,labels, smooth=FALSE,colorByStatus=FALSE,...){
	dat<-fpkmMatrix(object,fullnames=T)
	
	samp<-samples(object)
	
	#check to make sure x and y are in samples
	if (!all(c(x,y) %in% samp)){
		stop("One or more values of 'x' or 'y' are not valid sample names!")
	}
	
	#add pseudocount if necessary
	if(logMode){
		for (i in samp){
			dat[[i]]<-dat[[i]]+pseudocount
		}
	}
	
	#Attach tracking_id and gene_short_name
	if(!missing(labels)){
		require(stringr)
		tracking<-str_split_fixed(rownames(dat),"\\|",2)
		dat$gene_short_name<-tracking[,1]
		dat$tracking_id<-tracking[,2]

		labeled.dat<-dat[dat$gene_short_name %in% labels,]
	}
	
	#make plot object
	p<-ggplot(dat)
	p<- p + aes_string(x=x,y=y)
	p<- p + geom_point(size=1.2,alpha=I(1/3)) + geom_abline(intercept=0,slope=1,linetype=2) + geom_rug(size=0.5,alpha=0.01)
	
	#add smoother
	if(smooth){
		p <- p + stat_smooth(method="lm",fill="blue",alpha=0.2)
	}
	
	#Add highlights from labels
	if(!missing(labels)){
		p <- p + geom_point(data=labeled.dat,aes_string(x=x,y=y),size=1.3,color="red")
		p <- p + geom_text(data=labeled.dat,aes_string(x=x,y=y,label='gene_short_name'),color="red",hjust=0,vjust=0,angle=0,size=4)
	}
	
	#logMode
	if(logMode){
		p <- p + scale_y_log10() + scale_x_log10()
	}
	
	if (logMode)
    {
        p <- p + ylab(paste(y, "FPKM +",pseudocount))
        p <- p + xlab(paste(x, "FPKM +",pseudocount))
    } else {
        p <- p + ylab(paste(y, "FPKM"))
        p <- p + xlab(paste(x, "FPKM"))
    }
	
	#Add title & Return value
	#p<- p + labs(title=object@tables$mainTable)
	p
}

#.scatter(sigGenes,'P7_lincBrn1b_KO_Brain','P7_WT_Brain',labels=c('Arc'))

setMethod("csScatter",signature(object="CuffFeatureSet"), .scatter)

#Volcano plot
.volcano<-function(object,x,y,alpha=0.05,showSignificant=TRUE,xlimits=c(-20,20),...){
	samp<-samples(object)
	
	#check to make sure x and y are in samples
	if (!all(c(x,y) %in% samp)){
		stop("One or more values of 'x' or 'y' are not valid sample names!")
	}
	dat<-diffData(object=object)
	dat$significant<-'no'
	dat$significant[dat$q_value<=alpha]<-'yes'
	
	#subset dat for samples of interest
	mySamples<-c(x,y)
	dat<-dat[(dat$sample_1 %in% mySamples & dat$sample_2 %in% mySamples),]
	
	#Labels
	s1<-unique(dat$sample_1)
	s2<-unique(dat$sample_2)
	
	p<-ggplot(dat)
	if(showSignificant){
		p<- p + geom_point(aes(x=log2_fold_change,y=-log10(p_value),color=significant),alpha=I(1/3),size=1.2)
	}else{
		p<- p + geom_point(aes(x=log2_fold_change,y=-log10(p_value)),alpha=I(1/3),size=1.2)
	}
	
	p<- p + labs(title=paste(s2,"/",s1,sep=""))
	
	#Set axis limits
	p<- p + scale_x_continuous(limits=xlimits)
	
	#Default cummeRbund colorscheme
	p<-p + scale_colour_manual(values = c("black","red"))
	
	p
}

setMethod("csVolcano",signature(object="CuffFeatureSet"), .volcano)

.barplot<-function(object,logMode=TRUE,pseudocount=1.0,showErrorbars=TRUE,showStatus=TRUE,replicates=FALSE,...){
	quant_types<-c("OK","FAIL","LOWDATA","HIDATA","TOOSHORT")
	quant_types<-factor(quant_types,levels=quant_types)
	quant_colors<-c("black","red","blue","orange","green")
	names(quant_colors)<-quant_types
	
	dat<-fpkm(object,features=T)
	dat$warning<-""
	dat$warning[dat$quant_status!="OK"]<-"!"
	#TODO: Test dat to ensure that there are >0 rows to plot.  If not, trap error and move on...
	
	#Handle replicates
	if(replicates){
		repDat<-repFpkm(object)
		colnames(repDat)[1]<-"tracking_id"
	}
	
	colnames(dat)[1]<-"tracking_id"
	#tracking_ids<-dat$tracking_id
	obj_features <- annotation(object)
	tracking_ids <- obj_features[,1]
	
	gene_labels<-obj_features$gene_short_name
	#print(gene_labels)
	gene_labels[is.na(gene_labels)] = tracking_ids[is.na(gene_labels)]
	#print(gene_labels)
	
	dodge <- position_dodge(width=0.9) 
	
	if(logMode)
	{
	    dat$fpkm <- dat$fpkm + pseudocount
	    dat$conf_hi <- dat$conf_hi + pseudocount
	    dat$conf_lo <- dat$conf_lo + pseudocount
    }
	
	p<-ggplot(dat,aes(x=tracking_id,y=fpkm,fill=sample_name))
	p <- p + geom_bar(position=dodge,stat='identity')

	if (showErrorbars)
	{
	    p <- p +
		    geom_errorbar(aes(ymin=conf_lo,ymax=conf_hi),position=dodge,width=0.5)
	}
	
	if (logMode)
	{
	    p <- p + scale_y_log10()
		status_pos = 1
    }
	if(showStatus){
		if(logMode){
			p <- p + geom_text(aes(x=tracking_id,y=1,label=warning),position=dodge,stat='identity',color='red',vjust=1.5,size=3)
		}else{
			p <- p + geom_text(aes(x=tracking_id,y=0,label=warning),position=dodge,color='red',stat='identity',vjust=1.5,size=3)
		}
	}
	#gene_labels = dat$gene_short_name
	p <- p + scale_x_discrete("",breaks=tracking_ids,labels=gene_labels) + 
	    theme(axis.text.x=element_text(hjust=0,angle=-90))
    	
    # p<- p +
    #       geom_bar() +
    #       geom_errorbar(aes(ymin=conf_lo,ymax=conf_hi,group=1),size=0.15) +
    #       facet_wrap('sample_name') +
    #       theme(axis.text.x=element_text(hjust=0,angle=-90))
	
	#This does not make immediate sense with the conf_hi and conf_lo values.  Need to figure out appropriate transformation for these
	#if(logMode)
	#p<-p+scale_y_ log10()
	
    if (logMode)
    {
        p <- p + ylab(paste("FPKM +",pseudocount))
    } else {
        p <- p + ylab("FPKM")
    }
	
	#p <- p + theme(legend.position = "none")
	
	#Default cummeRbund colorscheme
	p<-p + scale_fill_hue(l=50,h.start=200) + scale_color_hue(l=50,h.start=200)
	
	#Recolor quant flags
	p<- p+ scale_colour_manual(name='quant_status',values=quant_colors)
	
	p
	
}

setMethod("expressionBarplot",signature(object="CuffFeatureSet"),.barplot)

.expressionPlot<-function(object,logMode=FALSE,pseudocount=1.0, drawSummary=FALSE, sumFun=mean_cl_boot, showErrorbars=TRUE,showStatus=TRUE,replicates=FALSE,...){
	quant_types<-c("OK","FAIL","LOWDATA","HIDATA","TOOSHORT")
	quant_types<-factor(quant_types,levels=quant_types)
	quant_colors<-c("black","red","blue","orange","green")
	names(quant_colors)<-quant_types
	
	dat<-fpkm(object)
	colnames(dat)[1]<-"tracking_id"
	
	if(replicates){
		repDat<-repFpkm(object)
		repDat$replicate<-as.factor(repDat$replicate)
		colnames(repDat)[1]<-"tracking_id"
	}
	
	if(logMode)
	{
		dat$fpkm <- dat$fpkm + pseudocount
		dat$conf_hi <- dat$conf_hi + pseudocount
		dat$conf_lo <- dat$conf_lo + pseudocount
		
		if(replicates){
			repDat$fpkm<-repDat$fpkm + pseudocount
		}
	}
	p <- ggplot(dat)
	#dat$fpkm<- log10(dat$fpkm+pseudocount)
	p <- p + geom_line(aes(x=sample_name,y=fpkm,group=tracking_id,color=tracking_id))
	
	if(replicates){
		p <- p + geom_point(aes(x=sample_name,y=fpkm,color=tracking_id),size=2.5,shape=18,data=repDat)
	}
	
	if (showErrorbars)
	{
		p <- p +
				geom_errorbar(aes(x=sample_name, ymin=conf_lo,ymax=conf_hi,color=tracking_id,group=tracking_id),width=0.25)
	}
	
	if(showStatus){
		p <- p+ geom_point(aes(x=sample_name,y=fpkm,shape=quant_status))
	}
	
	if (logMode)
	{
		p <- p + scale_y_log10()
	}
	
	#drawMean
	if(drawSummary){
		p <- p + stat_summary(aes(x=sample_name,y=fpkm,group=1),fun.data=sumFun,color="red",fill="red",alpha=0.2,size=1.1,geom="smooth")
	}
	
	if (logMode)
	{
		p <- p + ylab(paste("FPKM +",pseudocount))
	} else {
		p <- p + ylab("FPKM")
	}
	
	#Recolor quant flags
	#p<- p + scale_colour_manual(name='quant_status',values=quant_colors)
	
	p
}

setMethod("expressionPlot",signature(object="CuffFeatureSet"),.expressionPlot)

#TODO: Add csDensity plot for CuffFeatureSet objects

#TODO: Add ecdf plot for CuffFeatureSet and CuffData objects

#################
#Clustering		#
#################
#Kmeans by expression profile using JSdist
#TODO:Make this function return an object of type CuffClusterSet
#.cluster<-function(object, k, pseudocount=1, ...){
#	library(cluster)
#	m<-as.data.frame(fpkmMatrix(object))
#	m<-m[rowSums(m)>0,]
#	n<-JSdist(makeprobs(t(m)))
#	clusters<-pam(n,k)
#	clusters$fpkm<-m
#	m<-m+pseudocount
#	m$ids<-rownames(m)
#	m$cluster<-factor(clusters$clustering)
#	m.melt<-melt(m,id.vars=c("ids","cluster"))
#	c<-ggplot(m.melt)
#	c<-c+geom_line(aes(x=variable,y=value,color=cluster,group=ids)) + facet_wrap('cluster',scales='free')+scale_y_log10()
#	
#	#Default cummeRbund colorscheme
#	c<-c + scale_color_hue(l=50,h.start=200)
#	c
#}

.cluster<-function(object, k, logMode=T, method='none', pseudocount=1,...){
	require(cluster)
	m<-as.data.frame(fpkmMatrix(object))
	m<-m[rowSums(m)>0,]
	if(logMode){
		m<-log10(m+pseudocount)
	}
	
	if(!is.function(method)){
		method = function(mat){JSdist(makeprobs(t(m)))}	
	}		
	n<-method(m)
	clusters<-pam(n,k, ...)
	#clsuters<-pamk(n,krange=2:20)
	class(clusters)<-"list"
	clusters$fpkm<-m
	clusters
}

setMethod("csCluster",signature(object="CuffFeatureSet"),.cluster)

.ratioCluster<-function(object,k,ratioTo=NULL,pseudocount=0.0001,...){
	require(cluster)
	m<-as.data.frame(fpkmMatrix(object)+pseudocount)
	#TODO: ensure that ratioTo is in colnames(fpkmMatrix(object))
	m.ratio<-m/m[[ratioTo]]
	m.log.ratio<-log2(m.ratio)
	n<-dist(m.log.ratio)
	clusters<-pam(n,k,...)
	class(clusters)<-"list"
	clusters$fpkm<-m
	clusters
}

csClusterPlot<-function(clustering,pseudocount=1.0,logMode=FALSE,drawSummary=TRUE, sumFun=mean_cl_boot){
	m<-clustering$fpkm+pseudocount
	m$ids<-rownames(clustering$fpkm)
	m$cluster<-factor(clustering$clustering)
	m.melt<-melt(m,id.vars=c("ids","cluster"))
	c<-ggplot(m.melt)
	c<-c+geom_line(aes(x=variable,y=value,color=cluster,group=ids)) + theme_bw() + facet_wrap('cluster',scales='free_y')
	if(drawSummary){
		c <- c + stat_summary(aes(x=variable,y=value,group=1),fun.data=sumFun,color="black",fill="black",alpha=0.2,size=1.1,geom="smooth")
	}
	if(logMode){
		c<-c + scale_y_log10()
	}
	c<-c + scale_color_hue(l=50,h.start=200) + theme(axis.text.x=element_text(angle=-90,hjust=0))
	c
}

.findK<-function(object, k.range=c(2:20), logMode=T, pseudocount=1,...){
	require(cluster)
	m<-as.data.frame(fpkmMatrix(object))
	m<-m[rowSums(m)>0,]
	if(logMode){
		m<-log10(m+pseudocount)
	}
	n<-JSdist(makeprobs(t(m)))
	myWidths<-c()
	for (k in k.range){
		print(k)
		myWidths<-c(myWidths,pam(n,k,...)$silinfo$avg.width)
	}
	plot(k.range,myWidths)
}
	
	
	

######################
# 
######################

.dendro<-function(object,logMode=T,pseudocount=1,replicates=FALSE){
	if(replicates){
		fpkmMat<-repFpkmMatrix(object)
	}else{
		fpkmMat<-fpkmMatrix(object)
	}
	if(logMode){
		fpkmMat<-log10(fpkmMat+pseudocount)
	}
	res<-JSdist(makeprobs(fpkmMat))
	#colnames(res)<-colnames(fpkmMat)
	
	#res<-as.dist(res)
	res<-as.dendrogram(hclust(res))
	plot(res)
	res
}

setMethod("csDendro",signature(object="CuffFeatureSet"),.dendro)

#.csNMF<-function(object,rank,method,...){
#	#Non-negative Matrix Factorization
#	require(NMF)
#	#Retrieve FPKM values
#	fpkms<-fpkmMatrix(object)
#	
#	#Remove rows with sum()==0
#	fpkms<-fpkms[rowSums(fpkms)>0,]
#	
#}
#
#setMethod("csNMF",signature(object="CuffFeatureSet"),.csNMF)

##Takes as first argument the object returned from csCluster (a modified 'cluster' list)
#.clusterPlot<-function(clusters, pseudocount=1, ...){
#	m<-clusters$fpkm
#	m<-m+pseudocount
#	m$ids<-rownames(m)
#	m$cluster<-factor(clusters$clustering)
#	m.melt<-melt(m,id.vars=c("ids","cluster"))
#	c<-ggplot(m.melt)
#	c<-c+geom_line(aes(x=variable,y=value,color=cluster,group=ids)) + facet_wrap('cluster',scales='free')+scale_y_log10()
#	
#	#Default cummeRbund colorscheme
#	c<-c + scale_color_hue(l=50,h.start=200)
#	c
#}

######################
# Exploratory Analysis
######################
.nmf<-function(object,k,logMode=T,pseudocount=1,maxiter=1000,replicates=FALSE,fullnames=FALSE){
	require(NMFN)
	if(missing(k)) stop("Please provide a rank value for factorization (arg=k)")
	
	if(replicates){
		m=repFpkmMatrix(object,fullnames=fullnames)
	}else{
		m=fpkmMatrix(object,fullnames=fullnames)
	}
	
	if(logMode) 
	{
		m = log10(m+pseudocount)
	}
	
	myNMF<-nnmf(m,k=k,maxiter=maxiter)
	return (myNMF)
}

setMethod("csNMF",signature(object="CuffFeatureSet"),.nmf)

.density<-function(object, logMode = TRUE, pseudocount=0.0, labels, features=FALSE, replicates=FALSE,...){
	if(replicates){
		dat<-repFpkm(object,features=features)
		colnames(dat)[colnames(dat)=="rep_name"]<-"condition"
	}else{
		dat<-fpkm(object,features=features)
		colnames(dat)[colnames(dat)=="sample_name"]<-"condition"
	}
	if(logMode) dat$fpkm<-dat$fpkm+pseudocount
	p<-ggplot(dat)
	if(logMode) {
		p<-p+geom_density(aes(x= log10(fpkm),group=condition,color=condition,fill=condition),alpha=I(1/3))
	}else{
		p<-p+geom_density(aes(x=fpkm,group=condition,color=condition,fill=condition),alpha=I(1/3))
	}
	
	#p<-p + labs(title=object@tables$mainTable)
	
	#Default cummeRbund colorscheme
	p<-p + scale_fill_hue(l=50,h.start=200) + scale_color_hue(l=50,h.start=200)
	
	#TODO: Add label callout
	p
}

setMethod("csDensity",signature(object="CuffFeatureSet"),.density)

#################
# Dimensionality Reduction (use sparingly for Gene Sets)
#################

.MDSplot<-function(object,replicates=FALSE,logMode=TRUE,pseudocount=1.0){
	if(replicates){
		dat<-repFpkmMatrix(object)
		#repData<-sapply(replicates(object),function(x){strsplit(x,"_")[[1]][1]}) This is to color by condition and not replicate...
	}else{
		dat<-fpkmMatrix(object)
	}
	
	if(logMode){
		dat<-log10(dat+pseudocount)
	}
	
	d<-JSdist(makeprobs(dat))
	fit <- cmdscale(d,eig=TRUE, k=2)
	res<-data.frame(names=rownames(fit$points),M1=fit$points[,1],M2=fit$points[,2])
	p <- ggplot(res)
	p <- p + geom_point(aes(x=M1,y=M2,color=names)) + geom_text(aes(x=M1,y=M2,label=names,color=names)) + theme_bw()
	p
}

setMethod("MDSplot",signature(object="CuffFeatureSet"),.MDSplot)

.PCAplot<-function(object,x="PC1", y="PC2",replicates=FALSE,pseudocount=1.0,scale=TRUE,showPoints=TRUE,...){
	if(replicates){
		fpkms<-repFpkmMatrix(object)
	}else{
		fpkms<-fpkmMatrix(object)
	}
	fpkms<-log10(fpkms+pseudocount)
	PC<-prcomp(fpkms,scale=scale,...)
	dat <- data.frame(obsnames=row.names(PC$x), PC$x)
	#dat$shoutout<-""
	#dat$shoutout[matchpt(PC$rotation,PC$x)$index]<-rownames(pca$x[matchpt(pca$rotation,pca$x)$index,])
	plot <- ggplot(dat, aes_string(x=x, y=y)) 
	if(showPoints){
		plot<- plot + geom_point(alpha=.4, size=0.8, aes(label=obsnames))
	}

	plot <- plot + geom_hline(aes(yintercept=0), size=.2) + geom_vline(aes(xintercept=0), size=.2) #+ geom_text(aes(label=shoutout),size=2,color="red")

	datapc <- data.frame(varnames=rownames(PC$rotation), PC$rotation)
	mult <- min(
			(max(dat[,y]) - min(dat[,y])/(max(datapc[,y])-min(datapc[,y]))),
			(max(dat[,x]) - min(dat[,x])/(max(datapc[,x])-min(datapc[,x])))
	)
	datapc <- transform(datapc,
			v1 = .7 * mult * (get(x)),
			v2 = .7 * mult * (get(y))
	)
	plot <- plot + 
			#coord_equal() + 
			geom_text(data=datapc, aes(x=v1, y=v2, label=varnames,color=varnames), vjust=1)
	plot <- plot + geom_segment(data=datapc, aes(x=0, y=0, xend=v1, yend=v2,color=varnames), arrow=arrow(length=unit(0.2,"cm")), alpha=0.75) + theme_bw()
	plot
}

setMethod('PCAplot',signature(object="CuffFeatureSet"),.PCAplot)

#################
#Misc			#
#################
.makeIdentityMatrix<-function(sampleNames){
	d<-diag(length(sampleNames))
}

.specificity<-function(object,logMode=T,pseudocount=1,relative=FALSE,...){
	fpkms<-fpkmMatrix(object,...)
	if(logMode){
		fpkms<-log10(fpkms+pseudocount)
	}
	fpkms<-t(makeprobs(t(fpkms)))
	d<-diag(ncol(fpkms))
	res<-apply(d,MARGIN=1,function(q){
				JSdistFromP(fpkms,q)
			})
	colnames(res)<-paste(colnames(fpkms),"_spec",sep="")
	
	if(relative){
		res<-res/max(res)
	}
	1-res
}

setMethod("csSpecificity",signature(object="CuffFeatureSet"),.specificity)
shiauck/cummeRbund_on_RSQLite2.0 documentation built on May 5, 2019, 12:33 p.m.