R/geneSimilarity.R
In GOSim: Computation of functional similarities between GO terms and gene products; GO enrichment analysis

Documented in filterGO getGeneFeatures getGeneFeaturesPrototypes getGeneSim getGeneSimPrototypes getGOGraphsGenes getGOInfo getGSim precomputeTermSims selectPrototypes

# get GO information for a list of genes
getGOInfo<-function(geneIDs){	
	geneIDs = as.character(geneIDs)
	#require(GO.db)
	#if(!require(annotate))
	#	stop("Package annotate is required for function getGOInfo")
	if(!exists("GOSimEnv")) initialize()		
	ontology<-get("ontology",envir=GOSimEnv)
	gomap<-get("gomap",envir=GOSimEnv)	
	gomap = gomap[geneIDs]		
	goterms = lapply(gomap, function(x) sapply(x, function(y) y$Ontology == ontology))
	goterms = goterms[!is.na(names(goterms))]
	if(length(goterms) == 0)
		stop("No GO information available for these genes!")
	goterms <-sapply(goterms, function(x) unique(names(x[which(x)])))
	goterms <- goterms[sapply(goterms,length) > 0]
	goids<- toTable(GOTERM)				
	IC<-get("IC", envir=GOSimEnv)	
	info = sapply(goterms, function(g) cbind(unique(goids[goids[,"go_id"] %in% g,c("go_id", "Term", "Definition")]), IC=IC[g]))
	info
}

# filter out genes not mapping to the category in question	
filterGO<-function(genelist){	
	cat("filtering out genes not mapping to the currently set GO category ...")
	if(!exists("GOSimEnv")) initialize()	
	IC<-get("IC", envir=GOSimEnv)
	ids<-names(IC[IC != Inf]) # only consider GO terms with some known annotation   
	gomap<-get("gomap",envir=GOSimEnv)
	k<-1
	allgenes<-list()
	for(i in 1:length(genelist)){		
		annoi<-gomap[[match(as.character(genelist[i]), names(gomap))]]
		annoi<-intersect(names(annoi), as.character(ids))				
		if(length(annoi) > 0){
			allgenes[[k]]<-list(annotation=annoi,genename=as.character(genelist[i]))
			k<-k + 1
		}
	}	
	cat(" ===> list of ", length(genelist), "genes reduced to ", length(allgenes), "\n")
	allgenes
}

#calc.sim.significance = function(Sim, B=1000, similarity="funSimMax", similarityTerm="relevance", normalization=TRUE, method="sqrt", avg=(similarity=="OA"), adj.method="bonf"){	
#	n = NCOL(Sim)
#	pvals = matrix(1, ncol=n, nrow=n)			
#	gomap <- get("gomap",envir=GOSimEnv)
#	allgenes = filterGO(names(gomap))
#	allgenes = sapply(allgenes, function(x) x$genename)		
#	for(b in 1:B){
#		sam = base:::sample(allgenes, n)
#		K = getGeneSim(sam, similarity=similarity, similarityTerm=similarityTerm, normalization=normalization, method=method, avg=avg, verbose=FALSE)
#		pvals = (K >= Sim)*1 + pvals					
#	}		
#	pvals = pvals/B
#	pvals = p.adjust(pvals, method=adj.method)
#	pvals
#}

# get GO graphs for GO terms associated to each gene
getGOGraphsGenes <- function(genelist, prune=Inf){
	allgenes = filterGO(genelist)
	if(length(allgenes) > 0){
		G = list()
		for(i in 1:length(allgenes)){
			G[[i]] = getGOGraph(allgenes[[i]]$annotation, prune)
		}
	}
	else
		stop("No gene has GO information!")
	G
}

# precompute term similarities for all pairs of GO terms belonging to the annotated genelists x (and y)
precomputeTermSims<-function(x, y=NULL, similarityTerm="JiangConrath", verbose=FALSE){ 	
	if(verbose)
		message("precomputing term similarities ...")	
	gotermsx<-as.vector(unique(unlist(sapply(x, function(xx) xx$annotation))))
	if(!is.null(y)){  		 
		gotermsy<-as.vector(unique(unlist(sapply(y, function(xx) xx$annotation))))
		if(similarityTerm %in% c("diffKernelgraphLapl", "diffKernelLLE", "diffKernelpower", "diffKernelexpm")){
			STerm = getTermSim(c(gotermsx, gotermsy), method=similarityTerm)
			STerm = STerm[gotermsx, gotermsy]
			return(STerm)
		}
		STerm<-matrix(0, nrow=length(gotermsx), ncol=length(gotermsy))
		rownames(STerm)=gotermsx
		colnames(STerm)=gotermsy
		for(i in 1:length(gotermsx)){						
			for(j in 1:length(gotermsy)){          
				STerm[i,j]<-calcTermSim(gotermsx[i],gotermsy[j], similarityTerm, verbose)
			}			
		}		
	} else{
		if(similarityTerm %in% c("diffKernelgraphLapl", "diffKernelLLE", "diffKernelpower", "diffKernelexpm")){
			STerm = getTermSim(gotermsx, method=similarityTerm)			
			return(STerm)
		}
		STerm<-matrix(0, nrow=length(gotermsx), ncol=length(gotermsx))
		rownames(STerm)<-gotermsx
		colnames(STerm)<-gotermsx
		for(i in 1:length(gotermsx)){
			STerm[i,i]<-calcTermSim(gotermsx[i], gotermsx[i], similarityTerm, verbose)
			if(i > 1){
				for(j in 1:(i-1)){					
					STerm[i,j]<-calcTermSim(gotermsx[i], gotermsx[j], similarityTerm, verbose)
					STerm[j,i]<-STerm[i,j]
				}
			}
		}
	}	
	STerm
}

getWeightedDotSim <- function(anno1, anno2){
	v1 = getGeneFeatures.internal(anno1)
	v2 = getGeneFeatures.internal(anno2)
	dot = crossprod(v1,v2)	
}

# compute gene similarity for a pair of genes having GO terms anno1 and anno2
getGSim<-function(anno1, anno2, similarity="funSimMax", similarityTerm="JiangConrath", STerm=NULL, avg=FALSE, verbose=FALSE){	  
  if(length(anno1) <= length(anno2)){
	a1<-anno1
	a2<-anno2
	swap<-FALSE
  }
  else{
	a1<-anno2
	a2<-anno1
	swap<-TRUE
  }		
  if(!is.null(STerm)){ # use precomputed similarity values
	if(!swap)
		ker<-STerm[a1,a2]
	else
		ker<-STerm[a2,a1]		
	if(length(a1) == 1)
		ker<-t(as.matrix(ker))		
	if(is.null(ker) || is.null(nrow(ker))){
		warning(paste("No GO information for",a1,a2,". Similarity set to NaN."))		
		return(NaN)
	}	
	if(nrow(ker) > ncol(ker))
		ker<-t(ker)
  }
  else{ 
	if(similarity %in% c("dot"))
		return(getWeightedDotSim(a1, a2))			
	else{
		# calculate term similarity
		ker<-matrix(0,nrow=length(a1),ncol=length(a2))	
		for(i in 1:length(a1)){
			for(j in 1:length(a2))
				ker[i,j]<-calcTermSim(a1[i],a2[j], similarityTerm, verbose)		
		}
	}
  }  
  if(length(a1)*length(a2) > 0){
	if(similarity == "OA"){						
		res<-.Call("OAWrapper", Rpre=ker, mod=as.integer(1), PACKAGE="GOSim")
		if(verbose){
			cat("term similarity matrix:\n")
			print(ker)			
			cat("optimal assignment:\n")
			if(NROW(res$assignment) > 1)
				res$assignment = res$assignment[rowSums(res$assignment) > 0, ,drop=FALSE]
			if(NROW(res$assignment) > 1)
				res$assignment = res$assignment[,colSums(res$assignment) > 0, drop=FALSE]
			if(NROW(res$assignment) > 1 && NROW(res$assignment) > 1){
				res$assignment = res$assignment[1:length(anno1), 1:length(anno2), drop=FALSE]
				dimnames(res$assignment) = list(anno1, anno2)
			}
			print(res$assignment)
			cat("======================\n")
		}
		if(avg)
			res$score = res$score/length(a2)	
		return(res$score)
	}
	else if(similarity == "max"){				
		return(max(ker))
	}
	else if(similarity == "mean"){				
		return(mean(ker))
	}  
	else if(similarity == "funSimAvg"){
		rowMax = mean(apply(ker,1,max))
		colMax = mean(apply(ker,2,max))
		return(0.5*(rowMax + colMax))
	}
	else if(similarity == "funSimMax"){
		rowMax = mean(apply(ker,1,max))
		colMax = mean(apply(ker,2,max))
		return(max(rowMax, colMax))
	}
	else if(similarity == "hausdorff"){
		rowMax = max(apply(ker,1,min))
		colMax = max(apply(ker,2,min))
		d = max(rowMax, colMax)
		return(exp(-d))
	}	
	else
		stop(paste("getGSim: Unknown gene similarity",similarity,"!"))
  }
  else{	
	warning(paste("No GO information for",a1,a2,". Similarity set to NaN."))		
	return(NaN)
  }
}

# compute GO gene similarity for a list of genes
# getGeneSim<-function(genelist, similarity="funSimMax", similarityTerm="Lin", normalization=!(similarity %in% c("funSimAvg","funSimMax")), method="sqrt", avg=(similarity=="OA"), verbose=FALSE){
getGeneSim<-function(genelist1, genelist2=NULL, similarity="funSimMax", similarityTerm="relevance", normalization=TRUE, method="sqrt", avg=(similarity=="OA"), verbose=FALSE){	
	genelist1 <- unique(genelist1)
	genelist2 = unique(genelist2)
	if(length(genelist1) < 2 && is.null(genelist2))
		stop("Gene list should contain more than 2 elements!")
	allgenes<-filterGO(genelist1)	
	if(length(allgenes) > 1 && is.null(genelist2)){			
		if(!(similarity %in% c("dot")))
			STerm<-precomputeTermSims(x=allgenes, similarityTerm=similarityTerm, verbose=verbose) # precompute term similarities => speed up!		
		else
			STerm = NULL
		if(verbose)
			message(paste("Calculating similarity matrix with similarity measure",similarity))
		Ker<-matrix(0,nrow=length(allgenes),ncol=length(allgenes))
		colnames(Ker)<-sapply(allgenes,function(x) x$genename)
		rownames(Ker)<-colnames(Ker)
		for(i in 1:length(allgenes)){
			annoi<-(allgenes[[i]]$annotation)		
			Ker[i,i]<-getGSim(annoi,annoi, similarity, similarityTerm, STerm=STerm, avg=avg, verbose)
			if(i > 1){
				for(j in 1:(i-1)){
					annoj<-(allgenes[[j]]$annotation)
	# 			        message(paste(allgenes[[i]]$genename,allgenes[[j]]$genename))
					Ker[i,j]<-getGSim(annoi,annoj, similarity, similarityTerm, STerm=STerm, avg=avg, verbose)
					Ker[j,i]<-Ker[i,j]
				}
			}
		}			
		if(normalization){			
			Ker = normalize.kernel(Ker, method=method)
			if(any(Ker > 1, na.rm=TRUE)) # this has been updated
				warning("Similarity matrix contains values > 1! This may happen with simlarity='funSimMax', if one gene's GO annotation is a complete subset of another gene's GO annotation.")
			Ker[Ker>1] = 1 # can happen with similarity funSimMax in cases where one GO annotation is subset of another one
		}			
	}
	else if(length(allgenes) > 0 && length(genelist2) > 0){
		allgenes2<-setdiff(filterGO(genelist2), allgenes)
		if(!(similarity %in% c("dot")))
			STerm<-precomputeTermSims(x=allgenes, y=allgenes2, similarityTerm=similarityTerm, verbose=verbose) # precompute term similarities => speed up!		
		else
			STerm = NULL
		if(verbose)
			message(paste("Calculating similarity matrix with similarity measure",similarity))
		Ker<-matrix(0,nrow=length(allgenes),ncol=length(allgenes2))
		colnames(Ker)<-sapply(allgenes2,function(x) x$genename)
		rownames(Ker)<-sapply(allgenes,function(x) x$genename)
		for(i in 1:length(allgenes)){
			annoi<-(allgenes[[i]]$annotation)
			if(normalization)
				kerselfi = getGSim(annoi, annoi, similarity, similarityTerm, STerm=NULL, avg=avg, verbose)
			for(j in 1:length(allgenes2)){
				annoj<-(allgenes2[[j]]$annotation)
				# 			        message(paste(allgenes[[i]]$genename,allgenes[[j]]$genename))
				Ker[i,j]<-getGSim(annoi,annoj, similarity, similarityTerm, STerm=STerm, avg=avg, verbose)	
				if(normalization){
					kerselfj = getGSim(annoj, annoj, similarity, similarityTerm, STerm=NULL, avg=avg, verbose)
					Ker[i,j] = normalize.kernel(Ker[i,j], kerselfi, kerselfj, method=method)
				}
			}
		}			
		if(any(Ker > 1, na.rm=TRUE))
			warning("Similarity matrix contains values > 1! This may happen with simlarity='funSimMax', if one gene's GO annotation is a complete subset of another gene's GO annotation.")
		Ker[Ker>1] = 1 # can happen with similarity funSimMax in cases where one GO annotation is subset of another one
	}
	else{
		if(length(allgenes) == 0)
			stop("No gene has GO information!")					
		else if(length(allgenes) == 1)
			stop(paste("Only gene",allgenes," has GO information!"))					
	}
	Ker
}

getGeneFeatures.internal = function(anno){
	ancestor<-get("ancestor",envir=GOSimEnv)	
	an<-unlist(ancestor[names(ancestor) %in% anno])	
	IC<-get("IC", envir=GOSimEnv)
	v = double(length(IC))	
	names(v) = names(IC)	
	v[c(anno, an)] = IC[c(anno, an)]
	v = v[!is.na(v)]
	v
}

# compute feature representation for genes
getGeneFeatures = function(genelist, pca=FALSE, normalization=FALSE, verbose=FALSE){
	if(!exists("GOSimEnv")) initialize()	
	genelist <- unique(genelist)
	if(length(genelist) < 1)
		stop("Gene list should contain at least 1 element!")
	allgenes<-filterGO(genelist)	
	IC<-get("IC", envir=GOSimEnv)
	if(length(allgenes) > 0){			
		PHI<-matrix(0,nrow=length(allgenes),ncol=length(IC))
		rownames(PHI) <- sapply(allgenes,function(x) x$genename)
		colnames(PHI) <- names(IC)
		for(i in 1:length(allgenes)){
			annoi <- (allgenes[[i]]$annotation)		
			PHI[i,] <- getGeneFeatures.internal(annoi)							
		}	
		if(pca){
			pcares<-selectPrototypes(method="pca",data=PHI,verbose=verbose)		
			PHI<-pcares$features
		}				
		if(normalization)
			PHI<-t(apply(PHI,1,function(x) return(x/(norm(x)+1e-10))))					
	}
	else{		
		stop("No gene has GO information!")							
	}
	PHI
}

# compute prototype feature representation for genes
getGeneFeaturesPrototypes<-function(genelist, prototypes=NULL, similarity="max", similarityTerm="JiangConrath", pca=TRUE, normalization=TRUE, verbose=FALSE){
	genelist<-unique(genelist)
	if(is.null(prototypes))
		prototypes<-selectPrototypes(verbose=verbose)
	allgenes<-filterGO(genelist)
	if(length(allgenes) == 0)
		stop("No gene has GO information!")
	proto<-filterGO(prototypes)
	if(length(proto) == 0)
		stop("getGeneFeaturesPrototypes: Number of prototypes equals zero after filtering!")	
	STerm<-precomputeTermSims(x=allgenes,y=proto,similarityTerm=similarityTerm,verbose=verbose)  # precompute term similarities => speed up!
	if(verbose)
		message(paste("calculating feature vectors with",length(proto),"prototypes"))
	PHI<-matrix(0,nrow=length(allgenes),ncol=length(proto))
	for(i in 1:length(allgenes)){		
		for(j in 1:length(proto)){
			annoi<-(allgenes[[i]]$annotation)		
			annoj<-(proto[[j]]$annotation)			
			PHI[i,j]<-getGSim(annoi,annoj, similarity, similarityTerm, STerm=STerm, verbose)
		}
	}			
	rownames(PHI)<-sapply(allgenes,function(x) x$genename)
	colnames(PHI)<-sapply(proto,function(x) x$genename)	
	if(pca & (length(proto) > 2)){    
		pcares<-selectPrototypes(method="pca",data=PHI,verbose=verbose)		
		PHI<-pcares$features
		proto<-pcares$pcs    
	}	
	if(normalization)
		PHI<-t(apply(PHI,1,function(x) return(x/(norm(x)+1e-10))))		  
	list(features=PHI,prototypes=proto)
}

# compute GO gene similarity using the prototype feature representation
getGeneSimPrototypes<-function(genelist, prototypes=NULL, similarity="max", similarityTerm="JiangConrath", pca=TRUE, normalization=TRUE, verbose=FALSE){
	genelist<-unique(genelist)	
	if(length(genelist) < 2)
		stop("Gene list should contain at least 2 elements!")
	res<-getGeneFeaturesPrototypes(genelist, prototypes, similarity, similarityTerm, pca, normalization, verbose)
	PHI<-res$features		
	Ker<-PHI%*%t(PHI)
	if(normalization)
		Ker<-(Ker + 1)/2 # scale to [0,1]
	list(similarity=Ker,prototypes=res$prototypes,features=res$features)
}

# method to a) select prototype genes b) to perform subselection of prototypes via i) PCA ii) clustering
selectPrototypes<-function(n=250, method="frequency", data=NULL, verbose=FALSE){
	if(!exists("GOSimEnv")) initialize()	
	ontology<-get("ontology",envir=GOSimEnv) 	
	if(method == "frequency"){
		if(verbose)
			message("Automatic determination of prototypes using genes with most frequent annotation ...")	
		gomap<-get("gomap",envir=GOSimEnv)
		goterms = lapply(gomap, function(x) sapply(x, function(y) y$Ontology == ontology))
		goterms = goterms[!is.na(names(goterms))]
		if(length(goterms) == 0)
			stop("No GO information available for these genes!")
		goterms <-sapply(goterms, function(x) names(x[which(x)]))
		freq <- sapply(goterms,length)
		res = sort(freq, decreasing=TRUE)		
# 		locus<-as.list(GOENTREZID)
# 		goids<-as.list(GOTERM)
# 		goids<-goids[sapply(goids, function(x) Ontology(x) == ontology)]		
# 		locusnames<-intersect(names(locus),names(goids))
# 		locus<-locus[locusnames]
# 		lt<-table(unlist(locus))
# 		res<-sort(lt,decreasing=TRUE)
		prototypes<-names(res)[1:n]
		return(prototypes)
	}
	else if(method == "random"){
		if(verbose)		
			message("Automatic determination of prototypes using random genes from the current ontology...")
		gomap<-get("gomap",envir=GOSimEnv)
		goterms = lapply(gomap, function(x) sapply(x, function(y) y$Ontology == ontology))
		goterms = goterms[!is.na(names(goterms))]
		if(length(goterms) == 0)
			stop("No GO information available for these genes!")
		goterms <-sapply(goterms, function(x) names(x[which(x)]))
		goterms <- goterms[sapply(goterms,length) > 0]
		prototypes = sample(names(goterms), n)
# 		locus<-as.list(GOENTREZID)
# 		goids<-as.list(GOTERM)
# 		goids<-goids[sapply(goids, function(x) Ontology(x) == ontology)]		
# 		locusnames<-intersect(names(locus),names(goids))
# 		locus<-locus[locusnames]
# 		lt<-names(table(unlist(locus)))
# 		set.seed(0)
# 		ridx<-round(runif(n,min=1,max=length(lt)))
# 		prototypes<-lt[ridx]		
		return(prototypes)
	}
	else if(method == "pca"){
		if(is.null(data))
			stop("You need to specify a data matrix with feature vectors")
		pcares<-pca(data)
		if(verbose)
			message(paste("PCA: dimension reduced to",ncol(pcares$features),"principal components (95% total variance explained)"))
		return(pcares)
	}
	else if(method == "clustering"){    		
		if(is.null(data))
			stop("You need to specify a data matrix with feature vectors")
		if(verbose)
			message("Clustering feature vectors ...")
		#res<-Mclust(t(data),2:n)
		#return(res$mu)
		res = sapply(2:n, function(k) flexmix(t(data)~1, cluster=cutree(hclust(dist(t(data))),k=k), k=k, model= FLXMCmvnorm()))
		bics = sapply(res, BIC)
		res = res[[which.min(bics)]]
		p = parameters(res)
		return(p[setdiff(1:NROW(p), grep("cov", rownames(p))),])
	}
	else
	stop(paste("selectPrototypes: Unknown method",method,"!"))
}

Any scripts or data that you put into this service are public.

GOSim documentation built on Nov. 8, 2020, 11:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

GOSim
Computation of functional similarities between GO terms and gene products; GO enrichment analysis

R/geneSimilarity.R
In GOSim: Computation of functional similarities between GO terms and gene products; GO enrichment analysis

Defines functions selectPrototypes getGeneSimPrototypes getGeneFeaturesPrototypes getGeneFeatures getGeneFeatures.internal getGeneSim getGSim getWeightedDotSim precomputeTermSims getGOGraphsGenes filterGO getGOInfo

Documented in filterGO getGeneFeatures getGeneFeaturesPrototypes getGeneSim getGeneSimPrototypes getGOGraphsGenes getGOInfo getGSim precomputeTermSims selectPrototypes

Try the GOSim package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GOSim Computation of functional similarities between GO terms and gene products; GO enrichment analysis

R/geneSimilarity.R In GOSim: Computation of functional similarities between GO terms and gene products; GO enrichment analysis

Defines functions selectPrototypes getGeneSimPrototypes getGeneFeaturesPrototypes getGeneFeatures getGeneFeatures.internal getGeneSim getGSim getWeightedDotSim precomputeTermSims getGOGraphsGenes filterGO getGOInfo

Documented in filterGO getGeneFeatures getGeneFeaturesPrototypes getGeneSim getGeneSimPrototypes getGOGraphsGenes getGOInfo getGSim precomputeTermSims selectPrototypes

Try the GOSim package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GOSim
Computation of functional similarities between GO terms and gene products; GO enrichment analysis

R/geneSimilarity.R
In GOSim: Computation of functional similarities between GO terms and gene products; GO enrichment analysis