R/convertId.R

Defines functions newIdMatrix convertId

Documented in convertId newIdMatrix

##' convertId
##' 
##' A function to convert ID based on the biomaRt package.
##' 
##' A function to convert ID based on the biomaRt package..
##' 
##' @param keepMultipleId Logical. Indicate keep the multiple target IDs related to one source ID or not.
##' @param keepNoId Logical. Indicate keep the source IDs without target IDs or not.
##' @param verbose Logical. Indicate report extra information on progress or not.
##' @inheritParams biomaRt::getBM
##' @inheritParams biomaRt::useMart
##' @inheritParams newIdMatrix
##' @importFrom biomaRt getBM useMart
##' @importFrom stats var
##' @importFrom utils flush.console
##' @export
##' @examples temp<-cbind(rnorm(10),rnorm(10))
##' row.names(temp)<-c("Q04837","P0C0L4","P0C0L5","O75379","Q13068","A2MYD1",
##' "P60709","P30462","P30475","P30479")
##' colnames(temp)<-c("Exp1","Exp2")
##' convertId(temp,filters="uniprotswissprot",keepMultipleId=TRUE)
##' \dontrun{
##' temp<-cbind(rnorm(5000),rnorm(5000),rnorm(5000),rnorm(5000),rnorm(5000),rnorm(5000))
##' row.names(temp)<-1000:5999
##' colnames(temp)<-c("Control1","Control2","Control3","Treatment1","Treatment2","Treatment3")
##' convertId(temp,filters="entrezgene_id",attributes =c("entrezgene_id","uniprotswissprot"),
##' keepNoId=FALSE)
##' }
convertId<-function(x,dataset="hsapiens_gene_ensembl",filters="uniprotswissprot",attributes =c(filters,"entrezgene_id"),genesKept=c('foldchange','first','random','var','abs'),keepNoId=T,keepMultipleId=F,verbose=F) {
#	if (! require("biomaRt")) {
#		cat("biomaRt package is needed but not installed in this computer. Will install it from bioconductor.\n")
#		flush.console()
#		if (!requireNamespace("BiocManager", quietly=TRUE))
    #		install.packages("BiocManager")
#		BiocManager::install("biomaRt")
#		if (!require(biomaRt)) {stop("Package biomaRt can't be installed")}
#	}
	if (missing(genesKept)) {
		genesKept<-"var"
	} else {
		genesKept<-match.arg(genesKept)
	}
	if (verbose) {
		cat("Now conectting with ensembl. Internet acess is needed and it may use 30 seconds.\n")
		flush.console()
	}
	
	#temp
	#ensembl = useMart("ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2015.archive.ensembl.org")
	ensembl = useMart("ensembl",dataset=dataset)
	
	oldId<-row.names(x)
	newIdTable<-getBM(attributes =attributes,filters=filters,values=oldId,mart = ensembl)
	newIdTable<-newIdTable[which(newIdTable[,1]!="" & newIdTable[,2]!=""),]
	
	temp1<-which(oldId %in% newIdTable[,1])
	temp2<-nrow(x)-length(temp1)
	xNoId<-NULL
	if (keepNoId) {
		if (verbose) {
			cat(paste("No ID, Keep: ",temp2," genes can't find their ",attributes[2]," ID. They will be attched at the end of data with their original ID.\n",sep=""))
		}
		xNoId<-x[-temp1,,drop=F]
	} else {
		if (verbose) {
			cat(paste("No ID, Discard: ",temp2," genes can't find their ",attributes[2]," ID. They will be discard.\n",sep=""))
		}
	}
	
	temp2<-split(newIdTable,newIdTable[,1])
	if (keepMultipleId) {
		newIdTable<-sapply(temp2,function(x) return(paste(x[,2],collapse=";")))
		if (verbose) {
			cat(paste("Multiple IDs, Keep: ",length(which(sapply(temp2,function(x) nrow(x))>=2))," genes have more than one ",attributes[2]," IDs. All of these ",attributes[2]," IDs will be stored.\n",sep=""))
		}
	} else {
		newIdTable<-sapply(temp2,function(x) return(x[1,2]))
		if (verbose) {
			cat(paste("Multiple IDs, Discard: ",length(which(sapply(temp2,function(x) nrow(x))>=2))," genes have more than one ",attributes[2]," IDs. Only the first ",attributes[2]," ID for each gene will be used.\n",sep=""))
		}
	}
	names(newIdTable)<-names(temp2)
	
	result<-newIdMatrix(x,genesKept=genesKept,convertIdTable=newIdTable)
	
	if (keepMultipleId) {
		temp<-strsplit(row.names(result),";")
		temp1<-sapply(temp,length)
		temp2<-unlist(temp)
		result<-result[rep(1:length(temp1),temp1),]
		row.names(result)<-temp2
	}
	if (keepNoId) {
		result<-rbind(result,xNoId)
	}
	return(result)
}

##' newIdMatrix
##' 
##' A function to convert ID.
##' 
##' A function to convert ID.
##' 
##' @param x the expression data matrix.
##' @param convertIdTable A vector. The names should be the source IDs, and the values should be the target IDs.
##' @param genesKept The method to select target gene in more than one targets. "var"/"foldchange"/"abs" means selecting the gene with largest variation/fold change/absolute value. "first" means selecting the first target and "random" means randomly selection.
##' @export
##' @examples convertIdTable<-paste("New",c(1,2,2,2,1,3,4,4,5,5))
##' names(convertIdTable)<-paste("Old",1:length(convertIdTable))
##' temp<-matrix(rnorm(20),ncol=2)
##' row.names(temp)<-names(convertIdTable)
##' colnames(temp)<-c("Exp1","Exp2")
##' newIdMatrix(temp,genesKept="foldchange",convertIdTable)
newIdMatrix<-function(x,convertIdTable,genesKept=c("var","foldchange","abs","first","random")) {
	convertIdTable<-convertIdTable[which(convertIdTable!="" & names(convertIdTable)!="")]
	x<-x[names(convertIdTable),,drop=F]
	if (missing(genesKept)) {
		genesKept<-"var"
	} else {
		genesKept<-match.arg(genesKept)
	}
	if (genesKept=="foldchange") {
		temp<-apply(x,1,range,na.rm=T)
		testStat<-temp[2,]-temp[1,]
	} else if (genesKept=="first") {
		testStat<-rep(1,nrow(x))
		names(testStat)<-row.names(x)
	} else if (genesKept=="random") {
		testStat<-sample(1:nrow(x),nrow(x))
		names(testStat)<-row.names(x)
	} else if (genesKept=="var") {
		testStat<-apply(x,1,var,na.rm=T)
	} else if (genesKept=="abs") {
		testStat<-apply(x,1,function(y) max(abs(y),na.rm=TRUE))
	}
	testStat[is.na(testStat)]<--Inf #Some temp has a NA value if genesKept=="var" and only one sample has value
	temp<-split(testStat,convertIdTable)
	result<-x[unlist(sapply(temp, function(y) names(which.max(y)))),,drop=FALSE]
	row.names(result)<-names(temp)
	return(result)
}
slzhao/KEGGprofile documentation built on Oct. 27, 2020, 4:29 a.m.