We use this procedure to map worm genes to human homologs. The same procedure can be used to map worm genes to genes of other species. The key mapping information is the best match of worm-human protein sequences based on Blastp hits. To prepare for the mapping, several annotation files of genome WScel235 were downloaded from the ftp site of http://www.wormbase.org:
knitr::opts_chunk$set(dpi=300, fig.pos="H", fig.width=8, fig.height=6, echo=FALSE, warning=FALSE, message=FALSE);
file_home<-paste(Sys.getenv('RCHIVE_HOME'), 'data/gene/public/wormbase/src', sep='/'); file_out<-paste(Sys.getenv('RCHIVE_HOME'), 'data/gene/public/wormbase/r', sep='/'); if (!file.exists(file_out)) dir.create(file_out, recursive = TRUE); # all WORMBASE gene IDs fn<-paste(file_home, 'c_elegans.WS235.gene_ids.txt', sep='/'); all.ids<-read.table(fn, sep='\t', stringsAsFactors = FALSE)[, 2]; # mapping between differnt types of worm IDs fn<-paste(file_home, 'c_elegans.WS235.xrefs.txt', sep='/'); xref<-read.table(fn, sep='\t', stringsAsFactors = FALSE); xref<-xref[xref[, 5]!='.', ]; xref<-split(xref[, 5], xref[, 2]); fn<-paste(file_home, 'c_elegans.WS235.best_blastp_hits.txt', sep='/'); ln<-scan(fn, what='', sep='\n', flush=TRUE); ln<-ln[grep('ENSEMBL', ln)]; # has human homolog ce<-sapply(strsplit(ln, ','), function(x) x[1]); ln<-strsplit(ln, '[,:]'); en<-sapply(ln, function(ln) {ind<-which(ln=='ENSEMBL'); ln[(ind+1):(ind+2)]}); en<-t(en); p<-as.numeric(en[,2]); names(p)<-en[, 1]; ce2ensp<-split(p, ce); ce2ensp<-lapply(ce2ensp, sort); ce2ensp<-lapply(ce2ensp, names); saveRDS(list(all_worm_ids=all.ids, xref=xref, worm2human=ce2ensp), file=paste(file_out, 'worm2human_parsed.rds', sep='/'));
Prepare a list of 3 elements from files downloaded from wormbase FTP site: 1. all wormbase gene IDs; 2. mapping of wormbase gene IDs to protein IDs; and 3. mapping of wormbase protein IDs to human Emsembl protein IDs (sorted by Blastp p value, minimum first).
ce2hs<-readRDS(paste(file_home, 'worm2human_parsed.rds', sep='/')); pr<-unlist(ce2hs[[2]], use.names=FALSE); pr2pr<-ce2hs[[3]][pr]; gn<-rep(names(ce2hs[[2]]), sapply(ce2hs[[2]], length)); gn2pr<-split(unlist(pr2pr, use.names=FALSE), rep(gn, sapply(pr2pr, length))); gn2pr<-lapply(gn2pr, unique); mp<-as.list(rep('', length(ce2hs[[1]]))); names(mp)<-ce2hs[[1]]; mp[names(gn2pr)]<-gn2pr; n<-sapply(mp, function(x) length(x[x!='']));
Then, perform one to many mapping, from worm gene IDs to worm protein IDs to human protein IDs. r length(n[n>0])
worm genes were mapped to at least one human protein, among which r length(n[n>1])
genes were mapped to more than one human proteins.
versions<-c( "http://bioconductor.org/packages/2.14/data/annotation/src/contrib/org.Ce.eg.db_2.14.0.tar.gz", "http://bioconductor.org/packages/3.0/data/annotation/src/contrib/org.Ce.eg.db_3.0.0.tar.gz", "http://bioconductor.org/packages/3.1/data/annotation/src/contrib/org.Ce.eg.db_3.1.2.tar.gz", "http://bioconductor.org/packages/3.2/data/annotation/src/contrib/org.Ce.eg.db_3.2.3.tar.gz", "http://bioconductor.org/packages/3.3/data/annotation/src/contrib/org.Ce.eg.db_3.2.3.tar.gz" ) fn<-sapply(strsplit(versions, '/'), function(x) x[length(x)]); fn<-paste(file_home, fn, sep='/'); apply(cbind(versions, fn), 1, function(v) if (!file.exists(v[2])) download.file(v[1], v[2]))->x; library(dplyr); all.versions<-lapply(fn, function(f) { print(f); untar(f, exdir=file_home); db<-src_sqlite(paste(file_home, 'org.Ce.eg.db/inst/extdata/org.Ce.eg.sqlite', sep='/')); tbls<-src_tbls(db); if ('wormbase' %in% tbls) { gn<-as.data.frame(collect(tbl(db, 'genes'))); t<-as.data.frame(collect(tbl(db, 'wormbase'))); t[,1]<-gn[, 2][t[, 1]]; t; } else NA }); all.versions<-all.versions[!is.na(all.versions)]; if (length(all.versions)==0) stop("No Wormbase to Entrez gene mapped"); wb2eg<-split(as.character(as.vector(all.versions[[length(all.versions)]][, 1])), as.vector(all.versions[[length(all.versions)]][, 2])); if (length(all.versions)>1) { for (i in (length(all.versions)-1):1) { v<-all.versions[[i]]; v<-v[!(as.vector(v[,2]) %in% names(wb2eg)), ]; wb2eg<-c(wb2eg, split(as.vector(v[, 1]), as.vector(v[, 2]))); print(length(wb2eg)); } } n<-sapply(wb2eg, length); saveRDS(wb2eg, file=paste(file_out, 'wormbase2entrez.rds', sep='/'));
versions<-c( "http://bioconductor.org/packages/2.1/data/annotation/src/contrib/org.Hs.eg.db_2.0.2.tar.gz", "http://bioconductor.org/packages/2.2/data/annotation/src/contrib/org.Hs.eg.db_2.2.0.tar.gz", "http://bioconductor.org/packages/2.3/data/annotation/src/contrib/org.Hs.eg.db_2.2.6.tar.gz", "http://bioconductor.org/packages/2.4/data/annotation/src/contrib/org.Hs.eg.db_2.2.11.tar.gz", "http://bioconductor.org/packages/2.5/data/annotation/src/contrib/org.Hs.eg.db_2.3.6.tar.gz", "http://bioconductor.org/packages/2.6/data/annotation/src/contrib/org.Hs.eg.db_2.4.1.tar.gz", "http://bioconductor.org/packages/2.7/data/annotation/src/contrib/org.Hs.eg.db_2.4.6.tar.gz", "http://bioconductor.org/packages/2.8/data/annotation/src/contrib/org.Hs.eg.db_2.5.0.tar.gz", "http://bioconductor.org/packages/2.9/data/annotation/src/contrib/org.Hs.eg.db_2.6.4.tar.gz", "http://bioconductor.org/packages/2.10/data/annotation/src/contrib/org.Hs.eg.db_2.7.1.tar.gz", "http://bioconductor.org/packages/2.11/data/annotation/src/contrib/org.Hs.eg.db_2.8.0.tar.gz", "http://bioconductor.org/packages/2.12/data/annotation/src/contrib/org.Hs.eg.db_2.9.0.tar.gz", "http://bioconductor.org/packages/2.13/data/annotation/src/contrib/org.Hs.eg.db_2.10.1.tar.gz", "http://bioconductor.org/packages/2.14/data/annotation/src/contrib/org.Hs.eg.db_2.14.0.tar.gz", "http://bioconductor.org/packages/3.0/data/annotation/src/contrib/org.Hs.eg.db_3.0.0.tar.gz", "http://bioconductor.org/packages/3.1/data/annotation/src/contrib/org.Hs.eg.db_3.1.2.tar.gz", "http://bioconductor.org/packages/3.2/data/annotation/src/contrib/org.Hs.eg.db_3.2.3.tar.gz", "http://bioconductor.org/packages/3.3/data/annotation/src/contrib/org.Hs.eg.db_3.2.3.tar.gz" ) fn<-sapply(strsplit(versions, '/'), function(x) x[length(x)]); fn<-paste(file_home, fn, sep='/'); apply(cbind(versions, fn), 1, function(v) if (!file.exists(v[2])) download.file(v[1], v[2])); library(dplyr); all.versions<-lapply(fn, function(f) { print(f); untar(f, exdir=file_home); db<-src_sqlite(paste(file_home, 'org.Hs.eg.db/inst/extdata/org.Hs.eg.sqlite', sep='/')); tbls<-src_tbls(db); if ('ensembl_prot' %in% tbls) { gn<-as.data.frame(collect(tbl(db, 'genes'))); t<-as.data.frame(collect(tbl(db, 'ensembl_prot'))); t[,1]<-gn[, 2][t[, 1]]; t; } else NA }); all.versions<-all.versions[!is.na(all.versions)]; if (length(all.versions)==0) stop("No Ensembl protein to Entrez gene mapped"); en2eg<-split(as.character(as.vector(all.versions[[length(all.versions)]][, 1])), as.vector(all.versions[[length(all.versions)]][, 2])); if (length(all.versions)>1) { for (i in (length(all.versions)-1):1) { v<-all.versions[[i]]; v<-v[!(as.vector(v[,2]) %in% names(en2eg)), ]; en2eg<-c(en2eg, split(as.vector(v[, 1]), as.vector(v[, 2]))); print(length(en2eg)); } } n<-sapply(en2eg, length); saveRDS(en2eg, file=paste(file_out, 'human_ensp2entrez.rds', sep='/'));
We used the R org.Hs.eg.db package to map human Ensemble protein IDs to human Entrez gene IDs. This package has multiple versions. Some protein IDs in the older verions were removed from the newer ones. To be comprehensive, we combined r length(all.versions)
versions of the org.Hs.eg.db package, while using IDs in the newer version when they were available.
A total of r length(en2eg)
protein IDs were mapped to r length(unique(unlist(en2eg, use.names=FALSE)))
unique Entrez gene IDs while r length(n[n>1])
protein IDs were mapped to more than one gene ID.
en<-unlist(mp[n>0], use.names=FALSE); wb<-rep(names(mp[n>0]), sapply(mp[n>0], length)); en2eg<-en2eg[en]; wb<-rep(wb, sapply(en2eg, length)); eg<-unlist(en2eg, use.names=FALSE); wb2eg<-split(eg[!is.na(eg)], wb[!is.na(eg)]); mapped<-as.list(rep('', length(mp))); names(mapped)<-names(mp); mapped[names(wb2eg)]<-wb2eg; mapped<-lapply(mapped, unique); n<-sapply(mapped, function(x) length(x[x!=''])); saveRDS(mapped, paste(file_out, 'wormbase2human_entrez.rds', sep='/')); # Entrez to Entrez mapping between worm and human mp1<-rep(names(mapped), sapply(mapped, length)); mp2<-unlist(mapped, use.names=FALSE); mp3<-readRDS(paste(file_out, 'wormbase2entrez.rds', sep='/'))[mp1]; mp4<-rep(mp2, sapply(mp3, length)); mp5<-unlist(mp3, use.names=FALSE); flg<-mp4=='' | mp5=='' | is.na(mp4) | is.na(mp5); wm2hs<-split(mp4[!flg], mp5[!flg]); hs2wm<-split(mp5[!flg], mp4[!flg]); wm2hs<-lapply(wm2hs, unique); hs2wm<-lapply(hs2wm, unique); wm2hs<-wm2hs[order(as.numeric(names(wm2hs)))]; hs2wm<-hs2wm[order(as.numeric(names(hs2wm)))]; saveRDS(wm2hs, file=paste(file_out, 'worm2human_entrez.rds', sep='/')); saveRDS(hs2wm, file=paste(file_out, 'human2worm_entrez.rds', sep='/'));
We mapped WORMBASE gene IDs to Entrez gene IDs of human homolog through Ensembl protein IDs. r length(n[n>0])
WORMBASE IDs were mapped to at least one human gene, and r length(n[n>1])
of them were mapped to more than one.
library(rchive); fn.wm2hs<-paste(file_out, 'worm2human_entrez.rds', sep='/'); fn.hs2wm<-paste(file_out, 'human2worm_entrez.rds', sep='/'); wb.wm2hs<-readRDS(fn.wm2hs); wb.hs2wm<-readRDS(fn.hs2wm); fn.homo<-paste(Sys.getenv('RCHIVE_HOME'), 'data/gene/public/homologene/r/homologene.rds', sep='/'); fn.uni<-c( hs=paste(Sys.getenv('RCHIVE_HOME'), 'data/gene/public/unigene/r/human_genes.rds', sep='/'), ce=paste(Sys.getenv('RCHIVE_HOME'), 'data/gene/public/unigene/r/worm_genes.rds', sep='/'), hs.homolog=paste(Sys.getenv('RCHIVE_HOME'), 'data/gene/public/unigene/r/human_homolog.rds', sep='/'), ce.homolog=paste(Sys.getenv('RCHIVE_HOME'), 'data/gene/public/unigene/r/worm_homolog.rds', sep='/') ) # Homologene mapping homolo<-readRDS(fn.homo); gn.hs<-unique(homolo[homolo[,2]=='9606', 3]); homolo.hs2ce<-MapHomologene(homolo, gn.hs, from='9606', to='6239'); homolo.hs2ce<-split(as.vector(homolo.hs2ce), names(homolo.hs2ce)); gn.ce<-unique(homolo[homolo[,2]=='6239', 3]); homolo.ce2hs<-MapHomologene(homolo, gn.ce, from='6239', to='9606'); homolo.ce2hs<-split(as.vector(homolo.ce2hs), names(homolo.ce2hs)); # Unigene mapping uni.hs<-readRDS(fn.uni[1]); uni.hs.homolog<-readRDS(fn.uni[3]); uni.hs.homolog[, 1]<-uni.hs[uni.hs.homolog[, 1], 'GENE_ID']; uni.hs.mp<-MapUnigeneSpecies(uni.hs.homolog, 'org.Ce.eg.db'); uni.hs.mp<-lapply(uni.hs.mp, names); uni.hs.mp<-lapply(uni.hs.mp, function(x) x[x!='']); uni.hs.mp<-uni.hs.mp[sapply(uni.hs.mp, length)>0]; uni.ce<-readRDS(fn.uni[2]); uni.ce.homolog<-readRDS(fn.uni[4]); uni.ce.homolog[, 1]<-uni.ce[uni.ce.homolog[, 1], 'GENE_ID']; uni.ce.mp<-MapUnigeneSpecies(uni.ce.homolog, 'org.Hs.eg.db'); uni.ce.mp<-lapply(uni.ce.mp, names); uni.ce.mp<-lapply(uni.ce.mp, function(x) x[x!='']); uni.ce.mp<-uni.ce.mp[sapply(uni.ce.mp, length)>0]; mp<-list(homologene=list(worm2human=homolo.ce2hs, human2worm=homolo.hs2ce), wormbase=list(worm2human=wb.wm2hs, human2worm=wb.hs2wm), unigene=list(worm2human=uni.ce.mp, human2worm=uni.hs.mp)); worm2human<-CombineHomolog(lapply(mp, function(x) x[[1]])); human2worm<-CombineHomolog(lapply(mp, function(x) x[[2]])); saveRDS(worm2human, file=paste(file_out, 'worm2human_full.rds', sep='/')); saveRDS(human2worm, file=paste(file_out, 'human2worm_full.rds', sep='/')); ns<-sapply(mp, function(x) length(x[[1]])); nm<-names(mp); ln<-paste(' - **', nm, '**:', ns, ' mapped genes', sep=''); ln<-paste(ln, collapse='\n');
Finally, we combined worm-human gene mapping from r length(mp)
different resource. Totally, r nrow(worm2human[[1]])
worm genes were mapped to at least 1 human gene.
r ln
############################################################################################################## tm<-strsplit(as.character(Sys.time()), ' ')[[1]][1]; fn0<-paste(Sys.getenv("RCHIVE_HOME"), '/source/script/update/gene/MapWormbase2Human.Rmd', sep=''); fn1<-paste(Sys.getenv("RCHIVE_HOME"), '/source/script/update/gene/log/', tm, '_MapWormbase2Human.Rmd' , sep=''); file.copy(fn0, fn1)
END OF DOCUMENT
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.