We use this procedure to map worm genes to mouse homologs. The same procedure can be used to map worm genes to genes of other species. The key mapping information is the best match of worm-mouse protein sequences based on Blastp hits. To prepare for the mapping, several annotation files of genome WScel235 were downloaded from the ftp site of http://www.wormbase.org:

knitr::opts_chunk$set(dpi=300, fig.pos="H", fig.width=8, fig.height=6, echo=FALSE, warning=FALSE, message=FALSE);

Map wormbase gene IDs to mouse Ensembl protein IDs

file_home<-paste(Sys.getenv('RCHIVE_HOME'), 'data/gene/public/wormbase/src', sep='/');
file_out<-paste(Sys.getenv('RCHIVE_HOME'), 'data/gene/public/wormbase/r', sep='/');

if (!file.exists(file_out)) dir.create(file_out, recursive = TRUE); 

# all WORMBASE gene IDs
fn<-paste(file_home, 'c_elegans.WS235.gene_ids.txt', sep='/');
all.ids<-read.table(fn, sep='\t', stringsAsFactors = FALSE)[, 2];

# mapping between differnt types of worm IDs
fn<-paste(file_home, 'c_elegans.WS235.xrefs.txt', sep='/');
xref<-read.table(fn, sep='\t', stringsAsFactors = FALSE);
xref<-xref[xref[, 5]!='.', ];
xref<-split(xref[, 5], xref[, 2]);

fn<-paste(file_home, 'c_elegans.WS235.best_blastp_hits.txt', sep='/');
ln<-scan(fn, what='', sep='\n', flush=TRUE); 
ln<-ln[grep('ENSEMBL', ln)]; # has mouse homolog
ce<-sapply(strsplit(ln, ','), function(x) x[1]);
ln<-strsplit(ln, '[,:]');
en<-sapply(ln, function(ln) {ind<-which(ln=='ENSEMBL'); ln[(ind+1):(ind+2)]});
en<-t(en);
p<-as.numeric(en[,2]);
names(p)<-en[, 1]; 
ce2ensp<-split(p, ce);
ce2ensp<-lapply(ce2ensp, sort);
ce2ensp<-lapply(ce2ensp, names);
saveRDS(list(all_worm_ids=all.ids, xref=xref, worm2mouse=ce2ensp), file=paste(file_out, 'worm2mouse_parsed.rds', sep='/'));

Prepare a list of 3 elements from files downloaded from wormbase FTP site: 1. all wormbase gene IDs; 2. mapping of wormbase gene IDs to protein IDs; and 3. mapping of wormbase protein IDs to mouse Emsembl protein IDs (sorted by Blastp p value, minimum first).

ce2mm<-readRDS(paste(file_out, 'worm2mouse_parsed.rds', sep='/'));
pr<-unlist(ce2mm[[2]], use.names=FALSE); 
pr2pr<-ce2mm[[3]][pr];
gn<-rep(names(ce2mm[[2]]), sapply(ce2mm[[2]], length));
gn2pr<-split(unlist(pr2pr, use.names=FALSE), rep(gn, sapply(pr2pr, length))); 
gn2pr<-lapply(gn2pr, unique);

mp<-as.list(rep('', length(ce2mm[[1]])));
names(mp)<-ce2mm[[1]];
mp[names(gn2pr)]<-gn2pr; 

n<-sapply(mp, function(x) length(x[x!=''])); 

Then, perform one to many mapping, from worm gene IDs to worm protein IDs to mouse protein IDs. r length(n[n>0]) worm genes were mapped to at least one mouse protein, among which r length(n[n>1]) genes were mapped to more than one mouse proteins.

versions<-c(
  "http://bioconductor.org/packages/2.14/data/annotation/src/contrib/org.Ce.eg.db_2.14.0.tar.gz",
  "http://bioconductor.org/packages/3.0/data/annotation/src/contrib/org.Ce.eg.db_3.0.0.tar.gz",
  "http://bioconductor.org/packages/3.1/data/annotation/src/contrib/org.Ce.eg.db_3.1.2.tar.gz",
  "http://bioconductor.org/packages/3.2/data/annotation/src/contrib/org.Ce.eg.db_3.2.3.tar.gz",
  "http://bioconductor.org/packages/3.3/data/annotation/src/contrib/org.Ce.eg.db_3.2.3.tar.gz"
)

fn<-sapply(strsplit(versions, '/'), function(x) x[length(x)]);
fn<-paste(file_home, fn, sep='/');
apply(cbind(versions, fn), 1, function(v) if (!file.exists(v[2])) download.file(v[1], v[2]))->x;

library(dplyr);

all.versions<-lapply(fn, function(f) {
  print(f);
  untar(f, exdir=file_home);
  db<-src_sqlite(paste(file_home, 'org.Ce.eg.db/inst/extdata/org.Ce.eg.sqlite', sep='/')); 
  tbls<-src_tbls(db);
  if ('wormbase' %in% tbls) {
    gn<-as.data.frame(collect(tbl(db, 'genes'))); 
    t<-as.data.frame(collect(tbl(db, 'wormbase')));
    t[,1]<-gn[, 2][t[, 1]]; 
    t; 
  } else NA
});
all.versions<-all.versions[!is.na(all.versions)];

if (length(all.versions)==0) stop("No Wormbase to Entrez gene mapped");

wb2eg<-split(as.character(as.vector(all.versions[[length(all.versions)]][, 1])), as.vector(all.versions[[length(all.versions)]][, 2])); 
if (length(all.versions)>1) {
  for (i in (length(all.versions)-1):1) {
    v<-all.versions[[i]];
    v<-v[!(as.vector(v[,2]) %in% names(wb2eg)), ];
    wb2eg<-c(wb2eg, split(as.vector(v[, 1]), as.vector(v[, 2])));
    print(length(wb2eg)); 
  }
}
n<-sapply(wb2eg, length); 
saveRDS(wb2eg, file=paste(file_out, 'wormbase2entrez.rds', sep='/'));
versions<-c(
  "http://bioconductor.org/packages/2.1/data/annotation/src/contrib/org.Mm.eg.db_2.0.2.tar.gz",
  "http://bioconductor.org/packages/2.2/data/annotation/src/contrib/org.Mm.eg.db_2.2.0.tar.gz",
  "http://bioconductor.org/packages/2.3/data/annotation/src/contrib/org.Mm.eg.db_2.2.6.tar.gz",
  "http://bioconductor.org/packages/2.4/data/annotation/src/contrib/org.Mm.eg.db_2.2.11.tar.gz",
  "http://bioconductor.org/packages/2.5/data/annotation/src/contrib/org.Mm.eg.db_2.3.6.tar.gz",
  "http://bioconductor.org/packages/2.6/data/annotation/src/contrib/org.Mm.eg.db_2.4.1.tar.gz",
  "http://bioconductor.org/packages/2.7/data/annotation/src/contrib/org.Mm.eg.db_2.4.6.tar.gz", 
  "http://bioconductor.org/packages/2.8/data/annotation/src/contrib/org.Mm.eg.db_2.5.0.tar.gz",
  "http://bioconductor.org/packages/2.9/data/annotation/src/contrib/org.Mm.eg.db_2.6.4.tar.gz",
  "http://bioconductor.org/packages/2.10/data/annotation/src/contrib/org.Mm.eg.db_2.7.1.tar.gz",
  "http://bioconductor.org/packages/2.11/data/annotation/src/contrib/org.Mm.eg.db_2.8.0.tar.gz",
  "http://bioconductor.org/packages/2.12/data/annotation/src/contrib/org.Mm.eg.db_2.9.0.tar.gz",
  "http://bioconductor.org/packages/2.13/data/annotation/src/contrib/org.Mm.eg.db_2.10.1.tar.gz",
  "http://bioconductor.org/packages/2.14/data/annotation/src/contrib/org.Mm.eg.db_2.14.0.tar.gz",
  "http://bioconductor.org/packages/3.0/data/annotation/src/contrib/org.Mm.eg.db_3.0.0.tar.gz",
  "http://bioconductor.org/packages/3.1/data/annotation/src/contrib/org.Mm.eg.db_3.1.2.tar.gz",
  "http://bioconductor.org/packages/3.2/data/annotation/src/contrib/org.Mm.eg.db_3.2.3.tar.gz"
)

fn<-sapply(strsplit(versions, '/'), function(x) x[length(x)]);
fn<-paste(file_home, fn, sep='/');
apply(cbind(versions, fn), 1, function(v) if (!file.exists(v[2])) download.file(v[1], v[2]));

library(dplyr);

all.versions<-lapply(fn, function(f) {
  print(f);
  untar(f, exdir=file_home);
  db<-src_sqlite(paste(file_home, 'org.Mm.eg.db/inst/extdata/org.Mm.eg.sqlite', sep='/')); 
  tbls<-src_tbls(db);
  if ('ensembl_prot' %in% tbls) {
    gn<-as.data.frame(collect(tbl(db, 'genes'))); 
    t<-as.data.frame(collect(tbl(db, 'ensembl_prot')));
    t[,1]<-gn[, 2][t[, 1]]; 
    t; 
  } else NA
});
all.versions<-all.versions[!is.na(all.versions)];

if (length(all.versions)==0) stop("No Ensembl protein to Entrez gene mapped");

en2eg<-split(as.character(as.vector(all.versions[[length(all.versions)]][, 1])), as.vector(all.versions[[length(all.versions)]][, 2])); 
if (length(all.versions)>1) {
  for (i in (length(all.versions)-1):1) {
    v<-all.versions[[i]];
    v<-v[!(as.vector(v[,2]) %in% names(en2eg)), ];
    en2eg<-c(en2eg, split(as.vector(v[, 1]), as.vector(v[, 2])));
    print(length(en2eg)); 
  }
}
n<-sapply(en2eg, length); 
saveRDS(en2eg, file=paste(file_out, 'mouse_ensp2entrez.rds', sep='/'));

Map mouse Ensembl protein IDs to Entrez gene IDs

We used the R org.Mm.eg.db package to map mouse Ensemble protein IDs to mouse Entrez gene IDs. This package has multiple versions. Some protein IDs in the older verions were removed from the newer ones. To be comprehensive, we combined r length(all.versions) versions of the org.Mm.eg.db package, while using IDs in the newer version when they were available. A total of r length(en2eg) protein IDs were mapped to r length(unique(unlist(en2eg, use.names=FALSE))) unique Entrez gene IDs while r length(n[n>1]) protein IDs were mapped to more than one gene ID.

Map WORMBASE gene IDs to mouse Entrez gene IDs

en<-unlist(mp[n>0], use.names=FALSE);
wb<-rep(names(mp[n>0]), sapply(mp[n>0], length));
en2eg<-en2eg[en];
wb<-rep(wb, sapply(en2eg, length));
eg<-unlist(en2eg, use.names=FALSE);
wb2eg<-split(eg[!is.na(eg)], wb[!is.na(eg)]);

mapped<-as.list(rep('', length(mp)));
names(mapped)<-names(mp);
mapped[names(wb2eg)]<-wb2eg;
mapped<-lapply(mapped, unique);
n<-sapply(mapped, function(x) length(x[x!='']));
saveRDS(mapped, paste(file_out, 'wormbase2mouse_entrez.rds', sep='/'));

# Entrez to Entrez mapping between worm and mouse
mp1<-rep(names(mapped), sapply(mapped, length)); 
mp2<-unlist(mapped, use.names=FALSE); 
mp3<-readRDS(paste(file_out, 'wormbase2entrez.rds', sep='/'))[mp1]; 
mp4<-rep(mp2, sapply(mp3, length)); 
mp5<-unlist(mp3, use.names=FALSE); 
flg<-mp4=='' | mp5=='' | is.na(mp4) | is.na(mp5); 
wm2hs<-split(mp4[!flg], mp5[!flg]); 
hs2wm<-split(mp5[!flg], mp4[!flg]);
wm2hs<-lapply(wm2hs, unique); 
hs2wm<-lapply(hs2wm, unique); 
wm2hs<-wm2hs[order(as.numeric(names(wm2hs)))]; 
hs2wm<-hs2wm[order(as.numeric(names(hs2wm)))]; 
saveRDS(wm2hs, file=paste(file_out, 'worm2mouse_entrez.rds', sep='/')); 
saveRDS(hs2wm, file=paste(file_out, 'mouse2worm_entrez.rds', sep='/')); 

We mapped WORMBASE gene IDs to Entrez gene IDs of mouse homolog through Ensembl protein IDs. r length(n[n>0]) WORMBASE IDs were mapped to at least one mouse gene, and r length(n[n>1]) of them were mapped to more than one.

library(rchive); 

fn.wm2hs<-paste(file_out, 'worm2mouse_entrez.rds', sep='/'); 
fn.hs2wm<-paste(file_out, 'mouse2worm_entrez.rds', sep='/'); 
wb.wm2hs<-readRDS(fn.wm2hs);
wb.hs2wm<-readRDS(fn.hs2wm);

fn.homo<-paste(Sys.getenv('RCHIVE_HOME'), 'data/gene/public/homologene/r/homologene.rds', sep='/');

fn.uni<-c(
  hs=paste(Sys.getenv('RCHIVE_HOME'), 'data/gene/public/unigene/r/mouse_genes.rds', sep='/'),
  ce=paste(Sys.getenv('RCHIVE_HOME'), 'data/gene/public/unigene/r/worm_genes.rds', sep='/'),
  hs.homolog=paste(Sys.getenv('RCHIVE_HOME'), 'data/gene/public/unigene/r/mouse_homolog.rds', sep='/'),
  ce.homolog=paste(Sys.getenv('RCHIVE_HOME'), 'data/gene/public/unigene/r/worm_homolog.rds', sep='/')
)

# Homologene mapping
homolo<-readRDS(fn.homo);
gn.hs<-unique(homolo[homolo[,2]=='9606', 3]); 
homolo.hs2ce<-MapHomologene(homolo, gn.hs, from='9606', to='6239'); 
homolo.hs2ce<-split(as.vector(homolo.hs2ce), names(homolo.hs2ce)); 
gn.ce<-unique(homolo[homolo[,2]=='6239', 3]); 
homolo.ce2mm<-MapHomologene(homolo, gn.ce, from='6239', to='9606'); 
homolo.ce2mm<-split(as.vector(homolo.ce2mm), names(homolo.ce2mm)); 

# Unigene mapping
uni.mm<-readRDS(fn.uni[1]);
uni.mm.homolog<-readRDS(fn.uni[3]); 
uni.mm.homolog[, 1]<-uni.mm[uni.mm.homolog[, 1], 'GENE_ID'];
uni.mm.mp<-MapUnigeneSpecies(uni.mm.homolog, 'org.Ce.eg.db'); 
uni.mm.mp<-lapply(uni.mm.mp, names);
uni.mm.mp<-lapply(uni.mm.mp, function(x) x[x!='']);
uni.mm.mp<-uni.mm.mp[sapply(uni.mm.mp, length)>0]; 

uni.ce<-readRDS(fn.uni[2]);
uni.ce.homolog<-readRDS(fn.uni[4]); 
uni.ce.homolog[, 1]<-uni.ce[uni.ce.homolog[, 1], 'GENE_ID'];
uni.ce.mp<-MapUnigeneSpecies(uni.ce.homolog, 'org.Mm.eg.db'); 
uni.ce.mp<-lapply(uni.ce.mp, names);
uni.ce.mp<-lapply(uni.ce.mp, function(x) x[x!='']);
uni.ce.mp<-uni.ce.mp[sapply(uni.ce.mp, length)>0]; 

mp<-list(homologene=list(worm2mouse=homolo.ce2mm, mouse2worm=homolo.hs2ce),
         wormbase=list(worm2mouse=wb.wm2hs, mouse2worm=wb.hs2wm), 
         unigene=list(worm2mouse=uni.ce.mp, mouse2worm=uni.hs.mp)); 
worm2mouse<-CombineHomolog(lapply(mp, function(x) x[[1]])); 
mouse2worm<-CombineHomolog(lapply(mp, function(x) x[[2]])); 
saveRDS(worm2mouse, file=paste(file_out, 'worm2mouse_full.rds', sep='/')); 
saveRDS(mouse2worm, file=paste(file_out, 'mouse2worm_full.rds', sep='/')); 

ns<-sapply(mp, function(x) length(x[[1]]));
nm<-names(mp); 
ln<-paste(' - **', nm, '**:', ns, ' mapped genes', sep=''); 
ln<-paste(ln, collapse='\n'); 

Finally, we combined worm-mouse gene mapping from r length(mp) different resource. Totally, r nrow(worm2mouse[[1]]) worm genes were mapped to at least 1 mouse gene.

r ln

##############################################################################################################
tm<-strsplit(as.character(Sys.time()), ' ')[[1]][1];
fn0<-paste(Sys.getenv("RCHIVE_HOME"), '/source/script/update/gene/MapWormbase2Mouse.Rmd', sep='');
fn1<-paste(Sys.getenv("RCHIVE_HOME"), '/source/script/update/gene/log/', tm, '_MapWormbase2Mouse.Rmd' , sep='');
file.copy(fn0, fn1)

END OF DOCUMENT



zhezhangsh/rchive documentation built on June 17, 2020, 3:55 a.m.