# Combine more than one collections of GWAS results
CombineGwas<-function(path.out=paste(Sys.getenv('RCHIVE_HOME'), 'data/gwas/r', sep='/'),
paths=paste(Sys.getenv('RCHIVE_HOME'), 'data/gwas/public', c('dbgap/r', 'gwascatalog/r'), sep='/')
) {
# path.out Path to output files
# paths Paths to GWAS collections
if (!file.exists(path.out)) dir.create(path.out, recursive=TRUE);
library(stringr);
##################
# depended files #
##################
# required keyword files from PubTator
fn.key<-paste(Sys.getenv("RCHIVE_HOME"), '/data/literature/public/pubtator/r/original_pubmed2',
c('gene', 'disease', 'chemical'), '.rds', sep='');
names(fn.key)<-c('Gene', 'Disease', 'Chemical');
# required GWAS metadata files in each collection
fn.req<-c("analysis.rds", "study.rds", "pubmed.rds");
fn.missed<-lapply(paths, function(path) {
fn<-paste(path, fn.req, sep='/');
fn[!file.exists(fn)];
});
if (length(unlist(fn.missed)) > 0)
stop("Error: missing required file(s):\n", paste(unlist(fn.missing, use.names=FALSE), collapse='\n'));
#######################
# Loading in metadata #
#######################
################################################################################
################################################################################
### load and combine annotation tables
# Column names of annotation tables
cnm<-list(
analysis=c("Source", "Name", "Study_ID", "PubMed_ID", "URL", "Num_Variants", "Min_Phred", "Max_Phred", "Description"),
study=c("Source", "Name", "Num_Analyses", "URL", "Description"),
pubmed=c('Title', 'Journal', 'Date', 'URL', 'Abstract')
);
# Load, combine, and return metadata tables
meta<-sapply(names(cnm), function(nm) {
tbl.set<-lapply(paths, function(path) readRDS(paste(path, '/', nm, '.rds', sep='')));
# combined analysis tables with the same columns
tbls<-lapply(tbl.set, function(tbl) {
c<-setdiff(cnm[[nm]], colnames(tbl));
if (length(c)>0) tbl<-cbind(tbl, sapply(c, function(c) rep('', nrow(tbl))));
tbl[, cnm[[nm]]];
});
# Re-format
tbl<-do.call('rbind', tbls);
rnm<-unlist(lapply(tbl.set, rownames));
tbl<-tbl[!duplicated(rnm), ];
rownames(tbl)<-rnm[!duplicated(rnm)];
for (i in 1:ncol(tbl)) tbl[[i]]<-str_trim(as.vector(tbl[[i]]));
if ('Description' %in% cnm[[nm]]) tbl[tbl$Description=='', 'Description']<-tbl[tbl$Description=='', 'Name'];
saveRDS(tbl, file=paste(path.out, '/', nm, '.rds', sep=''));
tbl;
});
################################################################################
################################################################################
### Load and combine annotation as lists
meta.as.lst<-lapply(names(meta), function(nm) {
# load from individual sources
lst.set<-lapply(paths, function(path) {
fn.lst<-paste(path, '/', nm, '_by_id.rds', sep='');
if (file.exists(fn.lst)) readRDS(fn.lst) else { # if not exists in the source
tbl<-readRDS(paste(path, '/', nm, '.rds', sep=''));
by.id<-lapply(rownames(tbl), function(id) c(ID=id, tbl[id, ]));
names(by.id)<-rownames(tbl);
by.id
}
});
# combine lists
lst<-do.call('c', lst.set);
saveRDS(lst, file=paste(path.out, '/', nm, '_by_id.rds', sep=''));
lst;
});
names(meta.as.lst)<-names(meta);
################################################################################
################################################################################
## Study/analysis to PubMed
map2pubmed<-lapply(c('analysis', 'study'), function(nm) {
mp2pm<-lapply(meta.as.lst[[nm]], function(lst)
if('pubmed' %in% tolower(names(lst))) lst[tolower(names(lst))=='pubmed'][[1]] else '');
# if available directly from source
for (i in 1:length(paths)) {
fn.mp<-paste(paths[[i]], '/', nm, '2pubmed.rds', sep='');
if (file.exists(fn.mp)) {
mp<-readRDS(fn.mp);
mp<-mp[names(mp) %in% names(mp2pm)];
if (length(mp) > 0) mp2pm[names(mp)]<-mp;
};
}
saveRDS(lapply(mp2pm, function(mp) sort(unique(mp))), file=paste(path.out, '/', nm, '2pubmed.rds', sep=''));
# Map pubmed ID to analysis and study
pm<-unlist(mp2pm, use.names=FALSE);
id<-rep(names(mp2pm), sapply(mp2pm, length));
pm2id<-lapply(split(id, pm), function(x) sort(unique(x)));
pm2id<-pm2id[names(pm2id) %in% rownames(readRDS(paste(path.out, 'pubmed.rds', sep='/')))];
saveRDS(pm2id, file=paste(path.out, '/pubmed2', nm, '.rds', sep=''));
mp2pm;
});
pm.all<-sort(unique(unlist(map2pubmed)));
############
# Keywords #
############
################################################################################
################################################################################
# Use PubTator to map Pubmed ID to keywords
all.keys<-lapply(fn.key, function(fn) {
ky<-readRDS(fn);
ky[ky[,1] %in% pm.all, ];
});
names(all.keys)<-names(fn.key);
saveRDS(all.keys, file=paste(path.out, 'keyword_all.rds', sep='/'));
# combine keywords into a single table
tbl<-cbind(do.call('rbind', all.keys), Type=rep(names(fn.key), sapply(all.keys, nrow)));
tbl<-cbind(tbl[, 1:2], Type=rep(names(fn.key), sapply(all.keys, nrow)), tbl[, 3:ncol(tbl)]);
# expand keyword IDs if multiple IDs in the same cell
key.ids<-strsplit(tbl$MeshID, '[/,]'); # All keyword IDs
key.rnm<-rep(rownames(tbl), sapply(key.ids, length)); # table row names
key.ids<-unlist(key.ids, use.names=FALSE);
key.typ<-tbl[key.rnm, 'Type']; # Keyword type
key.pub<-tbl[key.rnm, 'PMID']; # Corresponding Pubmed ID
key.syn<-tbl[key.rnm, 'Mentions'];
# Fix keyword format as source:id
key.ids<-sub('GeneID', 'ENTREZ', key.ids);
key.ids[key.typ=='Gene' & !grepl(':', key.ids)]<-paste('ENTREZ:', key.ids[key.typ=='Gene' & !grepl(':', key.ids)], sep='')
key.ids[key.typ=='Chemical' & !grepl(':', key.ids)]<-paste('MESH:', key.ids[key.typ=='Chemical' & !grepl(':', key.ids)], sep='')
key.src<-sapply(strsplit(key.ids, ':'), function(x) x[1]);
key.val<-sapply(strsplit(key.ids, ':'), function(x) x[2]);
# Create URL
url.pre<-c(
'ENTREZ' = "http://www.ncbi.nlm.nih.gov/gene/?term=",
'MESH' = "http://www.ncbi.nlm.nih.gov/mesh/?term=",
'OMIM' = "http://omim.org/entry/",
'CHEBI' = "http://www.ebi.ac.uk/chebi/advancedSearchFT.do?searchString="
)[key.src];
url<-paste(url.pre, key.val, sep='');
url[is.na(url.pre)]<-'';
keyword<-data.frame(PMID=key.pub, ID=key.ids, Type=key.typ, URL=url, Synonym=key.syn, stringsAsFactors=FALSE);
keys<-unique(keyword$ID);
###################################################################
# Keyword names
ttl<-rep('', length(keys));
names(ttl)<-keys;
id<-sapply(strsplit(keys,':'), function(x) x[2]);
names(id)<-keys;
url<-as.vector(keyword$URL);
names(url)<-as.vector(keyword$ID);
url<-url[keys];
# Names of known genes
f<-dir(paste(Sys.getenv("RCHIVE_HOME"), 'data/gene/public/entrez/r/', sep='/'));
f<-f[grep('_genes_full.rds$', f)];
f<-paste(Sys.getenv("RCHIVE_HOME"), 'data/gene/public/entrez/r', f, sep='/')
anno<-lapply(f, readRDS);
anno<-do.call('rbind', anno);
sym<-as.vector(anno[id[grep('^ENTREZ', keys)], 'Symbol']);
desc<-as.vector(anno[id[grep('^ENTREZ', keys)], 'description']);
nm<-paste(sym, desc, sep='; ');
names(nm)<-keys[grep('^ENTREZ', keys)];
nm<-nm[!is.na(sym) & !is.na(desc)];
ttl[names(nm)]<-as.vector(nm);
# Names of other keywords, parse from html pages
library(stringr); # str_trim function
url0<-url[names(ttl[ttl=='' & url!=''])];
if (length(url0) > 0) {
pgs<-GetSetURL(url0);
names(pgs)<-names(url0);
ttl0<-sapply(names(url0), function(nm) {
print(nm);
u<-url0[nm];
#html<-strsplit(getURL(u), '\n')[[1]];
download.file(u, 'temp.html');
html<-scan('temp.html', what='', sep='\n', flush=TRUE);
if (grepl('^ENTREZ:', nm)) {
t<-strsplit(html[grep('<title>', html)], '[<[>]')[[1]][3];
t<-sub('No items found - Gene - NCBI', '', t);
} else if (grepl('^OMIM:', nm)) {
mk<-paste(' ', sub('^OMIM:', '', nm), ' - ', sep='');
t<-html[grep(mk, html)][1];
t<-sapply(strsplit(t, mk), function(x) x[2]);
} else if (grepl('^MESH:', nm)) {
t<-strsplit(html[grep('<title>', html)], '[<[>]')[[1]][3];
t<-sub(' - MeSH - NCBI', '', t);
} else if (grepl('^CHEBI:', nm)) {
mk<-paste('\\(', nm, '\\)$', sep='');
t<-html[grep(mk, html)][1];
t<-sub(mk, '', t);
}
str_trim(t); # take out spaces at the beginning or end
});
# If name not available, use source names
ttl[names(ttl0)]<-ttl0;
ttl[ttl=='No items found']<-'';
ttl1<-ttl[is.na(ttl) | ttl==''];
ttl.k<-keyword[keyword$ID %in% names(ttl1), ];
mp<-split(as.vector(ttl.k$Synonym), as.vector(ttl.k$ID))[names(ttl1)];
mp<-sapply(mp, function(mp) unique(mp)[1]);
ttl[names(ttl1)]<-mp[names(ttl1)];
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.