# -*- tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
# vi: set ts=2 noet:
library(plyr)
library(dplyr)
library(jsonlite)
library(readr)
library(BioChemPantry)
source("~/work/sea/scripts/id_utils.R")
source("~/work/sets/uniprot_idmapping/scripts/uniprot_id_map_web.R")
# collect genes with gene products from HGNC
# Each gene symbol corresponds to at most one uniprot accn
# but some uniprot entries correspond to multiple gene ids most of these are pretty weird or uncharacterized gene products, so filter all these out
staging_directory <- get_staging_directory("hgnc/release_160324")
system(paste0("
cd ", staging_directory, "
wget ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/json/locus_types/gene_with_protein_product.json"))
genes <- jsonlite::fromJSON(txt=paste0(staging_directory, "/data/gene_with_protein_product.json")$response$docs
genes %>% summarize_map("hgnc_id", "symbol")
# hgnc_id, gene_symbol are 1-1
# work around for bug https://github.com/hadley/tidyr/issues/141
my_unnest <- function(data, col_name){
data %>% adply(1, function(row){
if(is.null(row[,col_name][[1]])){
row[,col_name] <- NA
} else {
unnested <- row[,col_name][[1]]
row <- row[rep(1,length(unnested)),]
row[,col_name] <- unnested
}
row
})
}
genes <- genes %>%
my_unnest("uniprot_ids") %>%
rename(uniprot_accn = uniprot_ids)
genes %>% summarize_map("hgnc_id", "symbol")
# hgnc_id, gene_symbol are 1-1
genes %>% summarize_map("hgnc_id", "uniprot_accn")
#X<-[hgnc_id]:
# |X|: 18989
# count*size: 18989*1
#Y<-[uniprot_accn]:
# |Y|: 18834 (30 NA)
# count*size: 18765*1, 53*2, 6*3, 3*4, 3*5, 1*7, 1*10, 1*12, 1*14
#[X U Y]:
# |X U Y|: 18959 (30 NA)
# count*size: 18959*1
#[X @ Y]:
# |X ~ Y|: 18765
# |X:X < Y|, |Y:X < Y|: 0, 0
# |X:X > Y|, |Y:X > Y|: 194, 69
#just get rid of them,
genes <- genes %>%
dplyr::anti_join(
genes %>%
count(uniprot_accn) %>%
filter(n>1),
by="uniprot_accn")
genes <- genes %>%
mutate(iuphar = iuphar %>% str_replace("objectId:","") %>% as.numeric) %>%
dplyr::select(
symbol,
gene_family,
gene_family_id,
name,
uniprot_accn,
entrez_id,
ucsc_id,
ensembl_gene_id,
refseq_accession,
omim_id,
iuphar)
uniprot_mart <- useMart("unimart", dataset="uniprot", host="www.ebi.ac.uk", path="/uniprot/biomart/martservice")
uniprot_data <- genes %>%
dplyr:::select(uniprot_accn) %>%
biomaRt::getBM(
filters="accession",
attributes=c("accession", "name", "organism", "entry_type", "protein_evidence", "comments", "keyword"),
values=.$uniprot_accn,
mart=uniprot_mart) %>%
dplyr::rename(
uniprot_accn = accession,
uniprot_entry = name) %>%
nest(keyword)
genes2 <- genes %>%
dplyr::left_join(
uniprot_data %>%
dplyr::select(-symbol, -name),
by="uniprot_accn") %>%
dplyr::filter(
!is.na(uniprot_entry))
pantry <- get_pantry()
pantry %>% create_schema("hgnc_151120")
pantry %>% set_schema("hgnc_151120")
pantry %>% dplr::copy_to(
genes2,
"genes",
temporary=F,
indices=list(
"symbol",
"uniprot_accn",
"uniprot_entry"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.