vignette/sets/hgnc/release_160324/scripts/1_load_data.R

# -*- tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
# vi: set ts=2 noet:

library(plyr)
library(dplyr)
library(jsonlite)
library(readr)
library(BioChemPantry)
source("~/work/sea/scripts/id_utils.R")

source("~/work/sets/uniprot_idmapping/scripts/uniprot_id_map_web.R")

# collect genes with gene products from HGNC
# Each gene symbol corresponds to at most one uniprot accn
# but some uniprot entries correspond to multiple gene ids most of these are pretty weird or uncharacterized gene products, so filter all these out

staging_directory <- get_staging_directory("hgnc/release_160324")

system(paste0("
cd ", staging_directory, "
wget ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/json/locus_types/gene_with_protein_product.json"))

genes <- jsonlite::fromJSON(txt=paste0(staging_directory, "/data/gene_with_protein_product.json")$response$docs

genes %>% summarize_map("hgnc_id", "symbol")
# hgnc_id, gene_symbol are 1-1


# work around for bug https://github.com/hadley/tidyr/issues/141
my_unnest <- function(data, col_name){
	data %>% adply(1, function(row){
		if(is.null(row[,col_name][[1]])){
			row[,col_name] <- NA
		} else {
			unnested <- row[,col_name][[1]]
			row <- row[rep(1,length(unnested)),]
			row[,col_name] <- unnested
		}
		row
	})
}

genes <- genes %>%
	my_unnest("uniprot_ids") %>%
	rename(uniprot_accn = uniprot_ids)


genes %>% summarize_map("hgnc_id", "symbol")
# hgnc_id, gene_symbol are 1-1

genes %>% summarize_map("hgnc_id", "uniprot_accn")
#X<-[hgnc_id]:
#  |X|: 18989
#  count*size: 18989*1
#Y<-[uniprot_accn]:
#  |Y|: 18834 (30 NA)
#  count*size: 18765*1, 53*2, 6*3, 3*4, 3*5, 1*7, 1*10, 1*12, 1*14
#[X U Y]:
#  |X U Y|: 18959 (30 NA)
#  count*size: 18959*1
#[X @ Y]:
#  |X ~ Y|: 18765
#  |X:X < Y|, |Y:X < Y|: 0, 0
#  |X:X > Y|, |Y:X > Y|: 194, 69

#just get rid of them,
genes <- genes %>%
	dplyr::anti_join(
		genes %>%
			count(uniprot_accn) %>%
			filter(n>1),
		by="uniprot_accn")


genes <- genes %>%
	mutate(iuphar = iuphar %>% str_replace("objectId:","") %>% as.numeric) %>%
	dplyr::select(
		symbol,
		gene_family,
		gene_family_id,
		name,
		uniprot_accn,
		entrez_id,
		ucsc_id,
		ensembl_gene_id,
		refseq_accession,
		omim_id,
		iuphar)

uniprot_mart <- useMart("unimart", dataset="uniprot", host="www.ebi.ac.uk", path="/uniprot/biomart/martservice")

uniprot_data <- genes %>%
	dplyr:::select(uniprot_accn) %>%
	biomaRt::getBM(
		filters="accession",
		attributes=c("accession", "name", "organism", "entry_type", "protein_evidence", "comments", "keyword"),
		values=.$uniprot_accn,
		mart=uniprot_mart) %>%
	dplyr::rename(
		uniprot_accn = accession,
		uniprot_entry = name) %>%
	nest(keyword)


genes2 <- genes %>%
	dplyr::left_join(
		uniprot_data %>%
			dplyr::select(-symbol, -name),
		by="uniprot_accn") %>%
	dplyr::filter(
		!is.na(uniprot_entry))




pantry <- get_pantry()
pantry %>% create_schema("hgnc_151120")
pantry %>% set_schema("hgnc_151120")

pantry %>% dplr::copy_to(
	genes2,
	"genes",
	temporary=F,
	indices=list(
		"symbol",
		"uniprot_accn",
		"uniprot_entry"))
momeara/BioChemPantry documentation built on Aug. 13, 2021, 9:46 p.m.