#!/usr/bin/env Rscript
## ---- input
root <- "~/projects/geneLists"
script <- "061_UniProt-Subcell-loc"
short_name <- "hsUniprotSubcell"
data_source <- "uniprot.org"
## ---- prepare the renv
devtools::load_all(root, quiet=TRUE)
# imports
suppressPackageStartupMessages({
library(dplyr)
library(data.table)
})
# create a query like:
# https://www.uniprot.org/uniprot/?query=*&fil=reviewed%3Ayes+AND+organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22
# then, download > all > compressed > tab separated
myfile <- file.path(root,"downloads","uniprot-subcell.tab.gz")
# NOTE: downloaded Sept 9th 2021
stopifnot(file.exists(myfile))
# if you have R.utils, you can read .gz directly!
df <- fread(myfile) %>%
# munge
dplyr::rename(Accession = "Entry") %>%
dplyr::rename(CC = "Subcellular location [CC]") %>%
select(Accession,CC) %>%
tidyr::separate_rows(CC,sep="\\;") %>%
filter(CC != "") %>%
mutate(CC = gsub("SUBCELLULAR LOCATION: ","", CC)) %>%
mutate(CC = trimws(CC))
# need to delimit two cases:
# multiple entries (sentences) separated by "."
# rows with notes
# collect rows with 'Note='
subdf <- df %>% dplyr::filter(grepl("Note=",CC))
all(grepl("Note",subdf$CC))
# the notes look really useful...
subdf <- subdf %>% tidyr::separate(CC,into=c("CC","Note"),sep="Note=")
# now we can seperate CC
subdf <- subdf %>%
tidyr::separate_rows(CC,sep="\\. ") %>%
filter(CC!="")
# there are multiple types of entries (all associated with notes)
# * just loc, e.g: 'Cytoplasm'
# * loc and evidence, e.g" 'Cytopasm {type|source}'
# NOTE: source can be pubmed id or another uniprot entry in which case the
# cellular location annotation is based on information from another protein
# see all types of evidence annotations (ECO) here:
# https://www.uniprot.org/help/evidences
subdf2 <- df %>%
dplyr::filter(!grepl("Note=",CC)) %>%
mutate(Note="") %>%
tidyr::separate_rows(CC,sep="\\. ") %>%
filter(CC!="")
head(subdf2)
# combine and more clean-up
res <- rbind(subdf,subdf2) %>%
mutate(CC = gsub("\\.","",CC)) # rm trailing "."
head(res)
# okay, now lets group things together
part1 <- res %>% dplyr::filter(grepl("\\{",CC)) # evidence
part2 <- res %>% dplyr::filter(!grepl("\\{",CC)) # no evidence
# to account for cases like the following row:
# "[Amyloid-beta protein 42]: Cell surface"
# we can get subcell annot from the part after []
subpart1 <- part1 %>%
filter(grepl("\\[",CC)) %>%
tidyr::separate(CC,into=c("Isoform","CC"),sep="\\[.*\\]") %>%
mutate(CC=gsub("\\:","",CC)) %>%
mutate(CC=trimws(CC)) %>%
# we loose isoform specific info... could be appended to note
select(-Isoform) %>%
tidyr::separate(CC,into=c("CC","Evidence"),sep="\\{") %>%
mutate(CC = trimws(CC)) %>%
mutate(Evidence=gsub("\\}","",Note))
subpart2 <- part1 %>%
filter(!grepl("\\[",CC)) %>%
tidyr::separate(CC,into=c("CC","Evidence"),sep="\\{") %>%
mutate(CC = trimws(CC)) %>%
filter(CC != "") %>%
mutate(Evidence=gsub("\\}","",Note))
# check how many locs
#unique(subpart1$CC)
#unique(subpart2$CC)
# now for the other half
# part2
# NOTE: evidence can be buried in note!
# we ignore this additional complexity for now
subpart3 <- part2 %>%
filter(grepl("\\[",CC)) %>%
tidyr::separate(CC,into=c("Isoform","CC"),sep="\\[.*\\]") %>%
mutate(CC=gsub("\\:","",CC)) %>%
mutate(CC=trimws(CC)) %>%
# we loose isoform specific info... could be appended to note
mutate(Evidence=NA) %>%
select(-Isoform) %>%
select(Accession,CC,Evidence,Note)
subpart4 <- part2 %>%
filter(!grepl("\\[",CC)) %>%
mutate(Evidence=NA) %>%
select(Accession,CC,Evidence,Note)
# whew!
uniprot_subcell <- rbind(subpart1,subpart2,subpart3,subpart4)
head(uniprot_subcell)
uniprot <- uniprot_subcell$Accession
# NOTE: uniprot_map is mouse!
entrez <- geneLists::getIDs(uniprot,"uniprot","entrez",species="human")
# collect data
uniprot_subcell <- uniprot_subcell %>%
mutate(Entrez = entrez) %>%
filter(Entrez != "") %>%
filter(!is.na(Entrez))
## ---- create gene list
gene_list <- split(uniprot_subcell$Entrez,uniprot_subcell$CC)
# rm length==1
drop <- which(sapply(gene_list,length) == 1)
gene_list <- gene_list[-drop]
message("Compiled n subcellular locations (pathways): ", length(gene_list))
# its not perfect...
df <- data.table(Pathway=names(gene_list),n=sapply(gene_list,length))
df %>% head() %>% knitr::kable()
# save
myfile <- file.path(root,"gmt",script)
write_gmt(gene_list,data_source,myfile)
documentDataset(myfile,short_name,file.path(root,"R"),file.path(root,"data"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.