inst/scripts/2_build-lists/061_UniProt-Subcell-loc.R

#!/usr/bin/env Rscript

## ---- input 

root <- "~/projects/geneLists"
script <- "061_UniProt-Subcell-loc"
short_name <- "hsUniprotSubcell"
data_source <- "uniprot.org"

## ---- prepare the renv

devtools::load_all(root, quiet=TRUE)

# imports
suppressPackageStartupMessages({
  library(dplyr)
  library(data.table)
})


# create a query like:
# https://www.uniprot.org/uniprot/?query=*&fil=reviewed%3Ayes+AND+organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22
# then, download > all > compressed > tab separated
myfile <- file.path(root,"downloads","uniprot-subcell.tab.gz")
# NOTE: downloaded Sept 9th 2021

stopifnot(file.exists(myfile))

# if you have R.utils, you can read .gz directly!
df <- fread(myfile) %>% 
	# munge
	dplyr::rename(Accession = "Entry") %>%
	dplyr::rename(CC = "Subcellular location [CC]") %>%
	select(Accession,CC) %>%
	tidyr::separate_rows(CC,sep="\\;") %>%
	filter(CC != "") %>%
	mutate(CC = gsub("SUBCELLULAR LOCATION: ","", CC)) %>%
	mutate(CC = trimws(CC))

# need to delimit two cases:
# multiple entries (sentences) separated by "."
#  rows with notes

# collect rows with 'Note='
subdf <- df %>% dplyr::filter(grepl("Note=",CC))

all(grepl("Note",subdf$CC))

# the notes look really useful...
subdf <- subdf %>% tidyr::separate(CC,into=c("CC","Note"),sep="Note=")

# now we can seperate CC
subdf <- subdf %>% 
	tidyr::separate_rows(CC,sep="\\. ") %>%
	filter(CC!="")

# there are multiple types of entries (all associated with notes)
# * just loc, e.g: 'Cytoplasm'
# * loc and evidence, e.g" 'Cytopasm {type|source}'
# NOTE: source can be pubmed id or another uniprot entry in which case the
# cellular location annotation is based on information from another protein
# see all types of evidence annotations (ECO) here:
# https://www.uniprot.org/help/evidences

subdf2 <- df %>% 
	dplyr::filter(!grepl("Note=",CC)) %>% 
	mutate(Note="") %>%
	tidyr::separate_rows(CC,sep="\\. ") %>%
	filter(CC!="")

head(subdf2)

# combine and more clean-up
res <- rbind(subdf,subdf2) %>% 
	mutate(CC = gsub("\\.","",CC)) # rm trailing "."

head(res)

# okay, now lets group things together
part1 <- res %>% dplyr::filter(grepl("\\{",CC)) # evidence
part2 <- res %>% dplyr::filter(!grepl("\\{",CC)) # no evidence


# to account for cases like the following row:
# "[Amyloid-beta protein 42]: Cell surface"
# we can get subcell annot from the part after []
subpart1 <- part1 %>% 
	filter(grepl("\\[",CC)) %>%
	tidyr::separate(CC,into=c("Isoform","CC"),sep="\\[.*\\]") %>%
	mutate(CC=gsub("\\:","",CC)) %>%
	mutate(CC=trimws(CC)) %>%
	# we loose isoform specific info... could be appended to note
	select(-Isoform) %>%
	tidyr::separate(CC,into=c("CC","Evidence"),sep="\\{") %>%
	mutate(CC = trimws(CC)) %>%
	mutate(Evidence=gsub("\\}","",Note))

subpart2 <- part1 %>%
	filter(!grepl("\\[",CC)) %>%
	tidyr::separate(CC,into=c("CC","Evidence"),sep="\\{") %>%
	mutate(CC = trimws(CC)) %>%
	filter(CC != "") %>%
	mutate(Evidence=gsub("\\}","",Note))

# check how many locs
#unique(subpart1$CC)
#unique(subpart2$CC)

# now for the other half
# part2

# NOTE: evidence can be buried in note!
# we ignore this additional complexity for now

subpart3 <- part2 %>% 
	filter(grepl("\\[",CC)) %>%
	tidyr::separate(CC,into=c("Isoform","CC"),sep="\\[.*\\]") %>%
	mutate(CC=gsub("\\:","",CC)) %>%
	mutate(CC=trimws(CC)) %>%
	# we loose isoform specific info... could be appended to note
	mutate(Evidence=NA) %>%
	select(-Isoform) %>%
	select(Accession,CC,Evidence,Note)

subpart4 <- part2 %>% 
	filter(!grepl("\\[",CC)) %>%
	mutate(Evidence=NA) %>%
	select(Accession,CC,Evidence,Note)


# whew!
uniprot_subcell <- rbind(subpart1,subpart2,subpart3,subpart4)

head(uniprot_subcell)

uniprot <- uniprot_subcell$Accession

# NOTE: uniprot_map is mouse!
entrez <- geneLists::getIDs(uniprot,"uniprot","entrez",species="human")

# collect data
uniprot_subcell <- uniprot_subcell %>% 
	mutate(Entrez = entrez) %>% 
	filter(Entrez != "") %>%
	filter(!is.na(Entrez))


## ---- create gene list

gene_list <- split(uniprot_subcell$Entrez,uniprot_subcell$CC)

# rm length==1
drop <- which(sapply(gene_list,length) == 1)

gene_list <- gene_list[-drop]

message("Compiled n subcellular locations (pathways): ", length(gene_list))

# its not perfect...
df <- data.table(Pathway=names(gene_list),n=sapply(gene_list,length))
df %>% head() %>% knitr::kable()

# save
myfile <- file.path(root,"gmt",script)
write_gmt(gene_list,data_source,myfile)
documentDataset(myfile,short_name,file.path(root,"R"),file.path(root,"data"))
twesleyb/geneLists documentation built on Oct. 30, 2021, 7:28 p.m.