## Processing the Shogoin protein cell marker database
##
## Source: http://shogoin.stemcellinformatics.org/cell_marker/literatures
## Copy table and save as text-delimited file
##
## Manual fixes:
## - replaced "Immune System/blood Cells" with "Immune System/blood Cell"
## - replaced "Activated Bcell" with "Activated B Cell"
## - replaced "Immune System/blood Cells: Lymphoid Cell" with "Immune System/blood Cell" and "Golgi Apparatus" with "Blood" on the same line
##
library(here)
library(org.Hs.eg.db)
library(tidyverse)
.calc_jaccard_dist <- function(l, wide = FALSE) {
grid <- expand.grid(names(l), names(l), stringsAsFactors = FALSE)
dists <- map2_dbl(grid$Var1, grid$Var2, function(x, y, l) {
length(intersect(l[[x]], l[[y]]))/length(union(l[[x]], l[[y]]))
}, l = l)
df <- data.frame(grid, jaccard_dist = dists)
if (wide == TRUE) {
jd <- reshape(df, direction = 'wide', idvar = 'Var2', timevar = 'Var1')
jdm <- jd[, -1]
rownames(jdm) <- jd[, 1]
colnames(jdm) <- jd[, 1]
return(jdm)
} else {
return(df)
}
}
## Read in the database
raw <- readr::read_tsv(here('data-raw/2018-12-05_shogoin-cellmarker-database.txt'))
## Convert IDs to symbol for readability
uniprot <- raw$Uniprot_ID
symbol <- select(org.Hs.eg.db,
keys = uniprot, keytype = 'UNIPROT',
columns = c('UNIPROT', 'SYMBOL'))
## Create a new list for comparison
tmp <- raw %>%
dplyr::select(Cell_Type_Name, Cell_Type_Subclass, Anatomical_Location, Uniprot_ID) %>%
group_by(Anatomical_Location, Cell_Type_Name, Cell_Type_Subclass) %>%
nest() %>%
mutate(data = map(data, function(x) {
y <- sort(x$Uniprot_ID)
y[!is.na(y)]
}))
names(tmp$data) <- paste(tmp$Anatomical_Location, tmp$Cell_Type_Name, tmp$Cell_Type_Subclass, sep = '_')
## Compare annotated cells by jaccard distance
jd <- .calc_jaccard_dist(tmp$data, wide = FALSE)
pdf('~/Desktop/jd.pdf', height = 25, width = 25)
pheatmap::pheatmap(jd, clustering_method = 'ward.D2', fontsize_row = 3, fontsize_col = 3)
dev.off()
raw$Anatomical_Location %>% table() %>% sort
tmp <- raw %>% filter(Anatomical_Location == 'Blood')
tmp$Cell_Type_Subclass %>% table() %>% sort()
tmp %>%
dplyr::select(Cell_Type_Subclass, Uniprot_ID) %>%
inner_join(symbol, by = c('Uniprot_ID' = 'UNIPROT')) %>%
dplyr::select(-Uniprot_ID) %>%
unique() %>%
filter(!is.na(SYMBOL)) %>%
group_by(Cell_Type_Subclass) %>%
count() %>% arrange(n) %>% print(n = Inf)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.