data-raw/process-shogoin.R

## Processing the Shogoin protein cell marker database
##
## Source: http://shogoin.stemcellinformatics.org/cell_marker/literatures
## Copy table and save as text-delimited file
##
## Manual fixes:
## - replaced "Immune System/blood Cells" with "Immune System/blood Cell"
## - replaced "Activated Bcell" with "Activated B Cell"
## - replaced "Immune System/blood Cells: Lymphoid Cell" with "Immune System/blood Cell" and "Golgi Apparatus" with "Blood" on the same line
## 

library(here)
library(org.Hs.eg.db)
library(tidyverse)

.calc_jaccard_dist <- function(l, wide = FALSE) {
    grid <- expand.grid(names(l), names(l), stringsAsFactors = FALSE)
    dists <- map2_dbl(grid$Var1, grid$Var2, function(x, y, l) {
        length(intersect(l[[x]], l[[y]]))/length(union(l[[x]], l[[y]]))
    }, l = l)
    df <- data.frame(grid, jaccard_dist = dists)

    if (wide == TRUE) {
        jd <- reshape(df, direction = 'wide', idvar = 'Var2', timevar = 'Var1')
        jdm <- jd[, -1]
        rownames(jdm) <- jd[, 1]
        colnames(jdm) <- jd[, 1]
        return(jdm)
    } else {
        return(df)
    }
}

## Read in the database
raw <- readr::read_tsv(here('data-raw/2018-12-05_shogoin-cellmarker-database.txt'))

## Convert IDs to symbol for readability
uniprot <- raw$Uniprot_ID
symbol <- select(org.Hs.eg.db,
                keys = uniprot, keytype = 'UNIPROT',
                columns = c('UNIPROT', 'SYMBOL'))

## Create a new list for comparison
tmp <- raw %>%
    dplyr::select(Cell_Type_Name, Cell_Type_Subclass, Anatomical_Location, Uniprot_ID) %>%
    group_by(Anatomical_Location, Cell_Type_Name, Cell_Type_Subclass) %>%
    nest() %>%
    mutate(data = map(data, function(x) {
        y <- sort(x$Uniprot_ID)
        y[!is.na(y)]
    }))
names(tmp$data) <- paste(tmp$Anatomical_Location, tmp$Cell_Type_Name, tmp$Cell_Type_Subclass, sep = '_')

## Compare annotated cells by jaccard distance
jd <- .calc_jaccard_dist(tmp$data, wide = FALSE)


pdf('~/Desktop/jd.pdf', height = 25, width = 25)
pheatmap::pheatmap(jd, clustering_method = 'ward.D2', fontsize_row = 3, fontsize_col = 3)
dev.off()


raw$Anatomical_Location %>% table() %>% sort

tmp <- raw %>% filter(Anatomical_Location == 'Blood')
tmp$Cell_Type_Subclass %>% table() %>% sort()


tmp %>%
    dplyr::select(Cell_Type_Subclass, Uniprot_ID) %>%
    inner_join(symbol, by = c('Uniprot_ID' = 'UNIPROT')) %>%
    dplyr::select(-Uniprot_ID) %>%
    unique() %>%
    filter(!is.na(SYMBOL)) %>%
    group_by(Cell_Type_Subclass) %>%
    count() %>% arrange(n) %>% print(n = Inf)




    
robertamezquita/Inkwell documentation built on May 17, 2019, 10:13 a.m.