inst/data-raw/process/PMID29671030_Holm-2018/process.R

# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
#
# BiocManager::install("org.Hs.eg.db")

# Publication: https://pubmed.ncbi.nlm.nih.gov/29671030-abnormal-islet-sphingolipid-metabolism-in-type-1-diabetes/
# (on Pubmed page) Data availability: The RNA expression data is available online at
# https://www.dropbox.com/s/93mk5tzl5fdyo6b/Abnormal%20islet%20sphingolipid%20metabolism%20in%20type%201%20diabetes%2C%20RNA%20expression.xlsx?dl=0

library(org.Hs.eg.db)

hs <- org.Hs.eg.db
dat <- read.table("PMID29671030_Holm18.tsv", header = T)
symbols <- colnames(dat)
anno <- select(hs, keys = symbols, columns = c("ENTREZID", "SYMBOL"), keytype = "SYMBOL")

# Names/symbols without ID mappings because they're aliases
anno[is.na(anno$ENTREZID), ]

# Manual human gene lookup results at https://www.ncbi.nlm.nih.gov/gene/
manual <-
c(ssSPTa = 171546,
  ssSPTb = 165679,
  CerS2 = 29956,
  CerS3 = 204219,
  CerS4 = 79603,
  CerS5 = 91012,
  CerS6 = 253782,
  CGT = 7368, # alias
  CST = 9514, # alias
  Prosaposin = 5660, # should be PSAP
  FAPP2 = 84725, # alias
  CERT = 10087 # should be CERT1
)

# However, it looks like a couple of col names are typos because they can't
# be found upon manual lookup,
# which means these gene symbols cannot be annotated to their Entrez IDs with certainty
# Unless author responds, these are filtered out:
# c(KDRS = 2531, # KDSR
#   VPS51 = 738,
#   VPS52 = 6293,
#   VPS53 = 55275,
#   VPS54 = 51542)

anno$ENTREZID[match(names(manual), anno$SYMBOL)] <- manual
colnames(dat) <- anno$ENTREZID[match(colnames(dat), anno$SYMBOL)]

# Remove columns with uncertain mapping
dat <- dat[, !is.na(colnames(dat))]
xm2_t <- as.matrix(dat)

# export dataset
avucoh/nPOD documentation built on April 1, 2020, 5:24 p.m.