# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("org.Hs.eg.db")
library(readxl)
library(tidyr)
library(dplyr)
# First file is processed as a gene expression dataset
gx <- read_xlsx("Human LNSCs gene expression datasheet.xlsx", range = cell_rows(5:73)) %>%
rename(ID = `NPOD No`) %>%
filter(ID != "LIVENY") %>%
select(-c(Aab, ETHNICITY, GENDER, AGE, GROUP)) %>%
group_by(ID)
samples <- gx %>%
summarize(n = n())
gx.mean <- gx %>%
summarize_if(is.numeric, mean, na.rm = T) %>%
rename_all(function(x) gsub("_fn1", "", x)) %>%
mutate(ID = as.integer(ID))
# Normally don't keep sd if exported as medium/high-throughput expression dataset
gx.sd <- gx %>%
summarize_if(is.numeric, list(SD = sd), na.rm = T)
#--------------------------------------------------------------------------------------------------------------#
# Note: loading the annotation package before using dplyr causes some problems with rename
library(org.Hs.eg.db)
hs <- org.Hs.eg.db
symbols <- colnames(gx.mean)[-1]
anno <- select(hs, keys = symbols, columns = c("ENTREZID", "SYMBOL"), keytype = "SYMBOL")
# Names/symbols without ID mapping because they're aliases
anno[is.na(anno$ENTREZID), 1]
# Manual human gene lookup results at https://www.ncbi.nlm.nih.gov/gene/
manual <- c(Insulin = 3630,
GAD65 = 2572,
`IA-2` = 5798,
IGRP = 57818,
ZNT8 = 169026,
Arrestin = 408,
Tyrosinase = 7299,
MART1 = 2315,
`TGF-b1` = 7040,
`IL-10` = 3586,
`IL-27a` = 246778,
`IL-12p35/IL-35a` = 3592,
`IL-1RA` = 3557,
`Galectin-1` = 3956,
`Galectin-9` = 3965,
CD39 = 953,
CD73 = 4907,
`PD-L1/B7-H1` = 29126,
`PD-L2/CD273/B7-DC` = 80380,
`Semaphorin 4A` = 64218,
`B7-H4` = 79679,
`ICOS-L` = 23308,
`HVEM/CD270` = 8764,
ILT3 = 11006,
ILT4 = 10288,
FASL = 356,
iNOS = 4843,
A20 = 7128)
anno$ENTREZID[match(names(manual), anno$SYMBOL)] <- manual
names(gx.mean)[2:length(gx.mean)] <- anno$ENTREZID
gx.mean <- as.matrix(gx.mean)
write.table(gx.mean, "PMID31486854_1_Postigo-Fernandez-2019.tsv", sep = "\t", quote = F, row.names = F)
# Second dataset
# IMPORTANT: for "%-HLA-DR" vaues (sheet 1), there are two rows for #6458
# (the only one with duplicated measurements), which turns out to be a typo.
# Confirmed with author that in the second set (row 35), instead of donor #6458, should be the donor #6459.
hla.dr.prct <- read_xlsx("MHCII & PD-L1.xlsx", sheet = 1, range = cell_rows(3:44)) %>%
rename(ID = `NPOD No`) %>%
filter(ID != "LIVENY") %>%
select(-c(ETHNICITY, GENDER, AGE, GROUP)) %>%
rename_at(2:5, function(x) paste0("HLA.DR.prct.", x))
# Verify the one entry that needs to be fixed
hla.dr.prct %>%
group_by(ID) %>%
filter(n() > 1)
hla.dr.prct$ID[duplicated(hla.dr.prct$ID)] <- "6459"
# "% PDL1" (sheet 2)
pdl1.prct <- read_xlsx("MHCII & PD-L1.xlsx", sheet = 2, range = cell_rows(3:21)) %>%
rename(ID = `NPOD No`) %>%
filter(ID != "LIVENY") %>%
select(-c(ETHNICITY, GENDER, AGE, GROUP)) %>%
rename_at(2:5, function(x) paste0("PDL1.prct.", x))
# File 3
SC.prct <- read_xlsx("Relative frequency of SCs.xlsx", sheet = 1, range = cell_rows(3:44)) %>%
rename(ID = `NPOD No`) %>%
filter(ID != "LIVENY") %>%
select(-c(ETHNICITY, GENDER, AGE, GROUP)) %>%
group_by(ID) %>%
rename_all(function(x) gsub(" .*", ".prct", x))
dataset <- full_join(hla.dr.prct, pdl1.prct, by = "ID") %>%
full_join(SC.prct, by = "ID")
write.table(dataset, "PMID31486854_2_Postigo-Fernandez-2019.tsv", sep = "\t", quote = F, row.names = F)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.