inst/data-raw/process/PMID31486854_Postigo-Fernandez-2019/process.R

# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
#
# BiocManager::install("org.Hs.eg.db")

library(readxl)
library(tidyr)
library(dplyr)

# First file is processed as a gene expression dataset
gx <- read_xlsx("Human LNSCs gene expression datasheet.xlsx", range = cell_rows(5:73)) %>%
  rename(ID = `NPOD No`) %>%
  filter(ID != "LIVENY") %>%
  select(-c(Aab, ETHNICITY, GENDER, AGE, GROUP))  %>%
  group_by(ID)

samples <- gx %>%
  summarize(n = n())

gx.mean <- gx %>%
  summarize_if(is.numeric, mean, na.rm = T) %>%
  rename_all(function(x) gsub("_fn1", "", x)) %>%
  mutate(ID = as.integer(ID))

# Normally don't keep sd if exported as medium/high-throughput expression dataset
gx.sd <- gx %>%
  summarize_if(is.numeric, list(SD = sd), na.rm = T)

#--------------------------------------------------------------------------------------------------------------#
# Note: loading the annotation package before using dplyr causes some problems with rename

library(org.Hs.eg.db)

hs <- org.Hs.eg.db
symbols <- colnames(gx.mean)[-1]
anno <- select(hs, keys = symbols, columns = c("ENTREZID", "SYMBOL"), keytype = "SYMBOL")

# Names/symbols without ID mapping because they're aliases
anno[is.na(anno$ENTREZID), 1]

# Manual human gene lookup results at https://www.ncbi.nlm.nih.gov/gene/
manual <- c(Insulin = 3630,
  GAD65 = 2572,
  `IA-2` = 5798,
  IGRP = 57818,
  ZNT8 = 169026,
  Arrestin = 408,
  Tyrosinase = 7299,
  MART1 = 2315,
  `TGF-b1` = 7040,
  `IL-10` = 3586,
  `IL-27a` = 246778,
  `IL-12p35/IL-35a` = 3592,
  `IL-1RA` = 3557,
  `Galectin-1` = 3956,
  `Galectin-9` = 3965,
  CD39 = 953,
  CD73 = 4907,
  `PD-L1/B7-H1` = 29126,
  `PD-L2/CD273/B7-DC` = 80380,
  `Semaphorin 4A` = 64218,
  `B7-H4` = 79679,
  `ICOS-L` = 23308,
  `HVEM/CD270` = 8764,
  ILT3 = 11006,
  ILT4 = 10288,
  FASL = 356,
  iNOS = 4843,
  A20 = 7128)

anno$ENTREZID[match(names(manual), anno$SYMBOL)] <- manual

names(gx.mean)[2:length(gx.mean)] <- anno$ENTREZID
gx.mean <- as.matrix(gx.mean)
write.table(gx.mean, "PMID31486854_1_Postigo-Fernandez-2019.tsv",  sep = "\t", quote = F, row.names = F)

# Second dataset
# IMPORTANT: for "%-HLA-DR" vaues (sheet 1), there are two rows for #6458
# (the only one with duplicated measurements), which turns out to be a typo.
# Confirmed with author that in the second set (row 35), instead of donor #6458, should be the donor #6459.
hla.dr.prct <- read_xlsx("MHCII & PD-L1.xlsx", sheet = 1, range = cell_rows(3:44)) %>%
  rename(ID = `NPOD No`) %>%
  filter(ID != "LIVENY") %>%
  select(-c(ETHNICITY, GENDER, AGE, GROUP))  %>%
  rename_at(2:5, function(x) paste0("HLA.DR.prct.", x))

# Verify the one entry that needs to be fixed
hla.dr.prct %>%
  group_by(ID) %>%
  filter(n() > 1)

hla.dr.prct$ID[duplicated(hla.dr.prct$ID)] <- "6459"

# "% PDL1" (sheet 2)
pdl1.prct <- read_xlsx("MHCII & PD-L1.xlsx", sheet = 2, range = cell_rows(3:21)) %>%
  rename(ID = `NPOD No`) %>%
  filter(ID != "LIVENY") %>%
  select(-c(ETHNICITY, GENDER, AGE, GROUP)) %>%
  rename_at(2:5, function(x) paste0("PDL1.prct.", x))

# File 3
SC.prct <- read_xlsx("Relative frequency of SCs.xlsx", sheet = 1, range = cell_rows(3:44)) %>%
  rename(ID = `NPOD No`) %>%
  filter(ID != "LIVENY") %>%
  select(-c(ETHNICITY, GENDER, AGE, GROUP))  %>%
  group_by(ID) %>%
  rename_all(function(x) gsub(" .*", ".prct", x))

dataset <- full_join(hla.dr.prct, pdl1.prct, by = "ID") %>%
  full_join(SC.prct, by = "ID")

write.table(dataset, "PMID31486854_2_Postigo-Fernandez-2019.tsv",  sep = "\t", quote = F, row.names = F)
avucoh/nPOD documentation built on April 1, 2020, 5:24 p.m.