inst/scripts/make-data-Weber-BCR-XL-sim-main.R

##########################################################################################
# Script to prepare benchmark dataset 'Weber-BCR-XL-sim-main'
# 
# See Weber et al. (2019), Supplementary Note 1 (paper introducing 'diffcyt' framework)
# for more details
# 
# The original 'BCR-XL' dataset is sourced from Bodenmiller et al. (2012), and was
# previously used for benchmark evaluations by Bruggner et al. (2014) (Citrus paper).
# 
# Raw data downloaded from Cytobank (experiment 15713)
# - see Citrus wiki (section 'PBMC Example 1'):
# https://github.com/nolanlab/citrus/wiki/PBMC-Example-1
# - direct link to Cytobank repository:
# https://community.cytobank.org/cytobank/experiments/15713/download_files
# 
# Cell population labels are reproduced from Nowicka et al. (2017), where they were
# generated using a strategy of expert-guided manual merging of automatically generated
# clusters from the FlowSOM algorithm. Code to reproduce the cell population labels is
# available in the script 'cell_population_labels_BCR_XL.R'.
# 
# The 'Weber-BCR-XL-sim' dataset in this script is generated as follows:
# - select unstimulated reference samples from the main 'BCR-XL' dataset (8 individuals)
# - randomly split each unstimulated sample into two halves
# - in one half, replace B cells with equivalent number of B cells from the corresponding
# paired sample from BCR-XL stimulated condition
# 
# Methods are then evaluated by their ability to detect the known strong differential
# expression signal of the functional marker pS6 in B cells.
# 
# Lukas Weber, Jul 2019
##########################################################################################


# original version of this script available at: https://github.com/lmweber/diffcyt-evaluations

# note: random number generators were changed in R version 3.6.0; we use 'RNGversion()' to
# set random number generators to R version 3.5.3 for reproducibility (see
# https://cran.r-project.org/doc/manuals/r-devel/NEWS.html)

suppressPackageStartupMessages({
  library(flowCore)
  library(SummarizedExperiment)
})

RNGversion("3.5.3")



# -------------
# Download data
# -------------

# create temporary directories
DIR_TMP <- "tmp"
dir.create(file.path(DIR_TMP))
dir.create(file.path(DIR_TMP, "fcs_files"))
dir.create(file.path(DIR_TMP, "population_IDs"))

# download from 'imlspenticton' server
URL <- "http://imlspenticton.uzh.ch/robinson_lab/HDCytoData"
DIR <- "Bodenmiller_BCR_XL"

# download .fcs files
fcs_filename <- "Bodenmiller_BCR_XL_fcs_files.zip"
download.file(file.path(URL, DIR, fcs_filename), destfile = file.path(DIR_TMP, "fcs_files", fcs_filename))
unzip(file.path(DIR_TMP, "fcs_files", fcs_filename), exdir = file.path(DIR_TMP, "fcs_files"))

# download population IDs
pop_filename <- "Bodenmiller_BCR_XL_population_IDs.zip"
download.file(file.path(URL, DIR, pop_filename), destfile = file.path(DIR_TMP, "population_IDs", pop_filename))
unzip(file.path(DIR_TMP, "population_IDs", pop_filename), exdir = file.path(DIR_TMP, "population_IDs"))



# ---------
# Filenames
# ---------

DIR_FCS <- file.path(DIR_TMP, "fcs_files")
DIR_LABELS <- file.path(DIR_TMP, "population_IDs")

# .fcs files
files_fcs <- list.files(DIR_FCS, pattern = "\\.fcs$", full.names = TRUE)
files_BCRXL <- files_fcs[grep("patient[1-8]_BCR-XL\\.fcs$", files_fcs)]
files_ref <- files_fcs[grep("patient[1-8]_Reference\\.fcs$", files_fcs)]
files_fcs_all <- c(files_BCRXL, files_ref)

# cell population labels
files_labels <- list.files(DIR_LABELS, pattern = "\\.csv$", full.names = TRUE)
files_labels_BCRXL <- files_labels[grep("patient[1-8]_BCR-XL\\.csv$", files_labels)]
files_labels_ref <- files_labels[grep("patient[1-8]_Reference\\.csv$", files_labels)]
files_labels_all <- c(files_labels_BCRXL, files_labels_ref)



# ---------
# Load data
# ---------

data <- lapply(files_fcs_all, function(f) exprs(read.FCS(f, transformation = FALSE, truncate_max_range = FALSE)))

# sample IDs
sample_IDs <- gsub("\\.fcs$", "", gsub("^PBMC8_30min_", "", basename(files_fcs_all)))
sample_IDs

names(data) <- sample_IDs

# conditions
conditions <- as.factor(gsub("^patient[0-9+]_", "", sample_IDs))
conditions

# patient IDs
patient_IDs <- as.factor(gsub("_.*$", "", sample_IDs))
patient_IDs

# cell population labels
labels <- lapply(files_labels_all, read.csv)

stopifnot(all(sapply(data, nrow) == sapply(labels, nrow)))

names(labels) <- names(data)



# ----------------------
# Delete temporary files
# ----------------------

unlink(DIR_TMP, recursive = TRUE)



# ---------------------------------------------------------------
# Split each reference sample into two halves: 'base' and 'spike'
# ---------------------------------------------------------------

data_ref <- data[conditions == "Reference"]
labels_ref <- labels[conditions == "Reference"]

names(data_ref) <- gsub("_Reference$", "", names(data_ref))
names(labels_ref) <- gsub("_Reference$", "", names(labels_ref))

stopifnot(all(sapply(data_ref, nrow) == sapply(labels_ref, nrow)))

n_cells_ref <- sapply(data_ref, nrow)

set.seed(100)

# generate random indices
inds <- lapply(n_cells_ref, function(n) {
  i_base <- sort(sample(seq_len(n), floor(n / 2)))
  i_spike <- setdiff(seq_len(n), i_base)
  list(base = i_base, spike = i_spike)
})

inds_base <- lapply(inds, function(l) l[[1]])
inds_spike <- lapply(inds, function(l) l[[2]])

# subset data
data_base <- mapply(function(d, i) d[i, , drop = FALSE], data_ref, inds_base, SIMPLIFY = FALSE)
data_spike <- mapply(function(d, i) d[i, , drop = FALSE], data_ref, inds_spike, SIMPLIFY = FALSE)

# subset labels
labels_base <- mapply(function(d, i) d[i, , drop = FALSE], labels_ref, inds_base, SIMPLIFY = FALSE)
labels_spike <- mapply(function(d, i) d[i, , drop = FALSE], labels_ref, inds_spike, SIMPLIFY = FALSE)

stopifnot(all(sapply(data_base, nrow) == sapply(labels_base, nrow)))
stopifnot(all(sapply(data_spike, nrow) == sapply(labels_spike, nrow)))



# -------------------------------------------------------------------------------------
# Replace B cells in 'spike' samples with an equivalent number of B cells from 'BCR-XL'
# (stimulated) condition
# -------------------------------------------------------------------------------------

# note: for some samples, not enough B cells are available; use all available B cells in
# this case


# B cells from 'BCR-XL' (stimulated) condition
data_BCRXL <- data[conditions == "BCR-XL"]
labels_BCRXL <- labels[conditions == "BCR-XL"]

names(data_BCRXL) <- gsub("_BCR-XL$", "", names(data_BCRXL))
names(labels_BCRXL) <- gsub("_BCR-XL$", "", names(labels_BCRXL))

B_cells_BCRXL <- mapply(function(d, l) {
  d[l$population %in% c("B-cells IgM-", "B-cells IgM+"), , drop = FALSE]
}, data_BCRXL, labels_BCRXL, SIMPLIFY = FALSE)

B_cells_labels_BCRXL <- lapply(labels_BCRXL, function(l) {
  l[l$population %in% c("B-cells IgM-", "B-cells IgM+"), , drop = FALSE]
})

stopifnot(all(sapply(B_cells_BCRXL, nrow) == sapply(B_cells_labels_BCRXL, nrow)))


# number of B cells available in stimulated condition
sapply(B_cells_BCRXL, nrow)

# total number of B cells in reference condition
n_spike_ref <- sapply(labels_ref, function(l) {
  sum(l$population %in% c("B-cells IgM-", "B-cells IgM+"))
})
n_spike_ref

# number of B cells needed
n_spike <- sapply(labels_spike, function(l) {
  sum(l$population %in% c("B-cells IgM-", "B-cells IgM+"))
})
n_spike


# select correct number of B cells from 'BCR-XL' (stimulated) condition for each sample

set.seed(100)

ixs <- mapply(function(b, n) {
  # reduce 'n' if not enough B cells available
  n <- min(n, nrow(b))
  sample(seq_len(nrow(b)), n)
}, B_cells_BCRXL, n_spike, SIMPLIFY = FALSE)

B_cells_spike <- mapply(function(b, ix) {
  b[ix, , drop = FALSE]
}, B_cells_BCRXL, ixs, SIMPLIFY = FALSE)

B_cells_labels_spike <- mapply(function(bl, ix) {
  bl[ix, , drop = FALSE]
}, B_cells_labels_BCRXL, ixs, SIMPLIFY = FALSE)

stopifnot(all(sapply(B_cells_spike, nrow) == sapply(B_cells_labels_spike, nrow)))

sapply(B_cells_spike, nrow)


# replace B cells in 'spike' samples

data_spike <- mapply(function(d, l, b) {
  stopifnot(nrow(d) == nrow(l))
  d <- d[!(l$population %in% c("B-cells IgM-", "B-cells IgM+")), , drop = FALSE]
  d <- rbind(d, b)
  rownames(d) <- NULL
  d
}, data_spike, labels_spike, B_cells_spike, SIMPLIFY = FALSE)

labels_spike <- mapply(function(l, bl) {
  l <- l[!(l$population %in% c("B-cells IgM-", "B-cells IgM+")), , drop = FALSE]
  l <- rbind(l, bl)
  rownames(l) <- NULL
  l
}, labels_spike, B_cells_labels_spike, SIMPLIFY = FALSE)

stopifnot(all(sapply(data_spike, nrow) == sapply(labels_spike, nrow)))



# ---------------
# Create metadata
# ---------------

data_combined <- c(data_base, data_spike)
labels_combined <- c(labels_base, labels_spike)

stopifnot(all(sapply(data_combined, nrow) == sapply(labels_combined, nrow)))

conditions_combined <- c(rep("base", length(data_base)), rep("spike", length(data_spike)))


# sample information

patient_id <- as.factor(names(data_combined))
patient_id

group_id <- factor(conditions_combined, levels = c("base", "spike"))
group_id

sample_id <- paste(patient_id, group_id, sep = "_")
sample_id <- factor(sample_id, levels = sample_id)
sample_id

experiment_info <- data.frame(group_id, patient_id, sample_id, stringsAsFactors = FALSE)
experiment_info


names(data_combined) <- sample_id
names(labels_combined) <- sample_id


# marker information

# indices of all marker columns, lineage markers, and functional markers
# (10 lineage markers / 14 functional markers; see Bruggner et al. 2014, Table 1)
cols_markers <- c(3:4, 7:9, 11:19, 21:22, 24:26, 28:31, 33)
cols_lineage <- c(3:4, 9, 11, 12, 14, 21, 29, 31, 33)
cols_func <- setdiff(cols_markers, cols_lineage)

stopifnot(all(sapply(seq_along(data), function(i) all(colnames(data[[i]]) == colnames(data[[1]])))))

# channel and marker names
channel_name <- as.character(colnames(data[[1]]))
marker_name <- gsub("\\(.*$", "", channel_name)

# marker classes
marker_class <- rep("none", length(marker_name))
marker_class[cols_lineage] <- "type"
marker_class[cols_func] <- "state"
marker_class <- factor(marker_class, levels = c("none", "type", "state"))

marker_info <- data.frame(channel_name, marker_name, marker_class, stringsAsFactors = FALSE)
marker_info



# ----------------------------------
# Create SummarizedExperiment object
# ----------------------------------

# set up row data
n_cells <- sapply(data_combined, nrow)

row_data <- as.data.frame(lapply(experiment_info, function(col) {
  as.factor(rep(col, n_cells))
}))

stopifnot(nrow(row_data) == sum(n_cells))

# add population IDs
population_id <- do.call("rbind", labels_combined)
rownames(population_id) <- NULL
colnames(population_id) <- "population_id"

stopifnot(nrow(population_id) == sum(n_cells))

row_data <- cbind(row_data, population_id)

# add column indicating B cells
row_data$B_cell <- row_data$population_id %in% c("B-cells IgM-", "B-cells IgM+")

# add column indicating spike-in cells (all B cells in 'spike' samples)
row_data$spikein <- (row_data$group_id == "spike") & (row_data$B_cell == TRUE)


# set up column data
col_data <- marker_info


# set up expression data
d_exprs <- do.call("rbind", data_combined)

stopifnot(nrow(d_exprs) == nrow(row_data))
stopifnot(ncol(d_exprs) == nrow(col_data))
stopifnot(all(colnames(d_exprs) == marker_info$channel_name))


# create SummarizedExperiment object
d_SE <- SummarizedExperiment(
  assays = list(exprs = d_exprs), 
  rowData = row_data, 
  colData = col_data, 
  metadata = list(experiment_info = experiment_info, n_cells = n_cells)
)



# ---------------------
# Create flowSet object
# ---------------------

# note: row data is stored as additional columns of data in the expression matrices;
# additional information from row data and column data (e.g. marker classes) is stored in
# 'description' slot

# extract data
row_data <- rowData(d_SE)
col_data <- colData(d_SE)
d_exprs <- assay(d_SE)
meta_data <- metadata(d_SE)

# create tables to identify row data values when converted to numeric
stopifnot(all(colnames(row_data) == c("group_id", "patient_id", "sample_id", "population_id", "B_cell", "spikein")))
group_info <- data.frame(
  group_id = seq_len(nlevels(row_data$group_id)), 
  group_name = levels(row_data$group_id), 
  stringsAsFactors = FALSE
)
patient_info <- data.frame(
  patient_id = seq_len(nlevels(row_data$patient_id)), 
  patient_name = levels(row_data$patient_id), 
  stringsAsFactors = FALSE
)
sample_info <- data.frame(
  sample_id = seq_len(nlevels(row_data$sample_id)), 
  sample_name = levels(row_data$sample_id), 
  stringsAsFactors = FALSE
)
population_info <- data.frame(
  population_id = seq_len(nlevels(row_data$population_id)), 
  population_name = levels(row_data$population_id), 
  stringsAsFactors = FALSE
)
B_cell_info <- data.frame(
  B_cell = c(0, 1), 
  B_cell_status = c("FALSE", "TRUE"), 
  stringsAsFactors = FALSE
)
spikein_info <- data.frame(
  spikein = c(0, 1), 
  spikein_status = c("FALSE", "TRUE"), 
  stringsAsFactors = FALSE
)

# create extra columns of data from row data
row_data_fs <- do.call("cbind", lapply(row_data, as.numeric))
stopifnot(all(colnames(row_data_fs) == colnames(row_data)))
stopifnot(nrow(row_data_fs) == nrow(d_exprs))

# create marker info
marker_info <- as.data.frame(col_data, row.names = seq_len(nrow(col_data)))

# create flowFrame
ff <- flowFrame(cbind(d_exprs, row_data_fs))
stopifnot(all(colnames(ff) == c(colnames(d_exprs), colnames(row_data_fs))))
# include both channel and marker names in 'pData(parameters(.))'
stopifnot(length(c(marker_info$marker_name, colnames(row_data_fs))) == nrow(pData(parameters(ff))))
pData(parameters(ff))$desc <- c(marker_info$marker_name, colnames(row_data_fs))

# include additional information in 'description' slot
description(ff)$GROUP_INFO <- group_info
description(ff)$PATIENT_INFO <- patient_info
description(ff)$SAMPLE_INFO <- sample_info
description(ff)$POPULATION_INFO <- population_info
description(ff)$B_CELL_INFO <- B_cell_info
description(ff)$SPIKEIN_INFO <- spikein_info
# experiment information and marker information
description(ff)$EXPERIMENT_INFO <- meta_data$experiment_info
description(ff)$MARKER_INFO <- marker_info

# create flowSet
d_flowSet <- flowSet(ff)



# ------------
# Save objects
# ------------

filename_SE <- "Weber_BCR_XL_sim_main_SE.rda"
filename_flowSet <- "Weber_BCR_XL_sim_main_flowSet.rda"

save(d_SE, file = filename_SE)
save(d_flowSet, file = filename_flowSet)
lmweber/HDCytoData documentation built on March 19, 2024, 4:41 a.m.