knitr::opts_chunk$set(
  message = FALSE,
  warning = FALSE,
  collapse = TRUE,
  comment = "#>",
  fig.align = "center"
)

Building reference matrix from single cell expression matrix

In its simplest form, a reference matrix is built by averaging expression (also includes an option to take the median) of a single cell RNA-seq expression matrix by cluster. Both log transformed or raw count matrices are supported.

library(clustifyr)
library(clustifyrdata)

new_ref_matrix <- average_clusters(
  mat = pbmc_matrix,
  metadata = pbmc_meta$classified, # or use metadata = pbmc_meta, cluster_col = "classified"
  if_log = TRUE
)

head(new_ref_matrix)

Building reference matrix from seurat object

For further convenience, a shortcut function for generating reference matrix from seurat object is used here.

new_ref_matrix_v2 <- seurat_ref(
  seurat_object = s_small,
  cluster_col = "res.1"
)

new_ref_matrix_v3 <- seurat_ref(
  seurat_object = s_small3,
  cluster_col = "RNA_snn_res.1"
)

tail(new_ref_matrix_v3)

If given additional assay_name, output ref will include features from the designated slots (such as for CITE-seq data).

new_ref_matrix_v2 <- seurat_ref(
  seurat_object = s_small,
  cluster_col = "res.1",
  assay_name = c("ADT", "ADT2")
)

new_ref_matrix_v3 <- seurat_ref(
  seurat_object = s_small3,
  cluster_col = "RNA_snn_res.1",
  assay_name = c("ADT", "ADT2")
)

tail(new_ref_matrix_v3)

Building reference matrix from Recount2

Bulk RNA-Seq data can be obtained from any input source; in this example we will obtain a dataset from the recount2 database. This database provides > 2000 human RNA-Seq experiments that have been processed using a consistent pipeline. We have written a wrapper function to download a count matrix from recount2, given an SRA ID.

library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
library(recount2)

dl_recount <- function(sra_id) {
  if (!file.exists(file.path(sra_id, "rse_gene.Rdata"))) {
    download_study(sra_id)
  }

  load(file.path(sra_id, "rse_gene.Rdata"))

  # no longer need to downloaded data
  unlink(sra_id, recursive = TRUE)
  rse <- scale_counts(rse_gene)
  read_counts <- assay(rse, "counts")
  gene_ids <- rownames(read_counts)

  # get gene symbols, which are stored in rowData
  id2symbol <- data_frame(
    ids = rowData(rse_gene)$gene_id,
    symbols = rowData(rse_gene)$symbol@listData
  ) %>%
    mutate(symbols = map_chr(symbols, ~ .x[1]))

  # clean up metadata into a dataframe
  print("cleaning up meta")
  mdata <- colData(rse)
  mdata_cols <- lapply(
    mdata$characteristics,
    function(x) {
      str_match(x, "^([^:]+):")[, 2]
    }
  ) %>%
    unique() %>%
    unlist()

  mdata <- data_frame(
    run = mdata$run,
    all_data = as.list(mdata$characteristics)
  ) %>%
    mutate(out = purrr::map_chr(all_data, ~ str_c(.x, collapse = "::"))) %>%
    tidyr::separate(
      out,
      sep = "::",
      into = mdata_cols
    ) %>%
    select(-all_data) %>%
    mutate_at(
      .vars = vars(-matches("run")),
      .funs = function(x) str_match(x, ": (.+)")[, 2]
    )

  # convert ids to symbols
  row_ids_to_symbols <- left_join(data_frame(ids = gene_ids), id2symbol, by = "ids")

  if (length(gene_ids) != nrow(row_ids_to_symbols)) {
    warning("gene id mapping to symbols produce more or less ids")
  }

  row_ids_to_symbols <- filter(row_ids_to_symbols, !is.na(symbols))

  out_df <- read_counts %>%
    as.data.frame() %>% 
    tibble::rownames_to_column("gene_id") %>%
    left_join(., row_ids_to_symbols, by = c("gene_id" = "ids")) %>%
    dplyr::select(-gene_id) %>%
    dplyr::select(symbols, everything()) %>%
    filter(!is.na(symbols))

  out_matrix <- tidyr::gather(out_df, library, expr, -symbols) %>%
    group_by(symbols, library) %>%
    summarize(expr = sum(expr)) %>%
    tidyr::spread(library, expr) %>%
    as.data.frame()
  if (tibble::has_rownames(out_matrix)) {
    out_matrix <- tibble::remove_rownames(out_matrix)
  }
  out_matrix <- out_matrix %>% tibble::column_to_rownames("symbols") %>%
    as.matrix()

  list(
    read_counts = out_matrix,
    meta_data = mdata
  )
}

gtex_data <- dl_recount("SRP012682")

gtex_data$read_counts[1:5, 1:5]
gtex_data$meta_data[1:5, ]

Building reference matrix from microarray data

clustifyr works with microarray data-derived gene expression matrix as well (see Benchmark page). An example of generating a matrix for sorted immune cell type can be found below. The matrix can be found at clustifyrdata::hema_microarray_matrix

library(tidyverse)
# read series matrix for column names
GSE24759 <- read_tsv("GSE24759/GSE24759_series_matrix.txt", skip = 25, n_max = 1)

# read series matrix for full dataset; apply column names
GSE24759 <- read_tsv("GSE24759//GSE24759_series_matrix.txt", skip = 57, col_names = colnames(GSE24759)) %>% rename(ID = `!Sample_title`)

# read array metadata table, remove control probes and missing gene symbols
GPL4685 <- read_tsv("/Users/rf/microarray/GSE24759/GPL4685-15513.txt", skip = 14) %>%
  filter(!is.na(`Gene Symbol`), `Sequence Type` != "Control sequence") %>%
  separate(`Gene Symbol`, into = "gene_symbol", sep = " ") %>%
  select(ID, gene_symbol)

# join data and metadata, collapse gene symbols
GSE24759 <- inner_join(GSE24759, GPL4685) %>%
  select(-ID) %>%
  group_by(gene_symbol) %>%
  mutate_at(.vars = vars(-gene_symbol), .funs = list(~ log2(mean(2^.))))

# convert to matrix, add rownames
GSE24759_mat <- ungroup(GSE24759) %>%
  select(-gene_symbol) %>%
  as.matrix()
row.names(GSE24759_mat) <- GSE24759$gene_symbol

# merge samples
ref_hema_microarray <- GSE24759_mat %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  mutate(sample = str_remove(sample, ", .*")) %>%
  group_by(sample) %>%
  summarise_all(.funs = list(~ log2(mean(2^.))))

if (tibble::has_rownames(ref_hema_microarray)) {
  ref_hema_microarray <- tibble::remove_rownames(ref_hema_microarray)
}
ref_hema_microarray <- ref_hema_microarray %>%
  column_to_rownames("sample") %>%
  t()

Essentially, any data in matrix form is accepted by clustifyR.

clustifyrdata::ref_hema_microarray[1:5, 1:5]

Building reference gene list

clustify_lists and clustify_nudge uses dataframe or matrix of candidate genes for classification. Example formatting are shown below.

# directly from seurat function
head(pbmc_markers)

# manually enter into dataframe format
betam <- c("INS", "IGF2", "IAPP", "MAFA", "NPTX2")
alpham <- c("GCG", "PDK4", "LOXL2", "IRX2", "GC")
deltam <- c("SST", "RBP4", "HHEX", "PCSK1", "LEPR")
data.frame(alpham, betam, deltam)

# use matrixize_markers function to convert
mm <- matrixize_markers(
  marker_df = pbmc_markers,
  remove_rp = TRUE,
  unique = TRUE,
  n = 10
)
head(mm)

# get markers from ref_matrix, and then matrixize_markers
cbmc_m <- matrixize_markers(
  marker_df = ref_marker_select(cbmc_ref),
  remove_rp = TRUE,
  unique = TRUE,
  n = 3
)

# parse `garnett` marker files, see function `file_marker_parse`

Prebuilt references, in clustifyr

A few reference datasets are built into clustifyr for vignettes and testing: pbmc_bulk_matrix, cbmc_ref, pbmc_markers, cbmc_m

Prebuilt references, in clustifyrdata

More reference data, including tabula muris, and code used to generate them are available at https://github.com/rnabioco/clustifyrdata.

Also see list for individual downloads at https://rnabioco.github.io/clustifyr/articles/download_refs.html

```{"Installation"}

install.packages("devtools")

devtools::install_github("rnabioco/clustifyrdata") ```



rnabioco/clustifyrdata documentation built on Nov. 8, 2022, 4:26 a.m.