In rnabioco/clustifyrdata: External data sets for clustifyr

knitr::opts_chunk$set(
  message = FALSE,
  warning = FALSE,
  collapse = TRUE,
  comment = "#>",
  fig.align = "center"
)

Building reference matrix from single cell expression matrix

In its simplest form, a reference matrix is built by averaging expression (also includes an option to take the median) of a single cell RNA-seq expression matrix by cluster. Both log transformed or raw count matrices are supported.

library(clustifyr)
library(clustifyrdata)

new_ref_matrix <- average_clusters(
  mat = pbmc_matrix,
  metadata = pbmc_meta$classified, # or use metadata = pbmc_meta, cluster_col = "classified"
  if_log = TRUE
)

head(new_ref_matrix)

Building reference matrix from `seurat` object

For further convenience, a shortcut function for generating reference matrix from seurat object is used here.

new_ref_matrix_v2 <- seurat_ref(
  seurat_object = s_small,
  cluster_col = "res.1"
)

new_ref_matrix_v3 <- seurat_ref(
  seurat_object = s_small3,
  cluster_col = "RNA_snn_res.1"
)

tail(new_ref_matrix_v3)

If given additional assay_name, output ref will include features from the designated slots (such as for CITE-seq data).

new_ref_matrix_v2 <- seurat_ref(
  seurat_object = s_small,
  cluster_col = "res.1",
  assay_name = c("ADT", "ADT2")
)

new_ref_matrix_v3 <- seurat_ref(
  seurat_object = s_small3,
  cluster_col = "RNA_snn_res.1",
  assay_name = c("ADT", "ADT2")
)

tail(new_ref_matrix_v3)

Building reference matrix from Recount2

Bulk RNA-Seq data can be obtained from any input source; in this example we will obtain a dataset from the recount2 database. This database provides > 2000 human RNA-Seq experiments that have been processed using a consistent pipeline. We have written a wrapper function to download a count matrix from recount2, given an SRA ID.

library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
library(recount2)

dl_recount <- function(sra_id) {
  if (!file.exists(file.path(sra_id, "rse_gene.Rdata"))) {
    download_study(sra_id)
  }

  load(file.path(sra_id, "rse_gene.Rdata"))

  # no longer need to downloaded data
  unlink(sra_id, recursive = TRUE)
  rse <- scale_counts(rse_gene)
  read_counts <- assay(rse, "counts")
  gene_ids <- rownames(read_counts)

  # get gene symbols, which are stored in rowData
  id2symbol <- data_frame(
    ids = rowData(rse_gene)$gene_id,
    symbols = rowData(rse_gene)$symbol@listData
  ) %>%
    mutate(symbols = map_chr(symbols, ~ .x[1]))

  # clean up metadata into a dataframe
  print("cleaning up meta")
  mdata <- colData(rse)
  mdata_cols <- lapply(
    mdata$characteristics,
    function(x) {
      str_match(x, "^([^:]+):")[, 2]
    }
  ) %>%
    unique() %>%
    unlist()

  mdata <- data_frame(
    run = mdata$run,
    all_data = as.list(mdata$characteristics)
  ) %>%
    mutate(out = purrr::map_chr(all_data, ~ str_c(.x, collapse = "::"))) %>%
    tidyr::separate(
      out,
      sep = "::",
      into = mdata_cols
    ) %>%
    select(-all_data) %>%
    mutate_at(
      .vars = vars(-matches("run")),
      .funs = function(x) str_match(x, ": (.+)")[, 2]
    )

  # convert ids to symbols
  row_ids_to_symbols <- left_join(data_frame(ids = gene_ids), id2symbol, by = "ids")

  if (length(gene_ids) != nrow(row_ids_to_symbols)) {
    warning("gene id mapping to symbols produce more or less ids")
  }

  row_ids_to_symbols <- filter(row_ids_to_symbols, !is.na(symbols))

  out_df <- read_counts %>%
    as.data.frame() %>% 
    tibble::rownames_to_column("gene_id") %>%
    left_join(., row_ids_to_symbols, by = c("gene_id" = "ids")) %>%
    dplyr::select(-gene_id) %>%
    dplyr::select(symbols, everything()) %>%
    filter(!is.na(symbols))

  out_matrix <- tidyr::gather(out_df, library, expr, -symbols) %>%
    group_by(symbols, library) %>%
    summarize(expr = sum(expr)) %>%
    tidyr::spread(library, expr) %>%
    as.data.frame()
  if (tibble::has_rownames(out_matrix)) {
    out_matrix <- tibble::remove_rownames(out_matrix)
  }
  out_matrix <- out_matrix %>% tibble::column_to_rownames("symbols") %>%
    as.matrix()

  list(
    read_counts = out_matrix,
    meta_data = mdata
  )
}

gtex_data <- dl_recount("SRP012682")

gtex_data$read_counts[1:5, 1:5]
gtex_data$meta_data[1:5, ]

Building reference matrix from microarray data

clustifyr works with microarray data-derived gene expression matrix as well (see Benchmark page). An example of generating a matrix for sorted immune cell type can be found below. The matrix can be found at clustifyrdata::hema_microarray_matrix

library(tidyverse)
# read series matrix for column names
GSE24759 <- read_tsv("GSE24759/GSE24759_series_matrix.txt", skip = 25, n_max = 1)

# read series matrix for full dataset; apply column names
GSE24759 <- read_tsv("GSE24759//GSE24759_series_matrix.txt", skip = 57, col_names = colnames(GSE24759)) %>% rename(ID = `!Sample_title`)

# read array metadata table, remove control probes and missing gene symbols
GPL4685 <- read_tsv("/Users/rf/microarray/GSE24759/GPL4685-15513.txt", skip = 14) %>%
  filter(!is.na(`Gene Symbol`), `Sequence Type` != "Control sequence") %>%
  separate(`Gene Symbol`, into = "gene_symbol", sep = " ") %>%
  select(ID, gene_symbol)

# join data and metadata, collapse gene symbols
GSE24759 <- inner_join(GSE24759, GPL4685) %>%
  select(-ID) %>%
  group_by(gene_symbol) %>%
  mutate_at(.vars = vars(-gene_symbol), .funs = list(~ log2(mean(2^.))))

# convert to matrix, add rownames
GSE24759_mat <- ungroup(GSE24759) %>%
  select(-gene_symbol) %>%
  as.matrix()
row.names(GSE24759_mat) <- GSE24759$gene_symbol

# merge samples
ref_hema_microarray <- GSE24759_mat %>%
  t() %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  mutate(sample = str_remove(sample, ", .*")) %>%
  group_by(sample) %>%
  summarise_all(.funs = list(~ log2(mean(2^.))))

if (tibble::has_rownames(ref_hema_microarray)) {
  ref_hema_microarray <- tibble::remove_rownames(ref_hema_microarray)
}
ref_hema_microarray <- ref_hema_microarray %>%
  column_to_rownames("sample") %>%
  t()

Essentially, any data in matrix form is accepted by clustifyR.

clustifyrdata::ref_hema_microarray[1:5, 1:5]

Building reference gene list

clustify_lists and clustify_nudge uses dataframe or matrix of candidate genes for classification. Example formatting are shown below.

# directly from seurat function
head(pbmc_markers)

# manually enter into dataframe format
betam <- c("INS", "IGF2", "IAPP", "MAFA", "NPTX2")
alpham <- c("GCG", "PDK4", "LOXL2", "IRX2", "GC")
deltam <- c("SST", "RBP4", "HHEX", "PCSK1", "LEPR")
data.frame(alpham, betam, deltam)

# use matrixize_markers function to convert
mm <- matrixize_markers(
  marker_df = pbmc_markers,
  remove_rp = TRUE,
  unique = TRUE,
  n = 10
)
head(mm)

# get markers from ref_matrix, and then matrixize_markers
cbmc_m <- matrixize_markers(
  marker_df = ref_marker_select(cbmc_ref),
  remove_rp = TRUE,
  unique = TRUE,
  n = 3
)

# parse `garnett` marker files, see function `file_marker_parse`

Prebuilt references, in clustifyr

A few reference datasets are built into clustifyr for vignettes and testing: pbmc_bulk_matrix, cbmc_ref, pbmc_markers, cbmc_m

Prebuilt references, in clustifyrdata

More reference data, including tabula muris, and code used to generate them are available at https://github.com/rnabioco/clustifyrdata.

Also see list for individual downloads at https://rnabioco.github.io/clustifyr/articles/download_refs.html

```{"Installation"}

install.packages("devtools")

devtools::install_github("rnabioco/clustifyrdata") ```

rnabioco/clustifyrdata documentation built on Nov. 8, 2022, 4:26 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

rnabioco/clustifyrdata
External data sets for clustifyr

In rnabioco/clustifyrdata: External data sets for clustifyr

Building reference matrix from single cell expression matrix

Building reference matrix from `seurat` object

Building reference matrix from Recount2

Building reference matrix from microarray data

Building reference gene list

Prebuilt references, in clustifyr

Prebuilt references, in clustifyrdata

install.packages("devtools")

R Package Documentation

Browse R Packages

We want your feedback!

rnabioco/clustifyrdata External data sets for clustifyr

In rnabioco/clustifyrdata: External data sets for clustifyr

Building reference matrix from single cell expression matrix

Building reference matrix from seurat object

Building reference matrix from Recount2

Building reference matrix from microarray data

Building reference gene list

Prebuilt references, in clustifyr

Prebuilt references, in clustifyrdata

install.packages("devtools")

R Package Documentation

Browse R Packages

We want your feedback!

rnabioco/clustifyrdata
External data sets for clustifyr

Building reference matrix from `seurat` object