R/filter_rna_h5.R

Defines functions reduce_h5_list split_h5_list_by_hash add_h5_list_hash_results subset_h5_list_by_observations h5_list_add_dgCMatrix h5_list_convert_from_dgCMatrix h5_list_convert_to_dgCMatrix h5_list_add_mito_umis add_well_metadata add_cell_ids

Documented in add_cell_ids add_h5_list_hash_results add_well_metadata h5_list_add_dgCMatrix h5_list_add_mito_umis h5_list_convert_from_dgCMatrix h5_list_convert_to_dgCMatrix reduce_h5_list split_h5_list_by_hash subset_h5_list_by_observations

#' Add cell IDs to a h5_list object
#'
#' @param h5_list a list object generated by running rhdf5::h5dump() on a 10x HDF5 file.
#' @param target the group name for the matrix within the h5_list to use. Default is "matrix", which is used by 10x.
#' @param add_uuid a logical indicating whether or not to add a uuid generated by ids::uuid(). Default is TRUE.
#' @param replace_barcode a logical indicating whether or not the new uuid should replace the barcodes slot. Default is TRUE.
#' @param retain_original_barcode a logical indicating whether to retain the original barcodes if replace_barcode is TRUE. These will be moved to /matrix/original_barcode if TRUE. Default is TRUE.
#' @param add_name a logical indicating whether to add a new cell name. These are adjective_adjective_animal names generated by ids::adjective_animal(). Default is TRUE.
#'
#' @return a modified h5_list object
#' @export
#'
add_cell_ids <- function(h5_list,
                         target = "matrix",
                         add_uuid = TRUE,
                         replace_barcode = TRUE,
                         retain_original_barcode = TRUE,
                         add_name = TRUE) {
  
  assertthat::assert_that(class(h5_list) == "list")
  
  assertthat::assert_that(class(target) == "character")
  assertthat::assert_that(length(target) == 1)
  if(grepl("^/",target)) {
    target <- sub("^/","",target)
  }
  assertthat::assert_that(target %in% names(h5_list))
  
  assertthat::assert_that("barcodes" %in% names(h5_list[[target]]))
  assertthat::assert_that(is.logical(add_uuid))
  assertthat::assert_that(is.logical(add_name))
  
  if(add_uuid) {
    
    if(replace_barcode) {
      if(retain_original_barcode) {
        h5_list <- BarMixer::set_list_path(h5_list,
                                           paste0("/",target,"/observations/original_barcodes"),
                                           sub("-1","",h5_list[[target]]$barcodes))
      }
      # matrix$barcodes is used for column ids by most functions that read 10x HDF5 files.
      h5_list[[target]]$barcodes <- ids::uuid(n = length(h5_list[[target]]$barcodes),
                                              drop_hyphens = TRUE,
                                              use_time = TRUE)
      h5_list <- BarMixer::set_list_path(h5_list,
                                         paste0("/",target,"/observations/cell_uuid"),
                                         h5_list[[target]]$barcodes)
    } else {
      h5_list <- BarMixer::set_list_path(h5_list,
                                         paste0("/",target,"/observations/cell_uuid"),
                                         ids::uuid(n = length(h5_list[[target]]$barcodes),
                                                   drop_hyphens = TRUE,
                                                   use_time = TRUE))
    }
    
  }
  
  if(add_name) {
    h5_list <- BarMixer::set_list_path(h5_list,
                                       paste0("/",target,"/observations/cell_name"),
                                       ids::adjective_animal(n = length(h5_list[[target]]$barcodes),
                                                             n_adjectives = 2,
                                                             max_len = 10))
  }
  
  h5_list
  
}

#' Add well metadata
#'
#' Parses a standard well_id to generate chip_id, pool_id, and batch_id.
#'
#' @param h5_list a list object generated by running rhdf5::h5dump() on a 10x HDF5 file.
#' @param well_id a character object indicating the well name for the input h5. Should match the pattern B000-P1C1W1
#'
#' @return a modified h5_list object
#' @export
#'
add_well_metadata <- function(h5_list,
                              well_id) {
  
  assertthat::assert_that(class(h5_list) == "list")
  assertthat::assert_that("matrix" %in% names(h5_list))
  
  assertthat::assert_that(class(well_id) == "character")
  assertthat::assert_that(length(well_id) == 1)
  
  well_id <- sub("-[AHR]P","-P",well_id)
  
  chip_id <- sub("W[0-9]+$","",well_id)
  pool_id <- sub("C[0-9]+$","",chip_id)
  batch_id <- sub("-P[0-9]+$","",pool_id)
  
  n_cells <- length(h5_list$matrix$barcodes)
  
  h5_list <- BarMixer::set_list_path(h5_list,
                                     "/matrix/observations/well_id",
                                     rep(well_id, n_cells))
  h5_list <- BarMixer::set_list_path(h5_list,
                                     "/matrix/observations/chip_id",
                                     rep(chip_id, n_cells))
  h5_list <- BarMixer::set_list_path(h5_list,
                                     "/matrix/observations/pool_id",
                                     rep(pool_id, n_cells))
  h5_list <- BarMixer::set_list_path(h5_list,
                                     "/matrix/observations/batch_id",
                                     rep(batch_id, n_cells))
  
  h5_list
}


#' Add mitochondrial gene UMI counts to an h5_list object
#'
#' @param h5_list a list object generated by running rhdf5::h5dump() on a 10x HDF5 file.
#'
#' @return a list object with mitochondrial counts stored in h5_list$matrix$observations$mito_umis
#' @export
h5_list_add_mito_umis <- function(h5_list) {
  
  assertthat::assert_that(class(h5_list) == "list")
  assertthat::assert_that("matrix" %in% names(h5_list))
  
  chrM_genes <- data.table::fread(system.file("reference/GRCh38_10x_chrM_gene_metadata.csv.gz",
                                              package = "BarMixer"))
  
  reconvert_matrix <- FALSE
  if(!"matrix_dgCMatrix" %in% names(h5_list)) {
    h5_list <- BarMixer::h5_list_convert_to_dgCMatrix(h5_list,
                                                      target = "matrix")
    reconvert_matrix <- TRUE
  }
  
  common_mito_genes <- intersect(chrM_genes$name, h5_list$matrix$features$name)
  
  if(length(common_mito_genes) == 0) {
    stop("No Mitochondrial Genes found in h5_list")
  }
  
  common_mito_ids <- rownames(h5_list$matrix_dgCMatrix)[h5_list$matrix$features$name %in% common_mito_genes]
  
  mito_umis <- Matrix::colSums(h5_list$matrix_dgCMatrix[common_mito_ids,])
  names(mito_umis) <- NULL
  
  h5_list <- BarMixer::set_list_path(h5_list,
                                     "/matrix/observations/n_mito_umis",
                                     mito_umis)
  
  if(reconvert_matrix) {
    h5_list <- BarMixer::h5_list_convert_from_dgCMatrix(h5_list,
                                                        target = "matrix")
  }
  
  h5_list
  
}

#' Convert the matrix in an h5_list from 10x Genomics data to a sparse matrix
#'
#' This is very useful for subsetting the matrix based on column names.
#'
#' @param h5_list a list object generated by running rhdf5::h5dump() on a 10x HDF5 file.
#' @param target the group name for the sparse matrix. Default is "matrix", which is used by 10x.
#' @param feature_names the group name in target/features to use for rownames. Must be either "id" or "name". Default is "id".
#'
#' @return a list object
#' @export
#'
h5_list_convert_to_dgCMatrix <- function(h5_list,
                                         target = "matrix",
                                         feature_names = "id") {
  
  assertthat::assert_that(class(h5_list) == "list")
  assertthat::assert_that(target %in% names(h5_list))
  assertthat::assert_that(feature_names %in% c("id","name"))
  
  target_dgCMatrix <- paste0(target,"_dgCMatrix")
  
  h5_list[[target_dgCMatrix]] <- Matrix::sparseMatrix(x = h5_list[[target]]$data,
                                                      i = h5_list[[target]]$indices,
                                                      index1 = FALSE,
                                                      p = h5_list[[target]]$indptr,
                                                      dims = h5_list[[target]]$shape,
                                                      dimnames = list(h5_list[[target]]$features[[feature_names]],
                                                                      h5_list[[target]]$barcodes))
  h5_list[[target]]$data <- NULL
  h5_list[[target]]$indices <- NULL
  h5_list[[target]]$indptr <- NULL
  h5_list[[target]]$shape <- NULL
  h5_list[[target]]$features$id <- NULL
  h5_list[[target]]$barcodes <- NULL
  
  h5_list
}

#' Convert the matrix in an h5_list from a sparse matrix back to its original structure
#'
#' This is very useful for preparing to write back out to an .h5 file.
#'
#' @param h5_list a list object generated by running rhdf5::h5dump() on a 10x HDF5 file, with a matrix converted by h5_list_convert_to_dgCMatrix().
#' @param target the group name of the original sparse matrix. Default is "matrix", which is used by 10x.
#'
#' @return a list object
#' @export
#'
h5_list_convert_from_dgCMatrix <- function(h5_list,
                                           target = "matrix") {
  
  assertthat::assert_that(class(h5_list) == "list")
  
  target_dgCMatrix <- paste0(target,"_dgCMatrix")
  assertthat::assert_that(target_dgCMatrix %in% names(h5_list))
  
  h5_list[[target]]$data <- h5_list[[target_dgCMatrix]]@x
  h5_list[[target]]$indices <- h5_list[[target_dgCMatrix]]@i
  h5_list[[target]]$indptr <- h5_list[[target_dgCMatrix]]@p
  h5_list[[target]]$shape <- dim(h5_list[[target_dgCMatrix]])
  h5_list[[target]]$barcodes <- colnames(h5_list[[target_dgCMatrix]])
  h5_list[[target]]$features$id <- rownames(h5_list[[target_dgCMatrix]])
  
  h5_list[[target_dgCMatrix]] <- NULL
  
  h5_list
}

#' Add a sparse matrix to an h5_list containing 10x genomics data
#'
#' @param h5_list a list object, usually generated by running rhdf5::h5dump() on a 10x HDF5 file.
#' @param mat a dgCMatrix to add to the h5_list
#' @param target a character object specifying the location within the list to write to.
#'
#' @return a list object with the additional matrix added similar to 10x HDF5 matrix format.
#' @export
#'
h5_list_add_dgCMatrix <- function(h5_list,
                                  mat,
                                  target) {
  
  assertthat::assert_that(class(h5_list) == "list")
  assertthat::assert_that(class(mat) == "dgCMatrix")
  
  h5_list[[target]]$data <- mat@x
  h5_list[[target]]$indices <- mat@i
  h5_list[[target]]$indptr <- mat@p
  h5_list[[target]]$shape <- dim(mat)
  h5_list[[target]]$barcodes <- colnames(mat)
  h5_list[[target]]$features <- list(id = rownames(mat))
  
  h5_list
}


#' Subset a h5_list object based on a set of observations.
#'
#' @param h5_list a list object generated by running rhdf5::h5dump() on a 10x HDF5 file.
#' @param match_values a vector of values to use for filtering
#' @param match_target a character object specifying which values to use for filtering in the h5_list. If "barcodes", uses matrix/barcodes; Otherwise, looks in matrix/observations for matching values
#' @param sparse_matrices a character vector specifying which list objects to treat as sparse matrices. If these aren't currently dgCMatrices, they will be converted by h5_list_convert_to_dgCMatrix for filtering.
#'
#' @return a list object
#' @export
#'
subset_h5_list_by_observations <- function(h5_list,
                                           match_values,
                                           match_target = "barcodes",
                                           sparse_matrices = "matrix") {
  
  assertthat::assert_that(class(h5_list) == "list")
  assertthat::assert_that(is.vector(match_values))
  assertthat::assert_that(is.character(match_target))
  assertthat::assert_that(length(match_target) == 1)
  converted_matrices <- sparse_matrices[paste0(sparse_matrices, "_dgCMatrix") %in% names(h5_list)]
  unconverted_matrices <- setdiff(sparse_matrices[sparse_matrices %in% names(h5_list)], converted_matrices)
  assertthat::assert_that((length(unconverted_matrices) + length(converted_matrices)) > 0)
  
  
  if(match_target == "barcodes") {
    if("matrix" %in% unconverted_matrices) {
      assertthat::assert_that("barcodes" %in% names(h5_list$matrix))
      
      keep <- match(match_values, h5_list$matrix$barcodes)
    } else if("matrix" %in% converted_matrices) {
      keep <- match(match_values, colnames(h5_list$matrix_dgCMatrix))
    }
    
  } else {
    assertthat::assert_that("observations" %in% names(h5_list$matrix))
    assertthat::assert_that(match_target %in% names(h5_list$matrix$observations))
    
    keep <- match(match_values, h5_list$matrix$observations[[match_target]])
  }
  
  if(length(unconverted_matrices) > 0) {
    for(unconverted in unconverted_matrices) {
      h5_list <- BarMixer::h5_list_convert_to_dgCMatrix(h5_list,
                                                        unconverted)
    }
  }
  
  for(converted in c(unconverted_matrices, converted_matrices)) {
    mat_name <- paste0(converted, "_dgCMatrix")
    h5_list[[mat_name]] <- h5_list[[mat_name]][, keep]
    
    if("observations" %in% names(h5_list[[converted]])) {
      additional_cell_values <- names(h5_list[[converted]]$observations)
      
      for(additional_value in additional_cell_values) {
        h5_list[[converted]]$observations[[additional_value]] <- h5_list[[converted]]$observations[[additional_value]][keep]
      }
    }
  }
  
  if(length(unconverted_matrices) > 0) {
    for(unconverted in unconverted_matrices) {
      h5_list <- BarMixer::h5_list_convert_from_dgCMatrix(h5_list,
                                                          unconverted)
    }
  }
  
  h5_list
}


#' Add cell hashing results to an h5_list.
#'
#' @param h5_list a list object generated by running rhdf5::h5dump() on a 10x HDF5 file.
#' @param hash_category_table a data.frame or data.table with results from HTOparser::categorize_binary_hash_matrix() consisting of 3 columns: cell_barcode, hto_category, and hto_barcode.
#' @param hash_count_matrix (optional) a matrix or dgCMatrix with results from CITE-seq-Count, HTOparser::add_missing_hto_rows(), or HTOparser::barcode_table_to_matrix().
#' @param match_target a character object specifying which values to use for filtering in the h5_list. If "barcodes", uses matrix/barcodes; Otherwise, looks in matrix/observations for matching values
#'
#' @return a list object with hash data added to /hash/ and /matrix/observations/
#' @export
#'
add_h5_list_hash_results <- function(h5_list,
                                     hash_category_table,
                                     hash_count_matrix,
                                     match_target = "barcodes") {
  
  assertthat::assert_that(class(h5_list) == "list")
  assertthat::assert_that(sum(class(hash_category_table) %in% c("data.frame","data.table")) > 0)
  
  if(!is.null(hash_count_matrix)) {
    assertthat::assert_that(sum(class(hash_count_matrix) %in% c("dgCMatrix","matrix")) > 0)
  }
  
  assertthat::assert_that(class(match_target) == "character")
  assertthat::assert_that(length(match_target) == 1)
  
  if(match_target == "barcodes") {
    h5_list$matrix$barcodes <- sub("-.+","",h5_list$matrix$barcodes)
    common_barcodes <- intersect(h5_list$matrix$barcodes,
                                 hash_category_table$cell_barcode)
  } else {
    common_barcodes <- intersect(h5_list$matrix$observations[[match_target]],
                                 hash_category_table$cell_barcode)
  }
  
  # Filter the hash table based on common barcodes
  common_hash_table <- hash_category_table[match(common_barcodes,
                                                 hash_category_table$cell_barcode),]
  
  # Convert matrix to dgCMatrix for filtering
  h5_list <- BarMixer::h5_list_convert_to_dgCMatrix(h5_list,
                                                    target = "matrix")
  
  # Filter the h5_list based on common barcodes
  h5_list <- BarMixer::subset_h5_list_by_observations(h5_list,
                                                      match_values = common_barcodes,
                                                      match_target = match_target,
                                                      sparse_matrices = "matrix")
  
  if(is.null(hash_count_matrix)) {
    # If there's no hash count matrix, we only need to filter matrix as sparse
    sparse_matrices <- "matrix"
  } else {
    # Otherwise, filter the hash count matrix for common barcodes
    hash_count_matrix <- hash_count_matrix[, common_barcodes]
    
    colnames(hash_count_matrix) <- colnames(h5_list$matrix_dgCMatrix)
    
    # Convert to sparse for storage
    hash_count_matrix <- as(hash_count_matrix, "dgCMatrix")
    
    # Add the hash count matrix to the h5_list
    h5_list$hash_dgCMatrix <- hash_count_matrix
    # Keep track of hash_count_matrix as sparse for filtering
    sparse_matrices <- c("matrix","hash")
  }
  
  h5_list <- BarMixer::set_list_path(h5_list,
                                     "/matrix/observations/hto_category",
                                     common_hash_table$hto_category)
  h5_list <- BarMixer::set_list_path(h5_list,
                                     "/matrix/observations/hto_barcode",
                                     common_hash_table$hto_barcode)
  
  if("sample_id" %in% names(common_hash_table)) {
    h5_list <- BarMixer::set_list_path(h5_list,
                                       "/matrix/observations/sample_id",
                                       common_hash_table$sample_id)
  }
  
  for(sparse_matrix in sparse_matrices) {
    h5_list <- BarMixer::h5_list_convert_from_dgCMatrix(h5_list,
                                                        target = sparse_matrix)
  }
  
  h5_list
  
}

#' Split a 10x HDF5 file based on HTOparser results
#'
#' @param h5_list a list object generated by h5dump(), which has hash labels obtained from add_h5_list_hash_results().
#'
#' @return A list object with contents of the input h5_file separated by each hto_barcode. Also includes multiplets.
#' @export
#'
split_h5_list_by_hash <- function(h5_list) {
  
  assertthat::assert_that(class(h5_list) == "list")
  assertthat::assert_that("hto_category" %in% names(h5_list$matrix$observations))
  
  meta <- BarMixer::h5_list_cell_metadata(h5_list)
  
  # Separate singlets from non-singlets
  singlet_meta <- meta[meta$hto_category == "singlet",]
  multiplet_meta <- meta[meta$hto_category != "singlet",]
  
  # Make a list to hold output
  split_h5_list <- list()
  
  # Get the set of unique barcodes (and multiplet) to use for splitting
  hto_barcodes <- c(unique(as.character(singlet_meta$hto_barcode)), "multiplet")
  
  for(barcode in hto_barcodes) {
    # Use the multiplet table if multiplet. Otherwise, filter singlet table for hto_barcode
    if(barcode == "multiplet") {
      hto_meta <- multiplet_meta
    } else {
      hto_meta <- singlet_meta[singlet_meta$hto_barcode == barcode,]
    }
    
    # Subset the h5_list for this barcode using barcode_target and sparse_matrices, defined above in this function
    split_h5_list[[barcode]] <- BarMixer::subset_h5_list_by_observations(
      h5_list,
      match_values = hto_meta$barcodes,
      match_target = "barcodes",
      sparse_matrices = c("matrix","hash")
    )
    
  }
  
  split_h5_list
}

#' Concatenate two h5_list objects.
#'
#' This is suitable for h5_list objects that contain vectors and/or dgCMatrices. It expects that the two lists have the same structure and names.
#'
#' @param x the first list to combine.
#' @param y the second list to append.
#'
#' @return a list object with the same structure as x.
#' @export
#'
cat_h5_list <- function (x, y) {
  
  assertthat::assert_that(is.list(x))
  assertthat::assert_that(is.list(y))
  assertthat::assert_that(all(names(x) == names(y)))
  
  for (v in seq_along(x)) {
    
    if(class(x[[v]]) == "list") {
      # If the target is a list, recurse to the next level
      x[[v]] <- BarMixer::cat_h5_list(x[[v]], y[[v]])
    } else if(class(x[[v]]) == "dgCMatrix") {
      # If it's a dgCMatrix, perform a cbind
      x[[v]] <- cbind(x[[v]], y[[v]])
    } else {
      # If it's a vector, use c to combine.
      x[[v]] <- c(as.vector(x[[v]]), as.vector(y[[v]]))
    }
  }
  x
}

#' Reduce a list-of-lists of h5_list objects to a single, concatenated object.
#'
#' @param h5_ll A list-of-lists containing h5_list objects generated by rhdf5::h5dump() or similar.
#' @param sparse_matrices A vector of object names within each list that should be treated as dgCMatrices for combining. Default is "matrix", which is used by 10x.
#'
#' @return An h5_list object with the structure of a single h5_list within the h5_ll object.
#' @export
#'
reduce_h5_list <- function(h5_ll,
                           sparse_matrices = c("matrix")) {
  
  assertthat::assert_that(is.list(h5_ll))
  assertthat::assert_that(all(sparse_matrices %in% names(h5_ll[[1]])))
  
  for(i in seq_along(sparse_matrices)) {
    h5_ll <- lapply(h5_ll,
                    BarMixer::h5_list_convert_to_dgCMatrix,
                    sparse_matrices[i])
  }
  
  out_list <- h5_ll[[1]]
  for(i in 2:length(h5_ll)) {
    out_list <- BarMixer::cat_h5_list(out_list, h5_ll[[i]])
  }
  
  for(i in seq_along(sparse_matrices)) {
    # Exception here for "features", which shouldn't be concatenated.
    out_list[[sparse_matrices[i]]]$features <- h5_ll[[1]][[sparse_matrices[i]]]$features
    
    out_list <- BarMixer::h5_list_convert_from_dgCMatrix(out_list,
                                                         target = sparse_matrices[i])
  }
  
  out_list
  
}
AllenInstitute/BarMixer documentation built on Dec. 17, 2021, 8:42 a.m.