#'Filtering the loaded genotyping data.
#'
#'@description
#'We filter a SummarizedExperiment object to exclude variants and cells.
#'
#'@details
#'We do this for one sample at a time.
#'We want to remove:
#' \enumerate{
#' \item all cells not in an allow list,
#' \item all cells in an exclusion list,
#' \item all cells that do not have at least one variant with >1 (Reference),
#' \item all variants that are for alternative transcripts,
#' \item all variants that are always NoCall,
#' \item set variants with a VAF below a threshold to NoCall or Reference.
#' }
#'@importFrom Matrix summary
#'@importFrom SummarizedExperiment assays
#'@importFrom utils read.table
#'@importFrom Matrix rowSums colSums
#'@param SE SummarizedExperiment object.
#'@param cells_include A vector of cell barcodes. Only these cells will be retained.
#'@param cells_exclude A vector of cell barcodes. These cells will be removed from the output.
#'@param fraction_threshold Variants with an VAF below this threshold are set to 0. Numeric. Default = NULL.
#'@param alts_threshold Variants with a number of alt reads less than this threshold are set to 0. Numeric. Default = NULL.
#'@param min_cells_per_variant In how many cells should a variant be present to be included? Numeric. Default = 2.
#'@param min_variants_per_cell How many variants should be covered in a cell have to be included? Default = 1.
#'@param reject_value Should cells that fall below a threshold (fraction_threshold or alts_threshold) be treated as Reference or NoCall? Default = NoCall.
#'@param verbose Should the function be verbose? Default = TRUE
#'@examples
#' \dontrun{
#' # Removing all variants that are not detected in at least 2 cells.
#' # Before we remove the variants, we set fraction value of variants below 0.05 to 0.
#' se_geno <- Filtering(se_geno, min_cells_per_variant = 2, fraction_threshold = 0.05)
#' }
#'@export
Filtering <- function(SE, cells_include = NULL, cells_exclude = NULL, fraction_threshold = NULL, alts_threshold = NULL, min_cells_per_variant = NULL, min_variants_per_cell = NULL, reject_value = "NoCall", verbose = TRUE){
# Checking if the reject_value variable is correct.
if(!reject_value %in% c("Reference", "NoCall")){
stop(paste0("Your reject_value is ", reject_value, ".\nIt should be Reference or NoCall."))
} else{
if(reject_value == "NoCall") reject_value_numeric <- 0
if(reject_value == "Reference") reject_value_numeric <- 1
}
if(!is.null(cells_include)){
if(verbose) print("We remove all cells not in the allow list.")
# Check if any cells would be left.
check_leftover <- any(colnames(SE) %in% cells_include)
if(!check_leftover) stop("No cells are in your supplied list.")
SE <- SE[, colnames(SE) %in% cells_include]
cells_include <- cells_include[cells_include %in% colnames(SE)]
SE <- SE[, cells_include]
}
if(!is.null(cells_exclude)){
if(verbose) print("We remove the unwanted cell barcodes.")
# Check if any cells would be left.
check_leftover <- all(colnames(SE) %in% cells_exclude)
if(check_leftover) stop("No cells are in your supplied list.")
SE <- SE[, !colnames(SE) %in% cells_exclude]
}
# If the fraction_threshold is 0, we skip the thresholding. 0 and NULL would be the same.
# We might want to use a fraction_threshold of 0 to use the same variable to create file paths later.
if(!is.null(fraction_threshold)){
if(fraction_threshold == 0){
fraction_threshold <- NULL
}
}
if(!is.null(fraction_threshold)){
if(any(fraction_threshold >= 1, fraction_threshold <= 0)){
stop("Your fraction threshold is not 0 < x < 1.")
}
if(verbose) print(paste0("We assume that cells with a fraction smaller than our threshold are actually ", reject_value, "."))
if(verbose) print(paste0("We set consensus values to ", reject_value_numeric, " (", reject_value, ") and fraction values to 0."))
if(verbose) print(paste0("We do not set fractions between ", fraction_threshold, " and 1 to 1."))
if(verbose) print("This way, we retain the heterozygous information.")
# Filtering using sparse matrices.
consensus_matrix <- SummarizedExperiment::assays(SE)$consensus
fraction_matrix <- SummarizedExperiment::assays(SE)$fraction
position_matrix <- Matrix::summary(fraction_matrix)
position_matrix <- subset(position_matrix, x > 0 & x < fraction_threshold)
# If no elements fall between 0 and the fraction_threshold, we do not have to change the matrices.
if(nrow(position_matrix) > 0){
ij <- as.matrix(position_matrix[, 1:2])
consensus_matrix[ij] <- reject_value_numeric
fraction_matrix[ij] <- 0
SummarizedExperiment::assays(SE)$consensus <- consensus_matrix
SummarizedExperiment::assays(SE)$fraction <- fraction_matrix
}
}
if(!is.null(alts_threshold)){
if(any(!is.numeric(alts_threshold), is.infinite(alts_threshold))){
stop("Your alts_threshold should be a numeric value.")
}
if(verbose) print(paste0("We assume that cells with a number of alternative reads smaller than our threshold are actually ", reject_value, "."))
if(verbose) print(paste0("We set consensus values to ", reject_value_numeric, " (", reject_value, "), fraction values to 0."))
if(reject_value == "NoCall") if(verbose) print("We set Alts, Refs and Coverage to 0.")
if(reject_value == "Reference") if(verbose) print("We set Alts to 0 and adjust the Coverage.")
# Filtering using sparse matrices.
consensus_matrix <- SummarizedExperiment::assays(SE)$consensus
fraction_matrix <- SummarizedExperiment::assays(SE)$fraction
coverage_matrix <- SummarizedExperiment::assays(SE)$coverage
alts_matrix <- SummarizedExperiment::assays(SE)$alts
refs_matrix <- SummarizedExperiment::assays(SE)$refs
position_matrix <- summary(alts_matrix)
position_matrix <- subset(position_matrix, x < alts_threshold)
# If no elements fall between 0 and the alts_threshold, we do not have to change the matrices.
if(nrow(position_matrix) > 0){
ij <- as.matrix(position_matrix[, 1:2])
consensus_matrix[ij] <- reject_value_numeric
fraction_matrix[ij] <- 0
# We check if the positions to be changed have ref reads. If not, we set them to NoCall.
refs_check <- refs_matrix[ij] == 0
consensus_matrix[ij][refs_check] <- 0
# Since we remove the Alt reads in either case (NoCall or Reference), we remove the Alts from the Coverage.
coverage_matrix[ij] <- coverage_matrix[ij] - alts_matrix[ij]
# We now set the alts matrix to 0.
alts_matrix[ij] <- 0
# If we set the cells to NoCall, we also set the Refs to 0.
# We then also set the coverage matrix to 0.
if(reject_value == "NoCall"){
refs_matrix[ij] <- 0
coverage_matrix[ij] <- 0
}
SummarizedExperiment::assays(SE)$consensus <- consensus_matrix
SummarizedExperiment::assays(SE)$fraction <- fraction_matrix
SummarizedExperiment::assays(SE)$coverage <- coverage_matrix
SummarizedExperiment::assays(SE)$alts <- alts_matrix
SummarizedExperiment::assays(SE)$refs <- refs_matrix
}
}
if(verbose) print("We remove all the variants that are always NoCall.")
consensus_test <- SummarizedExperiment::assays(SE)$consensus > 0
keep_variants <- Matrix::rowSums(consensus_test) > 0
SE <- SE[keep_variants,]
if(!is.null(min_cells_per_variant)){
if(verbose) print(paste0("We remove variants, that are not at least detected in ", min_cells_per_variant, " cells."))
keep_variants <- Matrix::rowSums(SummarizedExperiment::assays(SE)$consensus >= 2)
keep_variants <- keep_variants >= min_cells_per_variant
SE <- SE[keep_variants,]
}
if(!is.null(min_variants_per_cell)){
if(verbose) print(paste0("We remove all cells that are not >= 1 (Ref) for at least ", min_variants_per_cell, " variant."))
consensus_test <- SummarizedExperiment::assays(SE)$consensus >= 1
keep_cells <- Matrix::colSums(consensus_test) >= min_variants_per_cell
SE <- SE[,keep_cells]
}
return(SE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.