R/scSeurat.R

Defines functions auto_subset process_seurat plot_cc get_cell_cycle_genes kill_cc process_ltbc gen_cellecta_bc_data filter_raw_data check_load_inputs tenx_load_qc

Documented in auto_subset gen_cellecta_bc_data get_cell_cycle_genes kill_cc plot_cc process_ltbc process_seurat tenx_load_qc

#' Load 10X data
#'
#' @param path_10x Path to 10X RNAseq data "filtered_feature_bc_matrix" folder
#' @param h5_file Path to 10X h5 file
#' @param frag_file Path to 10X ATAC fragments file. Only required for ATAC data
#' @param min_cells Passed to CreateSeuratObject: Include features detected in
#'     at least this many cells. Will subset the counts matrix as well. To
#'     reintroduce excluded features, create a new object with a lower cutoff.
#' @param min_features Passed to CreateSeuratObject: Include cells where at
#'     least this many features are detected.
#' @param mt_pattern Pattern used to identify mitochondrial reads (eg, "^MT-"
#'     Must add species_pattern if remove_species_pattern = FALSE
#'     (eg, "^hg19-MT-" or "^hg19-MT-|^mm10-mt-")
#' @param species_pattern Pattern used to select only reads from a single
#'     species (eg, "^mm10-" or "^hg19-")
#' @param exp_type Experiment type (one of "GEX", "ATAC" or "GEX+ATAC")
#' @param violin_plot If TRUE (default), produces a violin plot
#' @param remove_species_pattern Specifies if want to remove species_pattern
#'     prefix from gene names. If TRUE (default), removes species_pattern
#'     prefix.
#' @param sample_name Sample name. Used in violin plots for title
#'
#' @return A \code{\link{Seurat}}
#' @export
#'
#' @examples
#' \dontrun{
#' seurat_obj <- tenXLoadQC("path/to/10X/data/", species_pattern = "^mm9-")
#' }
tenx_load_qc <- function(path_10x = "",
                         h5_file = "",
                         frag_file = stringr::str_replace(h5_file,
                                                          "filtered_feature_bc_matrix.h5",
                                                          "atac_fragments.tsv.gz"),
                         min_cells = 5,
                         min_features = 800,
                         mt_pattern = "^mt-|^MT-",
                         species_pattern = "",
                         exp_type = "GEX",
                         remove_species_pattern = TRUE,
                         violin_plot = TRUE,
                         sample_name = path_10x) {

    check_load_inputs(remove_species_pattern,
                      species_pattern,
                      mt_pattern,
                      path_10x,
                      h5_file,
                      exp_type)

    # Load in the data using either the 5h file or the 10x folder
    if (h5_file != "") {
        raw_data <- suppressWarnings(Seurat::Read10X_h5(h5_file))
        if ("Peaks" %in% names(raw_data)) {
            atac_raw_data <- raw_data$Peaks
        }
        if ("Gene Expression" %in% names(raw_data)) {
            rna_raw_data <- raw_data$`Gene Expression`
        } else {
            rna_raw_data <- raw_data
        }
    } else {
        rna_raw_data <- Seurat::Read10X(path_10x)
        if ("Gene Expression" %in% names(rna_raw_data)) {
            rna_raw_data <- rna_raw_data[["Gene Expression"]]
        }
    }

    if (exp_type == "GEX+ATAC" && min_features != 0) {
        min_features <- 0
        message("Setting min_features to 0 to load both GEX and ATAC data.\n",
                "Carefully filter your data downstream.")
    }

    if (grepl("GEX", exp_type)) {
        gex_orig_cells <- nrow(rna_raw_data)
        gex_first_ten_genes <- rownames(rna_raw_data)[1:10]
        # subset the data to only include the species of interest
        rna_raw_data <-
            filter_raw_data(rna_raw_data,
                            species_pattern,
                            remove_species_pattern)

        if (nrow(rna_raw_data) == 0) {
            stop("No genes left in object after filtering using ",
                 "species_pattern! Check species_pattern argument. ",
                 "Before filtering gex data using species_pattern the first ",
                 "genes were: ",
                 paste(gex_first_ten_genes, collapse = "\n"))
        }

        seurat <-
            Seurat::CreateSeuratObject(rna_raw_data,
                                       min.cells = min_cells,
                                       min.features = min_features)

        if (ncol(seurat) == 0) {
            stop("No data left after applying min.cells and min.features.",
                 " Check your data and arguments.")
        }

        seurat <-
            Seurat::PercentageFeatureSet(seurat,
                                         pattern = gsub("_", "-", mt_pattern),
                                         col.name = "percent.mt")
            # Need to change all underscores to dashes due to CreateSeuratObject
            # doing the same

        if (sum(seurat$percent.mt, na.rm = TRUE) == 0) {
            warning("No mitochondrial reads found!")
            warning("If you have a sample aligned to a mixed reference, make ",
                    "sure that your species_pattern and mt_pattern arguments ",
                    "are appropriate.")
            print(paste("Potential mitochondrial genes:",
                        rownames(seurat)[grep(rownames(seurat),
                                         pattern = "mt",
                                         ignore.case = TRUE)]))
        }
    }

    if (grepl("ATAC", exp_type)) {
        # subset the data to only include the species of interest
        atac_first_ten_peaks <- head(rownames(rna_raw_data), 10)

        atac_raw_data <-
            filter_raw_data(atac_raw_data,
                            species_pattern,
                            remove_species_pattern)

        if (nrow(atac_raw_data) == 0) {
            stop("No peaks left in object after filtering using ",
                 "species_pattern! Check species_pattern argument. ",
                 "Before filtering gex data using species_pattern the first ",
                 "peaks were: ",
                 paste(atac_first_ten_peaks, collapse = "\n"))
        }


        if (exp_type == "ATAC") {
            seurat <-
                Signac::CreateChromatinAssay(counts = atac_raw_data,
                                             sep = c(":", "-"),
                                             fragments = frag_file,
                                             min.cells = min_cells) %>%
                Seurat::CreateSeuratObject(assay = "ATAC")
        } else if (exp_type == "GEX+ATAC") {
            seurat[["ATAC"]] <-
                Signac::CreateChromatinAssay(counts = atac_raw_data,
                                             sep = c(":", "-"),
                                             fragments = frag_file,
                                             min.cells = min_cells)
        } else {
            stop("This shouldn't be possible. I have no idea how you got here.")
        }

        seurat <-
            Seurat::PercentageFeatureSet(seurat,
                                         pattern = gsub("_", "-", mt_pattern),
                                         col.name = "percent_mt_atac",
                                         assay = "ATAC")

    }



    if (violin_plot && grepl("GEX", exp_type)) {
        print(Seurat::VlnPlot(seurat,
                            features = c("nFeature_RNA",
                                         "nCount_RNA",
                                         "percent.mt"),
                            ncol = 3) +
            patchwork::plot_annotation(title = path_10x,
                                       theme = ggplot2::theme(plot.title = ggplot2::element_text(size = 25))))
    }

    return(seurat)
}

check_load_inputs <- function(remove_species_pattern,
                              species_pattern,
                              mt_pattern,
                              path_10x,
                              h5_file,
                              exp_type) {

    # If removing species names and species pattern is in mt_pattern
    if (remove_species_pattern == TRUE &&
        grepl(species_pattern %>% stringr::str_remove_all("\\^"),
              mt_pattern) &&
        species_pattern != "") {
        warning("\nDon't put species prefix in mt_pattern when
                remove_species_pattern == TRUE\n",
                immediate. = TRUE)
        stop()
    }

    # If not removing species names and species pattern not in mt_pattern
    if (remove_species_pattern == FALSE &&
        grepl(species_pattern %>% stringr::str_remove_all("\\^"),
            mt_pattern) == FALSE) {
        warning("\nMake sure your mt_pattern have species prefixes in it when
                remove_species_pattern == FALSE
                mt_pattern should look like: ^hg19-MT-\n",
                immediate. = TRUE)
        stop()
    }

    if (!exp_type %in% c("GEX", "ATAC", "GEX+ATAC")) {
        warning("\nexp_type must be one of 'GEX', 'ATAC', or 'GEX+ATAC'\n",
                immediate. = TRUE)
        stop()
    }

    if (grepl("ATAC", exp_type) && h5_file == "") {
        warning("\nYou need to specify a h5_file for ATAC data\n",
                immediate. = TRUE)
        stop()
    }

    if (path_10x != "" && h5_file != "") {
        warning("\nYou can't specify both a path_10x and h5_file\n",
                immediate. = TRUE)
        stop()
    }

    if (h5_file != "") {
        if (suppressWarnings(try(system("which h5pfc",
                                        intern = TRUE,
                                        ignore.stderr = TRUE))) %>%
                length() == 0) {
            warning("\nYou need to have h5pfc installed to use the h5_file ",
                    "argument. Perhaps ml load HDF5 before you start R?\n",
                    immediate. = TRUE)
            stop()
        }
    }
}

filter_raw_data <- function(raw_data_matrix,
                            species_pattern,
                            remove_species_pattern) {
    if (species_pattern != "") {
        raw_data_matrix <-
            raw_data_matrix[grep(pattern = gsub("-", "_", species_pattern),
                            rownames(raw_data_matrix)), ]
        # Then remove the species_pattern prefix from the peak names
        if (remove_species_pattern) {
            rownames(raw_data_matrix) <-
                sub(gsub("-",
                            "_",
                            species_pattern),
                    "",
                    rownames(raw_data_matrix))
        }
    }
    return(raw_data_matrix)
}

#' Extract cellecta barcode information from a sam or bam file
#'
#' @param file Input sam or bam file
#' @param verbose Optional updates during parsing
#' @param output File to write out
#' @param samtools_module For slurm, which modules to load
#'
#' @return A tibble of the cell ids and lineage tracing barcodes
#' @export
#'
#' @details For some reason, this function does not cancel when ordered. Either
#' kill the R session or kill the process directly on the server
#'
#' @examples
#' \dontrun{
#' cid_lt <- gen_cellecta_bc_data(file = "path/to/file.bam",
#'                                verbose = TRUE,
#'                                samtools_module = "GCC/9.3.0 SAMtools/1.10")
#'
#' cid_lt <- gen_cellecta_bc_data(
#'    file = "/home/gdrobertslab/lab/Counts/S0027/outs/S0016-S0027-bc2.sam",
#'    verbose = TRUE,
#'    samtools_module = "GCC/9.3.0 SAMtools/1.10")
#' }
gen_cellecta_bc_data <- function(file, verbose = FALSE, output = tempfile(),
                                 samtools_module = FALSE) {

  package_dir <- find.package("rrrSingleCellUtils")

  system_cmd <- paste(package_dir,
                      "/exec/getCellectaBarcodes.pl --sam ",
                      file,
                      " > ", output,
                      sep = "")

  if (verbose) {
    system_cmd <- paste(system_cmd, "-v")
  }

  if (samtools_module != FALSE) {
    system_cmd <- paste("ml load ", samtools_module, "; ",
                        system_cmd, "; ",
                        "ml unload ", samtools_module,
                        sep = "")
  } else if (Sys.which("samtools") == "") {
    stop("\"samtools\" command not found in $PATH, do you need to provide module
            info for slurm?")
  }

  system(system_cmd)

  results <-
    readr::read_delim(output,
                      delim = "\t",
                      col_names = TRUE,
                      show_col_types = FALSE)

  return(results)
}

#' Process Lineage Tracing Barcodes
#'
#' @param sobject Name of the Seurat object that contains the cell barcodes
#'     (cids) that will be matched and integrated
#' @param histogram Will trigger the function to generate and output a
#'     histogram plot of the top 40 most frequent lineage tracing barcodes
#' @param ymax Upper limit of the y axis (ie, for creating side-by-side
#'     comparisons)
#' @param relative This will normalize cell counts to total number of cells
#'     containing barcodes
#' @param cid_lt Table of cellecta cell id and lineage tracing barcodes.
#'     Returned by gen_cellecta_bc_data()
#' @param col_fill Fill color
#' @param ret_list If TRUE, return a list of barcode frequencies instead of a
#'     Seurat object
#' @param title Title of the histogram, if generated
#'
#' @return either a \code{\link{Seurat}} object or a data frame of barcode
#'     frequences
#' @export
#'
#' @examples
#' \dontrun{
#' cid_lt <- gen_cellecta_bc_data(file = "path/to/file.bam",
#'                                verbose = TRUE,
#'                                samtools_module = "GCC/9.3.0 SAMtools/1.10")
#' output <- process_ltbc(sobject, cid_lt = cid_lt, histogram = TRUE)
#' }
process_ltbc <- function(sobject, cid_lt, histogram = FALSE,
                         col_fill = "#4DBBD5FF", ymax = NA, relative = FALSE,
                         title = "LT Barcode Frequency", ret_list = FALSE) {

  # Deduplicate redundant reads
  cid_lt <- cid_lt %>%
    dplyr::distinct() %>%

    # Match extracted barcode reads against the Cellecta barcode tables
    dplyr::mutate(label14 = bc14f[lt_14],
                  label30 = bc30f[lt_30]) %>%
    dplyr::mutate(label14 = stringr::str_remove(label14, ".+-"),
                  label30 = stringr::str_remove(label30, ".+-")) %>%

    # Eliminate barcodes that don't match the Cellecta barcode tables
    stats::na.omit() %>%

    # Concatenate the two barcodes into a single compound column
    dplyr::mutate(label = paste(label14, label30, sep = "-")) %>%
    dplyr::pull(label, cid) %>%
    sort()

  # Integrate the lineage tracing barcode into the Seurat object metadata
  sobject$lt <-
    cid_lt[stringr::str_remove(Seurat::Cells(sobject),
                               "-1$")] %>%
    as.vector()


  # Generate the frequency tables
  ylabel <- "Number of Cells"
  bc_freq <- as.data.frame(table(sobject$lt)) %>%
    dplyr::rename(freq = Freq) %>%
    dplyr::arrange(-freq)

  if (isTRUE(relative)) {
    bc_freq$freq <- bc_freq$freq / sum(bc_freq$freq) * 100
    ylabel <- "Percentage of Cells"
  }

  bc_plot_data <- utils::head(bc_freq, n = 40)

  # Create histogram graphs (default using blue color from npg from ggsci)
  if (isTRUE(histogram)) {
    print(ggplot2::ggplot(bc_plot_data, ggplot2::aes(x = stats::reorder(Var1,
                                                                        -freq),
                                                     y = freq)) +
            ggplot2::geom_bar(fill = col_fill, stat = "identity") +
            ggplot2::ggtitle(title) +
            ggplot2::ylab(ylabel) +
            ggplot2::xlab("Lineage Barcode") +
            ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90,
                                                               hjust = 1)) +
            ggplot2::ylim(0, ymax))
  }

  if (isTRUE(ret_list)) {
    return(bc_freq)
  } else {
    return(sobject)
  }
}

#' Regress out cell cycle effects
#'
#' @param sobject Seurat object to be processed
#' @param species What species does your data represent? "human" or "mouse"
#' @param cc_regress If set to Y, the process with run without user input and
#'     will automatically proceed to cell cycle regression. If set to Ask, will
#'     prompt the user. If set to N no regression will be performed.
#' @param show_plots Should the plots be printed?
#' @param find_pcs Number of principal components to generate in the re-do PCA
#'     post-CC regression
#' @param use_pcs Number of principal components to use in the post-regression
#'     dimensional reduction
#' @param use_res Resolution to input to FindClusters
#' @param method Type of dimensional reduction to use, currently supports either
#'     umap or tsne
#'
#' @return A Seurat object
#' @export
#'
#' @details
#' The kill_cc function will identify cell cycle components within a dataset.
#' After an initial scoring using the Seurat CellCycleScoring function, the user
#' will be shown a dimensional reduction plot with cells labeled by cell cycle.
#' If indicated, the user can then trigger a process to regress out the effects
#' of cell cycle within the dataset.  The function will then proceed to re-do
#' the PCA and jackstraw if needed, then show a dimensional reduction plot
#' post-regression and retun the corrected Seurat object.
#' Input must be a Seurat object that already has PCA and dimensional reduction
#' data (umap or tsne) attached.
#'
#'
#' @examples
#' \dontrun{
#' load("~/analyses/roberts/dev/rrrSingleCellUtils/testData/test_cc.RData")
#' test <- kill_cc(os, use_pcs = 5, cc_regress = "Y")
#' }
kill_cc <- function(sobject,
                    species = "human",
                    cc_regress = "N",
                    show_plots = TRUE,
                    find_pcs = 20,
                    use_pcs = 3,
                    use_res = 0.5,
                    method = "umap") {
  cc_genes <- get_cell_cycle_genes(species = species)

  sobject <- Seurat::CellCycleScoring(sobject,
                                      s.features = cc_genes$s_features,
                                      g2m.features = cc_genes$g2m_features,
                                      set.ident = TRUE)

  if (method != "umap" & method != "tsne") {
    print("You need to set a method supported by this function")
  }

  plot_cc <-
    Seurat::DimPlot(sobject,
                    reduction = method,
                    label = TRUE,
                    pt.size = 1) +
    theme_roberts()

  if (show_plots) {
    print(plot_cc)
  }

  if (cc_regress == "Ask") {
    cc_regress <-
      readline(prompt =
                 "Proceed with regression of cell cycle-dependent genes (Y/N)?")
  }

  if (cc_regress == "Y") {
    sobject <- Seurat::ScaleData(sobject,
                                 vars.to.regress = c("S.Score", "G2M.Score"),
                                 features = rownames(x = sobject))
    sobject <- Seurat::RunPCA(sobject, npcs = find_pcs)

    if (method == "tsne") {
      sobject <- Seurat::RunTSNE(sobject, reduction = "pca", dims = 1:use_pcs)
    } else if (method == "umap") {
      sobject <- Seurat::RunUMAP(sobject, reduction = "pca", dims = 1:use_pcs)
    }

    sobject <- Seurat::FindNeighbors(sobject,
                                     reduction = "pca",
                                     dims = 1:use_pcs)

    sobject <- Seurat::FindClusters(sobject, resolution = use_res)

    if (show_plots) {
        print(Seurat::DimPlot(sobject,
                            reduction = method,
                            label = TRUE,
                            pt.size = 1,
                            group.by = "Phase") +
                theme_roberts())

        print(Seurat::DimPlot(sobject,
                            reduction = method,
                            label = TRUE,
                            pt.size = 1) +
                theme_roberts())
    }

  } else {
    print("No CC regression performed.")
  }
  return(sobject)
}

#' Get Cell Cycle Genes
#'
#' This function retrieves cell cycle genes for a specified species.
#'
#' @param species A character string specifying the species. 
#'                Supported values are "human" and "mouse". Default is "human".
#'
#' @return A list containing S phase and G2M phase cell cycle genes.
#'         - s_features: S phase genes
#'         - g2m_features: G2M phase genes
#'
#' @keywords internal
get_cell_cycle_genes <- function(species = "human") {
    output <- list()
    if (species == "human") {
        output$s_features <- Seurat::cc.genes$s.genes
        output$g2m_features <- Seurat::cc.genes$g2m.genes
    } else if(species == "mouse") {
        output$s_features <-
            nichenetr::convert_human_to_mouse_symbols(
                Seurat::cc.genes$s.genes
            ) %>%
            na.omit() %>%
            as.vector()
        output$g2m_features <-
            nichenetr::convert_human_to_mouse_symbols(
                Seurat::cc.genes$g2m.genes
            ) %>%
            na.omit() %>%
            as.vector()
    } else {
        stop(
          "species argument: ",
          species,
          " is not supported, only human/mouse"
        )
    }
    return(output)
}

#' Plot Cell Cycle Distribution
#'
#' @param sobject A Seurat object
#' @param plot_type Type of plot to output, one of "bar" or "pie"
#'
#' @return a ggplot object
#' @export
#'
#' @examples
#' \dontrun{
#' plot_cc(marrow)
#'
#' plot_cc(marrow) + facet_wrap(~ Cluster, nrow = 1)
#' }
plot_cc <- function(sobject, plot_type = "bar") {
  if (!"Phase" %in% names(sobject@meta.data)) {
    stop("Phase info not available in this object.
         Run Seurat::CellCycleScoring() to estimate cell cycle phases")
  }

  cid2 <- tibble::tibble(Cluster = sobject$seurat_clusters,
                         CellId = names(sobject$seurat_clusters)) %>%
    dplyr::full_join(tibble::tibble(Phase = sobject$Phase,
                                    CellId = names(sobject$Phase)),
                     by = "CellId") %>%
    dplyr::group_by(Phase, Cluster) %>%
    dplyr::summarize(Freq = length(Phase), .groups = "drop") %>%
    dplyr::ungroup() %>%
    dplyr::group_by(Cluster) %>%
    dplyr::mutate(Proportion = Freq / sum(Freq)) %>%
    dplyr::ungroup()

  plot_obj <- ggplot2::ggplot(cid2,
                              ggplot2::aes(x = Cluster,
                                           y = Proportion,
                                           fill = Phase)) +
    ggplot2::geom_bar(width = 1, stat = "identity", color = "white") +
    ggplot2::scale_fill_brewer(palette = "Blues") +
    ggplot2::theme_minimal()

  if (plot_type == "pie") {
    plot_obj <- ggplot2::ggplot(cid2,
                                ggplot2::aes(x = "",
                                             y = Proportion,
                                             fill = Phase)) +
      ggplot2::geom_bar(stat = "identity", color = "white") +
      ggplot2::coord_polar("y", start = 0) +
      ggplot2::scale_fill_brewer(palette = "Blues") +
      ggplot2::theme_minimal() +
      Seurat::NoAxes() +
      ggplot2::facet_wrap(~ Cluster)
  }
  return(plot_obj)
}

#' Process a Seurat object
#'
#' @inheritParams Seurat::RunUMAP
#' @inheritParams Seurat::FindClusters
#' @param sobject A Seurat object
#' @param run_umap_dims Number of PCA dimensions to use in RunUMAP()
#'      and FindNeighbors()
#' @param graph_name Name of graph to use for the clustering algorithm in FindClusters()
#' @param neighbor_k_param Number of neighbors (k.param) to use in FindNeighbors()
#' @param umap_n_neighbors Number of neighbors (n.neighbors) to use in RunUMAP()
#' @param umap_metric Metric (metric) to use in RunUMAP()
#'
#' @return A Seurat object
#' @export
#'
#' @examples
#' \dontrun{
#' test <- process_seurat(pbmc_small)
#' }
process_seurat <- function(sobject,
                           verbose = FALSE,
                           run_umap_dims = 1:30,
                           assay = "RNA",
                           resolution = 0.3,
                           reduction = "pca",
                           graph_name = "RNA_snn",
                           neighbor_k_param = 30,
                           umap_n_neighbors = 30L,
                           umap_metric = "euclidean") {

    sobject <-
        sobject %>%
        Seurat::NormalizeData(verbose = verbose,
                              assay = assay) %>%
        Seurat::FindVariableFeatures(verbose = verbose,
                                     assay = assay) %>%
        Seurat::ScaleData(verbose = verbose,
                          assay = assay) %>%
        Seurat::RunPCA(verbose = verbose,
                       assay = assay)

    # Make sure run_umap_dims isn't more than we have in pca
    if (max(run_umap_dims) > ncol(Seurat::Embeddings(sobject,
                                                     reduction = reduction))) {
        run_umap_dims <- seq_len(ncol(Seurat::Embeddings(sobject,
                                                 reduction = reduction)))
        warning(paste("run_umap_dims argument is greater than the number of",
                      "dimensions in the reduction. Using all dimensions."))
    }

    sobject <-
        sobject %>%
        Seurat::RunUMAP(dims = run_umap_dims,
                        reduction = reduction,
                        n.neighbors = umap_n_neighbors,
                        metric = umap_metric,
                        verbose = verbose,
                        assay = assay) %>%
        Seurat::FindNeighbors(dims = run_umap_dims,
                              reduction = reduction,
                              verbose = verbose,
                              assay = assay,
                              k.param = neighbor_k_param) %>%
        Seurat::FindClusters(resolution = resolution,
                             verbose = verbose,
                             graph.name = graph_name)
}

#' Use median and standard deviation to subset a Seurat object based on specific features
#'
#' @param sobject Seurat object
#' @param sd_down Number of standard deviations below the median to subset
#' @param sd_up Number of standard deviations above the median to subset
#' @param make_plots Whether to make plots of the features before and after subsetting
#' @param features Vector of features to use for subsetting
#' @param sample_name Name of sample to use in plot titles
#'
#' @return A Seurat object
#' @export
#'
#' @examples
#' \dontrun{
#' filtered <- auto_subset(SeuratObject::pbmc_small)
#' }
auto_subset <- function(sobject,
                        sd_down = 1,
                        sd_up = 2,
                        make_plots = TRUE,
                        features = c("nCount_RNA",
                                     "nFeature_RNA"),
                        sample_name = NULL) {

    cutoff_table <-
        sobject@meta.data %>%
        dplyr::select(dplyr::all_of(features)) %>%
        tidyr::pivot_longer(cols = dplyr::everything(),
                            names_to = "feature",
                            values_to = "value") %>%
        dplyr::group_by(feature) %>%
        dplyr::summarize(median_val = stats::median(value),
                         sd_val = stats::sd(value),
                         .groups = "drop") %>%
        dplyr::mutate(min_val = median_val - sd_down * sd_val,
                      max_val = median_val + sd_up * sd_val)

    if (make_plots) {
        print(feature_hist(sobject,
                           features = features,
                           cutoff_table = cutoff_table))
    }

    for (feature_name in features) {
        cutoffs <-
            cutoff_table %>%
            dplyr::filter(feature == feature_name)

        values <- Seurat::FetchData(object = sobject, vars = feature_name)
        sobject <-
            sobject[, which(x = values >= cutoffs$min_val[1] &
                                values <= cutoffs$max_val[1])]
    }

    # if (make_plots) {
    #     print(feature_hist(sobject,
    #                        features = features,
    #                        cutoff_table = cutoff_table))
    # }
    return(sobject)
}
kidcancerlab/rrrSingleCellUtils documentation built on April 17, 2025, 5:10 p.m.