R/prep_SO.R

Defines functions .var_feature_filter_removal prep_SO

Documented in prep_SO

#' Prepare a somewhat standardized Seurat Object for further analyses
#'
#'
#'
#' @param SO_unprocessed named list of split Seurat objects
#' @param samples names of SO_unprocessed that are to include; if missing all are used
#' @param cells vector of cell names to include; if missint all cells are used
#' @param min_cells min number of cells per object in SO_unprocessed; objects with lower cell number are excluded
#' @param downsample downsample a fraction of cells from each object in SO_unprocessed (downsample < 0);
#' or an absolute number of cells per object (downsample > 1 & downsample >= min_cells)
#' @param export_prefix prefix to the output rds file
#' @param reductions which reduction to calculate, tsne --> Seurat::RunTSNE, umap --> Seurat::RunUMAP
#' @param nhvf number high variables features, passed to Seurat::SCTransform
#' or Seurat::FindVariableFeatures or Seurat::SelectIntegrationFeatures
#' @param npcs number of principle components in calculate in pca and to
#' consider for downstream functions like tSNE or UMAP
#' @param normalization algorithm for normalization of UMIs
#' @param batch_corr procedure for batch correction between samples in SO_unprocessed;
#' only relevant if more than 1 sample is passed
#' @param vars.to.regress passed to Seurat::SCTransform or Seurat::ScaleData; will be applied
#' independent of what is set for batch_corr; if batch_corr is set to 'none', then only vars.to.regress is
#' used to regress out a variable in meta.data which may be sample-specific; other then that
#' it may not be meaningful to regress a sample-specific variable and perform batch_corr;
#' rather vars.to.regress may be used in combination with batch_corr to regress percent_mito or so
#' @param seeed seed passed to several methods
#' @param save_path folder to save the resulting Seurat object as rds-file to
#' @param celltype_refs list(prim_cell_atlas = celldex::HumanPrimaryCellAtlasData(), MonacoImmune = celldex::MonacoImmuneData())
#' @param celltype_label list of label names to use from the reference data set (one list entry per celltype_refs; each entry may contain a vector of labels)
#' @param celltype_ref_clusters use a clustering as basis for grouped celltype annotation with SingleR. This will
#' fundamentally speed up the calculation!
#' e.g. if SCT assay is used and cluster_resolutions includes 0.8 then pass: SCT_snn_res.0.8.
#' when integration procedure assay is used: integrated_snn_res.0.8. For RNA assay: RNA_snn_res.0.8.
#' @param diet_seurat logical whether to run Seurat::DietSeurat
#' @param verbose print messages and progress bars from functions
#' @param var_feature_filter character vector of features which are to exclude from variable features;
#' this will affect downstream PCA, dimension reduction and clustering;
#' e.g.: var_feature_filter = grep("^TR[ABGD]V", rownames(SOqc_split[[1]]), value = T)
#' @param hvf_determination_before_merge determine variable features before or after merging multiple samples;
#' either by the standard LogNormalize workflow or by SCTransform function; this is most important for
#' deciding whether SCtransform is run on mulitple samples separately before merging (set to TRUE) or
#' after merging (set to FALSE); in my experience and when the integration procedure is set to harmony setting
#' it to FALSE yields better results; see: https://github.com/hbctraining/scRNA-seq_online/blob/master/lessons/06a_integration_harmony.md and subsequent links
#' @param FindVariableFeatures_args
#' @param SCtransform_args
#' @param RunUMAP_args
#' @param RunTSNE_args
#' @param FindNeighbors_args
#' @param FindClusters_args
#' @param RunHarmony_args
#' @param SOM_args
#' @param GQTSOM_args
#' @param EmbedSOM_args
#' @param FindIntegrationAnchors_args
#' @param IntegrateData_args by default features.to.integrate = rownames(Seurat::GetAssayData(SO.list[[1]], assay = switch(normalization, SCT = "SCT", LogNormalize = "RNA")))
#' @param RunPCA_args
#' @param ...
#'
#' @return Seurat Object, as R object and saved to disk as rds file
#' @export
#'
#' @examples
#' \dontrun{
#' }
prep_SO <- function(SO_unprocessed,
                    samples = NULL,
                    cells = NULL,
                    min_cells = 50,
                    downsample = 1,
                    export_prefix = NULL,
                    reductions = c("umap", "som", "gqtsom", "tsne"),
                    nhvf = 800,
                    npcs = 20,
                    normalization = c("SCT", "LogNormalize"),
                    hvf_determination_before_merge = F,
                    batch_corr = c("harmony", "integration", "none"),
                    vars.to.regress = NULL,
                    seeed = 42,
                    save_path = NULL,
                    celltype_refs = NULL, # list of celldex::objects
                    celltype_label = "label.main",
                    celltype_ref_clusters = NULL,
                    diet_seurat = F,
                    var_feature_filter = NULL,
                    verbose = F,
                    FindVariableFeatures_args = list(),
                    SCtransform_args = list(vst.flavor = "v2", method = "glmGamPoi"),
                    RunPCA_args = list(),
                    RunUMAP_args = list(),
                    RunTSNE_args = list(theta = 0),
                    FindNeighbors_args = list(),
                    FindClusters_args = list(resolution = seq(0.1,1,0.1)),
                    RunHarmony_args = list(),
                    SOM_args = list(),
                    GQTSOM_args = list(),
                    EmbedSOM_args = list(),
                    FindIntegrationAnchors_args = list(reduction = "rpca"),
                    IntegrateData_args = list(),
                    ...) {

  if (!requireNamespace("BiocManager", quietly = TRUE)) {
    utils::install.packages("BiocManager")
  }
  if (!requireNamespace("glmGamPoi", quietly = TRUE)) {
    BiocManager::install("glmGamPoi")
  }
  if (!requireNamespace("Gmisc", quietly = TRUE)) {
    utils::install.packages("Gmisc")
  }

  mydots <- list(...)

  options(warn = 1)
  if (is.null(save_path)) {
    message("No save_path to save Seurat objects to provided.")
  } else if (!is.character(save_path) || length(save_path) != 1) {
    stop("save_path has to be a character; a path to a folder where to save Seurat objects to.")
  }

  if (!is.null(celltype_refs)) {
    if (!is.list(celltype_refs)) {
      stop("celltype_refs should be a named list of reference data sets from celldex or similar.")
    }
    if (is.null(names(celltype_refs))) {
      stop("Provide names for celltype_refs.")
    }
    if (any(duplicated(names(celltype_refs)))) {
      stop("names for celltype_refs must be unique.")
    }
    if (length(celltype_label) == 1) {
      celltype_label <- rep(celltype_label, length(celltype_refs))
    }
    if (length(celltype_label) != length(celltype_refs)) {
      stop("celltype_label and celltype_refs must have the same lengths. Please choose one label for each ref.")
    }
    for (i in seq_along(celltype_refs)) {
      if (any(!celltype_label[[i]] %in% names(celltype_refs[[i]]@colData@listData))) {
        stop(paste0(paste(celltype_label[[i]][which(!celltype_label[[i]] %in% names(celltype_refs[[i]]@colData@listData))], collapse = ", "), " not found in ", names(celltype_refs)[i], "."))
      }
    }

    if (!requireNamespace("SingleR", quietly = T)) {
      BiocManager::install("SingleR")
    }
  }

  reductions <- match.arg(tolower(reductions), c("tsne", "umap", "som", "gqtsom"), several.ok = T)
  normalization <- match.arg(normalization, c("SCT", "LogNormalize"))
  batch_corr <- match.arg(batch_corr, c("harmony", "integration", "none"))

  # actually only very few arguments are allowed to be passed by RunPCA_args. Otherwise the function would break.
  if ("npcs" %in% names(RunPCA_args)) {
    stop("Please do not pass npcs in RunPCA_args. Use the npcs argument.")
  }
  if ("seed.use" %in% names(RunPCA_args)) {
    stop("Please do not pass seed.use in RunPCA_args. Use the seeed argument.")
  }
  if ("verbose" %in% names(RunPCA_args)) {
    stop("Please do not pass verbose in RunPCA_args. Use the verbose argument.")
  }
  if ("reduction.name" %in% names(RunPCA_args)) {
    stop("Please do not pass reduction.name in RunPCA_args.")
  }
  if ("reduction.key" %in% names(RunPCA_args)) {
    stop("Please do not pass reduction.key in RunPCA_args.")
  }
  if ("assay" %in% names(RunPCA_args)) {
    stop("Please do not pass assay in RunPCA_args.")
  }

  if (!is.null(celltype_ref_clusters)) {
    #check if celltype_ref_clusters can exist
    pref <- ifelse(batch_corr == "integration", "integrated", ifelse(normalization == "SCT", "SCT", "RNA"))
    mid <- "_snn_res."
    if (length(FindClusters_args[["resolution"]] > 0)) {
      suf <- gsub("\\.0$", "", FindClusters_args[["resolution"]])
    } else {
      suf <- "0.8"
    }
    candidates <- paste0(pref,mid,suf)
    if (!celltype_ref_clusters %in% candidates) {
      stop("celltype_ref_clusters not found in candidates based on batch_corr, normalization and cluster_resolutions: ", paste(candidates, collapse = ", "))
    }
  }

  if (methods::is(SO_unprocessed, "list")) {
    SO.list <- SO_unprocessed
  } else if (methods::is(SO_unprocessed, "character")) {
    if (!file.exists(SO_unprocessed)) {
      stop(paste0(SO_unprocessed, "not found."))
    } else {
      if (!grepl("\\.rds$", SO_unprocessed, ignore.case = T)) {
        stop("SO_unprocessed has to be an .rds file.")
      }
      SO.list <- readRDS(SO_unprocessed)
    }
  } else {
    stop("SO_unprocessed has to be named list of splitted Seurat objects or a path (character) to an .rds file of those. If it is only one Seurat object (one sample)
         make it a list of length 1.")
  }

  if (methods::is(SO.list, "Seurat")) {
    # if only one Seurat object is provided
    SO.list <- list(SO.list)
    names(SO.list) <- "sample"
  }

  if (is.null(names(SO.list))) {
    stop("SO_unprocessed has no names.")
  }

  if (downsample > 1 && downsample < min_cells) {
    message("downsample set to min_cells.")
    downsample <- min_cells
  }

  if (is.null(samples)) {
    samples <- names(SO.list)
  } else {
    if (any(!samples %in% names(SO.list))) {
      message("samples not found in SO_unprocessed: ", samples[which(!samples %in% names(SO.list))])
    }

  }
  samples <- names(SO.list)[which(grepl(paste(samples, collapse = "|"), names(SO.list)))]
  SO.list <- SO.list[which(names(SO.list) %in% samples)]

  if (length(SO.list) > 1 && batch_corr == "harmony" && !"group.by.vars" %in% names(RunHarmony_args)) {
    stop("Please provide one or more group.by.vars from meta.data in RunHarmony_args as a list: ", paste(names(SO.list[[1]]@meta.data), collapse = ", "), ".")
  }

  if (!is.null(cells)) {
    SO.list <- lapply(SO.list, function(x) {
      inds <- which(Seurat::Cells(x) %in% cells)
      if (length(inds) == 0) {
        return(NULL)
      }
      return(subset(x, cells = Seurat::Cells(x)[which(Seurat::Cells(x) %in% cells)]))
    })
  }
  if (any(sapply(SO.list, is.null))) {
    message("No cells found for: ", paste(names(SO.list)[which(sapply(SO.list, is.null))], collapse = ", "))
    SO.list <- SO.list[which(!sapply(SO.list, is.null))]
    if (length(SO.list) == 0) {
      stop("No Seurat objects left after filtering for cells.")
    }
  }

  ## use downsample method from Seurat dev package?!
  if (downsample < 1) {
    SO.list <- lapply(SO.list, function(x) subset(x, cells = sample(Seurat::Cells(x), as.integer(downsample*length(Seurat::Cells(x))), replace = FALSE)))
  } else if (downsample > 1) {
    SO.list <- lapply(SO.list, function(x) subset(x, cells = sample(Seurat::Cells(x), downsample, replace = FALSE)))
  }

  # remove samples with insufficient number of cells
  rm.nm <- names(SO.list[which(sapply(SO.list, function(x){length(Seurat::Cells(x)) < min_cells}))])
  if (length(rm.nm) > 0) {
    SO.list <- SO.list[which(!names(SO.list) %in% rm.nm)]
    message("samples removed due to min.cells: ", paste(rm.nm, collapse = ","))
  }

  message("Samples included (", length(SO.list), "): ", paste(names(SO.list), collapse=", "))

  if (length(SO.list) == 1) {
    batch_corr <- "none"
  }

  # prep SCtransform args once for all
  SCtransform_args <- SCtransform_args[which(!names(SCtransform_args) %in% c("object", "assay", "new.assay.name", "seed.use", "verbose"))]
  if (!"variable.features.n" %in% names(SCtransform_args)) {
    SCtransform_args <- c(SCtransform_args, list(variable.features.n = nhvf))
  }
  if (!"vst.flavor" %in% names(SCtransform_args)) {
    SCtransform_args <- c(SCtransform_args, list(vst.flavor = "v2"))
  }
  if (!"method" %in% names(SCtransform_args)) {
    SCtransform_args <- c(SCtransform_args, list(method = "glmGamPoi"))
  }
  if (!"vars.to.regress" %in% names(SCtransform_args)) {
    SCtransform_args <- c(SCtransform_args, list(vars.to.regress = vars.to.regress))
  }

  # prep FindVariableFeatures args once for all
  FindVariableFeatures_args <- FindVariableFeatures_args[which(!names(FindVariableFeatures_args) %in% c("object", "assay", "verbose"))]
  if (!"nfeatures" %in% names(FindVariableFeatures_args)) {
    FindVariableFeatures_args <- c(FindVariableFeatures_args, list(nfeatures = nhvf))
  }

  # cases:
  if (length(SO.list) == 1) {
    SO <- SO.list[[1]]
    if (normalization == "SCT") {
      SO <- Gmisc::fastDoCall(Seurat::SCTransform, args = c(list(object = SO,
                                                                 seed.use = seeed,
                                                                 verbose = verbose),
                                                            SCtransform_args))
      #SO <- Seurat::SCTransform(SO, variable.features.n = nhvf, vars.to.regress = vars.to.regress, verbose = verbose, assay = "RNA", vst.flavor = "v2", method = "glmGamPoi", seed.use = seeed)

      # remove var features which are to filter
      if (!is.null(var_feature_filter)) {
        SO <- .var_feature_filter_removal(SO = SO,
                                          var_feature_filter = var_feature_filter,
                                          normalization = normalization,
                                          nhvf = nhvf,
                                          vars.to.regress = vars.to.regress,
                                          seeed = seeed,
                                          verbose = verbose,
                                          SCtransform_args = SCtransform_args,
                                          FindVariableFeatures_args = FindVariableFeatures_args)
      }

      SO <- Seurat::NormalizeData(SO, verbose = verbose, assay = "RNA")
      SO <- Seurat::ScaleData(SO, assay = "RNA", verbose = verbose)
    } else if (normalization == "LogNormalize") {
      SO <- Seurat::NormalizeData(SO, verbose = verbose, assay = "RNA")
      SO <- Gmisc::fastDoCall(Seurat::FindVariableFeatures, args = c(list(object = SO,
                                                                          assay = "RNA",
                                                                          verbose = verbose),
                                                                     FindVariableFeatures_args))
      #SO <- Seurat::FindVariableFeatures(SO, selection.method = "vst", nfeatures = nhvf, verbose = verbose, assay = "RNA")
      if (!is.null(var_feature_filter)) {
        SO <- .var_feature_filter_removal(SO = SO,
                                          var_feature_filter = var_feature_filter,
                                          normalization = normalization,
                                          nhvf = nhvf,
                                          vars.to.regress = vars.to.regress,
                                          seeed = seeed,
                                          verbose = verbose,
                                          SCtransform_args = SCtransform_args,
                                          FindVariableFeatures_args = FindVariableFeatures_args)
      }
      SO <- Seurat::ScaleData(SO, assay = "RNA", verbose = verbose)
    }
    SO <- Gmisc::fastDoCall(Seurat::RunPCA, args = c(list(object = SO,
                                                          npcs = npcs,
                                                          seed.use = seeed,
                                                          verbose = verbose),
                                                     RunPCA_args))
    SO <- Seurat::ProjectDim(SO, reduction = "pca", do.center = T, overwrite = F, verbose = verbose)
  }

  if (length(SO.list) > 1) {
    if (batch_corr %in% c("none", "harmony")) {

      ### run SCT separately on unmerged SOs?
      # https://hbctraining.github.io/scRNA-seq_online/lessons/06a_integration_harmony.html
      # https://github.com/immunogenomics/harmony/issues/41
      # https://github.com/satijalab/sctransform/issues/55#issuecomment-633843730
      ## also see: https://satijalab.org/seurat/articles/sctransform_v2_vignette.html

      if (normalization == "SCT") {
        if (hvf_determination_before_merge) {

          SO.list <- lapply(SO.list, function(x) {
            Gmisc::fastDoCall(Seurat::SCTransform, args = c(list(object = x,
                                                                 seed.use = seeed,
                                                                 verbose = verbose),
                                                            SCtransform_args))
          })
          # SO.list <- lapply(SO.list, FUN = Seurat::SCTransform, assay = "RNA", variable.features.n = nhvf, vars.to.regress = vars.to.regress, seed.use = seeed, vst.flavor = "v2", method = "glmGamPoi", verbose = verbose)

          if (!is.null(var_feature_filter)) {
            SO.list <- lapply(SO.list,
                              FUN = .var_feature_filter_removal,
                              var_feature_filter = var_feature_filter,
                              normalization = normalization,
                              nhvf = nhvf,
                              vars.to.regress = vars.to.regress,
                              seeed = seeed,
                              verbose = verbose,
                              SCtransform_args = SCtransform_args,
                              FindVariableFeatures_args = FindVariableFeatures_args)
          }

          anchor_features <- Seurat::SelectIntegrationFeatures(SO.list,
                                                               assay = rep("SCT", length(SO.list)),
                                                               nfeatures = nhvf,
                                                               fvf.nfeatures = nhvf)

          SO <- merge(x = SO.list[[1]], y = SO.list[2:length(SO.list)], merge.data = T)
          Seurat::DefaultAssay(SO) <- "SCT"
          Seurat::VariableFeatures(SO) <- anchor_features
        } else {
          SO <- merge(x = SO.list[[1]], y = SO.list[2:length(SO.list)], merge.data = T)
          SO <- Gmisc::fastDoCall(Seurat::SCTransform, args = c(list(object = SO,
                                                                     seed.use = seeed,
                                                                     verbose = verbose),
                                                                SCtransform_args))
          #SO <- Seurat::SCTransform(SO, assay = "RNA", variable.features.n = nhvf, vars.to.regress = vars.to.regress, seed.use = seeed, vst.flavor = "v2", method = "glmGamPoi", verbose = verbose)

          # remove var features which are to filter
          if (!is.null(var_feature_filter)) {
            SO <- .var_feature_filter_removal(SO = SO,
                                              var_feature_filter = var_feature_filter,
                                              normalization = normalization,
                                              nhvf = nhvf,
                                              vars.to.regress = vars.to.regress,
                                              seeed = seeed,
                                              verbose = verbose,
                                              SCtransform_args = SCtransform_args,
                                              FindVariableFeatures_args = FindVariableFeatures_args)
          }
        }
        SO <- Seurat::NormalizeData(SO, assay = "RNA", verbose = verbose)
        SO <- Seurat::ScaleData(SO, assay = "RNA", verbose = verbose)

      } else if (normalization == "LogNormalize") {

        if (hvf_determination_before_merge) {
          SO.list <- lapply(SO.list, FUN = Seurat::NormalizeData, assay = "RNA", verbose = verbose)

          SO.list <- lapply(SO.list, function(x) {
            Gmisc::fastDoCall(Seurat::FindVariableFeatures, args = c(list(object = x,
                                                                          assay = "RNA",
                                                                          verbose = verbose),
                                                                     FindVariableFeatures_args))
          })
          #SO.list <- lapply(SO.list, FUN = Seurat::FindVariableFeatures, assay = "RNA", selection.method = "vst", nfeatures = nhvf, verbose = verbose)

          if (!is.null(var_feature_filter)) {
            SO.list <- lapply(SO.list,
                              FUN = .var_feature_filter_removal,
                              var_feature_filter = var_feature_filter,
                              normalization = normalization,
                              nhvf = nhvf,
                              vars.to.regress = vars.to.regress,
                              seeed = seeed,
                              verbose = verbose,
                              SCtransform_args = SCtransform_args,
                              FindVariableFeatures_args = FindVariableFeatures_args)
          }
          anchor_features <- Seurat::SelectIntegrationFeatures(SO.list,
                                                               assay = rep("RNA", length(SO.list)),
                                                               nfeatures = nhvf,
                                                               verbose = verbose)
          SO <- merge(x = SO.list[[1]], y = SO.list[2:length(SO.list)], merge.data = T)
          Seurat::VariableFeatures(SO) <- anchor_features
        } else {
          SO <- merge(x = SO.list[[1]], y = SO.list[2:length(SO.list)], merge.data = T)
          SO <- Seurat::NormalizeData(SO, assay = "RNA", verbose = verbose)
          SO <- Gmisc::fastDoCall(Seurat::FindVariableFeatures, args = c(list(object = SO,
                                                                              assay = "RNA",
                                                                              verbose = verbose),
                                                                         FindVariableFeatures_args))
          #SO <- Seurat::FindVariableFeatures(SO, assay = "RNA", selection.method = "vst", nfeatures = nhvf, verbose = verbose)
          if (!is.null(var_feature_filter)) {
            SO <- .var_feature_filter_removal(SO = SO,
                                              var_feature_filter = var_feature_filter,
                                              normalization = normalization,
                                              nhvf = nhvf,
                                              vars.to.regress = vars.to.regress,
                                              seeed = seeed,
                                              verbose = verbose,
                                              SCtransform_args = SCtransform_args,
                                              FindVariableFeatures_args = FindVariableFeatures_args)
          }
        }
        SO <- Seurat::ScaleData(SO, vars.to.regress = vars.to.regress, verbose = verbose)
      }

      SO <- Gmisc::fastDoCall(Seurat::RunPCA, args = c(list(object = SO,
                                                            npcs = npcs,
                                                            seed.use = seeed,
                                                            verbose = verbose),
                                                       RunPCA_args))
      SO <- Seurat::ProjectDim(SO, reduction = "pca", do.center = T, overwrite = F, verbose = verbose)

      if (batch_corr == "harmony") {
        if (!requireNamespace("harmony", quietly = T)) {
          utils::install.packages("harmony")
        }
        RunHarmony_args <- RunHarmony_args[which(!names(RunHarmony_args) %in% c("object", "assay.use", "verbose"))]
        SO <- Gmisc::fastDoCall(harmony::RunHarmony, args = c(list(object = SO,
                                                                   assay.use = switch(normalization, SCT = "SCT", LogNormalize = "RNA")),
                                                              RunHarmony_args))
      }
    }

    if (batch_corr == "integration") {
      k.filter <- as.integer(min(200, min(sapply(SO.list, ncol))/2))
      k.score <- as.integer(min(30, min(sapply(SO.list, ncol))/6))

      if (normalization == "SCT") {
        SO.list <- lapply(SO.list, function(x) {
          Gmisc::fastDoCall(Seurat::SCTransform, args = c(list(object = x,
                                                               seed.use = seeed,
                                                               verbose = verbose),
                                                          SCtransform_args))
        })
        # SO.list <- lapply(SO.list, FUN = Seurat::SCTransform, assay = "RNA", variable.features.n = nhvf, vars.to.regress = vars.to.regress, seed.use = seeed, vst.flavor = "v2", method = "glmGamPoi", verbose = verbose)

        # remove var features which are to filter
        if (!is.null(var_feature_filter)) {
          SO.list <- lapply(SO.list,
                            FUN = .var_feature_filter_removal,
                            var_feature_filter = var_feature_filter,
                            normalization = normalization,
                            nhvf = nhvf,
                            vars.to.regress = vars.to.regress,
                            seeed = seeed,
                            verbose = verbose,
                            SCtransform_args = SCtransform_args,
                            FindVariableFeatures_args = FindVariableFeatures_args)
        }

        anchor_features <- Seurat::SelectIntegrationFeatures(SO.list,
                                                             verbose = verbose,
                                                             nfeatures = nhvf)
        SO.list <- Seurat::PrepSCTIntegration(object.list = SO.list,
                                              anchor.features = anchor_features,
                                              verbose = verbose)
      } else {
        SO.list <- lapply(SO.list,
                          FUN = Seurat::NormalizeData,
                          assay = "RNA",
                          verbose = verbose)
        SO.list <- lapply(SO.list, function(x) {
          Gmisc::fastDoCall(Seurat::FindVariableFeatures, args = c(list(object = x,
                                                                        assay = "RNA",
                                                                        verbose = verbose),
                                                                   FindVariableFeatures_args))
        })
        #SO.list <- lapply(SO.list, FUN = Seurat::FindVariableFeatures, assay = "RNA", selection.method = "vst", nfeatures = nhvf, verbose = verbose)
        anchor_features <- Seurat::SelectIntegrationFeatures(SO.list,
                                                             nfeatures = nhvf,
                                                             verbose = verbose)
      }

      FindIntegrationAnchors_args <- FindIntegrationAnchors[which(!names(FindIntegrationAnchors) %in% c("object.list", "normalization.method", "verbose"))]
      if (!"anchor.features" %in% names(FindIntegrationAnchors_args)) {
        FindIntegrationAnchors_args <- c(list(anchor_features = anchor_features), FindIntegrationAnchors_args)
      }
      if (!"k.filter" %in% names(FindIntegrationAnchors_args)) {
        FindIntegrationAnchors_args <- c(list(k.filter = k.filter), FindIntegrationAnchors_args)
      }
      if (!"k.score" %in% names(FindIntegrationAnchors_args)) {
        FindIntegrationAnchors_args <- c(list(k.score = k.score), FindIntegrationAnchors_args)
      }
      if (!"reduction" %in% names(FindIntegrationAnchors_args)) {
        FindIntegrationAnchors_args <- c(list(reduction = "rpca"), FindIntegrationAnchors_args)
      }

      if (FindIntegrationAnchors_args[["reduction"]] == "rpca") {
        if (normalization == "SCT") {
          ## does passing RunPCA_args like this cause an error?
          SO.list <- lapply(SO.list,
                            FUN = Seurat::RunPCA,
                            assay = "SCT",
                            npcs = npcs,
                            seed.use = seeed,
                            features = anchor_features,
                            verbose = verbose,
                            unlist(RunPCA_args))
        }
      } else {
        SO.list <- lapply(SO.list, function(x) {
          x <- Seurat::ScaleData(x, features = anchor_features, verbose = verbose)
          x <- Gmisc::fastDoCall(Seurat::RunPCA, args = c(list(object = x,
                                                               npcs = npcs,
                                                               seed.use = seeed,
                                                               verbose = verbose),
                                                          RunPCA_args))
          return(x)
        })
      }

      anchorset <- Gmisc::fastDoCall(Seurat::FindIntegrationAnchors, args = c(list(object.list = SO.list,
                                                                                   normalization.method = normalization,
                                                                                   verbose = verbose),
                                                                              FindIntegrationAnchors_args))

      IntegrateData_args <- IntegrateData[which(!names(IntegrateData) %in% c("anchorset", "new.assay.name", "normalization.method", "verbose"))]
      if (!"features.to.integrate" %in% names(IntegrateData_args)) {
        IntegrateData_args <- c(list(features.to.integrate = rownames(Seurat::GetAssayData(SO.list[[1]], assay = switch(normalization, SCT = "SCT", LogNormalize = "RNA")))), IntegrateData_args)
      }
      if (!"k.weight" %in% names(IntegrateData_args)) {
        IntegrateData_args <- c(list(k.weight = k.filter), IntegrateData_args)
      }
      SO <- Gmisc::fastDoCall(Seurat::IntegrateData, args = c(list(anchorset = anchorset,
                                                                   normalization.method = normalization,
                                                                   verbose = verbose),
                                                              IntegrateData_args))

      Seurat::DefaultAssay(SO) <- "integrated"
      Seurat::VariableFeatures(SO) <- anchor_features

      if (normalization == "SCT" && (hvf_determination_before_merge || batch_corr == "integration")) {
        ## see https://satijalab.org/seurat/articles/sctransform_v2_vignette.html
        SO <- Seurat::PrepSCTFindMarkers(SO,
                                         assay = "SCT",
                                         verbose = verbose)
        message("If running on a subset of the original object after running PrepSCTFindMarkers(), FindMarkers() should be invoked with recorrect_umi = FALSE.")
      }

      ## independent of SCT or LogNormalize:
      SO <- Seurat::ScaleData(SO, assay = "RNA", verbose = verbose) # just for completeness
      SO <- Seurat::ScaleData(SO, assay = "integrated", verbose = verbose) # see https://satijalab.org/seurat/articles/integration_introduction.html

      SO <- Gmisc::fastDoCall(Seurat::RunPCA, args = c(list(object = SO,
                                                            npcs = npcs,
                                                            seed.use = seeed,
                                                            verbose = verbose),
                                                       RunPCA_args))
      SO <- Seurat::ProjectDim(SO, reduction = "pca", do.center = T, overwrite = F, verbose = verbose)
    }
  }

  ### do.call on large SeuratObject became super slow, not practicable!
  # https://stackoverflow.com/questions/28198103/alternative-to-do-call-for-large-datasets

  # alternatives:
  # https://rlang.r-lib.org/reference/exec.html
  # gmisc::fastDoCall

  red <- switch(batch_corr, harmony = "harmony", integration = "pca", none = "pca")
  if (any(grepl("umap", reductions, ignore.case = T))) {
    RunUMAP_args <- RunUMAP_args[which(!names(RunUMAP_args) %in% c("object", "seed.use", "reduction", "verbose"))]
    if (!"dims" %in% names(RunUMAP_args)) {
      RunUMAP_args <- c(list(dims = 1:npcs), RunUMAP_args)
    }
    SO <- Gmisc::fastDoCall(Seurat::RunUMAP, args = c(list(object = SO,
                                                           reduction = red,
                                                           seed.use = seeed,
                                                           verbose = verbose),
                                                      RunUMAP_args))
    #SO <- Seurat::RunUMAP(object = SO, umap.method = "uwot", metric = "cosine", dims = 1:npcs, seed.use = seeed, reduction = red, verbose = verbose, ...)
  }

  if (any(grepl("tsne", reductions, ignore.case = T))) {
    RunTSNE_args <- RunTSNE_args[which(!names(RunTSNE_args) %in% c("object", "seed.use", "reduction", "verbose"))]
    if (!"dims" %in% names(RunTSNE_args)) {
      RunTSNE_args <- c(list(dims = 1:npcs), RunTSNE_args)
    }
    if (!"num_threads" %in% names(RunTSNE_args)) {
      RunTSNE_args <- c(list(num_threads = 0), RunTSNE_args)
    }
    SO <- Gmisc::fastDoCall(Seurat::RunTSNE, args = c(list(object = SO,
                                                           reduction = red,
                                                           seed.use = seeed,
                                                           verbose = verbose),
                                                      RunTSNE_args))
    #SO <- Seurat::RunTSNE(object = SO, dims = 1:npcs, seed.use = seeed, reduction = red, verbose = verbose, num_threads = 0, ...)
  }

  if (any(grepl("^som$", reductions, ignore.case = T))) {
    if (!requireNamespace("devtools", quietly = T)) {
      utils::install.packages("devtools")
    }
    if (!requireNamespace("EmbedSOM", quietly = T)) {
      devtools::install_github("exaexa/EmbedSOM")
    }

    SOM_args <- SOM_args[which(!names(SOM_args) %in% c("data"))]
    map <- Gmisc::fastDoCall(EmbedSOM::SOM, args = c(list(data = SO@reductions[[red]]@cell.embeddings)), SOM_args)
    EmbedSOM_args <- EmbedSOM_args[which(!names(EmbedSOM_args) %in% c("data", "map"))]
    ES <- Gmisc::fastDoCall(EmbedSOM::EmbedSOM_args, args = c(list(data = SO@reductions[[red]]@cell.embeddings, map = map)), EmbedSOM_args)
    SO[["SOM"]] <- Seurat::CreateDimReducObject(embeddings = ES, key = "SOM_", assay = switch(normalization, SCT = "SCT", LogNormalize = "RNA"), misc = list(SOM_args, EmbedSOM_args))
  }

  if (any(grepl("^gqtsom$", reductions, ignore.case = T))) {
    if (!requireNamespace("devtools", quietly = T)) {
      utils::install.packages("devtools")
    }
    if (!requireNamespace("EmbedSOM", quietly = T)) {
      devtools::install_github("exaexa/EmbedSOM")
    }

    GQTSOM_args <- GQTSOM_args[which(!names(GQTSOM_args) %in% c("data"))]
    map <- Gmisc::fastDoCall(EmbedSOM::GQTSOM, args = c(list(data = SO@reductions[[red]]@cell.embeddings)), GQTSOM_args)
    EmbedSOM_args <- EmbedSOM_args[which(!names(EmbedSOM_args) %in% c("data", "map"))]
    ES <- Gmisc::fastDoCall(EmbedSOM::EmbedSOM_args, args = c(list(data = SO@reductions[[red]]@cell.embeddings, map = map)), EmbedSOM_args)
    SO[["GQTSOM"]] <- Seurat::CreateDimReducObject(embeddings = ES, key = "GQTSOM_", assay = switch(normalization, SCT = "SCT", LogNormalize = "RNA"), misc = list(GQTSOM_args, EmbedSOM_args))
  }

  FindNeighbors_args <- FindNeighbors_args[which(!names(FindNeighbors_args) %in% c("object", "reduction", "verbose"))]
  if (!"dims" %in% names(FindNeighbors_args)) {
    FindNeighbors_args <- c(list(dims = 1:npcs), FindNeighbors_args)
  }
  SO <- Gmisc::fastDoCall(Seurat::FindNeighbors, args = c(list(object = SO,
                                                               reduction = red,
                                                               verbose = verbose),
                                                          FindNeighbors_args))
  FindClusters_args <- FindClusters_args[which(!names(FindClusters_args) %in% c("object", "verbose"))]
  SO <- Gmisc::fastDoCall(Seurat::FindClusters, args = c(list(object = SO,
                                                              verbose = verbose),
                                                         FindClusters_args))


  #SO <- Seurat::FindNeighbors(object = SO, reduction = red, dims = 1:npcs, verbose = verbose, ...)
  #SO <- Seurat::FindClusters(object = SO, resolution = cluster_resolutions, verbose = verbose, ...)

  if (!is.null(celltype_refs) && !is.null(celltype_ref_clusters) && !celltype_ref_clusters %in% names(SO@meta.data)) {
    message("celltype_ref_clusters not found in SO meta data. Will be set to NULL and SingleR will operate on single cell level.")
    celltype_ref_clusters <- NULL
  }

  if (is.null(celltype_ref_clusters)) {
    refs <- NULL
  } else {
    refs <- SO@meta.data[,celltype_ref_clusters]
  }

  for (i in seq_along(celltype_refs)) {
    for (j in seq_along(celltype_label[[i]])) {
      celltypes <- SingleR::SingleR(test = Seurat::GetAssayData(SO, slot = "data", assay = "RNA"),
                                    ref = celltype_refs[[i]],
                                    labels = celltype_refs[[i]]@colData@listData[[celltype_label[[i]][j]]],
                                    clusters = refs)

      if (is.null(celltype_ref_clusters)) {
        SO@meta.data[,paste0(names(celltype_refs)[i], "__", celltype_label[[i]][j])] <- celltypes$labels
      } else {
        celltypes_df <- utils::stack(stats::setNames(celltypes$labels, levels(SO@meta.data[,celltype_ref_clusters])))
        names(celltypes_df) <- c(paste0(names(celltype_refs)[i], "__", celltype_label[[i]][j]), celltype_ref_clusters)
        celltypes_df <- tibble::column_to_rownames(dplyr::left_join(tibble::rownames_to_column(SO@meta.data[,celltype_ref_clusters,drop=F], "ID"), celltypes_df, by = celltype_ref_clusters), "ID")
        SO <- Seurat::AddMetaData(SO, celltypes_df[-which(names(celltypes_df) == celltype_ref_clusters)])
      }
      Seurat::Misc(SO, paste0(names(celltype_refs)[i], "__", celltype_label[[i]][j])) <- celltypes
    }
  }


  # remove counts as they can be recalculated with rev_lognorm
  if (diet_seurat) {
    Seurat::Misc(SO, slot = "RNA_count_colSums") <- unname(Matrix::colSums(Seurat::GetAssayData(SO, slot = "counts", assay = "RNA")))
    SO <- Seurat::DietSeurat(SO, assays = names(SO@assays), counts = F, dimreducs = names(SO@reductions))
  }

  save.time <- format(as.POSIXct(Sys.time(), format = "%d-%b-%Y-%H:%M:%S"), "%y%m%d-%H%M%S")
  save.name <- paste("SO", export_prefix, normalization, batch_corr, downsample, nhvf, npcs, paste0(save.time, ".rds"), sep = "_")
  Seurat::Misc(SO, "object_name") <- save.name

  if (!is.null(save_path)) {
    dir.create(save_path, showWarnings = F, recursive = T)
    saveRDS(SO, compress = T, file = file.path(save_path, save.name))
    message(paste0("SO saved to: ", save_path, " as ", save.name, "."))
  }

  return(SO)
}

.var_feature_filter_removal <- function(SO,
                                        var_feature_filter,
                                        max_rounds = 5,
                                        normalization,
                                        nhvf,
                                        vars.to.regress,
                                        seeed,
                                        verbose,
                                        SCtransform_args,
                                        FindVariableFeatures_args) {

  ## rerun SCTransform (or FindVariableFeatures) until nhvf is met while filtering for var_feature_filter
  ## max iterations is 5; maybe print some messages about progress
  ## filtering for var_feature_filter will affect PCA and all other downstream calculation which are based on PCA

  ## intention: filter TCR/BCR chain genes
  ## alternatively one could also regress them out?!

  if (any(!var_feature_filter %in% rownames(SO))) {
    ## print message?!
    var_feature_filter <- var_feature_filter[which(var_feature_filter %in% rownames(SO))]
    if (length(var_feature_filter) == 0) {
      return(SO)
    }
  }

  n <- 1
  while (any(Seurat::VariableFeatures(SO) %in% var_feature_filter) &&
         length(Seurat::VariableFeatures(SO)[which(!Seurat::VariableFeatures(SO) %in% var_feature_filter)]) < nhvf &&
         n <= max_rounds) {
    message(sum(Seurat::VariableFeatures(SO) %in% var_feature_filter), " of var_feature_filter found in Variable Features. Round ", n, " of increasing nhvf to ", nhvf + length(intersect(Seurat::VariableFeatures(SO), var_feature_filter)), " so that var_feature_filter can be removed while nhvf = " , nhvf, " is met.")

    if (normalization == "SCT") {
      SCtransform_args[which(names(SCtransform_args) == "variable.features.n")] <- SCtransform_args[["variable.features.n"]] + length(intersect(Seurat::VariableFeatures(SO), var_feature_filter))
      SO <- Gmisc::fastDoCall(Seurat::SCTransform, args = c(list(object = SO,
                                                                 seed.use = seeed,
                                                                 verbose = verbose),
                                                            SCtransform_args))

      #SO <- Seurat::SCTransform(SO, assay = "RNA", vst.flavor = "v2", method = "glmGamPoi", variable.features.n = nhvf + length(intersect(Seurat::VariableFeatures(SO), var_feature_filter)), vars.to.regress = vars.to.regress, seed.use = seeed, verbose = verbose)
    }
    if (normalization == "LogNormalize") {
      FindVariableFeatures_args[which(names(FindVariableFeatures_args) == "nfeatures")] <- FindVariableFeatures_args[["nfeatures"]] + length(intersect(Seurat::VariableFeatures(SO), var_feature_filter))
      SO <- Gmisc::fastDoCall(Seurat::FindVariableFeatures, args = c(list(object = SO,
                                                                          assay = "RNA",
                                                                          verbose = verbose),
                                                                     FindVariableFeatures_args))
      #SO <- Seurat::FindVariableFeatures(SO, selection.method = "vst", nfeatures = nhvf + length(intersect(Seurat::VariableFeatures(SO), var_feature_filter)), verbose = verbose, assay = "RNA")
    }
    n <- n + 1
  }
  Seurat::VariableFeatures(SO) <- Seurat::VariableFeatures(SO)[which(!Seurat::VariableFeatures(SO) %in% var_feature_filter)]

  return(SO)
}
Close-your-eyes/scexpr documentation built on April 21, 2023, 10:27 a.m.