R/seuratPreprocessing.r

Defines functions pipeline.cellcycleProcessing pipeline.seuratPreprocessing

pipeline.seuratPreprocessing <- function(env)
{
  env$seuratObject[["percent.mt"]] <- PercentageFeatureSet(object = env$seuratObject, assay = "RNA", pattern = "^MT-")

  # Filtering
  # 1) Low-quality cells where more than 15% of the read counts come from mitochondrial genome
  # 2) Cells with more than 7000 detected genes

  dropped.cols = which(env$seuratObject$percent.mt >= 15 | env$seuratObject$nFeature_RNA >= 7000)

  env$seuratObject <- subset(env$seuratObject, subset = nFeature_RNA < 7000 & percent.mt < 15)

  if (length(dropped.cols) > 0)
  {
    env$group.labels <- env$group.labels[-dropped.cols]
    env$group.colors <- env$group.colors[-dropped.cols]
    util.warn("Filtered",length(dropped.cols),"cells from data set in preprocessing.")
  }

  # Normalization & reduction
  env$seuratObject <- NormalizeData(env$seuratObject, assay = 'RNA', normalization.method = "LogNormalize")
  env$seuratObject <- FindVariableFeatures(env$seuratObject, selection.method = "vst", assay = 'RNA')
  if( length(unique(env$seuratObject$orig.ident)) > 2 )
  {
    env$seuratObject <- ScaleData(env$seuratObject, model.use = 'linear', vars.to.regress = 'orig.ident')
  } else
  {
    env$seuratObject <- ScaleData(env$seuratObject, model.use = 'linear' )
  }

  env$seuratObject <- RunPCA(env$seuratObject, npcs = min(ncol(env$seuratObject)-1, 100))
  env$seuratObject <- RunTSNE(env$seuratObject, reduction = "pca", assay = 'RNA', dims = length( env$seuratObject$pca ), perplexity = ifelse( ncol(env$seuratObject)>1000, 5, 3 ) )
  env$seuratObject <- RunUMAP(env$seuratObject, reduction = "pca", assay = 'RNA', dims = rep(length( env$seuratObject$pca),2), n.neighbors = min(ncol(env$seuratObject)-1, 30 )  )

  # primary cell clustering
  env$seuratObject <- FindNeighbors(env$seuratObject, assay = 'RNA',  dims = rep( length( env$seuratObject$pca ), 2 ) )
  env$seuratObject <- FindClusters(env$seuratObject, resolution = 1)

  return(env)
}


pipeline.cellcycleProcessing <- function(env)
{
  if( any(row.names(env$seuratObject) %in% Seurat::cc.genes.updated.2019$g2m.genes) && any(row.names(env$seuratObject) %in% Seurat::cc.genes.updated.2019$s.genes) ){
    marker <- list()
    marker$S <- intersect( Seurat::cc.genes.updated.2019$s.genes, rownames(env$seuratObject) )
    marker$G2M <- intersect( Seurat::cc.genes.updated.2019$g2m.genes, rownames(env$seuratObject) )
  }
  else
  {
    if (!biomart.available(env))
    {
      util.warn("Requested biomaRt host seems to be down.")
      util.warn("Disabling classification of cell cycle phase.")
      env$preferences$preprocessing$cellcycle.correction <- FALSE
      return(env)
    }

    if( length(grep("ENSMUSG",row.names(env$seuratObject))>0) )
    {
      # convert human marker names to mouse gene ids
      convertHumanGeneList <- function(x){
        human = useMart(biomart=env$preferences$database.biomart, dataset = "hsapiens_gene_ensembl", host=env$preferences$database.host)
        mouse = useMart(biomart=env$preferences$database.biomart, dataset = "mmusculus_gene_ensembl", host=env$preferences$database.host)
        genesV2 = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("ensembl_gene_id"), martL = mouse, uniqueRows=T)
        humanx <- unique(genesV2[, 2])
        return(humanx)
      }
      marker <- list()
      marker$S <- convertHumanGeneList(Seurat::cc.genes.updated.2019$s.genes)
      marker$G2M <- convertHumanGeneList(Seurat::cc.genes.updated.2019$g2m.genes)
    } else
      if( length(grep("ENSG",row.names(env$seuratObject))>0) )
      {
        # convert marker names to gene ids
        convertGeneList <- function(x){
          human = useMart(biomart=env$preferences$database.biomart, dataset = "hsapiens_gene_ensembl", host=env$preferences$database.host)
          genesV2 = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("ensembl_gene_id"), martL = human, uniqueRows=T)
          gene_ids <- unique(genesV2[, 2])
          return(gene_ids)
        }
        marker <- list()
        marker$S <- convertGeneList(Seurat::cc.genes.updated.2019$s.genes)
        marker$G2M <- convertGeneList(Seurat::cc.genes.updated.2019$g2m.genes)
      } else
      {
        util.warn("Cell cycle markers only available for human and mouse organisms")
        env$preferences$preprocessing$cellcycle.correction <- FALSE
        return(env)
      }
  }

  env$seuratObject <- CellCycleScoring(object = env$seuratObject, g2m.features = marker$G2M, s.features = marker$S)

  if( any(is.na(env$seuratObject$S.Score)) || any(is.na(env$seuratObject$G2M.Score)))
  {
    util.warn("Cell cycle classificataion failed. Possibly too few features in data set.")
    env$preferences$preprocessing$cellcycle.correction <- FALSE
    return(env)
  }

  if (env$preferences$preprocessing$cellcycle.correction)
  {
    util.info("Correction for cell cycle phase")
    env$seuratObject <- ScaleData(env$seuratObject, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(env$seuratObject))
  }
  return(env)
}
hloefflerwirth/scrat documentation built on Sept. 2, 2022, 4:09 a.m.