R/r_functions.R

Defines functions get.10x.data corner make.seurat.object generate.frac.intron.meta frac.intron.plot filter.nuc.droplets make.knee.plot filter.expressed.genes export.csvs MergeSeuratMulti

Documented in corner export.csvs filter.expressed.genes filter.nuc.droplets make.knee.plot make.seurat.object

# Reading 10x cellranger data using the output directory
library(tidyverse)

get.10x.data <- function(path, mtx_name = "matrix.mtx", genes_name = "genes.tsv", barcodes_name = "barcodes.tsv"){

  barcodes <- read.table(file.path(path, barcodes_name), comment.char = "%")
  genes <- read.table(file.path(path, genes_name), comment.char = "%", sep = "\t")
  mtx <- read.table(file.path(path, mtx_name), comment.char = "%", sep = " ")[-1,]

  A <- Matrix::sparseMatrix(i = mtx[,1], j = mtx[,2], x = mtx[,3])

  row.names(A) <- genes[1:range(mtx[,1])[2],1]
  colnames(A) <- barcodes[1:range(mtx[,2])[2],1]

  return(A)
}

# Print the left top corner of a given dataframe or matrix
#' Corner
#'
#' Getting the top left corner of a dataframe, tibble or matrix.
#'
#' @param df Dataframe, matrix or tibble to be shown
corner <- function(df) df[1:min(c(5,nrow(df))),
                          1:min(c(5,ncol(df)))]


#' Generates a seurat object of 10x raw data
#'
#' Using the 10x output path with the genes, barcodes and matrix files.
#'
#' @param path Path to the 10x output
#' @param cell.name.addition Prefix for each cell barcode
#' @param min.genes Minimal number of genes which need to be detected per barcode to keep the barcode
#' @return Seurat object
make.seurat.object <- function(path,
                               cell.name.addition = NULL, min.genes = 100, project = "SeuratProject"){
  A <- get.10x.data(path)

  gene.annot <- read.table(file.path(path, "genes.tsv"), comment.char = "%", sep = "\t")
  colnames(gene.annot) <- c("GeneID", "GeneName")

  A <- Seurat::CreateSeuratObject(raw.data = A, min.genes = min.genes, project = project)

  A@misc <- list(geneAnnotation = gene.annot)

  return(A)
}


# Produces a meta data table for each cell
# Can be used for the novel nuclei droplet identification
generate.frac.intron.meta <- function(exon, fullTranscript){
  if(!is.matrix(fullTranscript) & !is.matrix(exon)){
    ft <- fullTranscript@raw.data
    ex <- exon@raw.data
  }

  keep.cells <- intersect(colnames(ft), colnames(ex))

  cat("Number of cells in both sets:",length(keep.cells), "\n")
  ex <- ex[,keep.cells]
  ft <- ft[,keep.cells]

  ex.sum <- as.data.frame(Matrix::colSums(ex))
  colnames(ex.sum) = "exonic"

  ft.sum <- as.data.frame(Matrix::colSums(ft))
  colnames(ft.sum) <- "fullTranscript"

  cells.meta <- merge(ft.sum, ex.sum, by = 0)
  colnames(cells.meta)[1] <- "CellID"

  cells.meta <- dplyr::filter(cells.meta, fullTranscript > 0)
  cells.meta <- cells.meta %>% arrange(desc(fullTranscript)) %>%
    mutate(CellRank = 1:nrow(cells.meta)) %>%
    mutate(fracIntronic = 1 - (exonic / fullTranscript))

  return(cells.meta)
}

# Plot the novel nuclei droplet identification plot
frac.intron.plot <- function(exon=NULL, fulltranscript=NULL, meta.data = NULL,
                             frac.cut = NULL, rank.cut = NULL,
                             frac.cut.max = NULL, rank.cut.min = NULL){
  if(is.null(meta.data) & !is.null(exon) & !is.null(fulltranscript)){
    meta.data <- generate.frac.intron.meta(exon, fulltranscript)
  }else if(!is.null(meta.data) & !is.null(exon) & !is.null(fulltranscript)){
    stop("Please provide either a meta.data dataframe, generated by generate.frac.intron.meta,\n or provide exon and intron matrices / seurat objects.")
  }

  p <- ggplot(meta.data, aes(x = CellRank, y = fracIntronic)) +
    geom_point(size = 0.1) +
    theme_gray() +
    ylab("Fraction intronic UMI") +
    xlab("Full transcript UMI cell rank")

  if(!is.null(frac.cut)){
    p <- p + geom_hline(yintercept = frac.cut)
  }

  if(!is.null(frac.cut.max)){
    p <- p + geom_hline(yintercept = frac.cut.max, color = "red")
  }

  if(!is.null(rank.cut)){
    p <- p + geom_vline(xintercept = rank.cut)
  }

  if(!is.null(rank.cut.min)){
    p <- p + geom_vline(xintercept = rank.cut.min, color = "red")
  }

  return(p)

}

#' Filtering droplets
#'
#' Filters droplets based on their properties, calculated using generate.frac.intron.meta
#'
#' @param sobj.exon Seurat object, pretreated with generate.frac.intron.meta
filter.nuc.droplets <- function(sobj.exon, min.frac.intronic = 0.5, max.rank = NULL,
                                max.frac.intronic = NULL, min.rank = NULL){
  meta.cell <- sobj.exon@misc$cells.meta

  meta.cell <- meta.cell %>% filter(fracIntronic >= min.frac.intronic)

  if(!is.null(max.rank)){
    meta.cell <- meta.cell %>% filter(CellRank <= max.rank)
  }

  if(!is.null(max.frac.intronic)){
    meta.cell <- meta.cell %>% filter(fracIntronic <= max.frac.intronic)
  }

  if(!is.null(min.rank)){
    meta.cell <- meta.cell %>% filter(CellRank >= min.rank)
  }

  meta.cell <- filter(meta.cell, CellID %in% rownames(sobj.exon@meta.data))

  return(SubsetData(sobj.exon, cells.use = meta.cell$CellID))
}

#' Classical kneepoint plot
#'
#' Generates a kneepoint plot of the seurat object.
make.knee.plot <- function(sobj, max_rank = 2e4){
  meta <- sobj@meta.data

  meta <- meta %>% arrange(desc(nUMI)) %>%
    mutate(cumSum = cumsum(nUMI)) %>%
    mutate(UMIrank = 1:nrow(meta)) %>%
    filter(UMIrank < max_rank)

  ggplot(meta, aes(x = UMIrank, y = cumSum)) +
    geom_point() +
    theme_gray()
}

#' Filters out lowly expressed genes
#'
#' Remove genes from the dataset which are lowly expressed.
#'
#' @param sobj Seurat object
#' @param min_cells_expressed In how many cells needs a gene being expressed to be accepted
filter.expressed.genes <- function(sobj, min_cells_expressed = 25){
  exp <- sobj@data

  exp <- exp[Matrix::rowSums(exp > 0) >= min_cells_expressed, ]

  sobj@data <- exp

  return(sobj)
}

#' Exports a seurat object as CSV files
#'
#' This function creates a CSV file for:
#'
#'   - the data field
#'   - the raw.data field
#'   - the UMAP embedding
#'
#' @param obj Seurat object
#' @param folderpath Folder to export to. If not existent, it will be created.
#' @param sample.name Prefix for each exported CSV
#' @param alter.gene.names Dataframe to convert the names of the Genes.
#' @param alter.gene.names.from Column name of the alter.gene.names dataframe which corresponds to the row.names in the data and raw.data slot of the seurat object
#' @param alter.gene.names.to Column name which contains the new gene names in the alter.gene.names dataframe
#'
export.csvs <- function(obj, folderpath, sample.name = "seurat",
                        alter.gene.names.df = NULL,
                        alter.gene.names.from = "Gene.stable.ID",
                        alter.gene.names.to = "Gene.name",
                        min.cells = 10){
  exprs <- obj@data
  umi <- obj@raw.data
  emb <- obj@dr$umap@cell.embeddings

  dir.create(folderpath, recursive = T)

  if(!is.null(alter.gene.names.df)){
    cat("Renaming the genes\n")
    exprs <- exprs[row.names(exprs) %in% alter.gene.names.df[[alter.gene.names.from]], ]
    row.names(exprs) <- alter.gene.names.df[[alter.gene.names.to]][match(row.names(exprs), alter.gene.names.df[[alter.gene.names.from]])]

    umi <- umi[row.names(umi) %in% alter.gene.names.df[[alter.gene.names.from]], ]
    row.names(umi) <- alter.gene.names.df[[alter.gene.names.to]][match(row.names(umi), alter.gene.names.df[[alter.gene.names.from]])]
  }

  exprs <- exprs[,match(colnames(exprs), row.names(emb))[!is.na(match(colnames(exprs), row.names(emb)))]]
  umi <- umi[,match(colnames(umi), row.names(emb))[!is.na(match(colnames(umi), row.names(emb)))]]

  exprs.gz <- gzfile(file.path(folderpath, paste0(sample.name,"_exprs.csv.gz")), "w" )
  umi.gz <- gzfile(file.path(folderpath, paste0(sample.name,"_umi.csv.gz")), "w")

  cat("Writing the embedding\n")
  write.table(as.matrix(emb), file = file.path(folderpath, paste0(sample.name, "_embedding.csv")),
              quote = F, sep = ",", col.names = F)

  cat("Writing the expression matrix\n")
  write.table(as.matrix(exprs), file = exprs.gz, quote = F, sep = ",")
  close(exprs.gz)

  cat("Writing the UMI counts\n")
  write.table(as.matrix(umi[Matrix::rowSums(umi > 0) > min.cells, ]), file = umi.gz, quote = F, sep = ",")
  close(umi.gz)


}

MergeSeuratMulti <- function(sobj.list){
  out.merged <- NULL
  for(n in names(sobj.list)){
    tmp = sobj.list[[n]]

    cat("Processing", n, "\n")
    if(is.null(out.merged)){
      out.merged <- tmp
    }else{
      out.merged <- MergeSeurat(out.merged, tmp, do.normalize = F, do.scale = F, do.center = F)
    }
  }

  return(out.merged)
}
klprint/customSC documentation built on May 28, 2019, 5:54 p.m.