R/species_mixing.R

Defines functions read_ms_dge_summary read_ms_dge_files txs_ms_dge_summary txs_ms_dge

Documented in read_ms_dge_files read_ms_dge_summary txs_ms_dge txs_ms_dge_summary

#' Read mixed-species DGE summary files
#'
#' Read summary files created DigitalExpression from Drop-seq tools for a
#' species mixing experiment. Creates data.frame containing merged summary for
#' all cell barcodes present in both summary files. For instructions on how to
#' generate DGE files for species-mixing experiments, see the Drop-seq
#' alignment cookbook provided by the
#' [McCarroll lab](http://mccarrolllab.com/dropseq/).
#'
#' @param summary_files Character vector specifying the paths to
#' DigitalExpression summary files for each species. Must be of length two as
#' only species mixing experiments with two species are supported.
#'
#' @param species Character vector of length two providing the species names
#' that should be used for creating output.
#'
#' @param stringsAsFactors Logical: should character vectors be converted to
#' factors? Typically only applies to cell barcodes.
#'
#' @return A data.frame containing containing summary data on all cell barcodes
#' found in summary files of both species.
#'
#' @examples \dontrun{
#' library(dropseqr)
#'
#' # read two summary files and merge into one data.frame
#' infiles <- c("./human_summary.txt", "/mouse_summary.txt")
#' x <- read_ms_dge_summary(infiles, species = c("human", "mouse"))
#' }
#'
#' @export
read_ms_dge_summary <- function(summary_files,
                                species = c("SPECIES1", "SPECIES2"),
                                stringsAsFactors = default.stringsAsFactors()){

  # make sure that input has correct format
  if (length(summary_files) != 2){

    stop("Only species mixing experiments with 2 species are supported.",
        " summary_files needs to be of length 2")

  }

  if (length(species) != 2){

    stop("Only species mixing experiments with 2 species are supported.",
         " speciess needs to be of length 2")

  }

  # read summary files
  smry <- lapply(summary_files, FUN = utils::read.table, header = TRUE,
                 stringsAsFactors = stringsAsFactors)

  # merge summary files from both species
  ncells <- length(intersect(smry[[1]]$CELL_BARCODE, smry[[2]]$CELL_BARCODE))
  message("Merging DigitalExpression summary data on ",
          ncells, " shared cell barcodes")

  suffixes <- paste0("_", toupper(species))
  merge(x = smry[[1]], y = smry[[2]], by = "CELL_BARCODE", suffixes = suffixes)

}

#' Read mixed-species DGE files
#'
#' Read DGE files created DigitalExpression from Drop-seq tools for a species
#' mixing experiment. Creates data.frame containing merged DGE data for all
#' cell barcodes present in both DGE files. For instructions on how to generate
#' DGE files for species-mixing experiments, see the Drop-seq alignment
#' cookbook provided by the
#' [McCarroll lab](http://mccarrolllab.com/dropseq/).
#'
#' @param dge_files Character vector specifying the paths to
#' DigitalExpression output files for each species. Must be of length two as
#' only species mixing experiments with two species are supported.
#'
#' @param species Character vector of length two providing the species names
#' that should be used for creating output.
#'
#' @param stringsAsFactors Logical: should character vectors be converted to
#' factors? Typically only applies to gene names.
#'
#' @return A data.frame containing DGE data on all cell barcodes found in DGE
#' files of both species.
#'
#' @examples \dontrun{
#' library(dropseqr)
#'
#' # read two dge files and merge into one data.frame
#' infiles <- c("./human_dge.txt", "/mouse_dge.txt")
#' x <- read_ms_dge_files(infiles, species = c("human", "mouse"))
#' }
#'
#' @export
read_ms_dge_files <- function(dge_files,
                              species = c("SPECIES1", "SPECIES2"),
                              stringsAsFactors = default.stringsAsFactors()){

  # make sure that input has correct format
  if (length(dge_files) != 2){

    stop("Only species mixing experiments with 2 species are supported.",
         " dge_files needs to be of length 2")

  }

  if (length(species) != 2){

    stop("Only species mixing experiments with 2 species are supported.",
         " speciess needs to be of length 2")

  }

  # read dge files
  dge <- lapply(dge_files, FUN = utils::read.table, header = TRUE,
                stringsAsFactors = stringsAsFactors)

  # find "GENE" column
  gene_col <- sapply(dge, FUN = function(x){

    which(toupper(colnames(x)) == "GENE")

  })

  # find shared cell barcodes
  cell_barcodes <- intersect(colnames(dge[[1]][, -gene_col[1]]),
                             colnames(dge[[2]][, -gene_col[2]]))

  # retain shared cell barcodes and add SPECIES column
  dge[[1]] <- cbind(SPECIES = species[1],
                    GENE = dge[[1]][, gene_col[1]],
                    dge[[1]][, cell_barcodes])

  dge[[2]] <- cbind(SPECIES = species[2],
                    GENE = dge[[2]][, gene_col[2]],
                    dge[[2]][, cell_barcodes])

  # merge into one data.frame
  message("Merging DGE data on ", length(cell_barcodes),
          " shared cell barcodes")

  rbind(dge[[1]], dge[[2]])

}

#' Species-specific transcripts
#'
#' Calculate number of species-specific transcripts per cell from DGE summary
#' files of a species mixing experiment.
#'
#' @param dge_summary data.frame containing the merged summary information from
#' summary files created by DigitalExpression from Drop-seq tools. Needs to
#' have at least columns named CELL_BARCODE, NUM_TRANSCRIPTS_SPECIES1,
#' NUM_TRANSCRIPTS_SPECIES2 (case insensitive, SPECIES* sould be actual species
#' name). Merging of summary files can be done by using
#' \link[dropseqr]{read_ms_dge_summary}. For instructions how to extract dge
#' data from species-mixing experiments, see the Drop-seq alignment cookbook
#' provided by the [McCarroll lab](http://mccarrolllab.com/dropseq/).
#'
#' @param min_txs Cells with less than min_txs detected transcripts are not
#' assigned to a species (NA).
#'
#' @param min_frac Minimal fraction of transcripts per cell that need to come
#' from one species so that the cell to be assigned to that species.
#' E.g. min_frac = 0.9 means that if at least 90% of a cells transcripts were
#' mapped to the human genome the cell will be labelled as human. If a cell has
#' less than min_frac transcripts mapping to one species, the cell will be
#' labeled as mixed.
#'
#' @return A data.frame containing number and fraction of transcripts per
#' species, and inferred species for each cell.
#'
#' @examples
#' library(dropseqr)
#' library(ggplot2)
#'
#' # load example DigitalExpression summary information
#' data(ms_dge_summary)
#'
#' # summarize number of species-specific transcripts cell and assign cells to
#' # species. at least a total of 6000 transcripts are required per cell to
#' # assign the cell to a species
#' txs <- txs_ms_dge_summary(ms_dge_summary, min_txs = 6000)
#'
#' # visualize the results using ggplot2
#' ggplot(txs, aes(num_transcripts_human, num_transcripts_mouse)) +
#'   geom_point(aes(colour = species), alpha = 0.5) +
#'   xlab("Human transcripts") +
#'   ylab("Mouse transcripts") +
#'   scale_colour_manual(values = c("tomato", "blueviolet", "dodgerblue"),
#'                       na.value = "gray50",
#'                       breaks = c("human", "mouse", "mixed"))
#'
#' @export
txs_ms_dge_summary <- function(dge_summary, min_txs = 1000, min_frac = 0.9){

  # transform colnames to lower case
  colnames(dge_summary) <- tolower(colnames(dge_summary))

  # get columns with number of transcripts
  tx_cols <- grep(colnames(dge_summary), pattern = "num_transcripts",
                  value = TRUE)

  # infer species names from txs_cols
  species <- sub(tx_cols, pattern = "num_transcripts",
                 replacement = "")
  species <- gsub(species, pattern = "^[_*|\\W*]|[_*|\\W*]$",
                  replacement = "", perl = TRUE)

  # extract cell barcode and number of transcript columns
  txs <- dge_summary[, c("cell_barcode", tx_cols)]

  # calculate fraction of transcripts per species and add as new columns to txs
  col_names <- sub(colnames(txs)[2:3], pattern = "num", replacement = "frac")
  txs[, col_names[1]] <- txs[, 2] / rowSums(txs[, 2:3])
  txs[, col_names[2]] <- 1 - txs[, 4]

  # define species of each cell based on min_frac
  txs$species <- ((txs[, 4] >= min_frac) * 1 + (txs[, 5] >= min_frac) * 2) + 1

  # replace by species name (or "mixed", if species == 1)
  txs$species <- c("mixed", species)[txs$species]

  # set species to NA for cells with less than min_txs transcripts
  txs$species[rowSums(txs[, 2:3]) < min_txs] <- NA

  # sort according to total number of transcript per cell
  txs[order(rowSums(txs[, 2:3]), decreasing = TRUE), ]

}

#' Species-specific transcripts
#'
#' Calculate number of species-specific transcripts per cell from DGE data from
#' a species mixing experiment.
#'
#' @param dge data.frame containing digital gene expression data (DGE) for all
#' detected genes (rows) across cells (columns) from both species. Must contain
#' a column named "SPECIES" (case insensitive) providing a species id for each
#' gene. Any included gene id columns (not needed) must be named "GENE"
#' (case insensitive). Such a data.frame can be created with
#' \link[dropseqr]{read_ms_dge_files}. For instructions how to extract dge data
#' from species-mixing experiments, see the Drop-seq alignment cookbook
#' provided by the [McCarroll lab](http://mccarrolllab.com/dropseq/).
#'
#' @param min_txs Cells with less than min_txs detected transcripts are not
#' assigned to a species (NA).
#'
#' @param min_frac Minimal fraction of transcripts per cell that need to come
#' from one species so that the cell to be assigned to that species.
#' E.g. min_frac = 0.9 means that if at least 90% of a cells transcripts were
#' mapped to the human genome the cell will be labelled as human. If a cell has
#' less than min_frac transcripts mapping to one species, the cell will be
#' labeled as mixed.
#'
#' @return A data.frame containing number and fraction of transcripts per
#' species, and inferred species for each cell.
#'
#' @examples
#' library(dropseqr)
#' library(ggplot2)
#'
#' # load example DGE data
#' data(ms_dge_data)
#'
#' # summarize number of species-specific transcripts cell and assign cells to
#' # species. at least a total of 6000 transcripts are required per cell to
#' # assign the cell to a species
#' txs <- txs_ms_dge(ms_dge_data, min_txs = 6000)
#'
#' # visualize the results using ggplot2
#' ggplot(txs, aes(num_transcripts_human, num_transcripts_mouse)) +
#'   geom_point(aes(colour = species), alpha = 0.5) +
#'   xlab("Human transcripts") +
#'   ylab("Mouse transcripts") +
#'   scale_colour_manual(values = c("tomato", "blueviolet", "dodgerblue"),
#'                       na.value = "gray50",
#'                       breaks = c("human", "mouse", "mixed"))
#'
#' @export
txs_ms_dge <- function(dge, min_txs = 1000, min_frac = 0.9){

  # get column containg species and gene information
  species_col <- grep(colnames(dge), pattern = "species", ignore.case = TRUE)
  gene_col <- grep(colnames(dge), pattern = "gene", ignore.case = TRUE)

  # get species names
  species <- levels(as.factor(dge[, species_col]))

  # abort if not 2 species are found
  if (length(species) != 2){

    stop("Only species mixing with 2 species supported!", call. = FALSE)

  }

  # split dge by species
  dge_split <- split(dge[, -c(gene_col, species_col)], f = dge[, species_col])

  # get transcripts per species for each cell
  txs <- as.data.frame(sapply(X = dge_split, FUN = colSums))
  colnames(txs) <- paste0("num_transcripts_", colnames(txs))

  # calculate fraction of transcripts per species and add as new columns to txs
  col_names <- sub(colnames(txs)[1:2], pattern = "num", replacement = "frac")
  txs[, col_names[1]] <- txs[, 1] / rowSums(txs[, 1:2])
  txs[, col_names[2]] <- 1 - txs[, 3]

  # define species of each cell based on min_frac
  txs$species <- ((txs[, 3] >= min_frac) * 1 + (txs[, 4] >= min_frac) * 2) + 1

  # replace by species name (or "mixed", if species == 1)
  txs$species <- c("mixed", species)[txs$species]

  # set species to NA for cells with less than min_txs transcripts
  txs$species[rowSums(txs[, 1:2]) < min_txs] <- NA

  # convert rownames into first column
  txs <- cbind(cell_barcode = rownames(txs), txs, stringsAsFactors = FALSE,
        row.names = NULL)

  # sort according to total number of transcript per cell
  txs[order(rowSums(txs[, 2:3]), decreasing = TRUE), ]

}
argschwind/dropseqr documentation built on May 23, 2019, 4:24 p.m.