#' Read mixed-species DGE summary files
#'
#' Read summary files created DigitalExpression from Drop-seq tools for a
#' species mixing experiment. Creates data.frame containing merged summary for
#' all cell barcodes present in both summary files. For instructions on how to
#' generate DGE files for species-mixing experiments, see the Drop-seq
#' alignment cookbook provided by the
#' [McCarroll lab](http://mccarrolllab.com/dropseq/).
#'
#' @param summary_files Character vector specifying the paths to
#' DigitalExpression summary files for each species. Must be of length two as
#' only species mixing experiments with two species are supported.
#'
#' @param species Character vector of length two providing the species names
#' that should be used for creating output.
#'
#' @param stringsAsFactors Logical: should character vectors be converted to
#' factors? Typically only applies to cell barcodes.
#'
#' @return A data.frame containing containing summary data on all cell barcodes
#' found in summary files of both species.
#'
#' @examples \dontrun{
#' library(dropseqr)
#'
#' # read two summary files and merge into one data.frame
#' infiles <- c("./human_summary.txt", "/mouse_summary.txt")
#' x <- read_ms_dge_summary(infiles, species = c("human", "mouse"))
#' }
#'
#' @export
read_ms_dge_summary <- function(summary_files,
species = c("SPECIES1", "SPECIES2"),
stringsAsFactors = default.stringsAsFactors()){
# make sure that input has correct format
if (length(summary_files) != 2){
stop("Only species mixing experiments with 2 species are supported.",
" summary_files needs to be of length 2")
}
if (length(species) != 2){
stop("Only species mixing experiments with 2 species are supported.",
" speciess needs to be of length 2")
}
# read summary files
smry <- lapply(summary_files, FUN = utils::read.table, header = TRUE,
stringsAsFactors = stringsAsFactors)
# merge summary files from both species
ncells <- length(intersect(smry[[1]]$CELL_BARCODE, smry[[2]]$CELL_BARCODE))
message("Merging DigitalExpression summary data on ",
ncells, " shared cell barcodes")
suffixes <- paste0("_", toupper(species))
merge(x = smry[[1]], y = smry[[2]], by = "CELL_BARCODE", suffixes = suffixes)
}
#' Read mixed-species DGE files
#'
#' Read DGE files created DigitalExpression from Drop-seq tools for a species
#' mixing experiment. Creates data.frame containing merged DGE data for all
#' cell barcodes present in both DGE files. For instructions on how to generate
#' DGE files for species-mixing experiments, see the Drop-seq alignment
#' cookbook provided by the
#' [McCarroll lab](http://mccarrolllab.com/dropseq/).
#'
#' @param dge_files Character vector specifying the paths to
#' DigitalExpression output files for each species. Must be of length two as
#' only species mixing experiments with two species are supported.
#'
#' @param species Character vector of length two providing the species names
#' that should be used for creating output.
#'
#' @param stringsAsFactors Logical: should character vectors be converted to
#' factors? Typically only applies to gene names.
#'
#' @return A data.frame containing DGE data on all cell barcodes found in DGE
#' files of both species.
#'
#' @examples \dontrun{
#' library(dropseqr)
#'
#' # read two dge files and merge into one data.frame
#' infiles <- c("./human_dge.txt", "/mouse_dge.txt")
#' x <- read_ms_dge_files(infiles, species = c("human", "mouse"))
#' }
#'
#' @export
read_ms_dge_files <- function(dge_files,
species = c("SPECIES1", "SPECIES2"),
stringsAsFactors = default.stringsAsFactors()){
# make sure that input has correct format
if (length(dge_files) != 2){
stop("Only species mixing experiments with 2 species are supported.",
" dge_files needs to be of length 2")
}
if (length(species) != 2){
stop("Only species mixing experiments with 2 species are supported.",
" speciess needs to be of length 2")
}
# read dge files
dge <- lapply(dge_files, FUN = utils::read.table, header = TRUE,
stringsAsFactors = stringsAsFactors)
# find "GENE" column
gene_col <- sapply(dge, FUN = function(x){
which(toupper(colnames(x)) == "GENE")
})
# find shared cell barcodes
cell_barcodes <- intersect(colnames(dge[[1]][, -gene_col[1]]),
colnames(dge[[2]][, -gene_col[2]]))
# retain shared cell barcodes and add SPECIES column
dge[[1]] <- cbind(SPECIES = species[1],
GENE = dge[[1]][, gene_col[1]],
dge[[1]][, cell_barcodes])
dge[[2]] <- cbind(SPECIES = species[2],
GENE = dge[[2]][, gene_col[2]],
dge[[2]][, cell_barcodes])
# merge into one data.frame
message("Merging DGE data on ", length(cell_barcodes),
" shared cell barcodes")
rbind(dge[[1]], dge[[2]])
}
#' Species-specific transcripts
#'
#' Calculate number of species-specific transcripts per cell from DGE summary
#' files of a species mixing experiment.
#'
#' @param dge_summary data.frame containing the merged summary information from
#' summary files created by DigitalExpression from Drop-seq tools. Needs to
#' have at least columns named CELL_BARCODE, NUM_TRANSCRIPTS_SPECIES1,
#' NUM_TRANSCRIPTS_SPECIES2 (case insensitive, SPECIES* sould be actual species
#' name). Merging of summary files can be done by using
#' \link[dropseqr]{read_ms_dge_summary}. For instructions how to extract dge
#' data from species-mixing experiments, see the Drop-seq alignment cookbook
#' provided by the [McCarroll lab](http://mccarrolllab.com/dropseq/).
#'
#' @param min_txs Cells with less than min_txs detected transcripts are not
#' assigned to a species (NA).
#'
#' @param min_frac Minimal fraction of transcripts per cell that need to come
#' from one species so that the cell to be assigned to that species.
#' E.g. min_frac = 0.9 means that if at least 90% of a cells transcripts were
#' mapped to the human genome the cell will be labelled as human. If a cell has
#' less than min_frac transcripts mapping to one species, the cell will be
#' labeled as mixed.
#'
#' @return A data.frame containing number and fraction of transcripts per
#' species, and inferred species for each cell.
#'
#' @examples
#' library(dropseqr)
#' library(ggplot2)
#'
#' # load example DigitalExpression summary information
#' data(ms_dge_summary)
#'
#' # summarize number of species-specific transcripts cell and assign cells to
#' # species. at least a total of 6000 transcripts are required per cell to
#' # assign the cell to a species
#' txs <- txs_ms_dge_summary(ms_dge_summary, min_txs = 6000)
#'
#' # visualize the results using ggplot2
#' ggplot(txs, aes(num_transcripts_human, num_transcripts_mouse)) +
#' geom_point(aes(colour = species), alpha = 0.5) +
#' xlab("Human transcripts") +
#' ylab("Mouse transcripts") +
#' scale_colour_manual(values = c("tomato", "blueviolet", "dodgerblue"),
#' na.value = "gray50",
#' breaks = c("human", "mouse", "mixed"))
#'
#' @export
txs_ms_dge_summary <- function(dge_summary, min_txs = 1000, min_frac = 0.9){
# transform colnames to lower case
colnames(dge_summary) <- tolower(colnames(dge_summary))
# get columns with number of transcripts
tx_cols <- grep(colnames(dge_summary), pattern = "num_transcripts",
value = TRUE)
# infer species names from txs_cols
species <- sub(tx_cols, pattern = "num_transcripts",
replacement = "")
species <- gsub(species, pattern = "^[_*|\\W*]|[_*|\\W*]$",
replacement = "", perl = TRUE)
# extract cell barcode and number of transcript columns
txs <- dge_summary[, c("cell_barcode", tx_cols)]
# calculate fraction of transcripts per species and add as new columns to txs
col_names <- sub(colnames(txs)[2:3], pattern = "num", replacement = "frac")
txs[, col_names[1]] <- txs[, 2] / rowSums(txs[, 2:3])
txs[, col_names[2]] <- 1 - txs[, 4]
# define species of each cell based on min_frac
txs$species <- ((txs[, 4] >= min_frac) * 1 + (txs[, 5] >= min_frac) * 2) + 1
# replace by species name (or "mixed", if species == 1)
txs$species <- c("mixed", species)[txs$species]
# set species to NA for cells with less than min_txs transcripts
txs$species[rowSums(txs[, 2:3]) < min_txs] <- NA
# sort according to total number of transcript per cell
txs[order(rowSums(txs[, 2:3]), decreasing = TRUE), ]
}
#' Species-specific transcripts
#'
#' Calculate number of species-specific transcripts per cell from DGE data from
#' a species mixing experiment.
#'
#' @param dge data.frame containing digital gene expression data (DGE) for all
#' detected genes (rows) across cells (columns) from both species. Must contain
#' a column named "SPECIES" (case insensitive) providing a species id for each
#' gene. Any included gene id columns (not needed) must be named "GENE"
#' (case insensitive). Such a data.frame can be created with
#' \link[dropseqr]{read_ms_dge_files}. For instructions how to extract dge data
#' from species-mixing experiments, see the Drop-seq alignment cookbook
#' provided by the [McCarroll lab](http://mccarrolllab.com/dropseq/).
#'
#' @param min_txs Cells with less than min_txs detected transcripts are not
#' assigned to a species (NA).
#'
#' @param min_frac Minimal fraction of transcripts per cell that need to come
#' from one species so that the cell to be assigned to that species.
#' E.g. min_frac = 0.9 means that if at least 90% of a cells transcripts were
#' mapped to the human genome the cell will be labelled as human. If a cell has
#' less than min_frac transcripts mapping to one species, the cell will be
#' labeled as mixed.
#'
#' @return A data.frame containing number and fraction of transcripts per
#' species, and inferred species for each cell.
#'
#' @examples
#' library(dropseqr)
#' library(ggplot2)
#'
#' # load example DGE data
#' data(ms_dge_data)
#'
#' # summarize number of species-specific transcripts cell and assign cells to
#' # species. at least a total of 6000 transcripts are required per cell to
#' # assign the cell to a species
#' txs <- txs_ms_dge(ms_dge_data, min_txs = 6000)
#'
#' # visualize the results using ggplot2
#' ggplot(txs, aes(num_transcripts_human, num_transcripts_mouse)) +
#' geom_point(aes(colour = species), alpha = 0.5) +
#' xlab("Human transcripts") +
#' ylab("Mouse transcripts") +
#' scale_colour_manual(values = c("tomato", "blueviolet", "dodgerblue"),
#' na.value = "gray50",
#' breaks = c("human", "mouse", "mixed"))
#'
#' @export
txs_ms_dge <- function(dge, min_txs = 1000, min_frac = 0.9){
# get column containg species and gene information
species_col <- grep(colnames(dge), pattern = "species", ignore.case = TRUE)
gene_col <- grep(colnames(dge), pattern = "gene", ignore.case = TRUE)
# get species names
species <- levels(as.factor(dge[, species_col]))
# abort if not 2 species are found
if (length(species) != 2){
stop("Only species mixing with 2 species supported!", call. = FALSE)
}
# split dge by species
dge_split <- split(dge[, -c(gene_col, species_col)], f = dge[, species_col])
# get transcripts per species for each cell
txs <- as.data.frame(sapply(X = dge_split, FUN = colSums))
colnames(txs) <- paste0("num_transcripts_", colnames(txs))
# calculate fraction of transcripts per species and add as new columns to txs
col_names <- sub(colnames(txs)[1:2], pattern = "num", replacement = "frac")
txs[, col_names[1]] <- txs[, 1] / rowSums(txs[, 1:2])
txs[, col_names[2]] <- 1 - txs[, 3]
# define species of each cell based on min_frac
txs$species <- ((txs[, 3] >= min_frac) * 1 + (txs[, 4] >= min_frac) * 2) + 1
# replace by species name (or "mixed", if species == 1)
txs$species <- c("mixed", species)[txs$species]
# set species to NA for cells with less than min_txs transcripts
txs$species[rowSums(txs[, 1:2]) < min_txs] <- NA
# convert rownames into first column
txs <- cbind(cell_barcode = rownames(txs), txs, stringsAsFactors = FALSE,
row.names = NULL)
# sort according to total number of transcript per cell
txs[order(rowSums(txs[, 2:3]), decreasing = TRUE), ]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.