R/markers.R

Defines functions read_markers write_cell_type_header export_markers_for_cell_type export_markers_by_cell_type export_markers_by_group write_header export_markers remove_duplicated_genes compute_markers_blockwise tidy_stat my_fdr my_col_sums average_expression compute_markers_ compute_markers

Documented in compute_markers export_markers export_markers_by_cell_type export_markers_by_group export_markers_for_cell_type read_markers remove_duplicated_genes

#' Compute differential expression statistics for given dataset and cell types,
#' stratified by groups.
#'
#' @param expression Expression matrix (may be sparse).
#' @param cell_type_labels Character vector providing
#' cell type names for each sample in the expression matrix.
#' @param group_labels Character vector providing hierarchical grouping for
#' cell types (one group name for each sample in the expression matrix).
#' @param two_tailed Boolean. If FALSE, only upregulated genes are considered
#' significant (ROC test).
#' @param tie_correction Boolean. For the ROC test, should tie correction be
#' applied? Note that skipping tie correction is slightly conservative.
#' @param genes_are_rows Boolean. In the expression matrix, were genes provided
#' as rows (as in SingleCellExperiment or Seurat objects)?
#'
#' @return A tibble containing basic differential expression statistics for
#' all cell types and genes. All statistics are 1-vs-rest within groups. NOTE:
#' genes with duplicate names will be removed.
#'
#' @export
#' @importFrom rlang .data
#' @importFrom dplyr %>%
compute_markers = function(expression, cell_type_labels,
                           group_labels = rep("all", length(cell_type_labels)),
                           two_tailed = FALSE, tie_correction = FALSE,
                           genes_are_rows=TRUE) {
    cell_type_labels = as.character(cell_type_labels)
    group_labels = as.character(group_labels)
    if (any(cell_type_labels == "")) {
        stop("All cells must have a label, please remove cells without labels ",
             "or consider renaming (e.g., unknown).")
    }
    
    result = lapply(purrr::set_names(unique(group_labels)), function(group) {
        keep_cell = group_labels == group
        if (genes_are_rows) {
            expr = expression[, keep_cell]
        } else {
            expr = expression[keep_cell,]
        }
        label_matrix = design_matrix(cell_type_labels[keep_cell])
        if (ncol(label_matrix) > 1) {
            compute_markers_(expr, label_matrix, two_tailed,
                             tie_correction, genes_are_rows)
        } else {
            warning(paste0("Skipping group ", group,
                    ": only 1 cluster (", colnames(label_matrix) ,")."))
            return(NULL)
        }
    })
    result = result[!sapply(result, is.null)]
    result = dplyr::bind_rows(result, .id = "group")
    return(result)
}

# Actual implementation of marker computation (for a given group)
compute_markers_ = function(expression, cell_type_labels, two_tailed = FALSE,
                            tie_correction = FALSE, genes_are_rows=TRUE) {
    if (is.vector(cell_type_labels)) {
        cell_type_labels = design_matrix(as.character(cell_type_labels))
    }
    if (genes_are_rows) {
        expression = Matrix::t(expression)
    }
    
    aurocs = compute_aurocs(expression, cell_type_labels,
                            compute_tie_correction = tie_correction)
    p_values = auroc_p_value(
        aurocs$aurocs, cell_type_labels, two_tailed,
        tie_correction = aurocs$tie_correction, log.p = TRUE
    )
    log_fdr = matrix(my_fdr(p_values, log.p = TRUE), nrow = nrow(p_values),
                     dimnames = dimnames(p_values))
    
    population_size = colSums(cell_type_labels)
    population_fraction = population_size / sum(population_size)
    
    average = average_expression(expression, cell_type_labels)
    fold_change = (average$positives+1) / (average$negatives+1)
    uncentered_var = average_expression(expression**2, cell_type_labels)
    standard_error = sqrt((uncentered_var$positives - average$positives**2) / population_size)
          
    binary_expression = expression > 0
    m_binary = average_expression(binary_expression, cell_type_labels)
    fc_binary = (m_binary$positives + 0.001) / (m_binary$negatives + 0.001)
    n_expressing_cells = as.matrix(Matrix::crossprod(cell_type_labels, binary_expression))
    binary_precision = t(t(n_expressing_cells) / c(my_col_sums(binary_expression)))
    binary_recall = n_expressing_cells / population_size

    result = tidy_stat(fold_change, "fold_change")
    # WARNING: the indexing technique is efficient, but only works if
    # gene names are unique!!!
    full_indices = as.matrix(result[, c("cell_type", "gene")])
    result = result %>% dplyr::mutate(
        auroc = aurocs$aurocs[full_indices],
        log_fdr = log_fdr[full_indices],
        population_size = population_size[result$cell_type],
        population_fraction = population_fraction[result$cell_type],
        average_expression = average$positives[full_indices],
        se_expression = standard_error[full_indices],
        detection_rate = m_binary$positives[full_indices],
        fold_change_detection = fc_binary[full_indices],
        precision = binary_precision[full_indices],
        recall = binary_recall[full_indices]
    ) %>%
        dplyr::arrange(.data$cell_type, dplyr::desc(.data$auroc))
    
    # remove duplicate genes (see WARNING above)
    # NOTE: more efficient to do it now than in the original matrix
    genes = colnames(expression)
    duplicated_genes = unique(genes[duplicated(genes)])
    result = dplyr::filter(result, !(.data$gene %in% duplicated_genes))

    return(result)
}

# Compute average expression for each cell type + average background expression
average_expression = function(expression, design_matrix) {
    scaled_positives = scale(design_matrix, center=FALSE, scale=colSums(design_matrix))
    scaled_negatives = scale(1-design_matrix, center=FALSE, scale=colSums(1-design_matrix))
    return(list(
        positives = as.matrix(Matrix::crossprod(scaled_positives, expression)),
        negatives = as.matrix(Matrix::crossprod(scaled_negatives, expression))
    ))
}

# More efficient version of colSums (at least for sparse matrices)
my_col_sums = function(M) {
    as.matrix(rep(1, nrow(M)) %*% M)
}

# Adpated from p.adjust (with the possibility to provide log p-values)
my_fdr = function(p_values, log.p = FALSE) {
    i = length(p_values):1L
    o = order(p_values, decreasing = TRUE)
    n = length(p_values)
    result = rep(0, n)
    if (log.p) {
        result[o] = pmin(0, cummin(log(n) - log(i) + p_values[o]))
    } else {
        result[o] = pmin(1, cummin(n/i * p_values[o]))
    }
    return(result)
}

# Convert stat matrix to tidy format
tidy_stat = function(stat, stat_name, row_name="cell_type", col_name="gene") {
    stat %>%
        tibble::as_tibble(rownames=row_name, .name_repair = "minimal") %>%
        tidyr::pivot_longer(cols=-{{row_name}}, names_to=col_name, values_to=stat_name)
}

# Compute markers blockwise (dense blocks of the original sparse matrix)
compute_markers_blockwise = function(expression, cell_type_labels,
                                     group_labels = rep("all", length(cell_type_labels)),
                                     two_tailed = FALSE, tie_correction = FALSE,
                                     n_genes = NULL, genes_are_rows = TRUE) {
    if (genes_are_rows) {
        expression = Matrix::t(expression)
    }
    if (is.null(n_genes)) {
        n_genes = floor(2e9 / nrow(expression))
    }
    
    blocks = unique(c(seq(1, ncol(expression), n_genes), ncol(expression)))
    result = lapply(seq_len(length(blocks) - 1), function(i) {
        compute_markers(
            as.matrix(expression[, blocks[i]:blocks[i+1]]),
            cell_type_labels, group_labels, two_tailed,
            tie_correction, genes_are_rows=FALSE
        )
    })
    result = dplyr::bind_rows(result) %>%
        dplyr::arrange(.data$cell_type, .data$gene)

    # remove duplicate genes (see WARNING in compute_markers_)
    # NOTE: more efficient to do it now than in the original matrix
    genes = colnames(expression)
    duplicated_genes = unique(genes[duplicated(genes)])
    result = dplyr::filter(result, !(.data$gene %in% duplicated_genes))
    return(result)
}

#' Remove duplicate gene from a marker table (individual dataset)
#'
#' @param marker_stats Tibble obtained by calling `compute_marker_stats`.
#' 
#' @return Tibble with marker stats, all duplicated genes being removed.
#'
#' @export
remove_duplicated_genes = function(marker_stats) {
    duplicated_genes = marker_stats %>%
        dplyr::group_by(.data$group, .data$cell_type, .data$gene) %>%
        dplyr::tally() %>%
        dplyr::filter(.data$n > 1) %>%
        dplyr::pull(.data$gene) %>%
        unique()
    return(dplyr::filter(marker_stats, !(.data$gene %in% duplicated_genes)))
}


#' Export marker statistics to file.
#'
#' This function will export marker statistics in a simple format, including a short
#' summary header with cell types analyzed and date of export.
#'
#' @param marker_stats Tibble obtained by calling `compute_marker_stats`.
#' @param filename File name.
#' @param gzip Boolean. Should output file be zipped to spare memory?
#'
#' @export
export_markers = function(marker_stats, filename, gzip=TRUE) {
    write_header(unique(marker_stats$cell_type), filename)
    data.table::fwrite(marker_stats, filename, append=TRUE,
                       row.names=FALSE, col.names=TRUE)
    if (gzip) {
        R.utils::gzip(filename, overwrite=TRUE)
    }
}

# Generic header (time stamp, cell types)
write_header = function(cell_types, filename) {
    lines = paste0(
        "# Differential expression statistics generated on ", Sys.Date(), ". ",
        "Cell types: ", paste(cell_types, collapse = ", "), "."
    )
    writeLines(lines, filename)
}

#' Export markers statistics to file, one file per group.
#'
#' This function will export marker statistics in a simple format, including a short
#' summary header with cell types analyzed and date of export.
#'
#' @param de_stats Tibble obtained by calling `compute_markers`.
#' @param output_prefix File prefix, typically a path or a path + prefix.
#' Each group will be exported to a standardized file name, consisting of
#' output_prefix, a standardized version of the group name 
#' using `make.names`) and a .csv extension.
#' @param gzip Boolean. Should output file be zipped to spare memory?
#'
#' @export
export_markers_by_group = function(de_stats, output_prefix, gzip=TRUE) {
    for (group in unique(de_stats$group)) {
        filename = paste0(output_prefix, make.names(group), ".csv")
        export_markers(dplyr::filter(de_stats, group == group),
                       filename, gzip)
    }
}

#' Export marker statistics to file, one file per cell type.
#'
#' This function will export marker statistics in a simple format, including a short
#' summary header with cell types analyzed and date of export.
#'
#' @param de_stats Tibble obtained by calling `compute_markers`.
#' @param output_prefix File prefix, typically a path or a path + prefix.
#' Each cell type will be exported to a standardized file name, consisting of
#' output_prefix, a standardized version of the cell type name 
#' using `make.names`) and a .csv extension.
#' @param gzip Boolean. Should output file be zipped to spare memory?
#'
#' @export
export_markers_by_cell_type = function(de_stats, output_prefix, gzip=TRUE) {
    for (ct in unique(de_stats$cell_type)) {
        filename = paste0(output_prefix, make.names(ct), ".csv")
        export_markers_for_cell_type(de_stats, ct, filename, gzip)
    }
}

#' Export marker statistics of given cell type to file.
#'
#' This function will export marker statistics in a simple format, including a short
#' summary header with cell types analyzed and date of export.
#'
#' @param de_stats Tibble obtained by calling `compute_markers`.
#' @param cell_type Name of cell type of interest (contained in de_stats).
#' @param filename File name.
#' @param gzip Boolean. Should output file be zipped to spare memory?
#'
#' @export
export_markers_for_cell_type = function(de_stats, cell_type, filename, gzip=TRUE) {
    all_cell_types = unique(de_stats$cell_type)
    outgroups = all_cell_types[all_cell_types != cell_type]
    write_cell_type_header(cell_type, outgroups, filename)
    data.table::fwrite(dplyr::filter(de_stats, cell_type == cell_type),
                       filename, append=TRUE, row.names=FALSE, col.names=TRUE)
    if (gzip) {
        R.utils::gzip(filename, overwrite=TRUE)
    }
}
 
# Cell type header (time stamp, reference cell type, outgroups)
write_cell_type_header = function(reference, outgroups, filename) {
    lines = paste0(
        "# Differential expression statistics generated on ", Sys.Date(), ". ",
        "Reference: ", reference, ", outgroups: ", paste(outgroups, collapse = ", "), "."
    )
    writeLines(lines, filename)
}

#' Import marker statistics.
#'
#' @param filename File name.
#' @param header_size Size of header (# rows) containing meta-information such
#' as time stamp and list of cell types (for compatibility purposes only).
#'
#' @export
read_markers = function(filename, header_size=1) {
    result = data.table::fread(filename, header=TRUE, skip=header_size)
    return(tibble::as_tibble(result))
}
gillislab/MetaMarkers documentation built on April 24, 2021, 9:25 p.m.