liana: LIANA: a LIgand-receptor ANalysis frAmework

Documented in liana_pipe

#' Liana Pipe which runs DE analysis and merges needed information for LR inference
#'
#' @param sce SingleCellExperiment Object
#' @param op_resource resource tibble obtained via \link{liana::select_resource}
#' @inheritParams liana_scores
#' @inheritParams scran::findMarkers
#' @param assay assay to be used ("RNA" by default)
#' @param assay.type - the type of data to be used to calculate the means
#'  (logcounts by default), available options are: "counts" and "logcounts"
#' @param verbose logical for verbosity
#' @param cell.adj cell adjacency tibble/dataframe /w weights by which we will
#' `multiply` the relevant columns. Any cell pairs with a weights of 0 will be
#' filtered out.
#' Note that if working with LIANA's default methods, we suggest weights >= 0 & =< 1.
#' This ensure that all methods' score will be meaningfully weighed without
#' changing the interpretation of their scores, thus allow one to filter SCA,
#' rank NATMI, etc.
#'
#' @inheritParams .antilog1m
#'
#' @import SingleCellExperiment SeuratObject
#' @importFrom scran findMarkers
#' @importFrom scuttle summarizeAssayByGroup
#'
#' @export
#'
#' @return Returns a tibble with information required for LR calculations downstream
liana_pipe <- function(sce,
                       op_resource,
                       test.type = "wilcox",
                       pval.type = "all",
                       assay = "RNA",
                       assay.type = "logcounts",
                       verbose = TRUE,
                       base,
                       cell.adj = NULL){

    # calculate global_mean required for SCA
    global_mean <- fast_mean(exec(assay.type, sce))

    ### this whole chunk needs to move to liana_wrap
    # Resource Format
    transmitters <- op_resource$source_genesymbol %>%
        as_tibble() %>%
        select(gene = value) %>%
        separate_rows(gene, sep="[_]") %>%
        distinct()
    receivers <- op_resource$target_genesymbol %>%
        as_tibble() %>%
        select(gene = value) %>%
        separate_rows(gene, sep="[_]") %>%
        distinct()
    entity_genes <- union(transmitters$gene,
                          receivers$gene)

    # Filter `sce` to only include ligand receptor genes
    # and exclude any cells with 0 counts of LR genes
    sce <- .prep_universe(sce,
                          entity_genes = entity_genes,
                          verbose)

    # Get Log2FC (done after non-expr. cell and gene filter from `liana_prep`)
    # also any cells with 0 counts of LR genes are removed (in`.prep_universe`)
    logfc_df <- get_log2FC(
        sce,
        assay.type = assay.type,
        base
    )


    # Scale genes across cells
    sce@assays@data[["scaledata"]] <- row_scale(exec(assay.type, sce))

    # Get Avg and  Prop. Expr Per Cluster
    mean_prop <-
        scuttle::summarizeAssayByGroup(sce,
                                       ids = colLabels(sce),
                                       assay.type = assay.type,
                                       statistics = c("mean", "prop.detected"))
    means <- mean_prop@assays@data$mean
    props <- mean_prop@assays@data$prop.detected

    # scaled (z-transformed) means
    scaled <- scuttle::summarizeAssayByGroup(sce,
                                             ids = colLabels(sce),
                                             assay.type = "scaledata",
                                             statistics = c("mean"))
    scaled <- scaled@assays@data$mean

    # calculate PEM scores
    pem_scores <- compute_pem_scores(sce = sce,
                                     assay.type = assay.type)

    # Find Markers and Format
    cluster_markers <- scran::findMarkers(sce,
                                          groups = colLabels(sce),
                                          direction = "any",
                                          full.stats = TRUE,
                                          test.type = test.type,
                                          pval.type = pval.type,
                                          assay.type = assay.type) %>%
        pluck("listData") %>%
        map2(., names(.), function(cluster, cluster_name){
            cluster %>%
                as.data.frame() %>%
                rownames_to_column("gene") %>%
                as_tibble() %>%
                select(gene, p.value, FDR, stat = summary.stats)
        })

    # Get all Possible Cluster pair combinations
    pairs <- expand_grid(source = unique(colLabels(sce)),
                         target = unique(colLabels(sce)))
    # Get DEGs to LR format
    lr_res <- pairs %>%
        pmap(function(source, target){
            source_stats <- ligrec_degformat(cluster_markers[[source]],
                                             entity = transmitters,
                                             ligand_receptor = "ligand")
            target_stats <- ligrec_degformat(cluster_markers[[target]],
                                             entity = receivers,
                                             ligand_receptor = "receptor")

            op_resource %>%
                select(ligand = source_genesymbol,
                       receptor = target_genesymbol) %>%
                left_join(source_stats, by = "ligand") %>%
                left_join(target_stats, by = "receptor") %>%
                rename_with(~gsub(x = .x,
                                  pattern = "p.value",
                                  replacement = "pval"),
                            ends_with("p.value")) %>%
                na.omit() %>%
                distinct() %>%
                mutate(source = source,
                       target = target)
        }) %>%
        bind_rows()

    # Join Expression Means
    lr_res %<>%
        join_means(means = means,
                   source_target = "source",
                   entity = "ligand",
                   type = "expr") %>%
        join_means(means = means,
                   source_target = "target",
                   entity = "receptor",
                   type = "expr") %>%
        join_means(means = scaled,
                   source_target = "source",
                   entity = "ligand",
                   type = "scaled") %>%
        join_means(means = props,
                   source_target = "target",
                   entity = "receptor",
                   type = "prop") %>%
        join_means(means = props,
                   source_target = "source",
                   entity = "ligand",
                   type = "prop") %>%
        join_means(means = scaled,
                   source_target = "target",
                   entity = "receptor",
                   type = "scaled") %>%
        join_sum_means(means = means,
                       entity = "ligand") %>%
        join_sum_means(means = means,
                       entity = "receptor") %>%
        # Join PEM scores
        join_means(means = pem_scores,
                   source_target = "target",
                   entity = "receptor",
                   type = "pem") %>%
        join_means(means = pem_scores,
                   source_target = "source",
                   entity = "ligand",
                   type = "pem") %>%
        # logFC
        join_log2FC(logfc_df,
                    source_target = "source",
                    entity="ligand") %>%
        join_log2FC(logfc_df,
                    source_target = "target",
                    entity="receptor") %>%
        # Global Mean
        mutate(global_mean = global_mean)

    liana_message("LIANA: LR summary stats calculated!",
                  verbose = verbose
    )

    # Weigh by (spatial) constrains
    if(!is.null(cell.adj)){
        lr_res %<>%
            .sp_costrain(cell.adj)
    }

    # Join complexes (recomplexify) to lr_res
    cmplx <- op_resource %>%
        select(
            ligand = source_genesymbol,
            ligand.complex = source_genesymbol_complex,
            receptor = target_genesymbol,
            receptor.complex = target_genesymbol_complex
        )

    lr_res %<>%
        left_join(., cmplx,
                  by=c("ligand", "receptor")) %>%
        distinct()

    # Inherit levels of idents
    levels(lr_res$source) <- levels(colLabels(sce))
    levels(lr_res$target) <- levels(colLabels(sce))

    return(lr_res)
}



#' Helper Function to join DEG stats to LR
#'
#' @param cluster_markers dataframe with DE stats for a cluster
#' @param entity Transmitter or Receiver vector passed as tibble
#' @param ligand_receptor whether this is the source or target cluster
#'
#' @return A tibble with stats for receivers or transmitters per cluster
#'
#' @noRd
ligrec_degformat <- function(cluster_markers,
                             entity,
                             ligand_receptor,
                             columns=c("gene", "p.value", "FDR", "stat")){
    cluster_markers %>%
        left_join(entity, ., by = "gene") %>%
        na.omit() %>%
        select(columns) %>%
        rename_with(~paste(ligand_receptor, .x, sep = "."), -gene) %>%
        dplyr::rename({{ligand_receptor}} := gene) %>%
        distinct() %>%
        na.omit()
}




#' Join Expression per Cluster
#'
#' @param lr_res LR formatted DE results from \link{ligrec_degformat}
#' @param means Gene avg expression per cluster
#' @param source_target target or source cell
#' @param entity ligand or receptor
#' @param type type of mean to join (count or scaled)
#' @param pb progress bar
#'
#' @importFrom magrittr %>% %<>%
#'
#' @return Returns the Average Expression Per Cluster
#'
#' @export
#'
#' @keywords internal
join_means <- function(lr_res,
                       means,
                       source_target,
                       entity,
                       type,
                       pb = NULL){

    if(!is.null(pb)){
        pb$tick()$print()
    }

    entity.avg <- sym(str_glue("{entity}.{type}"))

    means %<>%
        as.data.frame() %>%
        rownames_to_column("gene") %>%
        pivot_longer(-gene, names_to = "cell", values_to = "avg") %>%
        dplyr::rename({{ source_target }} := cell,
                      {{ entity }} := gene,
                      {{ entity.avg }} := avg)

    lr_res %>%
        left_join(means, by=c(source_target, entity))
}


#' Join Expression per Cluster
#'
#' @param lr_res LR formatted DE results from \link{ligrec_degformat}
#' @param means Gene avg expression per cluster
#' @param entity ligand or receptor
#'
#' @return Returns the Summed Average Expression Per Cluster
#'
#' @noRd
join_sum_means <- function(lr_res, means, entity){

    entity.expr <- sym(str_glue("{entity}.sum"))

    sums <- means %>%
        as.data.frame() %>%
        rownames_to_column("gene") %>%
        rowwise("gene") %>%
        mutate(sum.means = sum(c_across(where(is.numeric)))) %>%
        select(gene, sum.means) %>%
        dplyr::rename({{ entity }} := gene,
                      {{ entity.expr }} := sum.means)  %>%
        distinct() %>%
        ungroup()

    lr_res %>%
        left_join(sums, by=c(entity))
}


#' Helper Function to join log2FC dataframe to LR_res
#'
#' @param lr_res LR formatted DE results from \link{ligrec_degformat}
#' @param logfc_df obtained via \link{get_log2FC}
#' @param source_target target or source cell
#' @param entity ligand or receptor
#'
#' @noRd
join_log2FC <- function(lr_res,
                        logfc_df,
                        source_target,
                        entity){

    entity.fc <- sym(str_glue("{entity}.log2FC"))

    logfc <- logfc_df %>%
        dplyr::rename(
            {{ source_target }} := cell,
            {{ entity }} := gene,
            {{ entity.fc }} := avg_log2FC)  %>%
        distinct()

    lr_res %>%
        left_join(logfc, by=c(entity, source_target))

}




#' Get Log2FC of Subject vs LOSO (i.e. 1 cell type vs all other cells FC)
#'
#' @param sce SingleCellExperiment object
#' @param subject leave-one-out subject, i.e. the cluster whose log2FC we wish
#'    to calculate when compared to all other cells
#' @inheritParams .antilog1m
#'
#' @inheritParams liana_pipe
#'
#' @return A log2FC dataframe for a given cell identity
#'
#' @details log2FC is calculated using the raw count average + a pseudocount of 1.
#' `assay.type` should be the raw counts
#'
#' @noRd
get_log2FC <- function(sce, assay.type, base){


    if(!is.nan(base)){
        # Get anti-logged normalized counts (preserves batch effets)
        sce@assays@data[["normcounts"]] <- .get_invcounts(sce, assay.type, base)
    } else{
        # Get raw counts as they are
        sce@assays@data[["normcounts"]] <- counts(sce)
    }


    # iterate over each possible cluster leaving one out
    levels(colLabels(sce)) %>%
        map(function(subject){
            # Subject (i.e. target) Cluster avg
            subject_avg <-
                scater::calculateAverage(subset(sce,
                                                select = colLabels(sce)==subject),
                                         assay.type = "normcounts"
                ) %>%
                as_tibble(rownames = "gene") %>%
                dplyr::rename(subject_avg = value)

            # All other cells average
            loso_avg <-
                scater::calculateAverage(subset(sce,
                                                select = !(colLabels(sce) %in% subject)),
                                         assay.type = "normcounts"
                ) %>%
                as_tibble(rownames = "gene") %>%
                dplyr::rename(loso_avg = value)

            # Join avg and calculate FC
            left_join(subject_avg, loso_avg, by="gene") %>%
                mutate(avg_log2FC =
                           log2((subject_avg + 1)) - log2((loso_avg + 1))) %>%
                select(gene, avg_log2FC)

        }) %>% setNames(levels(colLabels(sce))) %>%
        enframe(name = "cell") %>%
        unnest(value)
}


#' Helper Function to 'decomplexify' ligands and receptors into individual subunits
#'
#' @param resource a ligrec resource
#'
#' @param columns columns to separate and pivot long (e.g. genesymbol or uniprot),
#' `source_genesymbol` and `target_genesymbol` by default
#'
#' @return returns a longer tibble with complex subunits on seperate rows
#'
#' @details takes any number of columns, and assumes `_` as sep.
#'
#' @export
decomplexify <- function(resource,
                         columns = c("source_genesymbol",
                                     "target_genesymbol")){
    columns %>%
        map(function(col){
            sep_cols <- c(str_glue("col{rep(1:5)}"))
            col.complex <- str_glue("{col}_complex")

            resource <<- resource %>%
                mutate({{ col.complex }} :=
                           resource[[str_glue("{col}")]]) %>%
                separate(col,
                         into = sep_cols,
                         sep = "_",
                         extra = "drop",
                         fill = "right") %>%
                pivot_longer(cols = all_of(sep_cols),
                             values_to = col,
                             names_to = NULL) %>%
                tidyr::drop_na(all_of(col)) %>%
                distinct() %>%
                mutate(across(all_of(c(col, col.complex)),
                              ~str_replace(., "COMPLEX:", "")))
        })
    return(resource)
}


#' Helper Function to Get a rowwise scaled matrix
#'
#' @param mat a matrix, typically the logcounts matrix from an SCE object
#'
#'
#' @noRd
row_scale <- function(mat){
    col_means = rowMeans(mat,
                         na.rm = TRUE) # Get the column means
    col_sd = MatrixGenerics::rowSds(mat,
                                    center = col_means,
                                    na.rm = TRUE) # Get the column sd

    # return scaled mat
    return(as.matrix((mat - col_means) / col_sd))
}

#' Helper function to inverse logged counts
#'
#' @param x mat or array
#' @param base base for conversion from log-tranformed ~CPM back to ~CPM.
#'
#' @keywords internal
.antilog1m <- function(x, base=2){base ^ (x) - 1}


#' Helper function to generate inversed counts (i.e. normalized but not logged)
#'
#' @param sce SingleCellExperiment object
#' @param assay.type counts slot
#' @param base a positive or complex number: the base with respect to which
#' log-transformation was computed.
#'
#' @noRd
.get_invcounts <- function(sce, assay.type, base){
    antilogged <- .antilog1m(slot(exec(assay.type, sce), "x"), base = base)

    methods::new(
        "dgCMatrix",
        i = slot(exec(assay.type, sce), "i"),
        p = slot(exec(assay.type, sce), "p"),
        Dim = slot(exec(assay.type, sce), "Dim"),
        Dimnames = slot(exec(assay.type, sce), "Dimnames"),
        x = antilogged,
        factors = slot(exec(assay.type, sce), "factors")
    )
}


#' Helper function to fix r mean
#'
#' @param mat a matrix
#'
#' @details r mean is slow and it overflows on memory.
#'
#' @noRd
fast_mean <- function(mat){
    if(class(mat)=="dgCMatrix"){
        sum(mat@x)/(as.numeric(nrow(mat)) * as.numeric(ncol(mat)))
    } else{
        Matrix::mean(mat)
    }
}

#' Helper function to format SCE to the LR universe
#' @param sce SingleCellExperiment object
#' @param entity_genes union of ligand-receptor genes
#' @param verbose verbose - True/Flase
#'
#' @noRd
.prep_universe <- function(sce, entity_genes, verbose){

    # Keep only LR universe
    sce <- sce[rownames(sce) %in% entity_genes, ]

    # Check organism/gene intersect
    if(nrow(sce) < 3){
        liana_message(
            "Low gene intersect (<3) detected!",
            "Please check if the rownames of the data match the gene identities in the resource (i.e. human genesymbols).",
            output = "stop",
            verbose = verbose
        )
    }

    # Check for non-zero cells
    nonzero_cells <- colSums(counts(sce)) > 0

    if(!all(nonzero_cells)){
        nzero_cells <- sum(map_dbl(nonzero_cells, function(x) rlang::is_false(x = x)))

        liana_message(
            stringr::str_glue("{nzero_cells} cells were excluded as they",
                              " did not express any ligand-receptor genes!"),
            output="warning",
            verbose=verbose
        )
    }

    return(sce[,nonzero_cells])
}


#' Weigh by spatial constrans
#'
#' @param lr_res liana_pipe output prior to joining complexes
#' @param cell.adj cell adjacency weights (should be positive)
#' @param adjacency name of the column with cell pair adjacency scores
#'
#' @return weighed lr_res
#'
#' @details Note that for the case that there are weights from 0-1, the negative
#' values (in e.g. logFC, z-scores) might be counter-logically affected - i.e.
#' they would be brought closer to 0.
#' Thus, by default liana expects weights from 0-1. These are then multiplied
#' for positive values, while negative values are divided.
#'
#' Alternatively, one could e.g. multiply the weights by a factor (e.g. 10,000),
#' if logFC and Connectome are used. However, this would change some of
#' the assumptions/interpretations of the scores, while
#' consensus ranking will be unaffected.
#'
#' Also, note that any interactions between cell pairs with an adjacency of 0
#' will be excluded (this would affect the scores from CytoTalk).
#'
#' NB! `%/*/%` is only applicable and relevant to logFC and z-scores from a
#' single-context, and should not be used when scaling between conditions!!!
#'
#' @keywords internal
.sp_costrain <- function(lr_res,
                         cell.adj,
                         adjacency = "adjacency"){
    lr_res %>%
        left_join(cell.adj, by=c("source", "target")) %>%
        # remove any interactions between non-interacting cells
        filter(.data[[adjacency]]!=0) %>%
        # weigh relevant columns by adjacency
        mutate(across(ends_with(c("expr", "pem") #**
        ), ~`*`(.x, adjacency))) %>%
        rowwise() %>% # required as its done by element (not vector operation)
        mutate(across(ends_with(c("log2FC", "scaled")),
                      # makes sense for single context
                      # but makes no sense for multiple (better to change to prod)
                      ~`%/*/%`(.x, adjacency))) %>%
        ungroup()
}



#' Helper function to multiply or divide depending on sign
#'
#' @param x target for weighing
#' @param y weight
#'
#' @noRd
`%/*/%` <- function(x, y){
    if(y==0) return(0)

    if(x >= 0){
        x * y
    } else{
        x / y
    }
}