naivestates: Cell state calling using Naive Bayes

Documented in callStates findDominant

## Using probabilities of marker expression, assign scores for each class
##   in one or more dichotomies of interest via Naive Bayes
##
## by Artem Sokolov

#' Identifies a marker with the highest probability of expression
#' @param P Posterior probabilities, as generated by GMMreshape()
#' @param cid Column containing cell IDs
#' @param vm (optional) vector of markers to limit the argmax to
#' @param pthr probability threshold above which a cell is considered to express the marker
#' @return The original dataframe P augmented with the dominant marker column
#' @importFrom magrittr %>%
#' @export
findDominant <- function( P, cid, vm, pthr = 0.65 )
{
    P1 <- P %>% dplyr::mutate( tmpID = 1:(dplyr::n()) )
    if( missing(vm) )
        vm <- dplyr::select(P, -{{cid}}) %>% colnames

    vm <- c(vm, "None")
    
    P1 %>%
        dplyr::mutate( None = pthr ) %>%
        dplyr::select( tmpID, all_of(vm) ) %>%
        tidyr::gather( Dominant, Value, vm ) %>%
        dplyr::group_by( tmpID ) %>% dplyr::slice(which.max(Value)) %>%
        dplyr::ungroup() %>% dplyr::arrange( tmpID ) %>%
        dplyr::inner_join( P1, by="tmpID" ) %>%
        dplyr::select( -tmpID, -Value ) %>%
        dplyr::select( {{cid}}, Dominant, everything() )
}

#' Combines probabilities of expression in individual markers to call cell states
#'   according to a predefined marker -> cell state mapping
#' 
#' @param P     Posterior probabilities, as generated by GMMreshape()
#' @param cid   Column containing cell IDs
#' @param M     A data frame with columns Marker and State, defining the mapping between the two
#' @param fcomb A function for combining individual probabilities. Use "gmean" for geometric mean
#'                and "hmean" for harmonic mean. (Default: "hmean")
#' @param pthr  Probability threshold that must be exceeded for a cell state to be selected
#' @return      A data frame of probabilities for each cell/state pair, along with final calls
#' @importFrom magrittr %>%
#' @export
callStates <- function(P, cid, M, fcomb="hmean", pthr=0.5 )
{
    ## Argument verification
    if( !all(c("Marker", "State") %in% colnames(M)) )
        stop( "Marker -> cell state map must contain columns \"Marker\" and \"State\"" )
    
    ## Determine the scoring function for combining probabilities
    if( is.function(fcomb) )
        fscore <- fcomb
    else if( is.character(fcomb) && fcomb == "gmean" )
        fscore <- function(x) {exp(mean(log(x)))}
    else if( is.character(fcomb) && fcomb == "hmean" )
        fscore <- function(x) {length(x) / sum( 1/x )}
    else
        stop( "fcomb must be a function, \"gmean\", or \"hmean\"" )

    ## Isolate relevant markers
    S <- P %>% dplyr::select( CellID = {{cid}}, dplyr::all_of(M$Marker) )
    
    ## Fill out implicit negatives
    M <- M %>% dplyr::mutate( Expressed = "Yes" ) %>%
        tidyr::pivot_wider( names_from = Marker, values_from = Expressed, values_fill = "No" ) %>%
        tidyr::pivot_longer( -State, names_to = "Marker", values_to = "Expressed" ) %>%
        dplyr::group_by( State, Expressed ) %>%
        dplyr::summarize( dplyr::across(Marker, list), .groups="drop" ) %>%
        tidyr::pivot_wider( names_from = Expressed, values_from = Marker )

    ## Compute scores for each cellID/state pair
    S <- S %>% tidyr::pivot_longer( -CellID, names_to="Marker", values_to="Value" ) %>%
        dplyr::group_by( CellID ) %>%
        dplyr::summarize( Expression = list(rlang::set_names(Value, Marker)), .groups="drop" ) %>%
        tidyr::expand_grid( M ) %>%
        dplyr::rowwise() %>%
        dplyr::mutate(Contrib = list(c(Expression[Yes], 1-Expression[No])),
                      Prob = fscore(Contrib)) %>%
        dplyr::ungroup() %>% dplyr::select( CellID, State, Prob )

    ## Make the final call via argmax
    G <- S %>% dplyr::group_by( CellID ) %>%
        dplyr::slice( which.max(Prob) ) %>%
        dplyr::ungroup() %>%
        dplyr::mutate( State = ifelse(Prob < pthr, "Other", State) ) %>%
        dplyr::select( CellID, State )
    
    ## Reshape to cell-by-state matrix of probabilities and combine with final calls
    F <- S %>% tidyr::pivot_wider( names_from = State, values_from = Prob ) %>%
        dplyr::inner_join( G, ., by="CellID" ) %>%
        dplyr::rename( !!rlang::sym(cid) := CellID )

    F
}