## Gaussian mixture modeling
##
## by Artem Sokolov
## Modifies a vector to have a single "activation" phase
##
## Not exported
singleAct <- function( v )
{
## Identify the "activation" phase by its positive slope
n <- length(v)
ph <- which( (v[2:n] - v[1:(n-1)]) > 0 )
mni <- min(ph)
mxi <- max(ph) + 1
stopifnot( mni < mxi )
v[1:mni] <- min(v)
v[mxi:n] <- max(v)
v
}
## Linear rescale of a vector to the [lo, hi] range
##
## Not exported
rescaleLin <- function( v, lo, hi )
{
vadj <- (v-lo) / (hi-lo)
vfin <- vadj[ is.finite(vadj) ]
dplyr::case_when(
vadj == -Inf ~ min(vfin),
vadj == Inf ~ max(vfin),
TRUE ~ vadj )
}
## Automatically finds distribution boundaries that contain most of the data
##
## Not exported
findBounds <- function( .v, baseline=1e-3 )
{
density( .v ) %>% with( tibble::tibble(x, y) ) %>%
dplyr::mutate_at( "y", ~.x / max(.x) ) %>%
dplyr::arrange(x) %>% dplyr::filter( y > baseline ) %>%
dplyr::filter( dplyr::row_number() %in% c(1,n()) ) %>%
dplyr::pull(x) %>% as.list() %>% rlang::set_names(c("lo","hi"))
}
## Fits a two-component mixture model to the provided 1-D vector
## v - vector of values
## chName - name of the channel (for reporting only)
## mu_init - vector of two values, each between 0 and 1, specifying the initial placement of Gaussian means
## rs - random seed to allow reproducibility
##
## Not exported
mixEM <- function( v, chName, mu_init, rs=100 )
{
set.seed(rs)
cat( "Fitting a mixture model to", chName, "... " )
mix <- mixtools::normalmixEM( v, mu=mu_init, maxit=2000 )
## Reorder the means to be monotonically increasing
mdl <- mix[c("lambda","mu","sigma")]
j <- order(mdl$mu)
purrr::map( mdl, ~.x[j] )
}
## Given a model returned by mixEM(), computes posterior probabilities
## for column vals in data frame .df
##
## Not exported
mutate_probs <- function( .df, vals, mix )
{
## Ensure the means are ordered
stopifnot( mix$mu[1] < mix$mu[2] )
s <- rlang::ensym(vals)
## Adjust the values based on quantile limits
## Compute posterior probabilities of each value landing in negative and positive clusters
## NAs arising due to CP+CN == 0 are filled, NAs in column !!s are then restored
.df %>% dplyr::arrange(!!s) %>%
dplyr::mutate( AdjVal = rescaleLin(!!s, mix$lo, mix$hi),
CN = mix$lambda[1]*dnorm(AdjVal, mix$mu[1], mix$sigma[1]),
CP = mix$lambda[2]*dnorm(AdjVal, mix$mu[2], mix$sigma[2]),
Prob = tidyr::replace_na(CP/(CP+CN), NA) ) %>% # NaN -> NA for fill()
tidyr::fill( Prob, .direction="downup" ) %>%
dplyr::mutate( Prob = singleAct(Prob) ) %>%
dplyr::mutate( Prob = ifelse(is.na(!!s), NA, Prob) ) # Restore original NAs
}
#' Fit a Gaussian model to a set of channels
#'
#' @param X - cell-by-channel data frame of marker expression
#' @param cid - column name or index that contains cell IDs
#' @param ... - columns to fit a GMM to
#' @param bounds - [optional] Two values specifying the range of values to be used
#' for mixture modeling. This allows for exclusion of outliers from
#' model fitting. Calculated automatically, if not provided.
#' @param baseline - [optional] baseline for automatic boundary calculation
#' @param mu_init - [optional] vector of two values, each between 0 and 1,
#' specifying the initial placement of Gaussian means.
#' @param seed - [optional] random seed to allow for reproducibility
#' @return A composite data frame containing trained GMMs and their fits to data
#' @importFrom magrittr %>%
#' @export
GMMfit <- function(X, cid, ..., bounds, baseline=0.01, mu_init=c(0.2,0.8), seed=100)
{
## Argument verification
mb <- missing(bounds)
if( !mb && length(bounds) != 2 )
stop( "bounds must be a vector of two values" )
if( any(mu_init < 0) | any(mu_init > 1) )
stop( "mu_init argument values must be in [0, 1] range" )
## Select all relevant columns from the original data frame
X1 <- X %>% dplyr::select( {{cid}}, ... ) %>%
tidyr::gather( Marker, Values, -1 )
## Verify that the data has been log-normalized
if( range(X1$Values, na.rm=TRUE)[2] > 1000 )
warning( "Large values detected. Please ensure the data has been log-normalized." )
## Isolate the finite marker values to use for modeling
MV <- X1 %>% dplyr::filter( is.finite(Values) ) %>%
dplyr::filter( Values != 0 ) %>%
dplyr::group_by( Marker ) %>%
dplyr::summarize_at("Values",list)
## Computes distribution bounds, or uses provided values
## Adjust and filter the values accordingly
fb <- ~set_names(`if`( mb, findBounds(.x,baseline), as.list(sort(bounds)) ),
c("lo","hi"))
MVQ <- MV %>% dplyr::mutate( Bounds = purrr::map(Values, fb) ) %>%
dplyr::mutate_at( "Values", map2, .$Bounds, ~((.x-.y$lo) / (.y$hi-.y$lo)) ) %>%
dplyr::mutate_at( "Values", map, keep, ~(.x >= 0 & .x <=1) )
## Fit a mixture of two Gaussians to each marker
G <- MVQ %>%
dplyr::mutate( GMM = purrr::map2(Values, Marker, mixEM, mu_init, seed) ) %>%
dplyr::mutate_at( "GMM", purrr::map2, .$Bounds, c ) %>%
dplyr::select( Marker, GMM )
## Compute posterior probabilities on the original data
fmp <- ~mutate_probs(.x, "Value", .y)
R <- X1 %>% dplyr::rename( Value = Values ) %>%
tidyr::nest( Values=c({{cid}}, Value) ) %>%
dplyr::inner_join(G, by="Marker") %>%
dplyr::mutate( Values = purrr::map2(Values, GMM, fmp) )
R
}
#' Reshapes a GMMfit() dataframe to the original cell-by-marker format
#'
#' @param .df - data frame produced by GMMfit()
#' @return A data frame in the original cell-by-marker format
#' @importFrom magrittr %>%
#' @export
GMMreshape <- function(.df)
{
.df %>% dplyr::select( -GMM ) %>% tidyr::unnest(Values) %>%
dplyr::select( -Value, -AdjVal, -CN, -CP ) %>%
tidyr::spread( Marker, Prob )
}
#' Returns GMM models to tidy format
#'
#' @param .df - data frame produced by GMMfit()
#' @return A data frame that maps each marker to the following values in
#' the log10-transformed space:
#' \describe{
#' \item{lo/hi}{The low and high cut-off values for outlier exclusion.}
#' \item{lambda1/2}{Mixture coefficients for the GMM. Sum up to 1.}
#' \item{mu1/2}{Centroids of the two Gaussians in the mixture model.}
#' \item{sigma1/2}{The corresponding standard deviation of each Gaussian.}
#' }
#' @importFrom magrittr %>%
#' @export
GMMmodels <- function(.df)
{
.df %>%
dplyr::select( Marker, GMM ) %>%
dplyr::mutate_at( "GMM", purrr::map,
~list(lo = .x$lo, hi = .x$hi,
lambda1 = .x$lambda[1],
lambda2 = .x$lambda[2],
mu1 = .x$mu[1] * (.x$hi - .x$lo) + .x$lo,
mu2 = .x$mu[2] * (.x$hi - .x$lo) + .x$lo,
sigma1 = .x$sigma[1] * (.x$hi - .x$lo),
sigma2 = .x$sigma[2] * (.x$hi - .x$lo))
) %>%
dplyr::mutate_at( "GMM", purrr::map,
~tidyr::unnest(tibble::enframe(.x),value) ) %>%
tidyr::unnest(GMM) %>% tidyr::pivot_wider()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.