R/5_pca.R

Defines functions sma pls pca loadings loadingmat scores scoremat variances variancenames methodname loadingnames scorenames .filter_minvar evenify_upwards has_odd_length has_even_length is_odd is_even flip_sign_if_all_exprs_are_negative

Documented in loadingmat loadings pca pls scoremat scores sma

#=============================================================================
#
#                  flip_sign_if_all_exprs_are_negative.
#                  evenify_upwards
#
#=============================================================================

#' Flip sign if all expr values are negative
#' @param x matrix
#' @param verbose TRUE (default) or FALSE
#' @return updated matrix
#' @noRd
flip_sign_if_all_exprs_are_negative <- function(x, verbose=TRUE){
    idx <- !is.na(x)
    if (all(sign(x[idx])==-1)){
        if (verbose)  message('\t\tAll values negative: ', 
                                'flip signs to prevent singularities.')
        x %<>% multiply_by(-1)
    }
    x
}



#' Is even/odd?
#' @param x integer
#' @return TRUE or FALSE
#' @examples
#' is_even(13)
#' is_even(12)
#' is_odd(13)
#' is_odd(12)
#' @noRd
is_even <- function(x)   (x %% 2) == 0
is_odd  <- function(x)    !is_even(x)


#' Has even/odd length?
#' @param x vector
#' @return logical
#' @examples
#' has_even_length(1:2)
#' has_odd_length(1:2)
#' has_even_length(1:3)
#' has_odd_length(1:3)
#' @noRd
has_even_length <- function(x)   is_even(length(x))
has_odd_length  <- function(x)   is_odd(length(x))


#' Evenify upwards
#' @param x integer
#' @return integer
#' @examples
#' evenify_upwards(3)
#' @noRd
evenify_upwards <- function(x)   if (is_odd(x)) x+1 else x



#============================================================================
#
#               pca sma lda pls spls ropls
#                   .filter_minvar
#
#============================================================================

.filter_minvar <- function(object, method, minvar) {
    variances <- metadata(object)[[method]]
    discard_components <- variances[variances < minvar] %>% names()
    sdata(object)[discard_components] <- NULL
    fdata(object)[discard_components] <- NULL
    metadata(object)[[method]] <- variances[!names(variances) %in% discard_components]
    object
}

   scorenames <- function( method = 'pca', by, dims = 1:2, sep = FITSEP )  paste0('effect', sep, by, sep, method, dims)
 loadingnames <- function( method = 'pca', by, dims = 1:2, sep = FITSEP )  paste0('effect', sep, by, sep, method, dims)
   methodname <- function( method = 'pca', by,             sep = FITSEP )  paste0(               by, sep, method      )
variancenames <- function(                     dims = 1:2               )  paste0('effect',                       dims)

variances <- function(
    object, method = 'pca', by = biplot_by(object, method), dims = 1:2, sep = FITSEP
){
    y <- metadata(object)
    y %<>% extract2(methodname(method, by = by, sep = sep))
    y %<>% extract(variancenames(dims))
    y
}

#' Extract scores/loadings
#' @param object SummarizedExperiment
#' @param method 'pca', 'pls', etc.
#' @param by      svar (string)
#' @param dim     numeric vector
#' @examples
#' file <- system.file('extdata/atkin.metabolon.xlsx', package = 'autonomics')
#' object <- read_metabolon(file)
#' object %<>% pca()
#'     scores(object)[1:2]
#'   loadings(object)[1:2]
#'   scoremat(object)[1:2, ]
#' loadingmat(object)[1:2, ]
#' @export
scoremat <- function(
    object, method = 'pca', by = biplot_by(object, method), dim = 1:2
){
    sep <- guess_fitsep(fdt(object))
    cols <- 'sample_id'
    cols %<>% c(scorenames(method = method, by = by, dims = dim, sep = sep))
    mat <- sdt(object)[, cols, with = FALSE]
    mat %<>% dt2mat()
    mat
}

#' @rdname scoremat
#' @export
scores <- function(
    object, method = 'pca', by = biplot_by(object, method), dim = 1
){
    sep <- guess_fitsep(fdt(object))
    cols <- scorenames(method = method, by = by, dims = dim, sep = sep)
    sdt(object)[[cols]]
}

#' @rdname scoremat
#' @export
loadingmat <- function(
    object, method = 'pca', by = biplot_by(object, method), dim = 1:2
){
    sep <- guess_fitsep(fdt(object))
    cols <- 'feature_id'
    cols %<>% c(loadingnames(method = method, by = by, dims = dim, sep = sep))
    mat <- fdt(object)[, cols, with = FALSE]
    mat %<>% dt2mat()
    mat
}

#' @rdname scoremat
#' @export
loadings <- function(
    object, method = 'pca', by = biplot_by(object, method), dim = 1
){
    sep <- guess_fitsep(fdt(object))
    cols <- loadingnames(method = method, by = by, dims = dim, sep = sep)
    fdt(object)[[cols]]
}


#' PCA, SMA, LDA, PLS, SPLS, OPLS
#'
#' Perform a dimension reduction.
#' Store sample scores, feature loadings, and dimension variances.
#'
#' @param object          SummarizedExperiment
#' @param by              svar or NULL
#' @param sep             string
#' @param assay           string
#' @param ndim            number
#' @param minvar          number
#' @param center_samples  TRUE/FALSE: center samples prior to pca ?
#' @param verbose         TRUE/FALSE: message ?
#' @param plot            TRUE/FALSE: plot ?
#' @param ...             passed to biplot
#' @return                SummarizedExperiment
#' @examples
#' file <- system.file('extdata/atkin.metabolon.xlsx', package = 'autonomics')
#'  object <- read_metabolon(file)
#'  pca(object, plot = TRUE)    # Principal Component Analysis
#'  pls(object, plot = TRUE)    # Partial Least Squares
#'  lda(object, plot = TRUE)    # Linear Discriminant Analysis
#'  sma(object, plot = TRUE)    # Spectral Map Analysis
#' spls(object, plot = TRUE)    # Sparse PLS
#' # opls(object, plot = TRUE)  # OPLS # outcommented because it produces a file named FALSE
#' @author Aditya Bhagwat, Laure Cougnaud (LDA)
#' @export
pca <- function(
            object, 
                by = 'sample_id', 
             assay = assayNames(object)[1], 
              ndim = 2,
               sep = FITSEP,
            minvar = 0, 
    center_samples = TRUE,
           verbose = TRUE,
              plot = FALSE, 
               ...
){
# Assert
    if (!requireNamespace('pcaMethods', quietly = TRUE)){
        message("\t\t\tBiocManager::install('pcaMethods'). Then re-run.") 
        return(object) }   # tabs: align with other msgs in read_xxx
    assert_is_valid_sumexp(object)
    assert_is_scalar(assay); assert_is_subset(assay, assayNames(object))
    if (is.infinite(ndim)) ndim <- ncol(object)
    assert_is_a_number(ndim)
    assert_all_are_less_than_or_equal_to(ndim, ncol(object))
    assert_is_a_number(minvar)
    assert_all_are_in_range(minvar, 0, 100)
    assert_is_a_bool(center_samples)
    assert_is_a_bool(verbose)
    assert_is_a_bool(plot)
    if (verbose)  cmessage('%sAdd PCA', spaces(14))
    . <- NULL
# Prepare
    tmpobj <- object
    assays(tmpobj)[[assay]] %<>% inf_to_na(verbose=verbose)
    assays(tmpobj)[[assay]] %<>% nan_to_na(verbose=verbose)
    tmpobj %<>% rm_missing_in_all_samples(verbose = verbose)
# (Double) center and (global) normalize
    row_means <- rowMeans(        assays(tmpobj)[[assay]], na.rm = TRUE)
    col_means <- colWeightedMeans(assays(tmpobj)[[assay]], abs(row_means), na.rm = TRUE)
    global_mean <- mean(col_means)
    if (center_samples)  assays(tmpobj)[[assay]] %<>% apply(1, '-', col_means)  %>% t()  # Center columns (samples)
                         assays(tmpobj)[[assay]] %<>% apply(2, '-', row_means)           # Center rows (features)
    if (center_samples)  assays(tmpobj)[[assay]] %<>% add(global_mean)                   # Add doubly subtracted
                         assays(tmpobj)[[assay]] %<>% divide_by(sd(., na.rm=TRUE))       # Normalize
# Perform PCA
    pca_res  <- pcaMethods::pca(t(assays(tmpobj)[[assay]]), nPcs = ndim, scale = 'none', center = FALSE, method = 'nipals')
      samples <- pca_res@scores
     features <- pca_res@loadings
    variances <- round(100*pca_res@R2)
     colnames(samples) <-    scorenames(method = 'pca', by = by, dims = seq_len(ndim), sep = sep)
    colnames(features) <-  loadingnames(method = 'pca', by = by, dims = seq_len(ndim), sep = sep)
      names(variances) <- variancenames(seq_len(ndim))
# Add
    object %<>% merge_sdt(mat2dt(samples,   'sample_id'))
    object %<>% merge_fdt(mat2dt(features, 'feature_id'))
    metavar <- methodname(method = 'pca', by = by, sep = sep)
    metadata(object)[[metavar]] <- variances
# Filter for minvar
    object %<>% .filter_minvar('pca', minvar)
# Return
    if (plot)  print(biplot(object, method = 'pca', dims = seq(1,ndim)[1:2], ...))
    object
}

#' @rdname pca
#' @export
pls <- function(
     object,
         by = 'subgroup',
      assay = assayNames(object)[1],
       ndim = 2, 
        sep = FITSEP,
     minvar = 0,
    verbose = FALSE,
       plot = FALSE, 
        ...
){
# Assert
    if (!requireNamespace('mixOmics', quietly = TRUE)){
        message("\t\t\tBiocManager::install('mixOmics'). Then re-run.")
        return(object) }   # tabs : align with other msgs in read_xxx
    assert_is_valid_sumexp(object)
    assert_is_scalar(assay);  assert_is_subset(assay, assayNames(object))
    assert_is_subset(by, svars(object))
    if (is.infinite(ndim)) ndim <- ncol(object)
    assert_is_a_number(ndim)
    assert_all_are_in_range(ndim, 1, ncol(object))
    assert_is_a_number(minvar)
    assert_all_are_in_range(minvar, 0, 100)
    . <- NULL
# Transform
    obj <- object[, !is.na(object[[by]]) ]
    x <- t(assays(obj)[[assay]])
    y <- svalues(obj, by)
    pls_out <- mixOmics::plsda( x, y, ncomp = ndim)
    samples   <- pls_out$variates$X
    features  <- pls_out$loadings$X
    variances <- round(100*pls_out$prop_expl_var$X)
    colnames(samples)  <-    scorenames(method = 'pls', by = by, dims = seq_len(ndim), sep = sep)
    colnames(features) <-  loadingnames(method = 'pls', by = by, dims = seq_len(ndim), sep = sep)
    names(variances)   <- variancenames(seq_len(ndim))
# Add
    object %<>% merge_sdt(mat2dt(samples,   'sample_id'))
    object %<>% merge_fdt(mat2dt(features, 'feature_id'))
    metavar <- methodname(method = 'pls', by = by, sep = sep)
    metadata(object)[[metavar]] <- variances
# Filter for minvar
    object %<>% .filter_minvar('pls', minvar)
# Return
    if (plot)  print(biplot(object, method = 'pls', dims = seq(1,ndim)[1:2], ...))
    object
}


#' @rdname pca
#' @export
sma <- function(
     object, 
         by = 'sample_id', 
      assay = assayNames(object)[1], 
       ndim = 2, 
        sep = FITSEP,
     minvar = 0,
    verbose = TRUE, 
       plot = FALSE, 
        ...
){
# Assert
    if (!requireNamespace('mpm', quietly = TRUE)){
        message("\t\t\tFirst Biocinstaller::install('mpm'). Then re-run.")
        return(object) }   # tabs: align with other msgs in read_xxx
    assert_is_valid_sumexp(object)
    assert_is_scalar(assay);  assert_is_subset(assay, assayNames(object))
    if (is.infinite(ndim)) ndim <- ncol(object)
    assert_is_a_number(ndim)
    assert_all_are_in_range(ndim, 1, ncol(object))
    assert_is_a_number(minvar)
    assert_all_are_in_range(minvar, 0, 100)
    . <- NULL
# Preprocess
    tmpobj <- object
    assays(tmpobj)[[assay]] %<>% minusinf_to_na(verbose = verbose)   # else SVD singular
    assays(tmpobj)[[assay]] %<>% flip_sign_if_all_exprs_are_negative(verbose = verbose)
    tmpobj        %<>% rm_missing_in_some_samples(verbose = verbose)
# Transform
    df <- data.frame(feature = rownames(tmpobj), assays(tmpobj)[[assay]])
    mpm_tmp <- mpm::mpm(
                df, logtrans = FALSE, closure = 'none', center = 'double',
                normal = 'global', row.weight = 'mean', col.weight = 'constant')
    ncomponents <- length(mpm_tmp$contrib)
    mpm_out <- mpm::plot.mpm(mpm_tmp, do.plot = FALSE, dim = seq_len(ncomponents))
# Extract
    samples   <- mpm_out$Columns
    features  <- mpm_out$Rows
    variances <- round(100*mpm_tmp$contrib[seq_len(ncomponents)])
    names(samples)   <-    scorenames(method = 'sma', by = by, dims = seq_len(ndim), sep = sep)
    names(features)  <-  loadingnames(method = 'sma', by = by, dims = seq_len(ndim), sep = sep)
    names(variances) <- variancenames(dims = seq_len(ndim))
# Restrict
    if (is.infinite(ndim)) ndim <- ncol(samples)
    samples   %<>% extract(, seq_len(ndim), drop = FALSE)
    features  %<>% extract(, seq_len(ndim), drop = FALSE)
    variances %<>% extract(  seq_len(ndim))
# Add
    samples  %<>% cbind( sample_id = rownames(.), .)
    features %<>% cbind(feature_id = rownames(.), .)
    object %<>% merge_sdt(data.table(samples),  'sample_id')
    object %<>% merge_fdt(data.table(features), 'feature_id')
    metavar <- methodname(method = 'sma', by = by, sep = sep)
    metadata(object)[[metavar]] <- variances
# Filter for minvar
    object %<>% .filter_minvar('sma', minvar)
# Return
    if (plot)  print(biplot(object, method = 'sma', dims = seq(1,ndim)[1:2], ...))
    object
}


#' @rdname pca
#' @export
lda <- function(
     object, 
      assay = assayNames(object)[1], 
         by = 'subgroup', 
       ndim = 2, 
        sep = FITSEP,
     minvar = 0, 
    verbose = TRUE, 
       plot = FALSE, 
        ...
){
# Assert
    if (!requireNamespace('MASS', quietly = TRUE)){
        message("\t\t\tBiocManager::install('MASS'). Then re-run.")
        return(object) }  # tabs: align with other msgs in read_xxx
    assert_is_valid_sumexp(object)
    assert_is_scalar(assay);  assert_is_subset(assay, assayNames(object))
    assert_is_subset(by, svars(object))
    nsubgroup <- length(slevels(object, by))
    if (is.infinite(ndim))  ndim <- nsubgroup - 1
    assert_is_a_number(ndim)
    assert_all_are_in_range(ndim, 1, nsubgroup-1)
    if (ndim > (nsubgroup-1)) stop(
        sprintf('LDA requires ndim (%d) <= nsubgroup-1 (%d)',ndim, nsubgroup-1))
    assert_is_a_number(minvar)
    assert_all_are_in_range(minvar, 0, 100)
    . <- NULL
# Preprocess
    tmpobj <- object
    assays(tmpobj)[[assay]] %<>% minusinf_to_na(verbose = verbose)         # SVD singular
    assays(tmpobj)[[assay]] %<>% flip_sign_if_all_exprs_are_negative(verbose = verbose)
    tmpobj %<>% rm_missing_in_some_samples(verbose = verbose)
# Transform
    exprs_t  <- t(assays(tmpobj)[[assay]])
    lda_out  <- suppressWarnings(MASS::lda( exprs_t,grouping = object[[by]]))
    features <- lda_out$scaling
    if (ncol(features)==1) features %<>% cbind(LD2 = 0)
    exprs_t %<>% scale(center = colMeans(lda_out$means), scale = FALSE)
    samples  <- exprs_t %*% features
    variances <- round((lda_out$svd^2)/sum(lda_out$svd^2)*100)
    features  %<>% extract(, seq_len(ndim))
    samples   %<>% extract(, seq_len(ndim))
    variances %<>% extract(  seq_len(ndim))
    if (length(variances)==1) variances <- c(LD1 = variances, LD2 = 0)
# Rename
    colnames(samples)  <-    scorenames(method = 'lda', by = by, dims = seq_len(ndim), sep = sep)
    colnames(features) <-  loadingnames(method = 'lda', by = by, dims = seq_len(ndim), sep = sep)
    names(variances)   <- variancenames(ndim)
# Restrict
    samples   %<>% extract(, seq_len(ndim), drop = FALSE)
    features  %<>% extract(, seq_len(ndim), drop = FALSE)
    variances %<>% extract(  seq_len(ndim))
# Merge - Filter - Return
    object %<>% merge_sdt(mat2dt(samples,   'sample_id'))
    object %<>% merge_fdt(mat2dt(features, 'feature_id'))
    metavar <- methodname(method = 'lda', by = by, sep = sep)
    metadata(object)[[metavar]] <- variances
    object %<>% .filter_minvar('lda', minvar)
    if (plot)  print(biplot(object, method = 'lda', dims = seq(1,ndim)[1:2], ...))
    object
}


#' @rdname pca
#' @export
spls <- function(
    object, 
     assay = assayNames(object)[1], 
        by = 'subgroup', 
      ndim = 2, 
       sep = FITSEP,
    minvar = 0, 
      plot = FALSE, 
       ...
){
# Assert
    if (!requireNamespace('mixOmics', quietly = TRUE)){
        message("\t\t\tBiocManager::install('mixOmics'). Then re-run.")
        return(object)   # tabs: align with other msgs in read_xxx
    }
    assert_is_valid_sumexp(object);  assert_is_scalar(assay)
    assert_is_subset(assay, assayNames(object))
    assert_is_subset(by, svars(object))
    if (is.infinite(ndim)) ndim <- ncol(object)
    assert_is_a_number(ndim)
    assert_all_are_in_range(ndim, 1, ncol(object))
    assert_is_a_number(minvar)
    assert_all_are_in_range(minvar, 0, 100)
    . <- NULL
# Transform
    x <- t(assays(object)[[assay]])
    y <- object[[by]]
    pls_out <- mixOmics::splsda( x, y, ncomp = ndim)
    samples   <- pls_out$variates$X
    features  <- pls_out$loadings$X
    variances <- round(100*pls_out$prop_expl_var$X)
    colnames(samples)  <-    scorenames(method = 'spls', by = by, dims = seq_len(ndim), sep = sep)
    colnames(features) <-  loadingnames(method = 'spls', by = by, dims = seq_len(ndim), sep = sep)
    names(variances)   <- variancenames(seq_len(ndim))
# Add
    object %<>% merge_sdt(mat2dt(samples,  'sample_id'))
    object %<>% merge_fdt(mat2dt(features,'feature_id'))
    metavar <- methodname(method = 'spls', by = by, sep = sep)
    metadata(object)[[metavar]] <- variances
# Filter for minvar
    object %<>% .filter_minvar('spls', minvar)
# Return
    if (plot)  print(biplot(object, method = 'spls', dims = seq(1,ndim)[1:2], ...))
    object
}


#' @rdname pca
#' @export
opls <- function(
     object, 
         by = 'subgroup', 
      assay = assayNames(object)[1],
       ndim = 2, 
        sep = FITSEP,
     minvar = 0, 
    verbose = FALSE,
       plot = FALSE, 
        ...
){
# Assert
    if (!requireNamespace('ropls', quietly = TRUE)){
        message("\t\t\tBiocManager::install('ropls'). Then re-run.")
        return(object)   # tabs: align with other msgs in read_xxx
    }
    assert_is_valid_sumexp(object)
    assert_is_scalar(assay);  assert_is_subset(assay, assayNames(object))
    if (is.infinite(ndim)) ndim <- ncol(object)
    assert_is_a_number(ndim)
    assert_all_are_in_range(ndim, 1, ncol(object))
    assert_is_a_number(minvar)
    assert_all_are_in_range(minvar, 0, 100)
    . <- NULL
# Transform
    x <- t(assays(object)[[assay]])
    y <- svalues(object, by)
    pls_out <- ropls::opls(x, y, predI = ndim, permI = 0, fig.pdfC = FALSE)
    samples   <- pls_out@scoreMN
    features  <- pls_out@loadingMN
    variances <- round(pls_out@modelDF$R2X*100)
    colnames(samples)  <-    scorenames(method = 'opls', by = by, dims = seq_len(ndim), sep = sep)
    colnames(features) <-  loadingnames(method = 'opls', by = by, dims = seq_len(ndim), sep = sep)
    names(variances)   <- variancenames(dims = seq_len(ndim))
# Add
    object %<>% merge_sdt(mat2dt(samples,  'sample_id'))
    object %<>% merge_fdt(mat2dt(features, 'feature_id'))
    metavar <- methodname(method = 'opls', by = by, sep = sep)
    metadata(object)[[metavar]] <- variances
# Filter for minvar
    object %<>% .filter_minvar('opls', minvar)
# Return
    if (plot)  print(biplot(object, method = 'opls', dims = seq(1,ndim)[1:2], ...))
    object
}


#=============================================================================
#
#           biplot
#               add_scores
#               add_loadings
#                   headtail
#
#==============================================================================

num2char <- function(x){
    if (is.null(x))     return(x)
    if (is.numeric(x))  return(as.character(x))
    return(x)
}

add_scores <- function(
           p, 
      object, 
           x = 'pca1',
           y = 'pca2',
       color = 'subgroup', 
       shape = if ('replicate' %in% svars(object)) 'replicate' else NULL,
        size = NULL, 
       alpha = NULL, 
       group = NULL, 
    linetype = NULL,
       fixed = list(shape = 15, size = 3, na.rm = TRUE)
){
# manual colors require non-numerics
    if (!is.null(color)){
        if (is.numeric(color)){
            levs <- as.character(unique(sort(object[[color]])))
            object[[color]] %<>% num2char() %>% factor(levs)
        }
    }
    xsym <- sym(x)
    ysym <- sym(y)
    colorsym <- if (is.null(color)) quo(NULL) else sym(color)
    shapesym <- if (is.null(shape)) quo(NULL) else sym(shape)
    sizesym  <- if (is.null(size))  quo(NULL) else sym(size )
    alphasym <- if (is.null(alpha)) quo(NULL) else sym(alpha)
    groupsym <- if (is.null(group)) quo(NULL) else sym(group)
    linetypesym <- if (is.null(linetype)) quo(NULL) else sym(linetype)
# Points    
    p <- p + layer( geom     = 'point',
                    mapping  = aes(x = !!xsym, 
                                   y = !!ysym, 
                               color = !!colorsym, 
                               shape = !!shapesym, 
                               size  = !!sizesym, 
                               alpha = !!alphasym),
                    stat     = "identity", 
                    data     = sdt(object), 
                    params   = fixed,
                    position = 'identity' )
# Paths
    if (!is.null(group))  p <- p + layer(   geom     = 'path',
                                            mapping  = aes(x = !!xsym, 
                                                           y = !!ysym, 
                                                       color = !!colorsym, 
                                                       group = !!groupsym, 
                                                       linetype = !!linetypesym),
                                            stat     = "identity",
                                            data     = sdt(object),
                                            params   = list(size = 0.1, na.rm = TRUE),
                                            position = 'identity' )
    p
}

headtail <- function(x, n){
    c(x[seq(1, n)], x[seq(length(x)+1-n, length(x))])
}

pca1 <- pca2 <- NULL
add_loadings <- function(
         p, 
    object, 
         x = 'pca1', 
         y = 'pca2', 
     label = 'feature_name', 
        nx = 1, 
        ny = 1
){
# Process args
    if (nx==0 & ny==0) return(p)
    assert_is_subset(c(x, y), fvars(object))
    assert_is_subset(c(x, y), svars(object))
    axis <- angle <- NULL
# Loadings
    xloadings <- fdt(object)[[x]]
    yloadings <- fdt(object)[[y]]
    xscores   <- sdt(object)[[x]]
    yscores   <- sdt(object)[[y]]
    maxscore <- min(abs(min(c(xscores, yscores, na.rm = TRUE))),
                    abs(max(c(xscores, yscores, na.rm = TRUE))), na.rm = TRUE)
    scorefactor <- maxscore/max(abs(c(xloadings, yloadings)),  na.rm = TRUE)
    idx1 <- order(abs(xloadings), decreasing = TRUE)[seq_len(nx)] 
    idx2 <- order(abs(yloadings), decreasing = TRUE)[seq_len(ny)]
    #idx1 <- headtail(order(xloadings, na.last = NA), nx)
    #idx2 <- headtail(order(yloadings, na.last = NA), ny)
    #idx <- unique(c(idx1, idx2))
    idx <- c(idx1, idx2)
    loadingdt1 <- fdt(object)[idx1, c(label, x, y), with = FALSE]
    loadingdt2 <- fdt(object)[idx2, c(label, x, y), with = FALSE]
    loadingdt1[, axis := split_extract_fixed(x, '~', 3)]
    loadingdt2[, axis := split_extract_fixed(y, '~', 3)]
    loadingdt <- rbind(loadingdt1, loadingdt2)
    loadingdt[[x]] %<>% multiply_by(scorefactor) # bring them on same scale
    loadingdt[[y]] %<>% multiply_by(scorefactor)
    loadingdt[[x]] %<>% multiply_by(1.5)         # bring them somewhat outside
    loadingdt[[y]] %<>% multiply_by(1.5)
    loadingdt$angle <- loadingdt[[y]] / loadingdt[[x]]
    loadingdt$angle <- atan(loadingdt$angle)
    loadingdt$angle %<>% multiply_by(180/pi)

# Plot
    p <- p + geom_segment( data = loadingdt, 
                        mapping = aes( x = 0, 
                                    xend = !!sym(x),
                                       y = 0, 
                                    yend = !!sym(y),
                                linetype = axis), 
                                   color = 'gray85' )
    p <- p + geom_text(   data = loadingdt, 
                       mapping = aes( x = !!sym(x), 
                                      y = !!sym(y),
                                  label = !!sym(label), 
                                  angle = angle), 
                                  hjust = 'inward', 
                                  color = 'gray30' )
    p
}

pca1 <- pca2 <- feature_name <- NULL

#' Make alpha palette
#' @param object SummarizedExperiment
#' @param alpha string
#' @return character vector
#' @examples 
#' file <- system.file('extdata/atkin.metabolon.xlsx', package = 'autonomics')
#' object <- read_metabolon(file)
#' make_alpha_palette(object, 'Time')
#' @export
make_alpha_palette <- function(object, alpha){
# Assert
    assert_is_valid_sumexp(object)
    if (is.null(alpha))  return(NULL)
    assert_scalar_subset(alpha, svars(object))
# Create
    levels <- slevels(object, alpha)
    palette <- seq(1, 0.4, length.out = length(levels))
    names(palette) <- levels
# Return
    palette
}
    
biplot_methods <- function(object){
    sep <- guess_fitsep(fdt(object))
    y <- grep('(pca|pls)', svars(object), value = TRUE)
    y %<>% split_extract_fixed(sep, 3)
    y <- gsub('[0-9]+', '', y)
    y %<>% unique()
    y
}

biplot_by <- function(object, method = 'pca'){
    sep <- guess_fitsep(fdt(object))
    y <- grep(method, svars(object), value = TRUE, fixed = TRUE)
    y %<>% split_extract_fixed(sep, 2)
    y %<>% unique()
    y
}

biplot_dims <- function(
    object, method = 'pca', by = biplot_by(object, method)
){
    sep <- guess_fitsep(fdt(object))
    x <- paste0('effect', sep, by, sep, method)
    y <- grep(x, svars(object), value = TRUE, fixed = TRUE)
    y <- gsub(x, '', y)
    y %<>% as.numeric()
    y
}

#' Biplot
#' @param object         SummarizedExperiment
#' @param method         'pca', 'pls', 'lda', 'spls', 'opls', 'sma'
#' @param by             svar
#' @param dims           numeric vector: e.g. 1:2
#' @param alpha          svar
#' @param color          svar
#' @param shape          svar
#' @param size           svar
#' @param label          svar
#' @param group          svar
#' @param linetype       svar
#' @param feature_label  fvar
#' @param fixed          fixed plot aesthetics
#' @param nx     number of x features to plot
#' @param ny     number of y features to plot
#' @param colorpalette   character vector
#' @param alphapalette   character vector
#' @param title          string
#' @param theme          ggplot2::theme output
#' @return ggplot object
#' @examples
#' file <- system.file('extdata/atkin.metabolon.xlsx', package = 'autonomics')
#' object <- read_metabolon(file)
#' object %<>% pca(ndim = 4)
#' object %<>% pls(ndim = 4)
#' biplot(object)
#' biplot(object, nx = 1)
#' biplot(object, dims = 3:4, nx = 1)
#' biplot(object, method = 'pls')
#' biplot(object, method = 'pls', dims = 3:4)
#' biplot(object, method = 'pls', dims = 3:4, group = 'Subject')
#' @export
biplot <- function(
           object, 
           method = biplot_methods(object)[1],
               by = biplot_by(object, method)[1], 
             dims = biplot_dims(object, method, by)[1:2],
            color = 'subgroup', 
            shape = NULL, 
             size = NULL, 
            alpha = NULL,
            group = NULL,         # Use 'feature_id' (not 'gene')
         linetype = NULL,         # To align with `plot_exprs` and `plot_volcano`
            label = NULL,         # Which use 'feature_id' to guarantee uniqueness
    feature_label = 'feature_id', # if ('gene' %in% fvars(object)) 'gene' else 'feature_id', 
            fixed = list(shape = 15, size = 3), 
               nx = 0,
               ny = 0,
     colorpalette =  make_svar_palette(object, color),
     alphapalette = make_alpha_palette(object, alpha), 
            title = paste0(method, guess_fitsep(fdt(object)), by), 
            theme = ggplot2::theme(plot.title = element_text(hjust = 0.5), 
                                   panel.grid = element_blank())
){
# Assert / Process
    assert_is_all_of(object, 'SummarizedExperiment')
    if (!is.null(color)){ assert_is_a_string(color)
                          assert_is_subset(color, svars(object)) }
    if (!is.null(group)){ assert_is_a_string(group)
                          assert_is_subset(group, svars(object)) }
    if (!is.null(shape)){ assert_is_a_string(shape)
                          assert_is_subset(shape, svars(object)) 
                          fixed %<>% extract(names(.) %>% setdiff('shape'))}
    if (!is.null(size)){  assert_is_a_string(size)
                          assert_is_subset(size,  svars(object)) 
                          fixed %<>% extract(names(.) %>% setdiff('size'))}
    
    ndim <- max(dims)
    sep <- guess_fitsep(fdt(object))
    x <- scorenames(method, by = by, dims = dims[[1]], sep = sep)
    y <- scorenames(method, by = by, dims = dims[[2]], sep = sep)
    vars <- round(variances(object, method = method, by = by, dims = dims, sep = sep))
    xlab <- sprintf('X%d : %d%%', dims[[1]], vars[[1]])
    ylab <- sprintf('X%d : %d%%', dims[[2]], vars[[2]])

# Plot
    p <- ggplot() + theme_bw() + theme
    p <- p + ggplot2::xlab(xlab) + ggplot2::ylab(ylab) 
    p <- p + ggtitle(title)
    p %<>% add_loadings(object, x = x, y = y, label = feature_label, nx = nx, ny = ny)
    p %<>% add_scores(object, x = x, y = y, color = color, shape = shape, 
                      size = size, alpha = alpha, group = group, linetype = linetype, fixed = fixed)
    if (!is.null(colorpalette))  p <- p + scale_color_manual(values = colorpalette, na.value = 'gray80')
    if (!is.null(alphapalette))  p <- p + scale_alpha_manual(values = alphapalette)
    if (!is.null(label  ))  p <- p + geom_text_repel(
                    aes(x = !!sym(x), y = !!sym(y), label = !!sym(label)), 
                    data = sdt(object), na.rm = TRUE)
    if (!is.null(shape)){
        n <- length(slevels(object, shape))
        if (n > 6)  p <- p + scale_shape_manual(values = seq(15, 15+n-1))
            # Warning messages: The shape palette can deal with a maximum 
            # of 6 discrete values
            # https://stackoverflow.com/questions/16813278
    }
# Return
    p
}


#=============================================================================
#
#                       biplot_corrections()
#                       biplot_covariates()
#
#==============================================================================


#' Biplot batch corrections
#'
#' @param object      SummarizedExperiment
#' @param method      'pca', 'pls', 'lda', or 'sma'
#' @param by          svar
#' @param color       variable mapped to color (symbol)
#' @param covariates  covariates to be batch-corrected
#' @param varcols     number of covariate columns
#' @param plot        TRUE/FALSE: plot?
#' @return  grid object
#' @examples
#' file <- system.file('extdata/atkin.metabolon.xlsx', package = 'autonomics')
#' object <- read_metabolon(file, pca = TRUE, plot = FALSE)
#' biplot_corrections(object, color = 'subgroup', covariates = c('Sex', 'Diabetes', 'Subject', 'Time'))
#' @seealso biplot_covariates
#' @export
biplot_corrections <- function(
        object, 
        method = 'pca', 
            by = 'sample_id', 
         color = 'subgroup', 
    covariates = character(0),
       varcols = ceiling(sqrt(1+length(covariates))), 
          plot = TRUE
){
    x <- scorenames(method, by = by, dims = 1)
    y <- scorenames(method, by = by, dims = 2)
    p <- biplot(object, method = method, by = by, dims = 1:2, color = color)
    p <- p + ggtitle('INPUT')
    legend  <- gglegend(p + theme(legend.position = 'bottom', legend.title = element_blank()))
    p <- p + guides(color = 'none', fill = 'none')
    plotlist <- list(p)
    for (ibatch in covariates){
        tmp_object <- object
        tmp_b <- sdata(tmp_object)[[ibatch]]
        if (any(is.na(tmp_b))) {
            tmp_object %<>% filter_samples(!is.na(!!sym(ibatch)))
        }
        values(tmp_object) %<>% removeBatchEffect(batch = tmp_b)
        tmp_object <- get(method)(tmp_object, ndim=2, verbose=FALSE)
        p <- biplot(tmp_object, method = method, by = by, dims = 1:2, color = color)
        p <- p + ggtitle(paste0(' - ', ibatch))
        p <- p + guides(color = 'none', fill = 'none')
        plotlist %<>% c(list(p))
    }
    pp <- arrangeGrob(grobs = plotlist, ncol = varcols, bottom = legend)
    if (plot) grid::grid.draw(pp)
    invisible(pp)
}


#' Biplot covariates
#'
#' @param object     SummarizedExperiment
#' @param method     'pca', 'pls', 'lda', or 'sma'
#' @param by          svar
#' @param block       svar
#' @param covariates  covariates: mapped to color or batch-corrected
#' @param ndim        number of dimensions to plot
#' @param dimcols     number of dimension columns
#' @param varcols     number of covariate columns
#' @param plot        TRUE or FALSE: whether to plot
#' @return  ggplot object
#' @examples
#' file <- system.file('extdata/atkin.metabolon.xlsx', package = 'autonomics')
#' object <- read_metabolon(file, pca = TRUE)
#' biplot_covariates(object, covariates = 'subgroup', ndim = 12, dimcols = 3)
#' biplot_covariates(object, covariates = c('Sex', 'Diabetes', 'Subject', 'Time'))
#' biplot_covariates(object, covariates = c('Sex', 'Diabetes', 'Subject', 'Time'), ndim = 2)
#' biplot_covariates(object, covariates = c('subgroup'), dimcols = 3)
#' @seealso biplot_corrections
#' @export
biplot_covariates <- function(
        object, 
        method = 'pca', 
            by = 'sample_id', 
         block = NULL, 
    covariates = 'subgroup', 
          ndim = 6,
       dimcols = 1, 
       varcols = length(covariates),
          plot = TRUE
){
# Assert
    assert_is_valid_sumexp(object)
    assert_scalar_subset(method, c('pca', 'sma', 'pls', 'spls', 'opls', 'lda'))
    assert_is_subset(covariates, svars(object))
    blocksym <- if (is.null(block))  quo(NULL) else sym(block)
    dims <- NULL
# Plot
    x <- y <- NULL
    plotdt <- prep_covariates(object, method = method, by = by, ndim = ndim)
    plotlist <- list()
    for (covar in covariates){
        p <- ggplot(plotdt, aes(x = x, y = y, color = !!sym(covar), group = !!blocksym))
        p <- p + theme_bw()
        p <- p + facet_wrap(vars(dims), ncol = dimcols, scales = 'free')
        p <- p + geom_point(shape = 15, size = 3)
        if (!is.null(block))  p <- p + geom_line()
        p <- p + xlab(NULL) + ylab(NULL) + ggtitle(covar)
        p <- p + theme(legend.position = 'bottom', legend.title = element_blank())
        plotlist %<>% c(list(p))
    }
    pp <- gridExtra::arrangeGrob(grobs = plotlist, ncol = varcols)
    if (plot) grid::grid.draw(pp)
    invisible(pp)
}

prep_covariates <- function(
    object, 
    method = 'pca',
        by = 'sample_id',
      ndim = 6
){
    # Dimred
        for (var in svars(object))  if (grepl(paste0('~', method), var))  sdt(object)[[var]] <- NULL
        for (var in fvars(object))  if (grepl(paste0('~', method), var))  fdt(object)[[var]] <- NULL
        object %<>% get(method)(by = by, ndim = ndim)
    # Initialize plotdt
        plotdt <- sdt(object)
        plotdt %<>% extract(, !stri_detect_fixed(names(.), '~'), with = FALSE)
        plotdt %<>% extract(FALSE, )
        plotdt %<>% cbind(x = numeric(0), y = numeric(0), dims = character(0))
    # Add pairs
        sampledt <- sdt(object)
        idx <- stri_detect_fixed(names(sampledt), '~')
        scoredt <- sampledt[, idx, with = FALSE]
        sampledt %<>% extract(, !idx, with = FALSE)
        npairs <- ndim %/% 2
        for (idim in seq_len(npairs)){
            xdim <- idim*2-1
            ydim <- idim*2
            xvar <- scorenames(method, by = by, dims = xdim)
            yvar <- scorenames(method, by = by, dims = ydim)
            tmpdt <- cbind(sampledt, x = scoredt[[xvar]], y = scoredt[[yvar]], dims = sprintf('%d:%d', xdim, ydim))
            plotdt %<>% rbind(tmpdt)
        }
        plotdt$dims %<>% factor(unique(.))
        plotdt
}
bhagwataditya/importomics documentation built on May 1, 2024, 2:01 a.m.