R/hcpp.R

Defines functions hcpp

Documented in hcpp

##' Rcpp implementation of the HCP algorithm
##'
##' Objective:
##' This function solves the following problem:
##' argmin_{Z,B,U}   ||Y-Z*B||_2 + lambda1*||Z-F*U||_2 + lambda2*||B||_2 +
##' lambda_3||U||_2
##'
##' To use the residual data: Residual = Y - Z*B
##'
##' Note: k>0, lambda1>0, lambda2>0, lambda3>0 must be set by the user based on
##' the data at hand. one can set these values using cross-validation, by
##' evaluating the "performance" of the  resulting residual data on a desired
##' task. typically, if lambda>5, then hidden factors match the known covariates closely.
##' @title R/Rcpp implementation of the HCP algorithm
##' @param Z a matrix nxd of known covariates, where n is the number of
##' subjects and d is the number of known covariates. *must be standardize
##' (columns have 0 mean and constant SS).
##' @param Y a matrix of nxg of expression data (must be standardized (columns
##' scaled to have constant SS and mean 0). ** use standardize function to standardize F and Y.
##' @param X vector of responses.
##' @param k  number of inferred hidden components (k is an integer)
##' @param lambda1 model parameter 1
##' @param lambda2 model parameter 2
##' @param lambda3 model parameter 3
##' @param iter (optional) iter: number of iterations (default = 100);
##' @param stand default standardize data TRUE
##' @param log default log-transformation TRUE
##' @param fast default use fast RcppArmadillo implementation
##' @param verbose default TRUE
##' @param tol 1e-6 stopping criterium relative difference objective function
##' @return list
##' Z: matrix of hidden components, dimensionality: nxk,
##' B: matrix of effects of hidden components, dimensionality: kxg,
##' o: value of objective function on consecutive iterations.
##' @author mvaniterson
##' @references \url{http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0068141}
##' @importFrom stats runif pnorm
##' @export
##' @examples
##' data(rhcppdata)
##' F <- rhcppdata$F
##' Y <- rhcppdata$Y
##' k <- 10
##' lambda1 <- 20
##' lambda2 <- 1
##' lambda3 <- 1
##' iter <- 100
##' ##and run
##' ##Rres <- hcpp(Y, F, k, lambda1, lambda2, lambda3, iter)
hcpp <- function(Z, Y, X, k, lambda1=NULL, lambda2=NULL, lambda3=NULL, iter=100, stand=TRUE, log=TRUE, fast=TRUE, verbose=TRUE, tol=1e-6) {
        
    t0 <- proc.time()
    if(is.null(Z)) {
        message("Assume only hidden components")
        Z <- matrix(runif(2*nrow(Y), 1, 10), nrow=nrow(Y), ncol=2)
        lambda3 <- 0 ##do not penalized the coefficients with an effect on the known covariates
    }
    ## (0) check dimensions
    if(nrow(Y) != nrow(Z) | nrow(Y) != nrow(X))
        stop("Rows represent the samples for both Y, Z and X!")

    if(sum(is.na(Z)) > 0 | sum(is.na(Y)) > 0 | sum(is.na(X)) > 0)
        stop("NA's in input data are not allowed!")
 
    ## (1) take log of read counts
    if(log) {
        message("Log-transformation data...")
        Y <- log2(2+Y)
        if(sum(is.na(Y)) > 0)
            stop("NA's introduced by log-transformation these are not allowed!")
    }

    standardize <- function(x) {
        x <- x - outer(rep(1, nrow(x)), colMeans(x)) ##center
        x <- x*outer(rep(1, nrow(x)), 1/sqrt(colSums(x^2))) ##scale SS
        ##x <- apply(x, 2, function(x) x - mean(x))
        ##x <- apply(x, 2, function(x) x/sqrt(sum(x^2)))
        x
    }

    ## (2) standardize the data
    if(stand) {
        message("Standardize data...")
        Y <- standardize(Y)
        Z <- standardize(Z)
        X <- standardize(X)
    }

    if(sum(is.na(Z)) > 0 | sum(is.na(Y)) > 0 | sum(is.na(X)) > 0) {
        message(paste("Row(s) (Z):", paste(which(apply(Z, 1, function(x) any(is.na(x)))), collapse=", ")))
        message(paste("Row(s) (Y):", paste(which(apply(Y, 1, function(x) any(is.na(x)))), collapse=", ")))
        message(paste("Row(s) (X):", paste(which(apply(X, 1, function(x) any(is.na(x)))), collapse=", ")))
        stop("NA's introduced by the standardization these are not allowed!")
    }

    ## (3) HCP
    if(fast) {
        message("Run RcppArmadillo implemented HCP algorithm...")
        res <- rcpparma_hcpp(Z, Y, X, k, lambda1, lambda2, lambda3, iter)
        ##no hcpp1 Rcpp implementation yet!
    }
    else {
        message("Run plain R implemented HCP algorithm...")
        if(!is.null(lambda1) & !is.null(lambda2) & !is.null(lambda3))
            res <- r_hcpp3(Z, Y, X, k, lambda1, lambda2, lambda3, iter, verbose, tol)
        else if(!is.null(lambda1) & is.null(lambda2) & is.null(lambda3))
            res <- r_hcpp1(Z, Y, X, k, lambda1, iter, verbose, tol)
    }

    if(verbose)
        message(paste("Finished after", res$niter, "iterations of", iter, "iterations."))

    W <- res$W
    B <- res$B
    o <- res$o
    G <- res$G
    g <- as.vector(G[1,])
    niter <- res$niter

    ##summary statistics
    err <- as.vector(sqrt(colSums((Y - X%*%G - W%*%B)^2)/(nrow(Y) - ncol(W) - ncol(X) - 1))) ##xtx = 1 because of the standarization

    ##optionally use robust standard errors!
    ##U <- colSums((Y - X%*%G - W%*%B)^2) ##residuals    
    ##rerr <- as.vector(sqrt(crossprod(U%*%X)/crossprod(X))) ##robust standard errors for gamma
        
    tstat <- g/err
    pval <- 2*pnorm(-abs(g/err))    
    names(tstat) <- names(pval) <- colnames(Y)

    if(verbose)
        message(paste("The batch correction took:", round((proc.time() - t0)[3], 2), "seconds."))

    return(list(Res = Y - X%*%G - W%*%B, Cov=W, B=B, o=o, gamma=g, err=err, tstat=tstat, pval=pval, Y=Y, Z=Z, X=X, niter=niter))
}
mvaniterson/Rhcpp documentation built on Feb. 24, 2020, 4:06 p.m.