R/mdmr.R

Defines functions mdmr.fstats_to_pvals mdmr.worker save_mdmr.pvals_and_zstats save_mdmr.permutations save_mdmr.hat_matrices save_mdmr.model save_mdmr.factors save_mdmr.formula save_mdmr.modelinfo save_mdmr mdmr

#' Multi-Dimensional Multivariate Regression
#' 
#' Evaluates the degree to which a factor or predictor variable will explain 
#' the distances between particpants.
#' 
#' @author Zarrar Shehzad
#' @param G Matrix where each column represents one gower centered distance 
#'        matrix stored as a vector.
#' @param formula A formula specifying the model.
#' @param model A data frame in which the variables specified in the formula 
#'              will be found.
#' @param nperms Number of permutations (will add the original index)
#' @param factors2perm A vector indicating which factors to permute
#' @param superblocksize Number of voxels to process at a time
#' @param voxs A vector of voxel indices to examine
#' @param blocksize Number of permutations to process at a time
#' @param contr.unordered, contr.ordered Contrasts used for the design matrix
#' @param strata Vector of length nobs that contains indices to be exchanged
#' @param max.iter Maximum number of iterations for redoing a significantly 
#'                 correlated permutation
#' @param verbose 0, 1, 2 indicating nothing, moderate, a lot
#' @param parallel boolean
#' @param G.path Path to Gower's centered matrices if file-backed
#' @param fperms.path Path to Fperm's matrices
#' @peram save.fperms Boolean to save or not save the Fperms
#' @param type The data-type of the big matrices that will be created
#' @param shared Share intermediate and final big matrices
#' @param sge.info List containing njobs, nforks, nthreads, & ignore.proc.error
#' @param permute can be rhs, hat, or hat_with_covariate
#' @param list.perms
#' @return list with modelinfo, pvals, fstats, and permutation indices
mdmr <- function(G, formula, model, 
                 nperms=4999, factors2perm=NULL, superblocksize=length(voxs), 
                 voxs=1:ncol(G), blocksize=nperms, 
                 contr.unordered="contr.sum", contr.ordered="contr.poly", 
                 strata=NULL, max.iter=100, 
                 verbose=1, parallel=FALSE, 
                 G.path=NULL, fperms.path=NULL, save.fperms=FALSE, 
                 type="double", shared=parallel, 
                 sge.info=NULL, permute="rhs", 
                 seeds=sample(1:nperms), 
                 list.perms=NULL)
{
    inform <- verbose==2
    verbose <- as.logical(verbose)
    progress <- ifelse(verbose, "text", "none")
    
    nvoxs <- length(voxs)
    nsubs <- sqrt(nrow(G))
    nperms <- nperms + 1    # account for original indices
    if ((superblocksize == nvoxs) && !is.null(sge.info))
        superblocksize <- round(superblocksize/2)
    superblocks <- niftir.split.indices(1, nvoxs, by=superblocksize)
    blocks <- niftir.split.indices(1, nperms, by=blocksize)
    blocks$nperms <- sapply(1:blocks$n, function(bi) {
                         blocks$ends[bi] - blocks$starts[bi] + 1
                     })
    
    # Checks
    if (!is.data.frame(model))
        stop("input model must be a data frame")
    if (!is.big.matrix(G) || !is.shared(G))
        stop("input Gower's matrix must be type big matrix and shared")
    if (!is.null(fperms.path) && !file.exists(fperms.path))
        stop("output path for Pseudo-F permutations does not exist")
    if (save.fperms && is.null(fperms.path))
        stop("must specify fperms.path if save.fperms")
    if (is.filebacked(G) && is.null(G.path))
        stop("G.path must be given when G is filebacked big matrix")
    if (!is.filebacked(G) && !is.null(G.path))
        warning("G.path will be ignored", immediate.=TRUE)
    if (!is.null(G.path) && !file.exists(G.path))
        stop("output path for distance matrices does not exist")
    if (nvoxs == 0)
        stop("number of voxels cannot be 0")
    if (nperms == 0)
        stop("number of permutations cannot be 0")    
    if (!is.null(sge.info)) {
        library(Rsge)
        ref.names <- c("njobs", "nforks", "nthreads", "ignore.proc.error")
        if (!all.equal(names(sge.info), ref.names))
            stop("sge.info must include njobs, nforks, nthreads, and ignore.proc.error")
        if (is.null(G.path))
            stop("must provide G.path for SGE MDMR job")
        if (save.fperms == T && is.null(fperms.path))
            stop("must provide fperms.path if saving for SGE MDMR job")
        sge.info$run <- TRUE
        sge.options(sge.user.options = sprintf("-S /bin/bash -pe mpi_smp %i", 
                                            sge.info$nthreads*sge.info$nforks))
    } else {
        sge.info <- list(run=FALSE)
    }
    
    # G filename
    G.desc <- describe(G)
    
    # Prepare model info
    # the list includes: rhs, qrhs, H2s, IHs, df.res, df.exp
    modelinfo <- mdmr_model(formula, model, contr.unordered, contr.ordered, 
                            factors2perm, verbose)
    factor.names <- names(attr(modelinfo$qrhs, "factors2perm"))
    nfactors <- length(factor.names)
    
    
    vcat(verbose, "Preparing data")
    
    # Permutations for each factor (fill in values later)
    vcat(verbose, "...preparing permutation matrices")
    if (is.null(list.perms)) {
        list.perms <- mdmr_perms.gather_perms(modelinfo$rhs, modelinfo$qrhs, 
                                              nperms, strata, max.iter, 
                                              include.orig=TRUE, 
                                              verbose=verbose)
    }
    
    # F-statistics (including permutations) for each factor
    # (will fill in values later)
    if (save.fperms) {
        vcat(verbose, "...preparing file-backed pesudo-F matrices")
        list.Fperms <- lapply(1:nfactors, function(fi) {
            if (is.null(fperms.path)) {
                bm <- big.matrix(nperms, nvoxs, type=type, shared=TRUE)
            } else {
                name <- factor.names[fi]
                bm <- big.matrix(nperms, nvoxs, type=type, shared=TRUE, 
                            backingpath=fperms.path, 
                            backingfile=sprintf("fperms_%s.bin", name), 
                            descriptorfile=sprintf("fperms_%s.desc", name))
            }
            return(bm)
        })
        list.fperms_desc <- lapply(list.Fperms, describe)
        rm(list.Fperms); invisible(gc(F,T))
        list.Fperms <- NULL
    } else {
        list.Fperms <- NULL
        list.fperms_desc <- NULL
    }
    
    vcat(verbose, "Computing MDMR across %i large blocks", 
         superblocks$n)
    vcat(verbose, "...with %i smaller blocks within each larger one",
         blocks$n)
        
    caller_for_superblocks <- function(si) {
        vcat(verbose, "large block %i", si)
        start.time <- Sys.time()
        
        if (sge.info$run == TRUE) {
            set_parallel_procs(sge.info$nforks, sge.info$nthreads, verbose, 
                               sge.info$ignore.proc.error)
        }
        
        firstVox <- superblocks$starts[si]
        lastVox <- superblocks$ends[si]
        sub_nvoxs <- lastVox - firstVox + 1
        
        # specify subset of G
        vcat(verbose, "...grabbing subset of Gower matrices")
        if (!is.null(G.path))
            G <- attach.big.matrix(G.desc, backingpath=G.path)
        tmp_Gs <- deepcopy(x=G, cols=firstVox:lastVox, type=type, shared=shared)
        if (!is.null(G.path))
            rm(G); invisible(gc(F,T))
        
        # prepare partial Fperms
        vcat(verbose, "...preparing partial pesudo-F matrices")
        list.partial_Fperms <- lapply(1:nfactors, function(fi) {
            big.matrix(nperms, sub_nvoxs, type=type, shared=TRUE)
        })
                
        # function to run mdmr for specific factor
        # with subset of permutations / voxels
        caller_for_mdmr_worker <- function(bi, include.orig=FALSE) {
            firstPerm <- blocks$starts[bi]
            lastPerm <- blocks$ends[bi]
            sub_nperms <- blocks$nperms[bi]
            include.orig <- ifelse(bi==1, TRUE, FALSE)
            
            # RAM storage for fperms computed in current iteration
            list.subset_Fperms <- lapply(1:nfactors, function(fi) {
                big.matrix(sub_nperms, sub_nvoxs, type=type, shared=FALSE)
            })
            
            # subset of permutation indices
            vcat(verbose, "...preparing subset of permutation indices")
            list.subset_perms <- lapply(1:nfactors, function(fi) {
                list.perms[[fi]][,firstPerm:lastPerm,drop=FALSE]
            })
            
            # calculate pseudo-F statistic 
            # & return permutation indices
            mdmr.worker(modelinfo, tmp_Gs, list.subset_perms, 
                        list.subset_Fperms, permute=permute, 
                        type=type, shared=shared, 
                        verbose=verbose)
            
            # remove perm indices
            vcat(verbose, "...removing permutation indices")
            rm(list.subset_perms); invisible(gc(F,T))
            
            # copy over current Fperms to shared memory
            for (fi in 1:nfactors)
                bedeepcopy(x=list.subset_Fperms[[fi]], y=list.partial_Fperms[[fi]], 
                           y.rows=firstPerm:lastPerm)
            rm(list.subset_Fperms); invisible(gc(F,T))
            
            return(NULL)
        }
        
        # loop through blocks (permutations)
        vcat(verbose, "...looping through %i permutation block(s)", blocks$n)
        nulls <- llply(1:blocks$n, caller_for_mdmr_worker, 
                        .progress=progress, .inform=inform, 
                        .parallel=parallel)
        
        # calculate and save p-values
        vcat(verbose, "...calculating p-values")
        partial_Pmat <- mdmr.fstats_to_pvals(list.partial_Fperms)
        
        # save F-stats
        if (save.fperms) {
            # Notes:
            # ------
            # This is only for saving otherwise list isn't retained
            # 
            # I have a list of blocks with a list of factors 
            # containing a big matrix of Fperms sub_nperms x sub_nvoxs
            
            vcat(verbose, "...saving F-statistics to disk")
            l_ply(1:nfactors, function(fi) {
                Fperms <- attach.big.matrix(list.fperms_desc[[fi]], 
                                            backingpath=fperms.path)
                
                iFperms <- list.partial_Fperms[[fi]]
                oFperms <- sub.big.matrix(Fperms, 
                            firstCol=firstVox, lastCol=lastVox, 
                            backingpath=fperms.path)
                
                deepcopy(x=iFperms, y=oFperms)
                
                rm(Fperms); invisible(gc(F,T))
            }, .progress=progress)
        }
        
        rm(list.partial_Fperms); invisible(gc(F,T))
        
        # time/duration
        end.time <- Sys.time()
        time.total <- as.numeric(end.time-start.time, units="mins")
        time.left <- time.total*(superblocks$n-si)
        vcat(verbose, "...took %.1f minutes (%.1f minutes left)\n", 
             time.total, time.left)
             
        return(partial_Pmat)
    }
    
    # Loop through super blocks (voxels)
    if (sge.info$run) {
        list.pvals <- sge.parLapply(1:superblocks$n, caller_for_superblocks, 
                                    debug=inform, trace=inform, 
                                    packages=c("connectir"), 
                                    function.savelist=ls(), 
                                    njobs=sge.info$njobs)
    } else {
        list.pvals <- lapply(1:superblocks$n, caller_for_superblocks)
    }
    
    # P-values for each factor
    vcat(verbose, "...compiling and saving p-values")
    if (is.null(fperms.path)) {
        Pmat <- big.matrix(nvoxs, nfactors, type=type, shared=FALSE)
    } else {
        Pmat <- big.matrix(nvoxs, nfactors, type=type, shared=TRUE, 
                           backingpath=fperms.path, 
                           backingfile="pvals.bin", 
                           descriptorfile="pvals.desc")
    }
    for (si in 1:superblocks$n) {
        firstVox <- superblocks$starts[si]
        lastVox <- superblocks$ends[si]
        oPmat <- list.pvals[[si]]
        Pmat[firstVox:lastVox,] <- oPmat
        #bedeepcopy(x=oPmat, y=Pmat, y.rows=firstVox:lastVox)
    }
    rm(list.pvals); invisible(gc(F,T))
    
    # Reload F-stats
    if (!is.null(fperms.path)) {
        vcat(verbose, "...reloading F-perms")
        list.Fperms <- lapply(list.fperms_desc, function(desc) {
            attach.big.matrix(desc, backingpath=fperms.path)
        })
    }
    
    structure(list(
        modelinfo = modelinfo,   
        pvals = Pmat, 
        ns = list(nfactors=nfactors, nobs=nsubs, nperms=nperms, nvoxs=nvoxs), 
        fstats = list.Fperms, 
        fpath = fperms.path, 
        perms = list.perms
    ), class="mdmr")
}

save_mdmr <- function(obj, voxs, sdir, mdir, verbose=TRUE) {
    vcat(verbose, "Saving MDMR Results")
    
    mdmr.output <- mdir
    if (!file.exists(mdmr.output)) {
        vcat(verbose, "...creating MDMR output directory")
        dir.create(mdmr.output)
    }
    
    vcat(verbose, "...saving model info")
    save_mdmr.modelinfo(mdir, obj$modelinfo)
    
    vcat(verbose, "...saving formula")
    save_mdmr.formula(mdir, obj$modelinfo)
        
    vcat(verbose, "...saving factor names")
    save_mdmr.factors(mdir, obj$modelinfo)
    
    vcat(verbose, "...saving model")
    save_mdmr.model(mdir, obj$modelinfo)
    
    vcat(verbose, "...saving hat matrices")
    save_mdmr.hat_matrices(mdir, obj$modelinfo)
    
    vcat(verbose, "...saving permutations")
    factor.names <- names(attr(obj$modelinfo$qrhs, "factors2perm"))
    save_mdmr.permutations(mdir, obj$perms, factor.names)
    
    vcat(verbose, "...saving p-value and z-stat map(s)")
    save_mdmr.pvals_and_zstats(sdir, mdir, obj$pvals, voxs, factor.names, obj$ns$nperms)
    
    return(NULL)
}

save_mdmr.modelinfo <- function(mdir, modelinfo)
{
    ofile <- file.path(mdir, "modelinfo.rda")
    save(modelinfo, file=ofile)
}

save_mdmr.formula <- function(mdir, modelinfo)
{
    ofile <- file.path(mdir, "formula.txt")
    cat(as.character(modelinfo$formula)[-c(1)], file=ofile)
}

save_mdmr.factors <- function(mdir, modelinfo)
{
    ofile <- file.path(mdir, "factor_names.txt")
    
    all.factornames <- names(attr(modelinfo$qrhs, "factor.names"))
    perm.factornames <- names(attr(modelinfo$qrhs, "factors2perm"))
    is <- !(all.factornames %in% perm.factornames)
    cov.factornames <- all.factornames[is]
    
    totext <- sprintf("All Factors: %s\nPermuted Factors: %s\nCovariates: %s", 
                paste(all.factornames, collapse=", "), 
                paste(perm.factornames, collapse=", "), 
                paste(cov.factornames, collapse=", "))
    
    cat(totext, file=ofile)
}

save_mdmr.model <- function(mdir, modelinfo)
{
    ofile1 <- file.path(mdir, "model_raw.txt")
    ofile2 <- file.path(mdir, "model_evs.txt")
    
    write.csv(modelinfo$model, file=ofile1)
    write.csv(modelinfo$rhs, file=ofile2)
}

save_mdmr.hat_matrices <- function(mdir, modelinfo)
{
    factornames <- names(attr(modelinfo$qrhs, "factors2perm"))
    nfactors <- length(factornames)
    
    for (fi in 1:nfactors) {
        fname <- factornames[fi]
        ofile1 <- file.path(mdir, sprintf("H2s_%s.2D", fname))
        ofile2 <- file.path(mdir, sprintf("IHs_%s.2D", fname))
        write.table(modelinfo$H2s[[fi]], quote=F, row.names=F, col.names=F,  
                    file=ofile1)
        write.table(modelinfo$IHs[[fi]], quote=F, row.names=F, col.names=F,  
                    file=ofile2)
    }
}

save_mdmr.permutations <- function(mdir, list.perms, factor.names)
{
    nfactors <- length(list.perms)
    if (nfactors != length(factor.names)) {
        vstop("List of permutations must be length of %i and not %i", 
                nfactors, length(factor.names))
    }
    
    for (fi in 1:nfactors) {
        as.big.matrix(list.perms[[fi]], 
                      backingpath=mdir, 
                      descriptorfile=sprintf("perms_%s.desc", factor.names[fi]), 
                      backingfile=sprintf("perms_%s.bin", factor.names[fi]))
    }
}

save_mdmr.pvals_and_zstats <- function(sdir, mdir, Pvals, voxs, factor.names, nperms)
{
    library(tools) # for file_ext
    
    pval_to_zval <- function(pvals) {
        zvals <- qt(pvals, Inf, lower.tail=FALSE)
        
        # Since a p-value of 1, gives an infinite z-stat
        # then we can adjust the input p-value to be nperms/(nperms+1)
        inf_zvals <- is.infinite(zvals)
        if (any(inf_zvals))
            cat(sum(inf_zvals), "infinite z-value(s) found\n, setting to nperms/(nperms+1)\n")
        zvals[inf_zvals] <- nperms/(nperms+1)
        return(zvals)
    }
    
       
    nfactors    <- ncol(Pvals)
    nvoxs       <- nrow(Pvals)
    
    if (nfactors != length(factor.names))
        stop("Number of columns in Pvals must match length of factor.names")
    if (nvoxs != length(voxs))
        stop("Number of rows in Pvals must match length of voxs")
    
    
    maskpattern <- file.path(sdir, "mask.*")
    maskfile    <- Sys.glob(maskpattern)
    if (length(maskfile) == 0) 
        stop("Could not find mask file in ", maskpattern)
    ftype       <- detect_ftypes(maskfile)
    
    if (ftype == "nifti") {
        mask    <- read.mask(maskfile)
        hdr     <- read.nifti.header(maskfile)
        
        for (i in 1:nfactors) {
            ofile1 <- file.path(mdir, sprintf("one_minus_pvals_%s.nii.gz", 
                                             factor.names[[i]]))
            ps <- vector("numeric", sum(mask))
            ps[voxs] <- 1 - Pvals[,i]
            write.nifti(ps, hdr, mask, outfile=ofile1, odt="float")
        
            ofile2 <- file.path(mdir, sprintf("zstats_%s.nii.gz", 
                                             factor.names[[i]]))
            zs <- vector("numeric", sum(mask))
            zs[voxs] <- pval_to_zval(Pvals[,i])
            write.nifti(zs, hdr, mask, outfile=ofile2, odt="float")
        }
    } else if (ftype == "space"){
        # TODO: in future support writing of voxelwise data
        mask    <- as.logical(read.table(maskfile)[,])
        
        unmask_data <- function(x_masked) {
            x   <- vector("numeric", length=length(mask))
            x[mask][voxs] <- x_masked
            return(x)
        }
        
        for (i in 1:nfactors) {
            ofile1 <- file.path(mdir, sprintf("one_minus_pvals_%s.txt", 
                                             factor.names[[i]]))
            ps <- unmask_data(1 - Pvals[,i])
            write.table(ps, file=ofile1, row.names=F, col.names=F, quote=F)
        
            ofile2 <- file.path(mdir, sprintf("zstats_%s.txt", 
                                             factor.names[[i]]))
            zs <- pval_to_zval(Pvals[,i])
            zs <- unmask_data(zs)
            write.table(zs, file=ofile2, row.names=F, col.names=F, quote=F)
        }
    }
}


#' INTERNAL: Runs MDMR
#' saving the p-value's and F-statistics
#' and returning a matrix of permuted observation indices
#' 
#' @author Zarrar Shehzad
#' @param modelinfo a list that includes rhs, QR of rhs, H2s, IHs, and dfs
#' @param mat.Gs matrix where each column is a Gower centered matrix
#' @param list.perms list of matrices of nobs x nperms
#' @param list.Fperms list of big matrices that will hold the F-statistics
#' @param verbose boolean
#' @param ... values given to \code{\link{mdmr_perms}}
#' @return list.Fperms
mdmr.worker <- function(modelinfo, mat.Gs, list.perms, list.Fperms, 
                        verbose=TRUE, ...)
{
    list.H2s <- mdmr_perms.gather_H2s(modelinfo$rhs, modelinfo$qrhs, 
                                      list.perms, verbose=verbose, ...)
    list.IHs <- mdmr_perms.gather_IHs(modelinfo$rhs, modelinfo$qrhs, 
                                      list.perms, verbose=verbose, ...)
    
    # fill in values for list.Fperms
    vcat(verbose, "...calculating Pseudo-F stats")
    .Call("mdmr_worker", mat.Gs, list.Fperms, list.H2s, list.IHs, 
            as.double(modelinfo$df.res), as.double(modelinfo$df.exp))
    
    return(list.Fperms)
}

#' INTERNAL: Calculate p-values from real and permuted F-stats
#' 
#' @author Zarrar Shehzad
#' @param list.Fperms list of big matrices that will hold the F-statistics
#' @param verbose boolean
#' @param parallel boolean
#' @return matrix of nvoxs x nfactors containing p-values
mdmr.fstats_to_pvals <- function(list.Fperms, verbose=TRUE, parallel=FALSE)
{
    nfactors <- length(list.Fperms)
    nvoxs <- ncol(list.Fperms[[1]])
    nperms <- nrow(list.Fperms[[1]])
    progress <- ifelse(verbose, "text", "none")
    
    for (fi in 1:nfactors) {
        if (ncol(list.Fperms[[fi]]) != nvoxs)
            vstop("# of voxels not all the same in list.Fperms")
        if (nrow(list.Fperms[[fi]]) != nperms)
            vstop("# of permutations not all the same in list.Fperms")
    }
    
    Pmat <- laply(1:nfactors, function(fi)
                    .Call("mdmr_fstats_to_pvals", list.Fperms[[fi]]), 
                    .progress=progress, .parallel=parallel, .drop=FALSE)
    return(t(Pmat))
}
czarrar/connectir documentation built on Nov. 22, 2020, 12:13 p.m.