R/multiscaleSVDxpts.R

Defines functions simlrU simlr predictSimlr regularizeSimlr initializeSimlr .simlr3 mild milr.predict milr smoothAppGradCCA orthogonalizeAndQSparsify rankBasedMatrixSegmentation jointSmoothMatrixReconstruction .xuvtHelper knnSmoothImage smoothMultiRegression smoothRegression smoothMatrixPrediction knnSmoothingMatrix multiscaleSVD sparseDistanceMatrixXY sparseDistanceMatrix

Documented in initializeSimlr jointSmoothMatrixReconstruction knnSmoothImage knnSmoothingMatrix mild milr milr.predict multiscaleSVD orthogonalizeAndQSparsify predictSimlr rankBasedMatrixSegmentation regularizeSimlr simlr simlrU smoothAppGradCCA smoothMatrixPrediction smoothMultiRegression smoothRegression sparseDistanceMatrix sparseDistanceMatrixXY

#' Create sparse distance, covariance or correlation matrix
#'
#' Exploit k-nearest neighbor algorithms to estimate a sparse similarity matrix.
#' Critical to the validity of this function is the basic mathematical
#' relationships between euclidean distance and correlation and between
#' correlation and covariance.  For applications of such matrices, one may see
#' relevant publications by Mauro Maggioni and other authors.
#'
#' @param x input matrix, should be n (samples) by p (measurements)
#' @param k number of neighbors
#' @param r radius of epsilon-ball
#' @param sigma parameter for kernel PCA.
#' @param kmetric similarity or distance metric determining k nearest neighbors
#' @param eps epsilon error for rapid knn
#' @param ncores number of cores to use
#' @param sinkhorn boolean
#' @param kPackage name of package to use for knn.  FNN is reproducbile but
#' RcppHNSW is much faster (with nthreads controlled by enviornment variable
#' ITK_GLOBAL_DEFAULT_NUMBER_OF_THREADS) for larger problems.  For large problems,
#' compute the regularization once and save to disk; load for repeatability.
#' @param verbose verbose output
#' @return matrix sparse p by p matrix is output with p by k nonzero entries
#' @author Avants BB
#' @references
#' \url{http://www.math.jhu.edu/~mauro/multiscaledatageometry.html}
#' @examples
#' \dontrun{
#' set.seed(120)
#' mat <- matrix(rnorm(60), ncol = 10)
#' smat <- sparseDistanceMatrix(mat, 2)
#' r16 <- antsImageRead(getANTsRData("r16"))
#' mask <- getMask(r16)
#' mat <- getNeighborhoodInMask(
#'   image = r16, mask = mask, radius = c(0, 0),
#'   physical.coordinates = TRUE, spatial.info = TRUE
#' )
#' smat <- sparseDistanceMatrix(t(mat$indices), 10) # close points
#' testthat::expect_is(smat, "Matrix")
#' testthat::expect_is(smat, "dgCMatrix")
#' testthat::expect_equal(sum(smat), 18017)
#' }
#' @export sparseDistanceMatrix
sparseDistanceMatrix <- function(
    x, k = 3, r = Inf, sigma = NA,
    kmetric = c("euclidean", "correlation", "covariance", "gaussian"),
    eps = 1.e-6, ncores = NA, sinkhorn = FALSE, kPackage = "RcppHNSW",
    verbose = FALSE) {
  myn <- nrow(x)
  if (k >= ncol(x)) k <- ncol(x) - 1
  if (any(is.na(x))) stop("input matrix has NA values")
  if (kPackage == "RcppHNSW") mypkg <- "RcppHNSW" else mypkg <- "FNN"
  # note that we can convert from distance to covariance
  #   d_ij^2 = sigma_i^2 +  \sigma_j^2  - 2 * cov_ij
  # and from correlation to covariance   diag(sd) %*% corrMat %*% diag(sd)
  # and from euclidean distance of standardized data to correlation
  # 1.0 - dist^2 / ( 2 * nrow( x ) )
  # TODO / FIXME - implement covariance
  if (!usePkg("Matrix")) {
    stop("Please install the Matrix package")
  }
  if (!usePkg(mypkg)) {
    stop(paste("Please install the", mypkg, "package"))
  }
  kmetric <- match.arg(kmetric)
  if (kmetric == "gaussian" & is.na(sigma)) {
    stop("Please set the sigma parameter")
  }
  cometric <- (kmetric == "correlation" | kmetric == "covariance")
  if (cometric & r == Inf) r <- -Inf
  #  if ( ! usePkg("irlba") )
  #    stop("Please install the irlba package")
  # see http://www.analytictech.com/mb876/handouts/distance_and_correlation.htm
  # euclidean distance to correlation - xin contains correlations
  ecor <- function(xin, nn) {
    1.0 - xin / (2 * nn)
  }
  if (kmetric == "covariance") mycov <- apply(x, FUN = sd, MARGIN = 2)
  if (cometric) {
    x <- scale(x, center = TRUE, scale = (kmetric == "correlation"))
  }
  if (mypkg[1] == "RcppHNSW") {
    nThreads <- as.numeric(Sys.getenv("ITK_GLOBAL_DEFAULT_NUMBER_OF_THREADS"))
    if (!is.na(ncores)) nThreads <- ncores
    efval <- min(c(4, ncol(x)))
    bknn <- RcppHNSW::hnsw_knn(t(x),
      k = k, M = 16, ef = efval,
      distance = "euclidean",
      n_threads = nThreads,
      grain_size = floor(ncol(x) / nThreads)
    )
    names(bknn) <- c("nn.idx", "nn.dists")
  }
  if (mypkg[1] == "FNN") {
    bknn <- FNN::get.knn(t(x), k = k, algorithm = "kd_tree")
    names(bknn) <- c("nn.idx", "nn.dists")
  }
  #  if ( mypkg[1] == "nabor" ) bknn = nabor::knn( t( x ) , k=k, eps=eps )
  #  if ( mypkg[1] == "RANN" )  bknn = RANN::nn2( t( x ) , k=k, eps=eps  )

  # if ( mypkg[1] == "rflann" )  {
  #   myncores = as.numeric( system('getconf _NPROCESSORS_ONLN', intern = TRUE) )
  #   if ( !is.na( ncores  ) ) myncores = ncores
  #   bknn = rflann::Neighbour( t(x), t(x), k=k, "kdtree", cores=myncores, 1 )
  #   names( bknn ) = c( "nn.idx", "nn.dists" )
  # }
  #  if ( mypkg[1] == "naborpar" ) bknn = .naborpar( t( x ), t( x ) , k=k, eps=eps  )
  if (cometric) {
    bknn$nn.dists <- ecor(bknn$nn.dists, myn)
  }
  tct <- 0
  for (i in 1:ncol(x))
  {
    inds <- bknn$nn.idx[i, ] # index
    locd <- bknn$nn.dists[i, ] # dist
    inds[inds <= i] <- NA # we want a symmetric matrix
    tct <- tct + sum(!is.na(inds))
  }
  # build triplet representation for sparse matrix
  myijmat <- matrix(NA, nrow = (tct), ncol = 3)
  tct2 <- 1
  for (i in 1:ncol(x))
  {
    inds <- bknn$nn.idx[i, ]
    locd <- bknn$nn.dists[i, ]
    if (kmetric == "gaussian" & !is.na(sigma)) {
      locd <- exp(-1.0 * locd / (2.0 * sigma^2))
    }
    inds[inds <= i] <- NA # we want a symmetric matrix
    tctinc <- sum(!is.na(inds))
    if (kmetric == "covariance") {
      loccov <- mycov[i] * mycov[inds]
      locd <- locd * loccov
    }
    if (tctinc > 0) {
      upinds <- tct2:(tct2 + tctinc - 1)
      myijmat[upinds, 1] <- i
      myijmat[upinds, 2] <- inds[!is.na(inds)]
      myijmat[upinds, 3] <- locd[!is.na(inds)]
      tct2 <- tct2 + tctinc
    }
  }
  kmatSparse <- Matrix::sparseMatrix(
    i = myijmat[, 1],
    j = myijmat[, 2],
    x = myijmat[, 3], symmetric = TRUE
  )
  if (kmetric == "gaussian") diag(kmatSparse) <- 1
  if (cometric) {
    if (kmetric == "covariance") diag(kmatSparse) <- mycov^2
    if (kmetric == "correlation") diag(kmatSparse) <- 1
    kmatSparse[kmatSparse < r] <- 0
  } else {
    kmatSparse[kmatSparse > r] <- 0
  }
  if (sinkhorn) {
    for (i in 1:4) {
      #      kmatSparse = kmatSparse / Matrix::rowSums( kmatSparse )
      #      kmatSparse = Matrix::t( Matrix::t(kmatSparse) / Matrix::rowSums( Matrix::t(kmatSparse) ) )
      kmatSparse <- kmatSparse / Matrix::colSums(kmatSparse)
      kmatSparse <- kmatSparse / Matrix::rowSums(kmatSparse)
    }
  }
  return(kmatSparse)
  #
  #  mysvd = irlba::partial_eigen( kmatSparse, nvec )
  #  sparsenessParam = ( range( abs( mysvd ) ) * 0.00001 )[ 2 ]
  #  sparsenessParam = 1e-6
  #
}




#' Create sparse distance, covariance or correlation matrix from x, y
#'
#' Exploit k-nearest neighbor algorithms to estimate a sparse matrix measuring
#' the distance, correlation or covariance between two matched datasets.
#' Critical to the validity of this function is the basic mathematical
#' relationships between euclidean distance and correlation and between
#' correlation and covariance.  For applications of such matrices, one may see
#' relevant publications by Mauro Maggioni and other authors.
#'
#' @param x input matrix, should be n (samples) by p (measurements)
#' @param y input matrix second view, should be n (samples) by q (measurements)
#' @param k number of neighbors
#' @param r radius of epsilon-ball
#' @param sigma parameter for kernel PCA.
#' @param kmetric similarity or distance metric determining k nearest neighbors
#' @param eps epsilon error for rapid knn
#' @param kPackage name of package to use for knn.  FNN is reproducbile but
#' RcppHNSW is much faster (with nthreads controlled by enviornment variable
#' ITK_GLOBAL_DEFAULT_NUMBER_OF_THREADS) for larger problems.  For large problems,
#' compute the regularization once and save to disk; load for repeatability.
#' @param ncores number of cores to use
#' @param verbose verbose output
#' @return matrix sparse p by q matrix is output with p by k nonzero entries
#' @author Avants BB
#' @references
#' \url{http://www.math.jhu.edu/~mauro/multiscaledatageometry.html}
#' @examples
#' \dontrun{
#' set.seed(120)
#' mat <- matrix(rnorm(60), nrow = 6)
#' mat2 <- matrix(rnorm(120), nrow = 6)
#' smat <- sparseDistanceMatrixXY(mat, mat2, 3)
#' smat2 <- sparseDistanceMatrixXY(mat2, mat, 3)
#' testthat::expect_is(smat, "Matrix")
#' testthat::expect_is(smat, "dgCMatrix")
#' testthat::expect_is(smat2, "Matrix")
#' testthat::expect_is(smat2, "dgCMatrix")
#' testthat::expect_equal(sum(smat), 154.628961265087)
#' testthat::expect_equal(sum(smat2), 63.7344262003899)
#' }
#' @export sparseDistanceMatrixXY
sparseDistanceMatrixXY <- function(x, y, k = 3, r = Inf, sigma = NA,
                                   kmetric = c("euclidean", "correlation", "covariance", "gaussian"),
                                   eps = 0.000001,
                                   kPackage = "RcppHNSW",
                                   ncores = NA,
                                   verbose = FALSE) # , mypkg = "nabor" )
{
  if (any(is.na(x))) stop("input matrix x has NA values")
  if (any(is.na(y))) stop("input matrix y has NA values")
  if (kPackage == "RcppHNSW") mypkg <- "RcppHNSW" else mypkg <- "FNN"
  if (!usePkg("Matrix")) {
    stop("Please install the Matrix package")
  }
  if (!usePkg(mypkg)) {
    stop(paste("Please install the", mypkg, "package"))
  }
  kmetric <- match.arg(kmetric)
  if (kmetric == "gaussian" & is.na(sigma)) {
    stop("Please set the sigma parameter")
  }
  cometric <- (kmetric == "correlation" | kmetric == "covariance")
  ecor <- function(xin) {
    1.0 - xin / (2 * nrow(x))
  }
  if (cometric) {
    x <- scale(x, center = TRUE, scale = (kmetric == "correlation"))
    y <- scale(y, center = TRUE, scale = (kmetric == "correlation"))
  }
  if (mypkg[1] == "RcppHNSW") {
    efval <- min(c(4, ncol(x)))
    nThreads <- as.numeric(Sys.getenv("ITK_GLOBAL_DEFAULT_NUMBER_OF_THREADS"))
    if (!is.na(ncores)) nThreads <- ncores
    ann <- RcppHNSW::hnsw_build(t(x),
      distance = "euclidean",
      M = 12,
      ef = efval,
      n_threads = nThreads,
      grain_size = floor(ncol(x) / nThreads)
    )

    efval <- min(c(4, ncol(y)))
    bknn <- RcppHNSW::hnsw_search(t(y),
      ann,
      k = k,
      ef = efval,
      n_threads = nThreads,
      grain_size = floor(ncol(y) / nThreads)
    )
    names(bknn) <- c("nn.idx", "nn.dists")
  }
  if (mypkg[1] == "FNN") {
    knnalgs <- c("kd_tree", "brute")
    bknn <- FNN::get.knnx(t(x), t(y), k = k, algorithm = knnalgs[1])
    names(bknn) <- c("nn.idx", "nn.dists")
  }
  #  if ( mypkg[1] == "nabor" ) bknn = nabor::knn( t( y ), t( x ) , k=k, eps=eps )
  #  if ( mypkg[1] == "RANN" )  bknn = RANN::nn2( t( y ), t( x ) , k=k, eps=eps )
  # if ( mypkg[1] == "rflann" )  {
  #   myncores = as.numeric( system('getconf _NPROCESSORS_ONLN', intern = TRUE) )
  #   if ( !is.na( ncores  ) ) myncores = ncores
  #   bknn = rflann::Neighbour( t(y), t(x), k=k, "kdtree", cores=myncores, 1 )
  #   names( bknn ) = c( "nn.idx", "nn.dists" )
  # }


  #  if ( mypkg[1] == "naborpar" ) bknn = .naborpar( t( y ), t( x ) , k=k, eps=eps  )
  if (cometric) bknn$nn.dists <- ecor(bknn$nn.dists)
  tct <- 0
  nna <- rep(FALSE, nrow(bknn$nn.idx))
  #
  for (i in 1:nrow(bknn$nn.idx))
  {
    inds <- bknn$nn.idx[i, ] # index
    locd <- bknn$nn.dists[i, ] # dist
    nna[i] <- any(is.na(inds))
    tct <- tct + sum(!is.na(inds))
  }
  # build triplet representation for sparse matrix
  myijmat <- matrix(nrow = (tct), ncol = 3)
  tct2 <- 1
  for (i in 1:ncol(y))
  {
    inds <- bknn$nn.idx[i, ]
    locd <- bknn$nn.dists[i, ]
    if (kmetric == "gaussian" & !is.na(sigma)) {
      locd <- exp(-1.0 * locd / (2.0 * sigma^2))
    }
    tctinc <- sum(!is.na(inds))
    if (kmetric == "covariance") {
      locd <- cov(y[, i], x[, inds])
    } else if (kmetric == "correlation") {
      locd <- cor(y[, i], x[, inds])
    }
    if (tctinc > 0) {
      upinds <- tct2:(tct2 + tctinc - 1)
      myijmat[upinds, 1] <- i
      myijmat[upinds, 2] <- inds[!is.na(inds)]
      myijmat[upinds, 3] <- locd[!is.na(inds)]
      tct2 <- tct2 + tctinc
    }
  }
  kmatSparse <- Matrix::sparseMatrix(
    i = myijmat[, 1],
    j = myijmat[, 2],
    x = myijmat[, 3],
    dims = c(ncol(y), ncol(x)), symmetric = FALSE
  )
  if (cometric) {
    #    if ( kmetric == "covariance" )  diag( kmatSparse ) = mycov^2
    #    if ( kmetric == "correlation" ) diag( kmatSparse ) = 1
    kmatSparse[kmatSparse < r] <- 0
  } else {
    kmatSparse[kmatSparse > r] <- 0
  }
  return(kmatSparse)
}



#
# .naborpar <- function( xin, yin, k, eps ) {
#  if ( ! usePkg( "doParallel" ) ) stop( "need doParallel for naborpar" )
#  library( doParallel )
#  library( parallel )
#  invisible(capture.output(library( foreach, quietly=TRUE )))
#  ncor = parallel::detectCores( )
#  cl <- round( parallel::makeCluster( ncor )/2 )
#  doParallel::registerDoParallel( cl )
#  mycomb <- function( x, y ) {
#    x$nn.idx = rbind( x$nn.idx, y$nn.idx )
#    x$nn.dists = rbind( x$nn.dists, y$nn.dists )
#    x
#    }
#  mylen = ceiling( nrow( xin ) / ncor  )
#  selinds = rep( c(1:ncor), each=mylen )[1:nrow(xin)]
#  myout <- foreach( i=1:ncor, .combine=mycomb, .packages="ANTsR" ) %dopar% {
#    selector = selinds == i
#    nabor::knn( xin , yin[selector,], k=k, eps=eps )
#  }
#  myout
# }
#

#' Multi-scale svd
#'
#' Maggioni's multi-scale SVD algorithm explores the dimensionality of a dataset
#' by investigating the change in eigenvalues with respect to a scale parameter.
#' The scale parameter is defined by the radius of a ball that sits at each
#' point in the data.  The ball, at each scale, is moved across the dataset
#' and SVD is computed within the intersection of the ball and the data at each
#' point.  The shape in this collection of eigenvalues, with respect to scale,
#' enables us to estimate both signal and noise dimensionality and scale.  The
#' estimate can be computed efficiently on large datasets if the sampling is
#' chosen appropriately.  The challenge, in this algorithm, is classifying the
#' dimensions of noise, curvature and data.  This classification currently uses
#' variations on heuristics suggested in work by Maggioni et al.
#'
#' @param x input matrix, should be n (samples) by p (measurements)
#' @param r radii to explore
#' @param locn number of local samples to take at each scale
#' @param nev maximum number of eigenvalues to compute
#' @param knn randomly sample neighbors to assist with large datasets. set k with this value.
#' @param verbose boolean to control verbosity of output
#' @param plot boolean to control whether we plot results.  its value determines
#' which eigenvector off which to base the scale of the y-axis.'
#' @return list with a vector of tangent, curvature, noise dimensionality and a
#' a dataframe containing eigenvalues across scale, in correspondence with r:
#' \itemize{
#'   \item{dim: }{The tangent, curvature and noise dimensionality vector.  The
#' data dimensionality is the first entry, the curvature dimensionality exists
#' from the second to the first entry of the noise vector.}
#'   \item{noiseCutoffs: }{Dimensionalities where the noise may begin.  These
#' are candidate cutoffs but may contain some curvature information.'}
#'   \item{evalsVsScale: }{eigenvalues across scale}
#'   \item{evalClustering:}{data-driven clustering of the eigenvalues}
#' }
#' @author Avants BB
#' @references
#' \url{http://www.math.jhu.edu/~mauro/multiscaledatageometry.html}
#' @examples
#'
#' sphereDim <- 9
#' embeddDim <- 100
#' n <- 1000
#' if (usePkg("pracma")) {
#'   set.seed(20190919)
#'   sphereData <- pracma::rands(n, sphereDim, 1.)
#'   mysig <- 0.1
#'   spherEmbed <- matrix(rnorm(n * embeddDim, 0, mysig), nrow = n, ncol = embeddDim)
#'   spherEmbed[, 1:ncol(sphereData)] <- spherEmbed[, 1:ncol(sphereData)] + sphereData
#'   myr <- seq(1.0, 2.2, 0.05) # scales at which to sample
#'   mymssvd <- multiscaleSVD(spherEmbed, myr, locn = 5, nev = 20, plot = 1)
#'   if (getRversion() < "3.6.0") {
#'     testthat::expect_equal(mymssvd$noiseCutoffs, c(10, 11))
#'     cm <- unname(colMeans(mymssvd$evalsVsScale[11:25, ]))
#'     testthat::expect_equal(
#'       cm,
#'       c(
#'         0.133651668406975, 0.0985695151401464, 0.0914110478052329,
#'         0.086272017653314, 0.081188302173622, 0.0766100356616153, 0.0719736252996842,
#'         0.067588745051721, 0.0622331185687704, 0.0415236318358749, 0.0192976885668337,
#'         0.0183063537558787, 0.0174990088862745, 0.0170012938275551, 0.0163859378707545,
#'         0.0158265354487181, 0.0153357773252783, 0.0147933538908736, 0.0143510807701235,
#'         0.0140473978346935
#'       )
#'     )
#'   } else {
#'     testthat::expect_equal(mymssvd$noiseCutoffs, c(11, 15))
#'     cm <- unname(colMeans(mymssvd$evalsVsScale[13:25, ]))
#'     testthat::expect_equal(
#'       cm,
#'       c(
#'         0.138511257441516, 0.106071822485487, 0.0989441114152412, 0.092910922851038,
#'         0.0877970523897918, 0.0832570763653118, 0.0782599820599334, 0.0734433988152632,
#'         0.0678992413676906, 0.0432283615430504, 0.0202481578919003, 0.0191747572787057,
#'         0.0185718929604774, 0.0178301092823977, 0.0172423799670431, 0.0166981650233669,
#'         0.0162072551503541, 0.015784555784915, 0.0153600119986575, 0.0149084240854556
#'       )
#'     )
#'   }
#' }
#' @export multiscaleSVD
multiscaleSVD <- function(x, r, locn, nev, knn = 0, verbose = FALSE, plot = 0) {
  mresponse <- matrix(ncol = nev, nrow = length(r))
  n <- nrow(x)
  calcRowMatDist <- function(xmat, xrow) {
    locmag <- function(x, xrow) sqrt(sum((x - xrow)^2))
    apply(xmat, FUN = locmag, MARGIN = 1, xrow = xrow)
  }
  for (myscl in 1:length(r))
  {
    myr <- r[myscl]
    locsam <- sample(1:n, locn)
    myevs <- matrix(nrow = locn, ncol = nev)
    for (i in 1:locn)
    {
      rowdist <- calcRowMatDist(x, x[locsam[i], ])
      sel <- rowdist < myr
      if (sum(sel, na.rm = TRUE) > 2) {
        if (knn > 0 & sum(sel) > knn) # take a subset of sel
          {
            selinds <- sample(1:length(sel), knn)
            sel[-selinds] <- FALSE
          }
        lmat <- x[sel, ]
        if (nrow(lmat) < ncol(lmat)) lcov <- cov(t(lmat)) else lcov <- cov(lmat)
        temp <- svd(lcov, nv = (nrow(lcov) - 1))$d # * embeddDim / sum(sel)
        # lcov = sparseDistanceMatrix( x, k = knn, kmetric = "cov" )
        #  temp = irlba::irlba( lcov, nv=(nrow(lcov)-1) )$d
        temp <- temp[1:min(c(nev, length(temp)))]
        if (length(temp) < nev) temp <- c(temp, rep(NA, nev - length(temp)))
      } else {
        temp <- rep(NA, nev)
      }
      myevs[i, 1:nev] <- temp
      if (i == locn) {
        mresponse[myscl, ] <- colMeans(myevs, na.rm = TRUE)
        if (verbose) {
          print(paste(i, "r", myr, "localN", sum(sel)))
          print(mresponse[myscl, ])
        }
      }
    }
  }
  colnames(mresponse) <- paste("EV", 1:nev, sep = "")
  rownames(mresponse) <- paste("Scale", 1:length(r), sep = "")
  # just use pam to cluster the eigenvalues
  goodscales <- !is.na(rowMeans(mresponse))
  temp <- t(mresponse)[1:nev, goodscales] # remove top evec
  pamk <- NA
  krng <- 5:min(dim(temp) - 1) # force a min of 4 clusters
  if (usePkg("fpc")) {
    pamk <- fpc::pamk(temp, krange = krng)$pamobject$clustering
  }
  ############################################################
  # noise dimensionality
  scaleEvalCorrs <- rep(NA, ncol(mresponse))
  for (i in 1:nev)
  {
    mylm <- lm(mresponse[, i] ~ stats::poly(r^2, 2))
    coffs <- coefficients(summary(mylm))
    scaleEvalCorrs[i] <- summary(mylm)$r.squared # coffs[2,4]
  }
  shiftinds <- c(2:ncol(mresponse), ncol(mresponse))
  delt <- scaleEvalCorrs - scaleEvalCorrs[shiftinds]
  qdelt <- quantile(delt[2:length(delt)], 0.9, na.rm = TRUE)
  noiseDim <- which(delt > qdelt) + 1
  # curvature dimensionality
  # find the dimensionality that maximizes the t-test difference across
  # the multi-scale eigenvalues
  myt <- rep(NA, nev)
  for (i in 3:(nev - 2))
  {
    lowinds <- 2:i
    hiinds <- (i + 1):nev
    myt[i] <- t.test(mresponse[, lowinds], mresponse[, hiinds], paired = FALSE)$sta
    #  myt[ i ] = t.test( scaleEvalCorrs[lowinds], scaleEvalCorrs[hiinds], paired=FALSE )$sta
  }
  dataDimCurv <- which.max(myt)
  # find singular values that do not grow with scale ( radius )
  if (length(r) > 4) {
    scaleEvalCorrs <- rep(NA, ncol(mresponse))
    ply <- 4 # power for polynomial model
    for (i in 1:dataDimCurv)
    {
      mylm <- lm(mresponse[, i] ~ stats::poly(r, ply))
      coffs <- coefficients(summary(mylm))
      #  print( summary( mylm ) )
      #  print( paste("EV",i) )
      #  Sys.sleep( 3 )
      # linear term coffs[2,4]
      # quadratic term coffs[3,4]
      # noise term coffs[5,4]
      scaleEvalCorrs[i] <- coffs[2, 4]
    }
    dataDim <- max(which(scaleEvalCorrs < 0.05))
  } else {
    dataDim <- 4
  }
  ############################################################
  if (plot > 0) {
    mycols <- rainbow(nev)
    growthRate1 <- mresponse[, 1]
    plot(r, growthRate1,
      type = "l", col = mycols[1], main = "Evals by scale",
      ylim = c(0.00, max(mresponse[, plot], na.rm = TRUE)),
      xlab = "ball-radius", ylab = "Expected Eval"
    )
    for (i in 2:ncol(mresponse))
    {
      growthRatek <- mresponse[, i] # magic :: shift(mresponse[,i],1)*0
      points(r, growthRatek, type = "l", col = mycols[i])
    }
  }
  return(
    list(
      dim            = c(dataDim, dataDimCurv),
      noiseCutoffs   = noiseDim,
      evalClustering = pamk,
      evalsVsScale   = mresponse
    )
  )
}




#' k-nearest neighbors constrained smoothing
#'
#' Compute a smoothing matrix based on an input matrix of point coordinates
#'
#' @param x input matrix of point coordinates of dimensions n-spatial
#' spatial dimensions by p points
#' @param k number of neighbors, higher causes more smoothing
#' @param sigma sigma for the gaussian function
#' @param segmentation optional boolean to restrict specific rows to have minimal respons
#' @param ... arguments passed to \code{sparseDistanceMatrix}
#' @return sparse matrix is output
#' @author Avants BB
#' @examples
#' \dontrun{
#' mask <- getMask(antsImageRead(getANTsRData("r16")))
#' spatmat <- t(imageDomainToSpatialMatrix(mask, mask))
#' smoothingMatrix <- knnSmoothingMatrix(spatmat, k = 25, sigma = 3.0)
#' rvec <- rnorm(nrow(smoothingMatrix))
#' srvec <- smoothingMatrix %*% rvec
#' rvi <- makeImage(mask, rvec)
#' srv <- makeImage(mask, as.numeric(srvec))
#' }
#' @export knnSmoothingMatrix
knnSmoothingMatrix <- function(x, k, sigma, segmentation, ...) {
  usePkg("Matrix")
  jmat <- sparseDistanceMatrix(x,
    k = k, kmetric = "gaussian", sigma = sigma,
    sinkhorn = FALSE, ...
  )
  if (!missing(segmentation) & FALSE) {
    diagSparse <- Matrix::sparseMatrix(
      i = 1:length(segmentation),
      j = 1:length(segmentation),
      x = as.numeric(segmentation), symmetric = TRUE
    )
    jmat <- diagSparse %*% jmat
  }
  if (FALSE) {
    for (i in 1:4) { # sinkhorn
      normalizer <- Matrix::rowSums(jmat)
      normalizer[normalizer == 0] <- Inf
      if (!missing(segmentation)) {
        normalizer[!segmentation] <- 1e9
      }
      jmat <- jmat / normalizer
      normalizer2 <- Matrix::rowSums(Matrix::t(jmat))
      normalizer2[normalizer2 == 0] <- Inf
      if (!missing(segmentation)) {
        normalizer2[!segmentation] <- 1e9
      }
      jmat <- Matrix::t(Matrix::t(jmat) / normalizer2)
    }
  }
  normalizer <- Matrix::rowSums(jmat)
  normalizer[normalizer == 0] <- Inf
  if (!missing(segmentation)) {
    normalizer[!segmentation] <- 1e9
  }
  jmat <- jmat / normalizer
  return(jmat)
}




#' smooth matrix prediction
#'
#' Reconstruct a n by p matrix given n by k basis functions or predictors.
#' The reconstruction can be regularized.
#' # norm( x - uv^t )
#' # d/dv ... leads to ( -u, x - uv^t  )
#' # u^t u v^t - u^t x
#'
#' @param modelFormula a formula object which has, on the left, the variable x
#' and the prediction formula on the right.
#' @param x input matrix to be predicted.
#' @param basisDf data frame for basis predictors
#' @param iterations number of gradient descent iterations
#' @param gamma step size for gradient descent
#' @param sparsenessQuantile quantile to control sparseness - higher is sparser.
#' @param positivity restrict to positive or negative solution (beta) weights.
#' choices are positive, negative or either as expressed as a string.
#' @param smoothingMatrix allows parameter smoothing, should be square and same
#' size as input matrix
#' @param repeatedMeasures list of repeated measurement identifiers. this will
#' allow estimates of per identifier intercept.
#' @param rowWeights vectors of weights with size n (assumes diagonal covariance)
#' @param LRR integer value sets rank for fast version exploiting matrix approximation
#' @param doOrth boolean enforce gram-schmidt orthonormality
#' @param verbose boolean option
#' @return matrix of size p by k is output
#' @author Avants BB
#' @examples
#' \dontrun{
#' mask <- getMask(antsImageRead(getANTsRData("r16")))
#' spatmat <- t(imageDomainToSpatialMatrix(mask, mask))
#' smoomat <- knnSmoothingMatrix(spatmat, k = 5, sigma = 1.0)
#' mat <- matrix(rnorm(sum(mask) * 50), ncol = sum(mask), nrow = 50)
#' mat[1:25, 100:10000] <- mat[1:25, 100:10000] + 1
#' age <- rnorm(1:nrow(mat))
#' for (i in c(5000:6000, 10000:11000, 16000:17000)) {
#'   mat[, i] <- age * 0.1 + mat[, i]
#' }
#' gen <- c(rep("M", 25), rep("F", 12), rep("T", 13))
#' repmeas <- rep(c("A", "B", "C", "D", "E", "F", "G"), nrow(mat))[1:nrow(mat)]
#' mydf <- data.frame(age = scale(age), gen = gen)
#' fit <- smoothMatrixPrediction(
#'   x = mat, basisDf = mydf, iterations = 10,
#'   gamma = 1.e-6, sparsenessQuantile = 0.5,
#'   smoothingMatrix = smoomat, repeatedMeasures = repmeas,
#'   verbose = T
#' )
#' tt <- mat %*% fit$v
#' print(cor.test(mydf$age, tt[, 1]))
#' print(cor.test(fit$u[, "genM"], tt[, 2]))
#' vimg <- makeImage(mask, (fit$v[, 1]))
#' print(range(vimg) * 10)
#' plot(mask, vimg, window.overlay = range(abs(vimg)))
#' vimg <- makeImage(mask, (fit$v[, 2]))
#' print(range(vimg) * 10)
#' plot(mask, vimg, window.overlay = range(abs(vimg)))
#' }
#' @export smoothMatrixPrediction
smoothMatrixPrediction <- function(
    x,
    basisDf,
    modelFormula = as.formula(" x ~ ."),
    iterations = 10,
    gamma = 1.e-6,
    sparsenessQuantile = 0.5,
    positivity = c("positive", "negative", "either"),
    smoothingMatrix = NA,
    repeatedMeasures = NA,
    rowWeights = NA,
    LRR = NA,
    doOrth = FALSE,
    verbose = FALSE) {
  poschoices <- c("positive", "negative", "either", TRUE, FALSE)
  if (sum(positivity == poschoices) != 1 | length(positivity) != 1) {
    stop("choice of positivity parameter is not good - see documentation")
  }
  if (positivity == TRUE) positivity <- "positive"
  if (positivity == FALSE) positivity <- "either"
  smoothingWeight <- 1.0
  if (missing("x") | missing("basisDf")) {
    message("this function needs input")
    return(NA)
  }
  if (nrow(x) != nrow(basisDf)) {
    message("inconsistent row numbers between x and basisDf")
    return(NA)
  }
  if (!any(is.na(repeatedMeasures))) {
    usubs <- unique(repeatedMeasures)
    wtdf <- data.frame(table(repeatedMeasures))
    # rowWeights should scale with counts
    repWeights <- rep(0, length(repeatedMeasures))
    for (u in usubs) {
      repWeights[repeatedMeasures == u] <- 1.0 / wtdf$Freq[wtdf$repeatedMeasures == u]
    }
    rm(wtdf)
    if (all(is.na(rowWeights))) {
      rowWeights <- repWeights
    } else {
      rowWeights <- rowWeights * repWeights
    }
  }
  hasweights <- !all(is.na(rowWeights))
  if (hasweights) {
    locdf <- basisDf
    locdf$wts <- rowWeights
    wts <- "this is just a placeholder"
    mdl <- lm(as.formula(modelFormula),
      data = locdf, weights = wts, na.action = "na.exclude"
    )
    rm(locdf)
  } else {
    mdl <- lm(as.formula(modelFormula), data = basisDf, na.action = "na.exclude")
  }
  # bmdl = bigLMStats( mdl )
  u <- scale(model.matrix(mdl))
  intercept <- u[, 1]
  if (any(is.na(intercept))) intercept <- rep(0, length(intercept))
  u <- u[, -1]
  v <- antsrimpute(t(mdl$coefficients[-1, ]))
  v <- v + matrix(rnorm(length(v), 0, 0.01), nrow = nrow(v), ncol = ncol(v))
  if (!is.na(LRR)) {
    u <- lowrankRowMatrix(u, LRR)
    v <- t(lowrankRowMatrix(t(v), LRR))
    x <- icawhiten(x, LRR)
    #  x = lowrankRowMatrix( x, LRR )
  }
  if (hasweights & is.na(LRR)) {
    u <- diag(sqrt(rowWeights)) %*% u
    x <- diag(sqrt(rowWeights)) %*% x
  }
  intercept <- rowMeans(x - (u %*% t(v)))
  err <- mean(abs(x - (u %*% t(v) + intercept)))
  if (verbose) print(paste("iteration", 0, "err", err))
  tu <- t(u)
  tuu <- t(u) %*% u
  errs <- rep(NA, length(iterations))
  i <- 1
  while (i <= iterations) {
    v <- as.matrix(smoothingMatrix %*% v)
    dedv <- t(tuu %*% t(v) - tu %*% x)
    v <- v - dedv * gamma
    #  v = rsvd::rsvd( v )$v
    for (vv in 1:ncol(v)) {
      v[, vv] <- v[, vv] / sqrt(sum(v[, vv] * v[, vv]))
      if (vv > 1) {
        for (vk in 1:(vv - 1)) {
          temp <- v[, vk]
          denom <- sum(temp * temp, na.rm = TRUE)
          if (denom > 0) ip <- sum(temp * v[, vv]) / denom else ip <- 1
          v[, vv] <- v[, vv] - temp * ip
        }
      }
      localv <- v[, vv]
      doflip <- FALSE
      if (sum(localv > 0) < sum(localv < 0)) {
        localv <- localv * (-1)
        doflip <- TRUE
      }
      myquant <- quantile(localv, sparsenessQuantile, na.rm = TRUE)
      if (positivity == "positive") {
        if (myquant > 0) localv[localv <= myquant] <- 0 else localv[localv >= myquant] <- 0
      } else if (positivity == "negative") {
        localv[localv > myquant] <- 0
      } else if (positivity == "either") {
        localv[abs(localv) < quantile(abs(localv), sparsenessQuantile, na.rm = TRUE)] <- 0
      }
      if (doflip) v[, vv] <- localv * (-1) else v[, vv] <- localv
    }
    intercept <- rowMeans(x - (u %*% t(v)))
    if (!any(is.na(repeatedMeasures)) & is.na(LRR)) { # estimate random intercepts
      for (s in usubs) {
        usel <- repeatedMeasures == s
        intercept[usel] <- mean(intercept[usel], na.rm = TRUE)
      }
    }
    err <- mean(abs(x - (u %*% t(v) + intercept)))
    errs[i] <- err
    if (i > 1) {
      if ((errs[i] > errs[i - 1]) & (i == 3)) {
        #      message(paste("flipping sign of gradient step:", gamma))
        #      gamma = gamma * ( -1.0 )
      } else if ((errs[i] > errs[i - 1])) {
        gamma <- gamma * (0.5)
        if (verbose) message(paste("reducing gradient step:", gamma))
      }
      if (abs(gamma) < 1.e-9) i <- iterations
    }
    i <- i + 1
    if (verbose) print(paste(i, err))
  }
  if (verbose) print(paste("end", err))
  colnames(v) <- colnames(u)
  return(list(u = u, v = v, intercept = intercept))
}




#' smooth matrix-based regression
#'
#' Reconstruct a n by 1 vector given n by p matrix of predictors.
#'
#' @param x input matrix on which prediction is based
#' @param y target vector
#' @param iterations number of gradient descent iterations
#' @param sparsenessQuantile quantile to control sparseness - higher is sparser.
#' @param positivity restrict to positive or negative solution (beta) weights.
#' choices are positive, negative or either as expressed as a string.
#' @param smoothingMatrix allows parameter smoothing, should be square and same
#' size as input matrix
#' @param nv number of predictor spatial vectors
#' @param extraPredictors additional column predictors
#' @param verbose boolean option
#' @return vector of size p is output
#' @author Avants BB
#' @examples
#' \dontrun{
#' mask <- getMask(antsImageRead(getANTsRData("r16")))
#' spatmat <- t(imageDomainToSpatialMatrix(mask, mask))
#' smoomat <- knnSmoothingMatrix(spatmat, k = 200, sigma = 1.0)
#' mat <- matrix(rnorm(sum(mask) * 50), ncol = sum(mask), nrow = 50)
#' mat[1:25, 100:10000] <- mat[1:25, 100:10000] + 1
#' age <- rnorm(1:nrow(mat))
#' for (i in c(5000:6000, 10000:11000, 16000:17000)) {
#'   mat[, i] <- age * 0.1 + mat[, i]
#' }
#' sel <- 1:25
#' fit <- smoothRegression(
#'   x = mat[sel, ], y = age[sel], iterations = 10,
#'   sparsenessQuantile = 0.5,
#'   smoothingMatrix = smoomat, verbose = T
#' )
#' tt <- mat %*% fit$v
#' print(cor.test(age[-sel], tt[-sel, 1]))
#' vimg <- makeImage(mask, (fit$v[, 1]))
#' print(range(vimg) * 10)
#' plot(mask, vimg, window.overlay = range(abs(vimg)))
#' }
#' @export smoothRegression
## #' @param gamma step size for gradient descent
smoothRegression <- function(
    x,
    y,
    iterations = 10,
    sparsenessQuantile = 0.5,
    positivity = FALSE,
    smoothingMatrix = NA,
    nv = 2,
    extraPredictors,
    verbose = FALSE) {
  x <- scale(x, scale = FALSE)
  poschoices <- c("positive", "negative", "either", TRUE, FALSE)
  if (sum(positivity == poschoices) != 1 | length(positivity) != 1) {
    stop("choice of positivity parameter is not good - see documentation")
  }
  if (positivity == TRUE) positivity <- "positive"
  if (positivity == FALSE) positivity <- "either"
  gamma <- 1.e-8
  if (missing("x") | missing("y")) {
    message("this function needs input")
    return(NA)
  }
  #
  if (all(is.na(smoothingMatrix))) smoothingMatrix <- diag(ncol(x))
  originalN <- ncol(x)
  if (!missing("extraPredictors")) {
    temp <- lm(y ~ ., data = data.frame(extraPredictors))
    mdlmatrix <- scale(model.matrix(temp)[, -1], scale = TRUE)
    extraN <- originalN + ncol(mdlmatrix)
    x <- cbind(x, mdlmatrix)
  }
  scaledY <- as.numeric(scale(y))
  xgy <- scaledY %*% x
  v <- matrix(0, nrow = nv, ncol = ncol(x))
  for (k in 1:nv) {
    v[k, ] <- xgy + matrix(rnorm(ncol(x), 0, 1.e-3), nrow = 1, ncol = ncol(x))
  }
  errs <- rep(NA, length(iterations))
  i <- 1
  while (i <= iterations) {
    temp <- t(x %*% t(as.matrix(v))) %*% x
    dedv <- temp * 0
    for (k in 1:nv) {
      dedv[k, ] <- xgy - temp[k, ]
    }
    v <- (v + dedv * gamma)
    v[, 1:originalN] <- as.matrix(v[, 1:originalN] %*% smoothingMatrix)
    for (k in 1:nv) {
      if (k > 1) {
        for (vk in 1:(k - 1)) {
          temp <- v[vk, ]
          denom <- as.numeric(temp %*% temp)
          if (denom > 0) ip <- as.numeric(temp %*% v[k, ]) / denom else ip <- 1
          v[k, ] <- v[k, ] - temp * ip
        }
      }
    }
    v[, 1:originalN] <- as.matrix(v[, 1:originalN] %*% smoothingMatrix)
    v <- t(orthogonalizeAndQSparsify(t(v), sparsenessQuantile, positivity))
    if (i < 3) gamma <- quantile(v[abs(v) > 0], 0.5, na.rm = TRUE) * 1.e-2
    proj <- x %*% t(v)
    intercept <- colMeans(scaledY - (proj))
    for (k in 1:nv) proj[, k] <- proj[, k] + intercept[k]
    ymdl <- lm(scaledY ~ proj)
    err <- mean(abs(scaledY - (proj)))
    errs[i] <- err # summary(ymdl)$r.squared * ( -1 )
    if (verbose) print(paste("it/err/rsq", i, errs[i], summary(ymdl)$r.squared, "gamma", gamma))
    #  coefwts = coefficients( ymdl )
    #  for ( k in 1:nv ) v[k,] = v[k,] * coefwts[k+1]
    #  proj = x %*% t( v ) # + coefwts[1]
    if (i > 1) {
      if ((mean(errs[4:5]) > mean(errs[1:3])) & (i == 5)) {
        #      message(paste("flipping sign of gradient step:", gamma))
        gamma <- gamma * (-1.0)
      } else if ((errs[i] > errs[i - 1])) {
        gamma <- gamma * (0.5)
        if (verbose) message(paste("reducing gradient step:", gamma))
      } else {
        gamma <- gamma * 1.05
      }
      if (abs(gamma) < 1.e-12) i <- iterations
    }
    i <- i + 1
  }
  if (verbose) print(paste("end", errs[i - 1]))
  imagev <- v[, 1:originalN]
  return(
    list(
      v = as.matrix(t(imagev)),
      fullv = as.matrix(t(v))
    )
  )
}


#' Reconstruct a n by k vector given n by p matrix of predictors.
#'
#' @param x input matrix on which prediction is based
#' @param y target matrix
#' @param iterations number of gradient descent iterations
#' @param sparsenessQuantile quantile to control sparseness - higher is sparser.
#' @param positivity restrict to positive or negative solution (beta) weights.
#' choices are positive, negative or either as expressed as a string.
#' @param smoothingMatrixX allows parameter smoothing, should be square and same
#' size as input matrix
#' @param smoothingMatrixY allows parameter smoothing, should be square and same
#' size as input matrix
#' @param nv number of predictor spatial vectors
#' @param extraPredictors additional column predictors
#' @param verbose boolean option
#' @return vector of size p is output
#' @author Avants BB
#' @examples
#' \dontrun{
#' mask <- getMask(antsImageRead(getANTsRData("r16")))
#' spatmat <- t(imageDomainToSpatialMatrix(mask, mask))
#' smoomat <- knnSmoothingMatrix(spatmat, k = 200, sigma = 1.0)
#' mat <- matrix(rnorm(sum(mask) * 50), ncol = sum(mask), nrow = 50)
#' mat[1:25, 100:10000] <- mat[1:25, 100:10000] + 1
#' age <- matrix(rnorm(nrow(mat) * 2), ncol = 2)
#' for (i in c(5000:6000, 10000:11000, 16000:17000)) {
#'   mat[, i] <- age[, 1] * 0.1 + mat[, i]
#' }
#' sel <- 1:25
#' fit <- smoothMultiRegression(
#'   x = mat[sel, ], y = age[sel, ], iterations = 10,
#'   sparsenessQuantile = 0.5,
#'   smoothingMatrixX = smoomat, smoothingMatrixY = NA, verbose = T
#' )
#' tt <- mat %*% fit$v
#' print(cor.test(age[-sel, 1], tt[-sel, 1]))
#' vimg <- makeImage(mask, (fit$v[, 1]))
#' print(range(vimg) * 10)
#' plot(mask, vimg, window.overlay = range(abs(vimg)))
#' }
#' @export smoothMultiRegression
## #' @param gamma step size for gradient descent
smoothMultiRegression <- function(
    x,
    y,
    iterations = 10,
    sparsenessQuantile = 0.5,
    positivity = FALSE,
    smoothingMatrixX = NA, smoothingMatrixY = NA,
    nv = 2,
    extraPredictors,
    verbose = FALSE) {
  x <- scale(x, scale = FALSE)
  y <- scale(y, scale = FALSE)
  poschoices <- c("positive", "negative", "either", TRUE, FALSE)
  if (sum(positivity == poschoices) != 1 | length(positivity) != 1) {
    stop("choice of positivity parameter is not good - see documentation")
  }
  if (missing("x") | missing("y")) {
    message("this function needs input")
    return(NA)
  }
  svdy <- scale(rsvd::rsvd(y, nu = nv)$u)
  loy <- as.numeric(svdy[, 1])
  ox <- smoothRegression(x, loy,
    iterations = 50,
    sparsenessQuantile = sparsenessQuantile, positivity = positivity,
    smoothingMatrix = smoothingMatrixX, nv = nv, verbose = FALSE
  )$v
  oy <- matrix(nrow = ncol(y), ncol = nv)
  for (k in 1:iterations) {
    lox <- scale(x %*% (ox))
    for (j in 1:nv) {
      if (j == 1) rx <- x else rx <- residuals(lm(x ~ x %*% ox[, 1:(j - 1)]))
      if (j == 1) ry <- y else ry <- residuals(lm(y ~ y %*% oy[, 1:(j - 1)]))
      lox <- (rx %*% (ox))
      oy[, j] <- smoothRegression(ry, lox[, j],
        iterations = 50,
        sparsenessQuantile = sparsenessQuantile, positivity = positivity,
        smoothingMatrix = NA, nv = 2, verbose = FALSE
      )$v[, 1]
      loy <- (ry %*% (oy))
      ox[, j] <- smoothRegression(rx, loy[, j],
        iterations = 50,
        sparsenessQuantile = sparsenessQuantile, positivity = positivity,
        smoothingMatrix = NA, nv = 2, verbose = FALSE
      )$v[, 1]
    }
    tt <- x %*% ox
    ss <- y %*% oy
    print(k)
    print(cor(tt, ss))
  }
  return(
    list(
      vx = ox,
      vy = oy
    )
  )
}






#' k-nearest neighbors constrained image smoothing
#'
#' Compute a smoothing matrix based on an input matrix of point coordinates as
#' well as neighborhood intensity patterns.  this performs a form of edge
#' preserving smoothing.
#'
#' @param img input image to smooth
#' @param mask input image mask
#' @param radius number of neighbors, higher causes more smoothing
#' @param intensityWeight weight for intensity component, value 1 will weight
#' local voxel intensity roughly equally to spatial component
#' @param spatialSigma for gaussian defining spatial distances
#' @param iterations number of iterations over which to apply smoothing kernel
#' @param returnMatrix boolean, will return smoothing matrix instead of image.
#' @return antsImage is output
#' @author Avants BB
#' @examples
#' \dontrun{
#' img <- antsImageRead(getANTsRData("r16"))
#' mask <- getMask(img)
#' simg <- knnSmoothImage(
#'   img = img, mask = mask, radius = 2, intensityWeight = 1,
#'   spatialSigma = 1.5, iterations = 1
#' )
#' }
#' @export knnSmoothImage
knnSmoothImage <- function(
    img,
    mask,
    radius,
    intensityWeight = 0.1,
    spatialSigma = 20.0,
    iterations = 1,
    returnMatrix = FALSE) {
  if (radius <= 0) {
    return(img)
  }
  ivec <- img[mask == 1]
  spatmat <- t(imageDomainToSpatialMatrix(mask, mask))
  spatrange <- range(spatmat, na.rm = TRUE)
  intrange <- range(ivec, na.rm = TRUE)
  idelt <- (intrange[2] - intrange[1])
  if (idelt <= 0) {
    scl <- 1
  } else {
    scl <- (spatrange[2] - spatrange[1]) / idelt * intensityWeight
  }
  ivec2 <- ivec * scl
  spatmat <- rbind(spatmat, ivec2)
  r <- radius
  #  imat = antsrimpute(
  #    getNeighborhoodInMask( img, mask, rep( r, img@dimension), boundary.condition='image' ) )
  #  imat = knnSmoothingMatrix( imat, k = 2*(r*2+1)^2, sigma = intensitySigma )
  jmat <- knnSmoothingMatrix(spatmat, k = (r * 2 + 1)^2, sigma = spatialSigma)
  #  return( jmat )
  #  image( jmat[4000:4500,4000:4500] )
  #  print( jmat[4000:4010,4000:4010] )
  #  imat = imat / Matrix::rowSums( imat )
  #  jmat = imat * smoothingMatrix
  for (i in 1:4) { # sinkhorn
    jmat <- jmat / Matrix::rowSums(jmat)
    jmat <- Matrix::t(Matrix::t(jmat) / Matrix::rowSums(Matrix::t(jmat)))
  }
  if (returnMatrix) {
    return(jmat)
  }
  for (i in 1:iterations) {
    ivec <- jmat %*% ivec
  }
  return(makeImage(mask, as.numeric(ivec)))
}






.xuvtHelper <- function(x, u, v, errs, iterations,
                        smoothingMatrix, repeatedMeasures, intercept,
                        positivity, gamma, sparsenessQuantile, usubs,
                        doOrth, verbose) {
  i <- 1
  tu <- t(u)
  tuu <- t(u) %*% u
  if (is.na(gamma)) gamma <- 1.e-6
  while (i <= iterations) {
    v <- as.matrix(smoothingMatrix %*% v)
    dedv <- t(tuu %*% t(v) - tu %*% x)
    v <- v + dedv * gamma
    #    if ( abs( doOrth ) >  Inf ) {
    #      vOrth = qr.Q( qr( v ) )
    #      v = v * ( 1 - doOrth ) - vOrth * doOrth
    #    }
    for (vv in 1:ncol(v)) {
      v[, vv] <- v[, vv] / as.numeric(sqrt(v[, vv] %*% v[, vv]))
      if (vv > 1 & doOrth) {
        for (vk in 1:(vv - 1)) {
          temp <- v[, vk]
          denom <- as.numeric(temp %*% temp)
          if (denom > 0) ip <- as.numeric(temp %*% v[, vv]) / denom else ip <- 1
          v[, vv] <- v[, vv] - temp * ip
        }
      }
      localv <- v[, vv]
      doflip <- FALSE
      if (sum(localv > 0) < sum(localv < 0)) {
        localv <- localv * (-1)
        doflip <- TRUE
      }
      myquant <- quantile(localv, sparsenessQuantile, na.rm = TRUE)
      if (positivity == "positive") {
        if (myquant > 0) localv[localv <= myquant] <- 0 else localv[localv >= myquant] <- 0
      } else if (positivity == "negative") {
        localv[localv > myquant] <- 0
      } else if (positivity == "either") {
        localv[abs(localv) < quantile(abs(localv), sparsenessQuantile, na.rm = TRUE)] <- 0
      }
      if (doflip) v[, vv] <- localv * (-1) else v[, vv] <- localv
      v[, vv] <- v[, vv] / sqrt(sum(v[, vv] * v[, vv]))
    }
    intercept <- rowMeans(x - (u %*% t(v)))
    if (!any(is.na(repeatedMeasures))) { # estimate random intercepts
      for (s in usubs) {
        usel <- repeatedMeasures == s
        intercept[usel] <- mean(intercept[usel], na.rm = TRUE)
      }
    }
    err <- mean(abs(x - (u %*% t(v) + intercept)))
    errs[i] <- err
    if (i > 1) {
      if ((errs[i] > errs[i - 1]) & (i == 3)) {
        #        message(paste("flipping sign of gradient step:", gamma))
        gamma <- gamma * (-1.0)
      } else if ((errs[i] > errs[i - 1])) {
        gamma <- gamma * (0.5)
        if (verbose) message(paste("reducing gradient step:", gamma))
      } else if (abs(gamma) < 1.e-9) i <- iterations
    }
    i <- i + 1
    if (verbose) print(paste(i, err))
  }
  return(list(v = v, intercept = intercept, gamma = gamma * 1.1))
}


#' joint smooth matrix prediction
#'
#' Joints reconstruct a n by p, q, etc matrices or predictors.
#' The reconstruction can be regularized.
#' # norm( x - u_x v_x^t ) + norm( y - u_y v_y^t) + .... etc
#'
#' @param x input list of matrices to be jointly predicted.
#' @param parameters should be a ncomparisons by 3 matrix where the first two
#' columns define the pair to be matched and the last column defines the weight
#' in the objective function.
#' @param nvecs number of basis vectors to compute
#' @param iterations number of gradient descent iterations
#' @param subIterations number of gradient descent iterations in sub-algorithm
#' @param gamma step size for gradient descent
#' @param sparsenessQuantile quantile to control sparseness - higher is sparser
#' @param positivity restrict to positive or negative solution (beta) weights.
#' choices are positive, negative or either as expressed as a string.
#' @param smoothingMatrix a list containing smoothing matrices of the same
#' length as x.
#' @param rowWeights vectors of weights with size n (assumes diagonal covariance)
#' @param repeatedMeasures list of repeated measurement identifiers. this will
#' allow estimates of per identifier intercept.
#' @param doOrth boolean enforce gram-schmidt orthonormality
#' @param verbose boolean option
#' @return matrix list each of size p by k is output
#' @author Avants BB
#' @examples
#' \dontrun{
#' mat <- replicate(100, rnorm(20))
#' mat2 <- replicate(100, rnorm(20))
#' mat <- scale(mat)
#' mat2 <- scale(mat2)
#' wt <- 0.666
#' mat3 <- mat * wt + mat2 * (1 - wt)
#' params <- matrix(nrow = 2, ncol = 3)
#' params[1, ] <- c(1, 2, 1)
#' params[2, ] <- c(2, 1, 1)
#' x <- list((mat), (mat3))
#' jj <- jointSmoothMatrixReconstruction(x, 2, params,
#'   gamma = 1e-4, sparsenessQuantile = 0.5, iterations = 10,
#'   smoothingMatrix = list(NA, NA), verbose = TRUE
#' )
#'
#' # latent feature
#' mask <- getMask(antsImageRead(getANTsRData("r16")))
#' spatmat <- t(imageDomainToSpatialMatrix(mask, mask))
#' smoomat <- knnSmoothingMatrix(spatmat, k = 27, sigma = 120.0)
#' lfeats <- t(replicate(100, rnorm(3)))
#' # map these - via matrix - to observed features
#' n <- sum(mask)
#' ofeats1 <- (lfeats + rnorm(length(lfeats), 0.0, 1.0)) %*% rbind(rnorm(n), rnorm(n), rnorm(n))
#' ofeats2 <- (lfeats + rnorm(length(lfeats), 0.0, 1.0)) %*% rbind(rnorm(n), rnorm(n), rnorm(n))
#' # only half of the matrix contains relevant data
#' ofeats1[, 1:round(n / 2)] <- matrix(rnorm(round(n / 2) * 100), nrow = 100)
#' ofeats2[, 1:round(n / 2)] <- matrix(rnorm(round(n / 2) * 100), nrow = 100)
#' x <- list((ofeats1), (ofeats2))
#' jj <- jointSmoothMatrixReconstruction(x, 2, params,
#'   gamma = 0.0001, sparsenessQuantile = 0.75, iterations = 19,
#'   subIterations = 11,
#'   smoothingMatrix = list(smoomat, smoomat), verbose = TRUE
#' )
#' p1 <- ofeats2 %*% jj$v[[2]]
#' p2 <- ofeats1 %*% jj$v[[1]]
#' cor(p1, lfeats)
#' cor(p2, lfeats)
#' print(cor(rowMeans(ofeats1), lfeats))
#' print(cor(rowMeans(ofeats2), lfeats))
#'
#' # a 2nd example with 3 modalities
#' imageIDs <- c("r16", "r27", "r30", "r62", "r64", "r85")
#' images <- list()
#' feature1Images <- list()
#' feature2Images <- list()
#' feature3Images <- list()
#' ref <- antsImageRead(getANTsRData("r16"))
#' for (i in 1:length(imageIDs))
#' {
#'   cat("Processing image", imageIDs[i], "\n")
#'   tar <- antsRegistration(ref, antsImageRead(getANTsRData(imageIDs[i])),
#'     typeofTransform = "Affine"
#'   )$warpedmov
#'   images[[i]] <- tar
#'   feature1Images[[i]] <- iMath(images[[i]], "Grad", 1.0)
#'   feature2Images[[i]] <- iMath(images[[i]], "Laplacian", 1.0)
#'   feature3Images[[i]] <- reflectImage(tar, axis = 0, tx = "Affine")$warpedmovout
#' }
#' i <- 1
#' mask <- getMask(antsImageRead(getANTsRData(imageIDs[i])))
#' mask2 <- iMath(mask, "ME", 2)
#' spatmat <- t(imageDomainToSpatialMatrix(mask, mask))
#' smoomat <- knnSmoothingMatrix(spatmat, k = 125, sigma = 100.0)
#' spatmat2 <- t(imageDomainToSpatialMatrix(mask2, mask2))
#' smoomat2 <- knnSmoothingMatrix(spatmat2, k = 125, sigma = 100.0)
#' params <- matrix(nrow = 6, ncol = 3)
#' params[1, ] <- c(1, 2, 1)
#' params[2, ] <- c(2, 1, 1)
#' params[3, ] <- c(1, 3, 1)
#' params[4, ] <- c(3, 1, 1)
#' params[5, ] <- c(2, 3, 1)
#' params[6, ] <- c(3, 2, 1)
#' mat <- imageListToMatrix(feature1Images, mask)
#' mat2 <- imageListToMatrix(feature2Images, mask2)
#' mat3 <- imageListToMatrix(feature3Images, mask)
#' scl <- F
#' x <- list(scale(mat, scale = scl), scale(mat2, scale = scl), scale(mat3, scale = scl))
#' slist <- list(smoomat2, smoomat, smoomat, smoomat, smoomat, smoomat2)
#'
#' jj <- jointSmoothMatrixReconstruction(x, 4, params,
#'   positivity = T,
#'   gamma = 1e-6, sparsenessQuantile = 0.9, iterations = 10,
#'   smoothingMatrix = slist, verbose = TRUE
#' )
#' mm <- makeImage(mask, abs(jj$v[[2]][, 1])) %>% iMath("Normalize")
#' plot(ref, mm, doCropping = FALSE, window.overlay = c(0.1, 1))
#'
#' p1 <- mat2 %*% jj$v[[1]]
#' p2 <- mat %*% jj$v[[2]]
#' diag(cor(p1, p2))
#'
#' p1 <- mat3 %*% jj$v[[5]]
#' p2 <- mat2 %*% jj$v[[6]]
#' diag(cor(p1, p2))
#' }
#' @export jointSmoothMatrixReconstruction
jointSmoothMatrixReconstruction <- function(
    x,
    nvecs,
    parameters,
    iterations = 10,
    subIterations = 5,
    gamma = 1.e-6,
    sparsenessQuantile = 0.5,
    positivity = FALSE,
    smoothingMatrix = NA,
    rowWeights = NA,
    repeatedMeasures = NA,
    doOrth = FALSE,
    verbose = FALSE) {
  poschoices <- c("positive", "negative", "either", TRUE, FALSE)
  if (sum(positivity == poschoices) != 1 | length(positivity) != 1) {
    stop("choice of positivity parameter is not good - see documentation")
  }
  if (positivity == TRUE) positivity <- "positive"
  if (positivity == FALSE) positivity <- "either"

  for (k in 1:length(x)) x[[k]] <- x[[k]] / max(abs(x[[k]]))
  gammas <- rep(gamma, nrow(parameters))
  ulist <- list()
  vlist <- list()
  ilist <- list()
  if (!any(is.na(repeatedMeasures))) {
    usubs <- unique(repeatedMeasures)
    wtdf <- data.frame(table(repeatedMeasures))
    # rowWeights should scale with counts
    repWeights <- rep(0, length(repeatedMeasures))
    for (u in wtdf$usubs) {
      repWeights[repeatedMeasures == u] <- 1.0 / wtdf$Freq[wtdf$repeatedMeasures == u]
    }
    rm(wtdf)
    if (all(is.na(rowWeights))) {
      rowWeights <- repWeights
    } else {
      rowWeights <- rowWeights * repWeights
    }
  }

  for (i in 1:nrow(parameters)) {
    m1 <- parameters[i, 1]
    m2 <- parameters[i, 2]
    modelFormula <- as.formula(" x[[ m2 ]]  ~ .")
    basisDf <- data.frame(u = irlba::irlba(x[[m1]], nu = nvecs, nv = 0)$u)
    #    basisDf = data.frame( u=rsvd::rsvd( x[[ m1 ]], nu = nvecs, nv = 0 )$u )
    #    basisDf = data.frame( u=RSpectra::svds( x[[ m1 ]], nvecs )$u )
    mdl <- lm(modelFormula, data = basisDf)
    u <- model.matrix(mdl)
    ilist[[i]] <- u[, 1] # intercept
    u <- u[, -1]
    v <- mdl$coefficients[-1, ]
    v <- v + matrix(rnorm(length(v), 0, 0.01), nrow = nrow(v), ncol = ncol(v))
    v <- t(v)
    for (vv in 1:ncol(v)) {
      v[, vv] <- v[, vv] / sqrt(sum(v[, vv] * v[, vv]))
    }
    ulist[[i]] <- u
    vlist[[i]] <- v
  }
  errs <- rep(NA, length(iterations))
  if (verbose) print("part II")
  perr <- matrix(nrow = iterations, ncol = nrow(parameters))
  k <- 1
  while (k <= iterations) {
    for (i in 1:nrow(parameters)) {
      m1 <- parameters[i, 1]
      m2 <- parameters[i, 2]
      whichv <- NA
      for (pp in 1:nrow(parameters)) {
        if (parameters[pp, 2] == m1 & parameters[pp, 1] == m2) {
          whichv <- pp
        }
      }
      if (!is.na(whichv)) {
        temp <- vlist[[whichv]]
        ulist[[i]] <- (x[[m1]] %*% (temp))
      } else {
        ulist[[i]] <- ulist[[m2]]
        vlist[[i]] <- t(t(ulist[[m2]]) %*% x[[m2]])
      }
      if (is.logical(smoothingMatrix[[i]])) {
        loSmoo <- diag(ncol(x[[m2]]))
      } else {
        loSmoo <- smoothingMatrix[[i]]
      }
      temp <- .xuvtHelper(x[[m2]],
        ulist[[i]], vlist[[i]],
        errs,
        iterations = subIterations,
        smoothingMatrix = loSmoo,
        repeatedMeasures = repeatedMeasures, ilist[[i]],
        positivity = positivity, gammas[i] * parameters[i, 3],
        sparsenessQuantile = sparsenessQuantile, usubs = usubs,
        doOrth = doOrth, verbose = F
      )
      #    gammas[i] = temp$gamma * 1.0 # 5
      vlist[[i]] <- temp$v
      ilist[[i]] <- temp$intercept
      perr[k, i] <- mean(abs(x[[m2]] - (ulist[[i]] %*% t(vlist[[i]]) + ilist[[i]])))
    }
    if (verbose) {
      print(perr[k, ])
      print(paste("overall", mean(perr[k, ])))
    }
    if (k > 1) {
      e1 <- perr[k, ] * parameters[, 3]
      e2 <- perr[k - 1, ] * parameters[, 3]
      if (mean(e1) > mean(e2)) gammas <- gammas * 0.9 # k = iterations
    }

    for (i in 1:nrow(parameters)) {
      m1 <- parameters[i, 1]
      m2 <- parameters[i, 2]
      whichv <- NA
      for (pp in 1:nrow(parameters))
      {
        if (parameters[pp, 2] == m1 & parameters[pp, 1] == m2) {
          whichv <- pp
        }
      }
      if (!is.na(whichv)) {
        temp <- (vlist[[whichv]])
        ulist[[i]] <- (x[[m1]] %*% (temp))
      } else {
        ulist[[i]] <- ulist[[m2]]
        vlist[[i]] <- t(t(ulist[[m2]]) %*% x[[m2]])
      }
    }

    k <- k + 1
  }


  return(list(u = ulist, v = vlist, intercepts = ilist))
}


#' rank-based segmentation of a matrix
#'
#' @param v input matrix
#' @param sparsenessQuantile a value in zero to one
#' @param positivity one of positive, negative, either
#' @param basic simplest keep top k-entries in each row
#' @param transpose work on the transpose
#' @return matrix
#' @author Avants BB
#' @examples
#'
#' mat <- matrix(1:200, nrow = 10)
#' matr <- rankBasedMatrixSegmentation(mat, 0.9, basic = FALSE, positivity = "positive")
#'
#' @export rankBasedMatrixSegmentation
rankBasedMatrixSegmentation <- function(v, sparsenessQuantile, basic = FALSE, positivity = "positive", transpose = FALSE) {
  if (transpose) v <- t(v)
  mycols <- 1:ncol(v)
  ntokeep <- round(quantile(mycols, 1.0 - sparsenessQuantile))
  outmat <- matrix(0, nrow = nrow(v), ncol = ncol(v))
  if (basic) {
    for (k in 1:nrow(v)) {
      if (positivity == "either") locord <- order(abs(v[k, ]), decreasing = T)[1:ntokeep]
      if (positivity == "negative") locord <- order(v[k, ], decreasing = F)[1:ntokeep]
      if (positivity == "positive") locord <- order(v[k, ], decreasing = T)[1:ntokeep]
      outmat[k, locord] <- v[k, locord]
    }
    if (transpose) {
      return(t(outmat))
    }
    return(outmat)
  }
  tozero <- c()
  for (k in 1:nrow(v)) {
    vec <- v[k, ]
    if (length(tozero) > 0) vec[tozero] <- 0
    # adjust for weighted signs
    if ((sum(vec[vec < 0]) - sum(vec[vec > 0])) < 0) vec <- vec * (-1.0)
    if (positivity == "either") {
      vec <- abs(vec)
      locord <- order(vec, decreasing = T)[1:ntokeep]
    } else if (positivity == "negative") {
      locord <- order(vec, decreasing = F)[1:ntokeep]
    } else {
      locord <- order(vec, decreasing = T)[1:ntokeep]
    }
    outmat[k, locord] <- v[k, locord]
    tozero <- c(tozero, locord)
    if (all(mycols %in% tozero)) tozero <- c()
  }
  if (transpose) {
    return(t(outmat))
  }
  return(outmat)
}


#' sparsify a matrix
#'
#' This implements a quantile based sparsification operation
#'
#' @param v input matrix
#' @param sparsenessQuantile quantile to control sparseness - higher is sparser
#' @param positivity restrict to positive or negative solution (beta) weights.
#' choices are positive, negative or either as expressed as a string.
#' @param orthogonalize run gram-schmidt if TRUE.
#' @param softThresholding use soft thresholding
#' @param unitNorm set each vector to unit norm
#' @param sparsenessAlg string sets the NMF or other algorithm to estimate V
#' @return matrix
#' @author Avants BB
#' @examples
#'
#' mat <- replicate(100, rnorm(20))
#' mat <- orthogonalizeAndQSparsify(mat)
#'
#' @export orthogonalizeAndQSparsify
orthogonalizeAndQSparsify <- function(
    v,
    sparsenessQuantile = 0.5, positivity = "either",
    orthogonalize = TRUE, softThresholding = FALSE, unitNorm = FALSE, sparsenessAlg = NA) {
  if (!is.na(sparsenessAlg)) {
    if (sparsenessAlg == "orthorank") {
      return(rankBasedMatrixSegmentation(v, sparsenessQuantile, basic = FALSE, positivity = positivity, transpose = TRUE))
    } else {
      return(rankBasedMatrixSegmentation(v, sparsenessQuantile, basic = TRUE, positivity = positivity, transpose = TRUE))
    }
  }
  if (sparsenessQuantile == 0) {
    return(v)
  }
  epsval <- 0.0 # .Machine$double.eps
  #  if ( orthogonalize ) v = qr.Q( qr( v ) )
  binaryOrth <- function(x) { # BROKEN => DONT USE
    minormax <- function(x) {
      if (max(x) == 0) {
        return(which.min(x))
      }
      return(which.max(x))
    }
    b <- x * 0
    wm <- apply(abs(x), FUN = minormax, MARGIN = 2)
    for (i in 1:ncol(x)) b[wm[i], i] <- x[wm[i], i]
    for (i in 1:nrow(b)) b[i, ] <- b[i, ] / sqrt(sum(b[i, ]^2))
    b
  }
  for (vv in 1:ncol(v)) {
    if (var(v[, vv]) > epsval) {
      #      v[ , vv ] = v[ , vv ] / sqrt( sum( v[ , vv ] * v[ , vv ] ) )
      if (vv > 1 & orthogonalize) {
        for (vk in 1:(vv - 1)) {
          temp <- v[, vk]
          denom <- sum(temp * temp, na.rm = TRUE)
          if (denom > epsval) ip <- sum(temp * v[, vv]) / denom else ip <- 1
          v[, vv] <- v[, vv] - temp * ip
        }
      }
      localv <- v[, vv] # zerka
      doflip <- FALSE
      if (sum(localv > 0, na.rm = TRUE) < sum(localv < 0, na.rm = TRUE)) {
        localv <- localv * (-1)
        doflip <- TRUE
      }
      myquant <- quantile(localv, sparsenessQuantile, na.rm = TRUE)
      if (!softThresholding) {
        if (positivity == "positive") {
          if (myquant > 0) localv[localv <= myquant] <- 0 else localv[localv >= myquant] <- 0
        } else if (positivity == "negative") {
          localv[localv > myquant] <- 0
        } else if (positivity == "either") {
          localv[abs(localv) < quantile(abs(localv), sparsenessQuantile, na.rm = TRUE)] <- 0
        }
      } else {
        if (positivity == "positive") localv[localv < 0] <- 0
        mysign <- sign(localv)
        myquant <- quantile(abs(localv), sparsenessQuantile, na.rm = TRUE)
        temp <- abs(localv) - myquant
        temp[temp < 0] <- 0
        localv <- mysign * temp
      }
      if (doflip) v[, vv] <- localv * (-1) else v[, vv] <- localv
    }
    cthresh <- 100
    if (cthresh > 0) {
      #      temp = makeImage( , )
    }
  }
  if (unitNorm) {
    for (i in 1:ncol(v)) {
      locnorm <- sqrt(sum(v[, i]^2))
      if (locnorm > 0) v[, i] <- v[, i] / locnorm
    }
  }
  return(v)
}


#' cca via sparse smooth matrix prediction
#'
#' This implements a sparse and graph-regularized version of CCA based on the
#' AppGrad style of implementation by Ma, Lu and Foster, 2015.
#'
#' @param x input view 1 matrix
#' @param y input view 2 matrix
#' @param smoox smoothingMatrix for x
#' @param smooy smoothingMatrix for y
#' @param sparsenessQuantile quantile to control sparseness - higher is sparser
#' @param positivity restrict to positive or negative solution (beta) weights.
#' choices are positive, negative or either as expressed as a string.
#' @param k number of basis vectors to compute
#' @param iterations number of gradient descent iterations
#' @param stochastic size of subset to use for stocastic gradient descent
#' @param initialization type of initialization, currently only supports a
#' character \code{randxy}
#' @param verbose boolean option
#' @return list with matrices each of size p or q by k
#' @author Avants BB
#' @examples
#'
#' mat <- replicate(100, rnorm(20))
#' mat2 <- replicate(100, rnorm(20))
#' mat <- scale(mat)
#' mat2 <- scale(mat2)
#' wt <- 0.666
#' mat3 <- mat * wt + mat2 * (1 - wt)
#' jj <- smoothAppGradCCA(mat, mat3)
#'
#' @export smoothAppGradCCA
smoothAppGradCCA <- function(x, y,
                             smoox = NA, smooy = NA,
                             sparsenessQuantile = 0.5,
                             positivity = "either",
                             k = 2, iterations = 10,
                             stochastic = NA,
                             initialization = "randxy",
                             verbose = FALSE) {
  if (nrow(x) != nrow(y)) stop("nrow x should equal nrow y")
  #  x = scale( icawhiten(x,nrow(x)), scale=T )
  #  y = scale( icawhiten(y,nrow(y)), scale=T )
  x <- scale(x, scale = T)
  y <- scale(y, scale = T)
  errs <- rep(NA, iterations)
  poschoices <- c("positive", "negative", "either", TRUE, FALSE)
  if (sum(positivity == poschoices) != 1 | length(positivity) != 1) {
    stop("choice of positivity parameter is not good - see documentation")
  }
  if (positivity == TRUE) positivity <- "positive"
  if (positivity == FALSE) positivity <- "either"
  ratio <- norm(x) / norm(y)
  x <- x / norm(x)
  y <- y / norm(y)
  if (any(is.na(smoox))) smoox <- diag(ncol(x))
  if (any(is.na(smooy))) smooy <- diag(ncol(y))
  #  phix = RSpectra::svds( x, k=k )$v
  #  phiy = RSpectra::svds( y, k=k )$v
  #  phix = t( y ) %*% irlba::irlba( x, nu=k, nv=0, maxit=1000, tol=1.e-6 )$u
  #  phiy = t( x ) %*% irlba::irlba( y, nu=k, nv=0, maxit=1000, tol=1.e-6 )$u
  #  phix = t( y ) %*% svd( x, nu=k, nv=0  )$u
  #  phiy = t( x ) %*% svd( y, nu=k, nv=0  )$u
  if (initialization == "randxy") {
    phiy <- t(y) %*% (x %*% matrix(rnorm(k * ncol(x), 0, 1), ncol = k))
    phix <- t(x) %*% (y %*% matrix(rnorm(k * ncol(y), 0, 1), ncol = k))
  } else if (initialization == "svd") {
    phix <- svd(x, nv = k, nu = 0)$v
    phiy <- svd(y, nv = k, nu = 0)$v
  }
  #  phix = matrix( rnorm( k * ncol(x), 0, 1e-4 ), ncol=k )
  #  phiy = matrix( rnorm( k * ncol(y), 0, 1e-4 ), ncol=k )

  i <- 1
  if (is.na(stochastic)) stoke <- FALSE else stoke <- TRUE
  while (i < iterations) {
    if (stoke) ind <- sample(1:nrow(x))[1:stochastic] else ind <- 1:nrow(x)
    if (i < 3) gx <- -1e-4 # quantile( phix[ abs(phix) > 0 ] , 0.5 , na.rm=TRUE ) * ( -1.e4 )
    if (i < 3) gy <- -1e-4 # quantile( phiy[ abs(phiy) > 0 ] , 0.5 , na.rm=TRUE ) * ( -1.e4 )
    # gradient calculation
    delta <- t(x[ind, ]) %*% (x[ind, ] %*% phix - y[ind, ] %*% phiy)
    phix1 <- phix - delta * gx
    delta <- t(y[ind, ]) %*% (y[ind, ] %*% phiy - x[ind, ] %*% phix)
    phiy1 <- phiy - delta * gy
    #    phix1 = phix1 %*% dx
    #    phiy1 = phiy1 %*% dy
    phix1 <- as.matrix(smoox %*% orthogonalizeAndQSparsify(phix1, sparsenessQuantile = sparsenessQuantile, positivity = positivity))
    phiy1 <- as.matrix(smooy %*% orthogonalizeAndQSparsify(phiy1, sparsenessQuantile = sparsenessQuantile, positivity = positivity))
    # now update
    phix <- phix1 / norm(x %*% phix1)
    phiy <- phiy1 / norm(y %*% phiy1)
    errs[i] <- norm(y %*% phiy - x %*% phix)
    #  print(  sum( abs(  diag(  cor( y %*% phiy,  x %*% phix  ) ) ) ) )
    if (verbose) print(paste(i, errs[i]))
    #    if ( i > 15 ) if ( errs[ i ]  < errs[i-1] ) i = iterations+1
    i <- i + 1
  }

  #  print( cor( x %*% phix ) )
  #  print( cor( y %*% phiy ) )
  #  print( cor( y %*% phiy,  x %*% phix  ) )
  return(list(phix = phix, phiy = phiy))
}


#' Efficiently compute a multivariate, penalized image-based linear regression model (milr)
#'
#' This function simplifies calculating image-wide multivariate beta maps from
#' linear models.  Image-based variables are stored in the input
#' matrix list. They should be named consistently in the input formula and
#' in the image list.  If they are not, an error will be thrown.  All input
#' matrices should have the same number of rows and columns.  The model will
#' minimize a matrix energy similar to norm( X - UVt - UranVrant ) where the U
#' are standard design and random effect (intercept) design matrices.  The
#' random intercept matrix is only included if repeated measures are indicated.
#'
#' @param dataFrame This data frame contains all relevant predictors except for
#' the matrices associated with the image variables.
#' @param voxmats The named list of matrices that contains the changing predictors.
#' @param myFormula This is a character string that defines a valid regression formula.
#' @param smoothingMatrix allows parameter smoothing, should be square and same
#' size as input matrix
#' @param iterations number of gradient descent iterations
#' @param gamma step size for gradient descent
#' @param sparsenessQuantile quantile to control sparseness - higher is sparser
#' @param positivity restrict to positive or negative solution (beta) weights.
#' choices are positive, negative or either as expressed as a string.
#' @param repeatedMeasures list of repeated measurement identifiers. this will
#' allow estimates of per identifier intercept.
#' @param orthogonalize boolean to control whether we orthogonalize the v
#' @param verbose boolean to control verbosity of output
#' @return A list of different matrices that contain names derived from the
#' formula and the coefficients of the regression model.
#' @author BB Avants.
#' @examples
#'
#' set.seed(1500)
#' nsub <- 12
#' npix <- 100
#' outcome <- rnorm(nsub)
#' covar <- rnorm(nsub)
#' mat <- replicate(npix, rnorm(nsub))
#' mat2 <- replicate(npix, rnorm(nsub))
#' myform <- " vox2 ~ covar + vox "
#' df <- data.frame(outcome = outcome, covar = covar)
#' result <- milr(df, list(vox = mat, vox2 = mat2), myform)
#'
#' \dontrun{
#' # a 2nd example with 3 modalities
#' imageIDs <- c("r16", "r27", "r30", "r62", "r64", "r85")
#' images <- list()
#' feature1Images <- list()
#' feature2Images <- list()
#' feature3Images <- list()
#' ref <- antsImageRead(getANTsRData("r16"))
#' mask <- ref * 0
#' for (i in 1:length(imageIDs)) {
#'   cat("Processing image", imageIDs[i], "\n")
#'   tar <- antsImageRead(getANTsRData(imageIDs[i]))
#'   images[[i]] <- tar
#'   mask <- mask + tar
#'   feature1Images[[i]] <- iMath(images[[i]], "Grad", 1.0)
#'   feature2Images[[i]] <- iMath(images[[i]], "Laplacian", 1.0)
#'   feature3Images[[i]] <- reflectImage(tar, axis = 0, tx = "Affine")$warpedmovout
#' }
#' i <- 1
#' mask <- getMask(mask)
#' spatmat <- t(imageDomainToSpatialMatrix(mask, mask))
#' smoomat <- knnSmoothingMatrix(spatmat, k = 23, sigma = 100.0)
#' mat <- imageListToMatrix(images, mask)
#' mat2 <- imageListToMatrix(feature1Images, mask)
#' mat3 <- imageListToMatrix(feature3Images, mask)
#' myscale <- function(x) {
#'   return(scale(x, center = TRUE, scale = T))
#' }
#' x <- list(x1 = myscale(mat[-5, ]), x2 = myscale(mat2[-5, ]), x3 = myscale(mat3[-5, ]))
#' df <- data.frame(m1 = rowMeans(x$x1), m2 = rowMeans(x$x2), m3 = rowMeans(x$x3))
#' myform <- " x1 ~ x2 + x3 + m2 + m3 "
#' result <- milr(df, x, myform,
#'   iterations = 32, smoothingMatrix = smoomat,
#'   gamma = 1e-1, verbose = T
#' )
#' k <- 1
#' mm <- makeImage(mask, (result$prediction[k, ])) %>% iMath("Normalize")
#' tar <- makeImage(mask, x$x1[k, ])
#' plot(mm, doCropping = FALSE)
#' plot(tar, doCropping = FALSE)
#' cor.test(result$prediction[k, ], x$x1[k, ])
#' myol <- makeImage(mask, abs(result$v[, "x2"])) %>% iMath("Normalize")
#' plot(tar, myol, doCropping = FALSE)
#' myol <- makeImage(mask, abs(result$v[, "m3"])) %>% iMath("Normalize")
#' plot(tar, myol, doCropping = FALSE)
#' result <- milr(df, x, myform,
#'   iterations = 11, smoothingMatrix = smoomat,
#'   sparsenessQuantile = 0.5, positivity = "positive",
#'   gamma = 1e-2, verbose = T
#' )
#' myol <- makeImage(mask, abs(result$v[, "x2"])) %>% iMath("Normalize")
#' plot(tar, myol, doCropping = FALSE, window.overlay = c(0.1, 1))
#' mm <- makeImage(mask, (result$prediction[k, ])) %>% iMath("Normalize")
#' tar <- makeImage(mask, x$x1[k, ])
#' plot(mm, doCropping = FALSE)
#' plot(tar, doCropping = FALSE)
#' cor.test(result$prediction[k, ], x$x1[k, ])
#' # univariate outcome
#' myform <- " m1 ~ x2 + x3 + m2 + m3 "
#' result <- milr(df, x, myform,
#'   iterations = 11, smoothingMatrix = smoomat,
#'   gamma = 1e-2, verbose = T
#' )
#' }
#'
#' @export milr
milr <- function(dataFrame, voxmats, myFormula, smoothingMatrix,
                 iterations = 10, gamma = 1.e-6,
                 sparsenessQuantile,
                 positivity = c("positive", "negative", "either"),
                 repeatedMeasures = NA,
                 orthogonalize = FALSE,
                 verbose = FALSE) {
  milrorth <- orthogonalize
  vdf <- data.frame(dataFrame)
  matnames <- names(voxmats)
  if (length(matnames) == 0) stop("please name the input list entries")
  n <- nrow(voxmats[[1]])
  p <- ncol(voxmats[[1]])
  if (missing(smoothingMatrix)) smoothingMatrix <- diag(p)
  poschoices <- c("positive", "negative", "either", TRUE, FALSE)
  if (!missing(positivity)) {
    if (sum(positivity == poschoices) != 1 | length(positivity) != 1) {
      stop("choice of positivity parameter is not good - see documentation")
    }
    if (positivity == TRUE) positivity <- "positive"
    if (positivity == FALSE) positivity <- "either"
  }
  for (k in 1:length(voxmats)) {
    vdf <- cbind(vdf, voxmats[[k]][, 1])
    names(vdf)[ncol(vdf)] <- matnames[k]
    if (ncol(voxmats[[k]]) != p) {
      stop(paste("matrix ", matnames[k], " does not have ", p, "entries"))
    }
  }
  # get names from the standard lm
  temp <- summary(lm(myFormula, data = vdf))
  myrownames <- rownames(temp$coefficients)
  mypvs <- matrix(rep(NA, p * length(myrownames)),
    nrow = length(myrownames)
  )
  myestvs <- mypvs
  myervs <- mypvs
  mytvs <- mypvs
  outcomevarname <- trimws(unlist(strsplit(myFormula, "~"))[1])
  outcomevarnum <- which(outcomevarname == matnames)
  outcomeisconstant <- FALSE
  if (length(outcomevarnum) == 0) {
    outcomeisconstant <- TRUE
    outcomevarnum <- which(colnames(vdf) == outcomevarname)
  }
  hasRanEff <- FALSE
  vRan <- NA
  if (!any(is.na(repeatedMeasures))) {
    hasRanEff <- TRUE
    usubs <- unique(repeatedMeasures)
    if (length(repeatedMeasures) != nrow(dataFrame)) {
      stop("The length of the repeatedMeasures vector should equal the number of rows in the data frame.")
    }
    ranEff <- factor(repeatedMeasures)
    temp <- lm(rnorm(nrow(dataFrame)) ~ ranEff)
    temp <- model.matrix(temp)
    ranEffNames <- colnames(temp)
    ranEffNames[1] <- paste0("ranEff", as.character(levels(ranEff)[1]))
    temp[ranEff == levels(ranEff)[1], 1] <- 1
    temp[ranEff != levels(ranEff)[1], 1] <- 0
    zRan <- (temp[, ])
    colnames(zRan) <- ranEffNames
    tz <- t(zRan)
    tzz <- tz %*% zRan
    rm(ranEff)
  }
  mylm <- lm(myFormula, data = vdf)
  u <- (model.matrix(mylm)[, ])
  unms <- colnames(u)
  colnames(u) <- unms
  lvx <- length(voxmats)
  predictormatrixnames <- colnames(u)[colnames(u) %in% matnames]
  myks <- which(matnames %in% predictormatrixnames)
  v <- matrix(rnorm(ncol(u) * p, 1, 1), nrow = p, ncol = ncol(u)) * 0.0
  if (hasRanEff) {
    vRan <- matrix(rnorm(ncol(zRan) * p, 1, 1), nrow = p, ncol = ncol(zRan)) * 0.0
    dedrv <- vRan * 0
    colnames(vRan) <- ranEffNames
  }
  colnames(v) <- unms
  hasIntercept <- "(Intercept)" %in% colnames(v)
  if (hasIntercept) dospar <- 2:ncol(v) else dospar <- 1:ncol(v)
  for (i in 1:p) {
    if (length(myks) > 0) {
      for (k in 1:length(predictormatrixnames)) {
        u[, predictormatrixnames[k]] <- voxmats[[myks[k]]][, i]
      }
    }
    tu <- t(u)
    tuu <- t(u) %*% u
    if (outcomeisconstant) {
      myoc <- vdf[, outcomevarnum]
    } else {
      myoc <- voxmats[[outcomevarnum]][, i]
    }
    term2 <- tu %*% myoc
    v[i, ] <- (tuu %*% v[i, ] - term2)
  }
  v <- as.matrix(smoothingMatrix %*% v)
  if (!missing(sparsenessQuantile)) {
    v <- orthogonalizeAndQSparsify(v, sparsenessQuantile, positivity,
      orthogonalize = milrorth
    )
  }
  dedv <- v * 0
  predicted <- voxmats[[1]] * 0
  # now fix dedv with the correct voxels
  for (iter in 1:iterations) {
    err <- 0
    for (i in 1:p) {
      if (length(myks) > 0) {
        for (k in 1:length(predictormatrixnames)) {
          u[, predictormatrixnames[k]] <- voxmats[[myks[k]]][, i]
        }
      }
      tu <- t(u)
      tuu <- t(u) %*% u
      if (outcomeisconstant) {
        myoc <- vdf[, outcomevarnum]
      } else {
        myoc <- voxmats[[outcomevarnum]][, i]
      }
      term2 <- tu %*% myoc
      dedv[i, ] <- tuu %*% v[i, ] - term2
      if (hasRanEff) dedv[i, ] <- dedv[i, ] + (tu %*% zRan) %*% vRan[i, ]
      predicted[, i] <- u %*% (v[i, ])
      if (hasRanEff) predicted[, i] <- predicted[, i] + zRan %*% vRan[i, ]
      # based on this explanation
      # https://m-clark.github.io/docs/mixedModels/mixedModels.html
      # the energy term for the regression model is:
      #   norm( y - uvt ) =>  grad update is wrt v is  tuu * v - tu * y
      # the energy term for the mixed regression model is:
      #   norm( y - uvt - z_ran v_rant ) =>
      # this derivative z_ran * ( y - uvt - z_ran v_rant  )
      #   grad update wrt v is     tuu * v    + tu * zRan * vRan - tu * y
      #   grad update wrt vran is  tzz * vran + tz * uvt - tz * y
      err <- err + mean(abs(myoc - predicted[, i]))
    }
    v <- v - dedv * gamma
    v <- as.matrix(smoothingMatrix %*% v)
    if (!missing(sparsenessQuantile)) {
      v <- orthogonalizeAndQSparsify(v, sparsenessQuantile, positivity, orthogonalize = milrorth)
    }
    gammamx <- gamma * 0.1 # go a bit slower
    # simplified model here
    #      dedrv = t( ( t(zRan) %*% zRan ) %*% t( vRanX ) ) # t1
    #      dedrv = dedrvX + t( ( t(zRan) %*%  umat ) %*% t( vmat1 )  ) # t2
    #      dedrv = dedrv -  t( t(zRan) %*% voxmats[[1]] ) # t3
    #      vRan = smoothingMatrix %*% ( vRan + dedrvX * gammamx )


    if (hasRanEff) {
      vRan <- as.matrix(smoothingMatrix %*% vRan)
      # update random effects
      for (i in 1:p) {
        if (length(myks) > 0) {
          for (k in 1:length(predictormatrixnames)) {
            u[, predictormatrixnames[k]] <- voxmats[[myks[k]]][, i]
          }
        }
        tu <- t(u)
        tuu <- t(u) %*% u
        if (outcomeisconstant) {
          myoc <- vdf[, outcomevarnum]
        } else {
          myoc <- voxmats[[outcomevarnum]][, i]
        }
        predicted[, i] <- u %*% (v[i, ]) + zRan %*% vRan[i, ]
        #   grad update wrt vran is  tzz * vran + tz * uvt - tz * y
        #        rterm2 = tz %*% ( myoc - predicted[ , i ] ) # was this a bug or typo?  need to check math
        rterm2 <- tz %*% (myoc)
        dedrv[i, ] <- (tz %*% u) %*% v[i, ] + tzz %*% vRan[i, ] - rterm2
      }
      vRan <- vRan - dedrv * gammamx
    }
    if (verbose) print(err / p)
  }
  colnames(v) <- unms
  return(list(u = u, v = v, prediction = predicted, vRan = vRan))
}





#' Predict from a milr output
#'
#' This function computes a prediction or low-dimensional embedding, given
#' \code{milr} output.  It will return a predictive model if the outcome variable
#' is a scalar.  Otherwise, it will return a low-dimensional embedding without
#' a specific predictive model.
#'
#' @param milrResult This output form milr
#' @param dataFrameTrain This data frame contains all relevant predictors
#' in the training data except for the matrices associated with the image variables.
#' @param voxmatsTrain The named list of matrices that contains the changing predictors.
#' @param dataFrameTest This data frame contains all relevant predictors
#' in the training data except for the matrices associated with the image variables in test data.
#' @param voxmatsTest The named list of matrices that contains the changing predictors in test data.
#' @param myFormula This is a character string that defines a valid regression formula.
#' @return the predicted matrix.
#' @author BB Avants.
#' @examples
#'
#' nsub <- 24
#' npix <- 100
#' outcome <- rnorm(nsub)
#' covar <- rnorm(nsub)
#' mat <- replicate(npix, rnorm(nsub))
#' mat2 <- replicate(npix, rnorm(nsub))
#' mat3 <- replicate(npix, rnorm(nsub))
#' myform <- " vox2 ~ covar + vox + vox3 "
#' istr <- c(rep(TRUE, round(nsub * 2 / 3)), rep(FALSE, nsub - round(nsub * 2 / 3)))
#' df <- data.frame(outcome = outcome, covar = covar)
#' ltr <- list(vox = mat[istr, ], vox2 = mat2[istr, ], vox3 = mat3[istr, ])
#' lte <- list(vox = mat[!istr, ], vox2 = mat2[!istr, ], vox3 = mat3[!istr, ])
#' result <- milr(df[istr, ], ltr, myform)
#' pred <- milr.predict(result, df[istr, ], ltr, df[!istr, ], lte, myform)
#'
#' @seealso \code{\link{milr}}
#' @export milr.predict
milr.predict <- function(
    milrResult,
    dataFrameTrain,
    voxmatsTrain,
    dataFrameTest,
    voxmatsTest,
    myFormula) {
  matnames <- names(voxmatsTrain)
  vdf <- data.frame(dataFrameTrain)
  vdfTe <- data.frame(dataFrameTest)
  if (length(matnames) == 0) stop("please name the input list entries")
  outcomevarname <- trimws(unlist(strsplit(myFormula, "~"))[1])
  outcomevarnum <- which(outcomevarname == matnames)
  outcomeisconstant <- FALSE
  if (length(outcomevarnum) == 0) {
    outcomeisconstant <- TRUE
    outcomevarnum <- which(colnames(vdf) == outcomevarname)
  }
  # first build a unified training and testing dataFrame
  n <- nrow(voxmatsTrain[[1]])
  p <- ncol(voxmatsTrain[[1]])
  for (k in 1:length(voxmatsTrain)) {
    vdf <- cbind(vdf, voxmatsTrain[[k]][, 1])
    names(vdf)[ncol(vdf)] <- matnames[k]
    if (ncol(voxmatsTrain[[k]]) != p) {
      stop(paste("train matrix ", matnames[k], " does not have ", p, "entries"))
    }
    vdfTe <- cbind(vdfTe, voxmatsTest[[k]][, 1])
    names(vdfTe)[ncol(vdfTe)] <- matnames[k]
    if (ncol(voxmatsTest[[k]]) != p) {
      stop(paste("test matrix ", matnames[k], " does not have ", p, "entries"))
    }
  }

  # get names from the standard lm
  temp <- summary(lm(myFormula, data = vdf))
  myrownames <- rownames(temp$coefficients)
  mylm <- lm(myFormula, data = vdf)
  u <- model.matrix(mylm)
  unms <- colnames(u[, ])
  u[, -1] <- scale(u[, -1])
  colnames(u) <- unms
  lvx <- length(voxmatsTrain)
  predictormatrixnames <- colnames(u)[colnames(u) %in% matnames]
  myks <- which(matnames %in% predictormatrixnames)
  # compute low-dimensional representations from the milr result for train-test
  if (length(myks) > 0) {
    for (k in 1:length(predictormatrixnames)) {
      vdf[, predictormatrixnames[k]] <-
        voxmatsTrain[[myks[k]]] %*% milrResult$v[, predictormatrixnames[k]]
      vdfTe[, predictormatrixnames[k]] <-
        voxmatsTest[[myks[k]]] %*% milrResult$v[, predictormatrixnames[k]]
    }
  }
  if (outcomevarname %in% matnames) {
    vdf <- voxmatsTrain[[outcomevarname]] %*% milrResult$v
    vdfTe <- voxmatsTest[[outcomevarname]] %*% milrResult$v
    return(
      list(
        predictionTrain = NA,
        predictionTest = NA,
        lowDimensionalProjectionTrain = vdf,
        lowDimensionalProjectionTest = vdfTe
      )
    )
  } else {
    trmdl <- lm(myFormula, data = vdf)
    return(
      list(
        predictionTrain = predict(trmdl),
        predictionTest = predict(trmdl, newdata = vdfTe),
        lowDimensionalProjectionTrain = vdf[, predictormatrixnames],
        lowDimensionalProjectionTest = vdfTe[, predictormatrixnames]
      )
    )
  }
}




#' Efficiently compute a multivariate, image-based (mixed) linear decompositition (mild)
#'
#' This function simplifies calculating image-wide multivariate PCA maps.
#' The model will minimize a matrix energy similar to
#' norm( X - UVt - UranVrant ) where the U
#' are standard design and random effect (intercept) design matrices.  The
#' random intercept matrix is only included if repeated measures are indicated.
#'
#' @param dataFrame This data frame contains all relevant predictors except for
#' the matrices associated with the image variables.
#' @param voxmats The named list of matrices that contains the changing predictors.
#' @param basisK an integer determining the size of the basis.
#' @param myFormulaK This is a character string that defines a valid regression
#' which in this case should include predictors named as \code{paste0("mildBasis",1:basisK)}
#' @param smoothingMatrix allows parameter smoothing, should be square and same
#' size as input matrix
#' @param iterations number of gradient descent iterations
#' @param gamma step size for gradient descent
#' @param sparsenessQuantile quantile to control sparseness - higher is sparser
#' @param positivity restrict to positive or negative solution (beta) weights.
#' choices are positive, negative or either as expressed as a string.
#' @param initializationStrategy optional initialization matrix or seed.
#' seed should be a single number; matrix should be a n by k matrix.
#' @param repeatedMeasures list of repeated measurement identifiers. this will
#' allow estimates of per identifier intercept.
#' @param orthogonalize boolean to control whether we orthogonalize the v
#' @param verbose boolean to control verbosity of output
#' @return A list of different matrices that contain names derived from the
#' formula and the coefficients of the regression model.
#' @author BB Avants.
#' @examples
#'
#' set.seed(1500)
#' nsub <- 12
#' npix <- 100
#' outcome <- rnorm(nsub)
#' covar <- rnorm(nsub)
#' mat <- replicate(npix, rnorm(nsub))
#' mat2 <- replicate(npix, rnorm(nsub))
#' nk <- 3
#' myform <- paste(
#'   " vox2 ~ covar + vox + ",
#'   paste0("mildBasis", 1:nk, collapse = "+")
#' ) # optional covariates
#' df <- data.frame(outcome = outcome, covar = covar)
#' result <- mild(df, list(vox = mat, vox2 = mat2),
#'   basisK = 3, myform,
#'   initializationStrategy = 10
#' )
#' result <- mild(df, list(vox = mat, vox2 = mat2),
#'   basisK = 3, myform,
#'   initializationStrategy = 4
#' )
#' myumat <- svd(mat2, nv = 0, nu = 3)$u
#' result <- mild(df, list(vox = mat, vox2 = mat2),
#'   basisK = 3, myform,
#'   initializationStrategy = 0
#' )
#'
#' @seealso \code{\link{milr}}
#' @export mild
mild <- function(dataFrame, voxmats, basisK,
                 myFormulaK, smoothingMatrix,
                 iterations = 10, gamma = 1.e-6,
                 sparsenessQuantile = 0.5,
                 positivity = c("positive", "negative", "either"),
                 initializationStrategy = 0,
                 repeatedMeasures = NA,
                 orthogonalize = FALSE,
                 verbose = FALSE) {
  positivity <- positivity[1]
  mildorth <- orthogonalize
  vdf <- data.frame(dataFrame)
  matnames <- names(voxmats)
  matnorm <- norm(voxmats[[1]])
  if (length(matnames) == 0) stop("please name the input list entries")
  n <- nrow(voxmats[[1]])
  p <- ncol(voxmats[[1]])
  if (missing(smoothingMatrix)) smoothingMatrix <- diag(p)
  poschoices <- c("positive", "negative", "either", TRUE, FALSE)
  if (!missing(positivity)) {
    if (sum(positivity == poschoices) != 1 | length(positivity) != 1) {
      stop("choice of positivity parameter is not good - see documentation")
    }
    if (positivity == TRUE) positivity <- "positive"
    if (positivity == FALSE) positivity <- "either"
  }
  for (k in 1:length(voxmats)) {
    vdf <- cbind(vdf, voxmats[[k]][, 1])
    names(vdf)[ncol(vdf)] <- matnames[k]
    if (ncol(voxmats[[k]]) != p) {
      stop(paste("matrix ", matnames[k], " does not have ", p, "entries"))
    }
  }
  outcomevarname <- trimws(unlist(strsplit(myFormulaK, "~"))[1])
  outcomevarnum <- which(outcomevarname == matnames)
  if (is.numeric(initializationStrategy)) {
    set.seed(initializationStrategy)
    initializationStrategy <- scale(qr.Q(qr(
      replicate(basisK, rnorm(nrow(voxmats[[1]])))
    )))
  }
  if (!is.matrix(initializationStrategy)) {
    stop("Please set valid initializationStrategy.")
  }
  for (k in 1:basisK) {
    if (k == 1) { # check matrix size
      stopifnot(nrow(initializationStrategy) == nrow(vdf))
      stopifnot(ncol(initializationStrategy) == basisK)
    }
    initvec <- initializationStrategy[, k]
    vdf <- cbind(vdf, initvec)
    names(vdf)[ncol(vdf)] <- paste0("mildBasis", k)
  }
  # augment the formula with the k-basis
  # get names from the standard lm
  temp <- summary(lm(myFormulaK, data = vdf))
  myrownames <- rownames(temp$coefficients)
  knames <- myrownames[grep("mildBasis", myrownames)]
  mypvs <- matrix(rep(NA, p * length(myrownames)),
    nrow = length(myrownames)
  )
  myestvs <- mypvs
  myervs <- mypvs
  mytvs <- mypvs
  outcomeisconstant <- FALSE
  if (length(outcomevarnum) == 0) {
    outcomeisconstant <- TRUE
    outcomevarnum <- which(colnames(vdf) == outcomevarname)
  }
  hasRanEff <- FALSE
  vRan <- NA
  if (!any(is.na(repeatedMeasures))) {
    hasRanEff <- TRUE
    usubs <- unique(repeatedMeasures)
    if (length(repeatedMeasures) != nrow(dataFrame)) {
      stop("The length of the repeatedMeasures vector should equal the number of rows in the data frame.")
    }
    ranEff <- factor(repeatedMeasures)
    temp <- lm(rnorm(nrow(dataFrame)) ~ ranEff)
    temp <- model.matrix(temp)
    ranEffNames <- colnames(temp)
    ranEffNames[1] <- paste0("ranEff", as.character(levels(ranEff)[1]))
    temp[ranEff == levels(ranEff)[1], 1] <- 1
    temp[ranEff != levels(ranEff)[1], 1] <- 0
    zRan <- (temp[, ])
    colnames(zRan) <- ranEffNames
    tz <- t(zRan)
    tzz <- tz %*% zRan
    rm(ranEff)
  }
  mylm <- lm(myFormulaK, data = vdf)
  u <- (model.matrix(mylm)[, ])
  unms <- colnames(u)
  colnames(u) <- unms
  lvx <- length(voxmats)
  predictormatrixnames <- colnames(u)[colnames(u) %in% matnames]
  myks <- which(matnames %in% predictormatrixnames)
  v <- t(voxmats[[outcomevarnum]]) %*% u
  if (hasRanEff) {
    vRan <- matrix(rnorm(ncol(zRan) * p, 1, 1), nrow = p, ncol = ncol(zRan)) * 0.0
    dedrv <- vRan * 0
    colnames(vRan) <- ranEffNames
  }
  colnames(v) <- unms
  hasIntercept <- "(Intercept)" %in% colnames(v)
  if (hasIntercept) dospar <- 2:ncol(v) else dospar <- 1:ncol(v)

  for (i in 1:p) {
    if (length(myks) > 0) {
      for (k in 1:length(predictormatrixnames)) {
        u[, predictormatrixnames[k]] <- voxmats[[myks[k]]][, i]
      }
    }
    tu <- t(u)
    tuu <- t(u) %*% u
    if (outcomeisconstant) {
      myoc <- vdf[, outcomevarnum]
    } else {
      myoc <- voxmats[[outcomevarnum]][, i] / matnorm
    }
    term2 <- tu %*% myoc
    v[i, ] <- (tuu %*% v[i, ] - term2) * 0.01
  }
  v <- as.matrix(smoothingMatrix %*% v)
  v <- orthogonalizeAndQSparsify(v, sparsenessQuantile, positivity,
    orthogonalize = mildorth
  )
  dedv <- v * 0
  predicted <- voxmats[[1]] * 0
  # now fix dedv with the correct voxels
  for (iter in 1:iterations) {
    err <- 0
    v <- as.matrix(smoothingMatrix %*% v)
    for (i in 1:p) {
      if (length(myks) > 0) {
        for (k in 1:length(predictormatrixnames)) {
          u[, predictormatrixnames[k]] <- voxmats[[myks[k]]][, i]
        }
      }
      tu <- t(u)
      tuu <- t(u) %*% u
      if (outcomeisconstant) {
        myoc <- vdf[, outcomevarnum]
      } else {
        myoc <- voxmats[[outcomevarnum]][, i] / matnorm
      }
      term2 <- tu %*% myoc
      dedv[i, ] <- tuu %*% v[i, ] - term2
      if (hasRanEff) dedv[i, ] <- dedv[i, ] + (tu %*% zRan) %*% vRan[i, ]
      predicted[, i] <- u %*% (v[i, ])
      if (hasRanEff) predicted[, i] <- predicted[, i] + zRan %*% vRan[i, ]
      err <- err + mean(abs(myoc - predicted[, i]))
    }
    v <- v - dedv * gamma
    v <- as.matrix(smoothingMatrix %*% v)
    if (!missing(sparsenessQuantile)) {
      v <- orthogonalizeAndQSparsify(v, sparsenessQuantile, positivity,
        orthogonalize = mildorth
      )
    }
    gammamx <- gamma * 0.1 # go a bit slower
    if (hasRanEff) {
      vRan <- as.matrix(smoothingMatrix %*% vRan)
      # update random effects
      for (i in 1:p) {
        if (length(myks) > 0) {
          for (k in 1:length(predictormatrixnames)) {
            u[, predictormatrixnames[k]] <- voxmats[[myks[k]]][, i]
          }
        }
        tu <- t(u)
        tuu <- t(u) %*% u
        if (outcomeisconstant) {
          myoc <- vdf[, outcomevarnum]
        } else {
          myoc <- voxmats[[outcomevarnum]][, i] / matnorm
        }
        predicted[, i] <- u %*% (v[i, ]) + zRan %*% vRan[i, ]
        rterm2 <- tz %*% (myoc)
        dedrv[i, ] <- (tz %*% u) %*% v[i, ] + tzz %*% vRan[i, ] - rterm2
      }
      vRan <- vRan - dedrv * gammamx
    }
    # reset the u variables
    if (iter < iterations) {
      for (k in 1:length(knames)) {
        u[, knames[k]] <- (voxmats[[outcomevarnum]] %*% v[, knames[k]]) / matnorm
      }
      u[, knames] <- qr.Q(qr(u[, knames]))
    }
    if (verbose) print(err / p)
  }
  colnames(v) <- unms
  return(list(u = u, v = v, prediction = predicted, vRan = vRan))
}


.simlr3 <- function(dataFrame,
                    voxmats,
                    basisK,
                    myFormulaK,
                    smoothingMatrixX,
                    smoothingMatrixY,
                    iterations = 10, gamma = 1.e-6,
                    sparsenessQuantileX = 0.5,
                    sparsenessQuantileY = 0.5,
                    positivityX = c("positive", "negative", "either"),
                    positivityY = c("positive", "negative", "either"),
                    initializationStrategyX = list(seed = 0, matrix = NA, voxels = NA),
                    initializationStrategyY = list(seed = 0, matrix = NA, voxels = NA),
                    repeatedMeasures = NA,
                    verbose = FALSE) {
  ################################
  for (k in 1:length(voxmats)) {
    voxmats[[k]] <- scale(voxmats[[k]])
    voxmats[[k]] <- voxmats[[k]] / norm(voxmats[[k]])
  }
  myorth <- function(u) {
    vecorth <- function(v1, v2) {
      ni <- sum(v1 * v1)
      nip1 <- sum(v2 * v1)
      if (ni > 0) ratio <- nip1 / ni else ratio <- 1
      #    if ( cor( v1, v2 ) == 1 ) return( rnorm( length( v1 )))
      v2 <- v2 - v1 * ratio
      return(v2)
    }
    nc <- ncol(u)
    for (k in 1:(nc - 1)) {
      vi <- u[, k]
      for (i in (k + 1):nc) {
        vip1 <- u[, i]
        vip1 <- vecorth(vi, vip1)
        mynorm <- sum(vip1 * vip1)
        u[, i] <- vip1 / mynorm
      }
    }
    #  print( cor( u ) )
    return(u)
  }

  myorth2 <- function(x) {
    qr.Q(qr(x))
  }
  myorth3 <- function(u = NULL, basis = TRUE, norm = TRUE) {
    if (is.null(u)) {
      return(NULL)
    }
    if (!(is.matrix(u))) {
      u <- as.matrix(u)
    }
    p <- nrow(u)
    n <- ncol(u)
    if (prod(abs(La.svd(u)$d) > 1e-08) == 0) {
      stop("colinears vectors")
    }
    if (p < n) {
      warning("too much vectors to orthogonalize.")
      u <- as.matrix(u[, 1:p])
      n <- p
    }
    if (basis & (p > n)) {
      base <- diag(p)
      coef.proj <- crossprod(u, base) / diag(crossprod(u))
      base2 <- base - u %*% matrix(coef.proj, nrow = n, ncol = p)
      norm.base2 <- diag(crossprod(base2))
      base <- as.matrix(base[, order(norm.base2) > n])
      u <- cbind(u, base)
      n <- p
    }
    v <- u
    if (n > 1) {
      for (i in 2:n) {
        coef.proj <- c(crossprod(u[, i], v[, 1:(i - 1)])) / diag(crossprod(v[
          ,
          1:(i - 1)
        ]))
        v[, i] <- u[, i] - matrix(v[, 1:(i - 1)], nrow = p) %*%
          matrix(coef.proj, nrow = i - 1)
      }
    }
    if (norm) {
      coef.proj <- 1 / sqrt(diag(crossprod(v)))
      v <- t(t(v) * coef.proj)
    }
    return(v)
  }
  ################################
  orthogonalizeBasis <- TRUE
  n <- nrow(voxmats[[1]])
  p <- ncol(voxmats[[1]])
  q <- ncol(voxmats[[2]])
  if (missing(smoothingMatrixX)) smoothingMatrixX <- diag(p)
  if (missing(smoothingMatrixY)) smoothingMatrixY <- diag(q)
  xmatname <- names(voxmats)[1]
  ymatname <- names(voxmats)[2]
  formx <- paste(xmatname, myFormulaK)
  formy <- paste(ymatname, myFormulaK)
  locits <- 1
  ########################
  mildx <- mild(dataFrame,
    voxmats[1], basisK, formx, smoothingMatrixX,
    iterations = 1, gamma = gamma,
    sparsenessQuantile = sparsenessQuantileX,
    positivity = positivityX[[1]],
    initializationStrategy = initializationStrategyX,
    repeatedMeasures = repeatedMeasures,
    verbose = FALSE
  )
  colinds <- (ncol(mildx$u) - basisK + 1):ncol(mildx$u)
  ########################
  mildy <- mild(dataFrame,
    voxmats[2], basisK, formy, smoothingMatrixY,
    iterations = 1, gamma = gamma,
    sparsenessQuantile = sparsenessQuantileY,
    positivity = positivityY[[1]],
    initializationStrategy = initializationStrategyY,
    repeatedMeasures = repeatedMeasures,
    verbose = FALSE
  )
  xOrth <- mildy$u[, -1]
  yOrth <- mildx$u[, -1]
  #    xOrth = svd( antsrimpute( voxmats[[2]] %*% mildy$v[,-1] ) )$u
  #    yOrth = svd( antsrimpute( voxmats[[1]] %*% mildx$v[,-1] ) )$u
  # mildy$v = matrix(  rnorm( length( mildy$v ) ), nrow = nrow( mildy$v ) )
  # mildx$v = matrix(  rnorm( length( mildx$v ) ), nrow = nrow( mildx$v ) )
  for (i in 1:iterations) {
    if (orthogonalizeBasis == TRUE) {
      xOrthN <- myorth(voxmats[[2]] %*% mildy$v[, -1])
      yOrthN <- myorth(voxmats[[1]] %*% mildx$v[, -1])
      if (i == 1) {
        xOrth <- xOrthN
        yOrth <- yOrthN
      } else {
        wt1 <- 0.5
        wt2 <- 1.0 - wt1
        xOrth <- xOrth * wt1 + xOrthN * wt2
        yOrth <- yOrth * wt1 + yOrthN * wt2
      }
    }
    xOrth <- yOrth
    xorthinds <- c((ncol(xOrth) - basisK + 1):ncol(xOrth))
    colnames(xOrth[, xorthinds]) <- colnames(mildx$u[, colinds])
    colnames(yOrth[, xorthinds]) <- colnames(mildx$u[, colinds])
    dataFramex <- cbind(dataFrame, xOrth[, xorthinds])
    dataFramey <- cbind(dataFrame, yOrth[, xorthinds])
    dfinds <- c((ncol(dataFramex) - basisK + 1):ncol(dataFramex))
    colnames(dataFramex)[dfinds] <- colnames(mildx$u[, colinds])
    colnames(dataFramey)[dfinds] <- colnames(mildy$u[, colinds])
    mildx <- milr(dataFramex,
      voxmats[1], formx, smoothingMatrixX,
      iterations = locits, gamma = gamma * (1),
      sparsenessQuantile = sparsenessQuantileX,
      positivity = positivityX[[1]],
      repeatedMeasures = repeatedMeasures,
      verbose = FALSE
    )

    mildy <- milr(dataFramey,
      voxmats[2], formy, smoothingMatrixY,
      iterations = locits, gamma = gamma * (1),
      sparsenessQuantile = sparsenessQuantileY,
      positivity = positivityY[[1]],
      repeatedMeasures = repeatedMeasures,
      verbose = FALSE
    )
    ###############################################
    p1 <- antsrimpute(voxmats[[1]] %*% mildx$v[, -1])
    p2 <- antsrimpute(voxmats[[2]] %*% mildy$v[, -1])
    locor <- cor(p1, p2)
    overall <- mean(abs(diag(locor)))
    if (verbose & i > 0) {
      print(paste("it:", i - 1, "=>", overall))
      #    print( ( locor ) )
      #    print( diag( locor ) )
    }
  }
  return(list(simlrX = mildx, simlrY = mildy))


  if (FALSE) {
    xv <- mildx$v
    yv <- mildy$v
    xvup <- t(xOrth) %*% voxmats[[1]]
    yvup <- t(yOrth) %*% voxmats[[2]]
    xvup[, -colinds] <- 0
    yvup[, -colinds] <- 0
    xvup <- xvup / norm(xvup)
    yvup <- yvup / norm(yvup)
    # now make the above sparse
    xvup <- as.matrix(xvup[, ] %*% smoothingMatrixX)
    xv <- xv + t(xvup) * gamma
    xv <- as.matrix(smoothingMatrixX %*% xv[, ])
    xv <- orthogonalizeAndQSparsify(xv, sparsenessQuantileX, positivityX[[1]])
    xv <- as.matrix(smoothingMatrixX %*% xv[, ])

    # now make the above sparse
    yvup <- as.matrix(yvup[, ] %*% smoothingMatrixY)
    yv <- yv + t(yvup) * gamma
    yv <- as.matrix(smoothingMatrixY %*% yv[, ])
    yv <- orthogonalizeAndQSparsify(yv, sparsenessQuantileY, positivityY[[1]])
    yv <- as.matrix(smoothingMatrixY %*% yv[, ])

    mildy$u[, colinds] <- yOrth[, colinds] <- scale(voxmats[[1]] %*% (xv))[, colinds]
    mildx$u[, colinds] <- xOrth[, colinds] <- scale(voxmats[[2]] %*% (yv))[, colinds]
  } # end if
}





#' Initialize simlr
#'
#' Four initialization approaches for simlr.  Returns either a single matrix
#' derived from dimensionality reduction on all matrices bound together
#' (\code{jointReduction=TRUE}) or a list of reduced
#' dimensionality matrices, one for each input.  Primarily, a helper function
#' for SiMLR.
#'
#' @param voxmats list that contains the named matrices.
#' @param k rank of U matrix
#' @param jointReduction boolean determining whether one or length of list bases are returned.
#' @param zeroUpper boolean determining whether upper triangular part of
#' initialization is zeroed out
#' @param uAlgorithm either \code{"svd"}, \code{"random"}, \code{"randomProjection"}, \code{eigenvalue}, \code{"pca"} (default), \code{"ica"} or \code{"cca"}
#' @param addNoise scalar value that adds zero mean unit variance noise, multiplied
#' by the value of \code{addNoise}
#'
#' @return A single matrix or list of matrices
#' @author BB Avants.
#' @examples
#'
#' set.seed(1500)
#' nsub <- 3
#' npix <- c(10, 6, 13)
#' nk <- 2
#' outcome <- initializeSimlr(
#'   list(
#'     matrix(rnorm(nsub * npix[1]), ncol = npix[1]),
#'     matrix(rnorm(nsub * npix[2]), ncol = npix[2]),
#'     matrix(rnorm(nsub * npix[3]), ncol = npix[3])
#'   ),
#'   k = 2, uAlgorithm = "pca"
#' )
#'
#' @export
initializeSimlr <- function(
    voxmats, k, jointReduction = FALSE,
    zeroUpper = FALSE, uAlgorithm = "svd", addNoise = 0) {
  nModalities <- length(voxmats)
  localAlgorithm <- uAlgorithm
  if (uAlgorithm == "avg") {
    uAlgorithm <- "svd"
    localAlgorithm <- "svd"
  }
  if (uAlgorithm == "randomProjection" & jointReduction) {
    jointReduction <- FALSE
  }
  if (jointReduction) {
    X <- Reduce(cbind, voxmats) # bind all matrices
    if (uAlgorithm == "pca" | uAlgorithm == "svd") {
      X.pcr <- stats::prcomp(t(X), rank. = k, scale. = uAlgorithm == "svd") # PCA
      u <- (X.pcr$rotation)
    } else if (uAlgorithm == "ica") {
      u <- t(fastICA::fastICA(t(X), method = "C", n.comp = k)$A)
    } else if (uAlgorithm == "cca") {
      X <- Reduce(cbind, voxmats[-1]) # bind all matrices
      #      u = randcca( voxmats[[1]], X, k )$svd$u
      u <- sparseDecom2(list(voxmats[[1]], X),
        sparseness = c(0.5, 0.5), nvecs = k, its = 3, ell1 = 0.1
      )$projections2
    } else {
      u <- replicate(k, rnorm(nrow(voxmats[[1]])))
    }
    if (addNoise > 0) u <- u + replicate(k, rnorm(nrow(voxmats[[1]]))) * addNoise
    if (zeroUpper) u[upper.tri(u)] <- 0
    return(u)
  }
  uOut <- list()
  uRand <- replicate(k, rnorm(nrow(voxmats[[1]])))
  for (s in 1:nModalities) {
    X <- Reduce(cbind, voxmats[-s])
    if (localAlgorithm == "pca " | localAlgorithm == "svd") {
      uOut[[s]] <- (stats::prcomp(t(X), rank. = k, scale. = uAlgorithm == "svd")$rotation)
    } else if (localAlgorithm == "ica") {
      uOut[[s]] <- t(fastICA::fastICA(t(X), method = "C", n.comp = k)$A)
    } else if (localAlgorithm == "cca") {
      uOut[[s]] <- sparseDecom2(list(X, voxmats[[s]]),
        sparseness = c(0.5, 0.5), nvecs = k, its = 3, ell1 = 0.1
      )$projections2
    } else if (localAlgorithm == "randomProjection") {
      uOut[[s]] <- t((t(uRand) %*% voxmats[[s]]) %*% t(voxmats[[s]]))
      uOut[[s]] <- uOut[[s]] / norm(uOut[[s]], "F")
    } else {
      uOut[[s]] <- (stats::prcomp(t(X), rank. = k, scale. = uAlgorithm == "svd")$rotation)
    }
    if (addNoise > 0) uOut[[s]] <- uOut[[s]] + replicate(k, rnorm(nrow(voxmats[[1]]))) * addNoise
    if (zeroUpper) uOut[[s]][upper.tri(uOut[[s]])] <- 0
  }
  return(uOut)
}

#' Automatically produce regularization matrices for simlr
#'
#' @param x A list that contains the named matrices.
#' Note: the optimization will likely perform much more smoothly if the input
#' matrices are each scaled to zero mean unit variance e.g. by the \code{scale} function.
#' Note: x may also contain a mixture of raw matrix data and masks which are
#' binary antsImages. If a mask is passed, this function will assume the user
#' wants spatial regularization for that entry.
#' @param knn A vector of knn values (integers, same length as matrices)
#' @param fraction optional single scalar value to determine knn
#' @param sigma optional sigma vector for regularization (same length as matrices)
#' @param kPackage name of package to use for knn.  FNN is reproducbile but
#' RcppHNSW is much faster (with nthreads controlled by enviornment variable
#' ITK_GLOBAL_DEFAULT_NUMBER_OF_THREADS) for larger problems.  For large problems,
#' compute the regularization once and save to disk; load for repeatability.
#' @return A list of regularization matrices.
#' @author BB Avants.
#' @examples
#' # see simlr examples
#' @export
regularizeSimlr <- function(x, knn, fraction = 0.1, sigma, kPackage = "FNN") {
  getSpatialRegularization <- function(inmask, myk, mysig) {
    spatmat <- t(imageDomainToSpatialMatrix(inmask, inmask))
    regs <- knnSmoothingMatrix(spatmat,
      k = myk,
      sigma = mysig, kPackage = "FNN"
    )
    return(regs)
  }

  if (missing(knn)) {
    knn <- rep(NA, length(x))
    for (i in 1:length(x)) {
      temp <- round(fraction * ncol(x[[i]]))
      if (temp < 3) temp <- 3
      knn[i] <- temp
    }
  }
  if (missing(sigma)) sigma <- rep(10, length(x))
  slist <- list()
  for (i in 1:length(x)) {
    if (inherits(x[[i]], "antsImage")) {
      slist[[i]] <- getSpatialRegularization(x[[i]], knn[i], sigma[i])
    } else {
      slist[[i]] <- knnSmoothingMatrix(scale(data.matrix(x[[i]]), T, T),
        k = knn[i],
        sigma = sigma[i], kPackage = kPackage
      )
    }
  }
  return(slist)
}


#' predict output matrices from simlr solution
#'
#' @param x A list that contains the named matrices.
#' @param simsol the simlr solution
#' @param targetMatrix an optional index that sets a fixed target (outcome) matrix.
#' If not set, each basis set will be used to predict its corresponding matrix.
#' @param sourceMatrices an optional index vector that sets predictor matrices.
#' If not set, each basis set will be used to predict its corresponding matrix.
#' @param projectv boolean to determine whether raw u or x * v is used; default to TRUE which uses the x * v approach
#' @return A list of variance explained, predicted matrices and error metrics:
#' \itemize{
#'   \item{varx: }{Mean variance explained for \code{u_i} predicting \code{x_i}.}
#'   \item{predictions: }{Predicted \code{x_i} matrix.}
#'   \item{initialErrors: }{Initial matrix norm.}
#'   \item{finalErrors: }{Matrix norm of the error term.}
#'   \item{aggregateTstats: }{Summary t-stats for each matrix and each \code{u_i}.}
#'   \item{uOrder: }{Estimated order of the \code{u_i} predictors aggregated over all terms.}
#' }
#' @author BB Avants.
#' @examples
#' # see simlr examples
#' @export
predictSimlr <- function(x, simsol, targetMatrix, sourceMatrices, projectv = TRUE) {
  if (missing(sourceMatrices)) {
    sourceMatrices <- 1:length(x)
  } else {
    if (!any(sourceMatrices %in% 1:length(x))) {
      stop("sourceMatrices are not within the range of the number of input matrices")
    }
  }
  varxfull <- list()
  initialErrors <- rep(0, length(sourceMatrices))
  finalErrors <- rep(0, length(sourceMatrices))
  predictions <- list()
  sortOrder <- list()
  if (projectv) {
    for (jj in 1:length(simsol$u)) {
      simsol$u[[jj]] <- x[[jj]] %*% simsol$v[[jj]]
    }
  }
  myBetas <- matrix(rep(0, ncol(simsol$u[[1]]) * length(sourceMatrices)),
    ncol = ncol(simsol$u[[1]])
  )
  myBetasQ5 <- myBetas
  ct <- 1
  for (i in 1:length(sourceMatrices)) {
    if (missing(targetMatrix)) mytm <- i else mytm <- targetMatrix
    mdl <- lm(x[[mytm]] ~ simsol$u[[sourceMatrices[i]]])
    predictions[[i]] <- predict(mdl)
    smdl <- summary(mdl)
    blm <- bigLMStats(mdl, 0.0001)
    myBetas[i, ] <- rowMeans(abs(blm$beta.t))
    q5betas <- abs(blm$beta.t)
    q5betas[q5betas < quantile(q5betas, 0.5, na.rm = TRUE)] <- NA
    myBetasQ5[i, ] <- rowMeans(q5betas, na.rm = T)
    varxj <- rep(NA, length(smdl))
    for (j in 1:length(smdl)) {
      varxj[j] <- smdl[[j]]$r.squared
    }
    varxfull[[i]] <- varxj
    finalErrors[i] <- norm(predictions[[i]] - x[[mytm]], "F")
    initialErrors[i] <- norm(x[[mytm]], "F")
  }
  varx <- unlist(lapply(varxfull, FUN = "mean"))
  uOrder <- order(colSums(myBetas), decreasing = TRUE)
  list(
    varx = varx,
    varxfull = varxfull,
    predictions = predictions,
    initialErrors = initialErrors,
    finalErrors = finalErrors,
    aggregateTstats = myBetas,
    tstatsQ5 = myBetasQ5,
    rawTstats = blm$beta.t,
    uOrder = uOrder
  )
}



#' Similarity-driven multiview linear reconstruction model (simlr) for N modalities
#'
#' simlr minimizes reconstruction error across related modalities.  That is,
#' simlr will reconstruct each modality matrix from a basis set derived from
#' the other modalities.  The basis set can be derived from SVD, ICA or a
#' simple sum of basis representations.
#' This function produces dataset-wide multivariate beta maps for each of the
#' related matrices.  The multivariate beta maps are regularized by user
#' input matrices that encode relationships between variables.  The idea is
#' overall similar to canonical correlation analysis but generalizes the basis
#' construction and to arbitrary numbers of modalities.
#'
#' @param voxmats A list that contains the named matrices.  Note: the optimization will likely perform much more smoothly if the input matrices are each scaled to zero mean unit variance e.g. by the \code{scale} function.
#' @param smoothingMatrices list of (sparse) matrices that allow parameter smoothing/regularization.  These should be square and same order and size of input matrices.
#' @param iterations number of gradient descent iterations
#' @param sparsenessQuantiles vector of quantiles to control sparseness - higher is sparser
#' @param positivities vector that sets for each matrix if we restrict to positive or negative solution (beta) weights.
#' choices are positive, negative or either as expressed as a string.
#' @param initialUMatrix list of initialization matrix size \code{n} by \code{k} for each modality.  Otherwise, pass a single scalar to control the
#' number of basis functions in which case random initialization occurs. One
#' may also pass a single initialization matrix to be used for all matrices.
#' If this is set to a scalar, or is missing, a random matrix will be used.
#' @param mixAlg 'svd', 'ica', 'rrpca-l', 'rrpca-s', 'stochastic', 'pca' or 'avg' denotes the algorithm employed when estimating the mixed modality bases
#' @param orthogonalize boolean to control whether we orthogonalize the solutions explicitly
#' @param repeatedMeasures list of repeated measurement identifiers. this will
#' allow estimates of per identifier intercept.
#' @param lineSearchRange lower and upper limit used in \code{optimize}
#' @param lineSearchTolerance tolerance used in \code{optimize}, will be multiplied by each matrix norm such that it scales appropriately with input data
#' @param randomSeed controls repeatability of ica-based decomposition
#' @param constraint one of none, Grassmann or Stiefel
#' @param energyType one of regression, normalized, lowRank, cca or ucca
#' @param vmats optional initial \code{v} matrix list
#' @param connectors a list ( length of projections or number of modalities )
#' that indicates which modalities should be paired with current modality
#' @param optimizationStyle one of \code{c("mixed","greedy","linesearch")}
#' @param scale options to standardize each matrix. e.g. divide by the square root
#' of its number of variables (Westerhuis, Kourti, and MacGregor 1998), divide
#' by the number of variables or center or center and scale or ... (see code).
#' can be a vector which will apply each strategy in order.
#' @param expBeta if greater than zero, use exponential moving average on gradient.
#' @param jointInitialization boolean for initialization options, default TRUE
#' @param sparsenessAlg string sets the NMF algorithm to estimate V or basicrank or othorank
#' @param verbose boolean to control verbosity of output - set to level \code{2}
#' in order to see more output, specifically the gradient descent parameters.
#' @return A list of u, x, y, z etc related matrices.
#' @author BB Avants.
#' @examples
#'
#' set.seed(1500)
#' nsub <- 25
#' npix <- c(100, 200, 133)
#' nk <- 5
#' outcome <- matrix(rnorm(nsub * nk), ncol = nk)
#' outcome1 <- matrix(rnorm(nsub * nk), ncol = nk)
#' outcome2 <- matrix(rnorm(nsub * nk), ncol = nk)
#' outcome3 <- matrix(rnorm(nsub * nk), ncol = nk)
#' view1tx <- matrix(rnorm(npix[1] * nk), nrow = nk)
#' view2tx <- matrix(rnorm(npix[2] * nk), nrow = nk)
#' view3tx <- matrix(rnorm(npix[3] * nk), nrow = nk)
#' mat1 <- (outcome %*% t(outcome1) %*% (outcome1)) %*% view1tx
#' mat2 <- (outcome %*% t(outcome2) %*% (outcome2)) %*% view2tx
#' mat3 <- (outcome %*% t(outcome3) %*% (outcome3)) %*% view3tx
#' matlist <- list(vox = mat1, vox2 = mat2, vox3 = mat3)
#' result <- simlr(matlist)
#' p1 <- mat1 %*% (result$v[[1]])
#' p2 <- mat2 %*% (result$v[[2]])
#' p3 <- mat3 %*% (result$v[[3]])
#' regs <- regularizeSimlr(matlist)
#' result2 <- simlr(matlist)
#' pred1 <- predictSimlr(matlist, result)
#' pred2 <- predictSimlr(matlist, result2)
#'
#' # compare to permuted data
#' s1 <- sample(1:nsub)
#' s2 <- sample(1:nsub)
#' resultp <- simlr(list(vox = mat1, vox2 = mat2[s1, ], vox3 = mat3[s2, ]))
#' p1p <- mat1 %*% (resultp$v[[1]])
#' p2p <- mat2[s1, ] %*% (resultp$v[[2]])
#' p3p <- mat3[s2, ] %*% (resultp$v[[3]])
#'
#' # compare to SVD
#' svd1 <- svd(mat1, nu = nk, nv = 0)$u
#' svd2 <- svd(mat2, nu = nk, nv = 0)$u
#' svd3 <- svd(mat3, nu = nk, nv = 0)$u
#'
#' # real
#' range(cor(p1, p2))
#' range(cor(p1, p3))
#' range(cor(p3, p2))
#'
#' # permuted
#' range(cor(p1p, p2p))
#' range(cor(p1p, p3p))
#' range(cor(p3p, p2p))
#'
#' # svd
#' print(range(cor(svd1, svd2)))
#'
#' resultp <- simlr(list(vox = mat1, vox2 = mat2[s1, ], vox3 = mat3[s2, ]),
#'   initialUMatrix = nk, verbose = TRUE, iterations = 5,
#'   energyType = "normalized"
#' )
#' @seealso \code{\link{milr}} \code{\link{mild}} \code{\link{simlrU}}
#' @export simlr
simlr <- function(
    voxmats,
    smoothingMatrices,
    iterations = 10,
    sparsenessQuantiles,
    positivities,
    initialUMatrix,
    mixAlg = c("svd", "ica", "avg", "rrpca-l", "rrpca-s", "pca", "stochastic"),
    orthogonalize = FALSE,
    repeatedMeasures = NA,
    lineSearchRange = c(-1e10, 1e10),
    lineSearchTolerance = 1e-8,
    randomSeed,
    constraint = c("none", "Grassmann", "Stiefel"),
    energyType = c("cca", "regression", "normalized", "ucca", "lowRank", "lowRankRegression"),
    vmats,
    connectors = NULL,
    optimizationStyle = c("lineSearch", "mixed", "greedy"),
    scale = c("centerAndScale", "sqrtnp", "np", "center", "norm", "none", "impute", "eigenvalue", "robust"),
    expBeta = 0.0,
    jointInitialization = TRUE,
    sparsenessAlg = NA,
    verbose = FALSE) {
  if (missing(scale)) scale <- c("centerAndScale")
  if (missing(energyType)) energyType <- "cca"
  if (missing(mixAlg)) mixAlg <- "svd"
  if (missing(optimizationStyle)) optimizationStyle <- "lineSearch"
  if (!missing("randomSeed")) set.seed(randomSeed) #  else set.seed( 0 )
  energyType <- match.arg(energyType)
  constraint <- match.arg(constraint)
  optimizationStyle <- match.arg(optimizationStyle)
  scaleList <- c()
  if (length(scale) == 1) {
    scaleList[1] <- match.arg(scale[1], choices = c(
      "sqrtnp", "np", "centerAndScale",
      "norm", "none", "impute", "eigenvalue", "center", "robust"
    ))
  }
  if (length(scale) > 1) {
    for (kk in 1:length(scale)) {
      scaleList[kk] <- match.arg(scale[kk],
        choices = c(
          "sqrtnp", "np", "centerAndScale", "norm", "none",
          "impute", "eigenvalue", "center", "robust"
        )
      )
    }
  }
  # \sum_i  \| X_i - \sum_{ j ne i } u_j v_i^t \|^2 + \| G_i \star v_i \|_1
  # \sum_i  \| X_i - \sum_{ j ne i } u_j v_i^t - z_r v_r^ T \|^2 + constraints
  #
  constrainG <- function(vgrad, i, constraint) {
    if (constraint == "Grassmann") {
      # grassmann manifold - see https://stats.stackexchange.com/questions/252633/optimization-with-orthogonal-constraints
      # Edelman, A., Arias, T. A., & Smith, S. T. (1998). The geometry of algorithms with orthogonality constraints. SIAM journal on Matrix Analysis and Applications, 20(2), 303-353.
      projjer <- diag(ncol(vgrad)) - t(vmats[[i]]) %*% vmats[[i]]
      return(vgrad %*% projjer)
    }
    if (constraint == "Stiefel") { # stiefel manifold
      vgrad <- vgrad - vmats[[i]] %*% (t(vgrad) %*% (vmats[[i]]))
    }
    return(vgrad)
  }

  normalized <- FALSE
  ccaEnergy <- FALSE
  nModalities <- length(voxmats)
  if (missing(connectors)) {
    temp <- 1:nModalities
    connectors <- list()
    for (i in temp) {
      connectors[[i]] <- temp[-i]
    }
  }
  if (energyType == "normalized") {
    normalized <- TRUE
    ccaEnergy <- FALSE
  }
  if (energyType == "cca" | energyType == "ucca") {
    normalized <- FALSE
    ccaEnergy <- TRUE
  }
  mixAlg <- match.arg(mixAlg)
  # 0.0 adjust length of input data
  gamma <- rep(1, nModalities)
  if (missing(positivities)) {
    positivities <- rep("positive", nModalities)
  }
  if (any((positivities %in% c("positive", "negative", "either")) == FALSE)) {
    stop("positivities must be one of positive, negative, either")
  }
  if (length(positivities) == 1) {
    positivities <- rep(positivities[1], nModalities)
  }
  matnames <- p <- rep(NA, nModalities)
  for (i in 1:nModalities) {
    p[i] <- ncol(voxmats[[i]])
  }
  n <- nrow(voxmats[[1]])
  if (missing(sparsenessQuantiles)) {
    sparsenessQuantiles <- rep(0.5, nModalities)
  }

  # 1.0 adjust matrix norms
  if (!(any(scaleList == "none"))) {
    for (i in 1:nModalities) {
      if (any(is.null(voxmats[[i]])) | any(is.na(voxmats[[i]]))) {
        stop(paste("input matrix", i, "is null or NA."))
      }
      matnames <- names(voxmats)[i]
      if (any(is.na(voxmats[[i]]))) {
        voxmats[[i]][is.na(voxmats[[i]])] <- mean(voxmats[[i]], na.rm = T)
      }
      for (j in 1:length(scaleList)) {
        if (scaleList[j] == "norm") {
          voxmats[[i]] <- voxmats[[i]] / norm(voxmats[[i]], type = "F")
        }
        if (scaleList[j] == "np") {
          voxmats[[i]] <- voxmats[[i]] / prod(dim(voxmats[[i]]))
        }
        if (scaleList[j] == "sqrtnp") {
          voxmats[[i]] <- voxmats[[i]] / sqrt(prod(dim(voxmats[[i]])))
        }
        if (scaleList[j] == "center") {
          voxmats[[i]] <- base::scale(voxmats[[i]], center = TRUE, scale = FALSE)
        }
        if (scaleList[j] == "centerAndScale") {
          voxmats[[i]] <- base::scale(voxmats[[i]], center = TRUE, scale = TRUE)
        }
        if (scaleList[j] == "eigenvalue") {
          voxmats[[i]] <- voxmats[[i]] / sum(svd(voxmats[[i]])$d)
        }
        if (scaleList[j] == "robust") {
          voxmats[[i]] <- robustMatrixTransform(voxmats[[i]])
        }
      }
    }
  }

  # 3.0 setup regularization
  if (missing(smoothingMatrices)) {
    smoothingMatrices <- list()
    for (i in 1:nModalities) {
      smoothingMatrices[[i]] <- diag(ncol(voxmats[[i]]))
    }
  }
  for (i in 1:length(smoothingMatrices)) {
    if (any(is.null(smoothingMatrices[[i]])) | any(is.na(smoothingMatrices[[i]]))) {
      stop(paste("smoothing-matrix", i, "is null or NA."))
    }
    smoothingMatrices[[i]] <- smoothingMatrices[[i]] /
      Matrix::rowSums(smoothingMatrices[[i]])
  }
  # some gram schmidt code
  localGS <- function(x, orthogonalize = TRUE) {
    if (!orthogonalize) {
      return(x)
    }
    n <- dim(x)[1]
    m <- dim(x)[2]
    q <- matrix(0, n, m)
    r <- matrix(0, m, m)
    qi <- x[, 1]
    si <- sqrt(sum(qi^2))
    if (si < .Machine$double.eps) si <- 1
    q[, 1] <- qi / si
    r[1, 1] <- si
    for (i in 2:m) {
      xi <- x[, i]
      qj <- q[, 1:(i - 1)]
      rj <- t(qj) %*% xi
      qi <- xi - qj %*% rj
      r[1:(i - 1), i] <- rj
      si <- sqrt(sum(qi^2))
      if (si < .Machine$double.eps) si <- 1
      q[, i] <- qi / si
      r[i, i] <- si
    }
    return(q)
    return(list(q = q, r = r))
  }
  randmat <- 0

  # 4.0 setup initialization
  if (missing(initialUMatrix)) {
    initialUMatrix <- nModalities
  }

  if (is.matrix(initialUMatrix)) {
    randmat <- initialUMatrix
    initialUMatrix <- list()
    for (i in 1:nModalities) {
      initialUMatrix[[i]] <- randmat
    }
  } else if (is.numeric(initialUMatrix)) {
    if (jointInitialization) {
      temp <- initializeSimlr(voxmats, initialUMatrix, uAlgorithm = mixAlg, jointReduction = jointInitialization)
      initialUMatrix <- list()
      for (i in 1:nModalities) initialUMatrix[[i]] <- temp
    } else {
      initialUMatrix <- initializeSimlr(voxmats, initialUMatrix, uAlgorithm = mixAlg, jointReduction = jointInitialization)
    }
  }

  if (length(initialUMatrix) != nModalities &
    !is.matrix(initialUMatrix)) {
    message(paste("initializing with random matrix: ", initialUMatrix, "columns"))
    randmat <- scale(
      ((matrix(rnorm(n * initialUMatrix), nrow = n))), TRUE, TRUE
    )
    initialUMatrix <- list()
    for (i in 1:nModalities) {
      initialUMatrix[[i]] <- randmat
    }
  }

  basisK <- ncol(initialUMatrix[[1]])
  buildV <- FALSE
  if (missing(vmats)) buildV <- TRUE else if (is.null(vmats)) buildV <- TRUE
  if (buildV) {
    if (verbose) print("     <0> BUILD-V <0> BUILD-V <0> BUILD-V <0> BUILD-V <0>    ")
    vmats <- list()
    for (i in 1:nModalities) {
      vmats[[i]] <- t(voxmats[[i]]) %*% initialUMatrix[[i]]
      # 0 # svd( temp, nu=basisK, nv=0 )$u
      # for ( kk in 1:nModalities ) vmats[[ i ]] = vmats[[ i ]] + t(voxmats[[i]]) %*% initialUMatrix[[kk]]
      #      vmats[[ i ]] = # svd( vmats[[ i ]], nu=basisK, nv=0 )$u
      #        ( stats::prcomp( vmats[[ i ]], retx=TRUE, rank.=basisK, scale.=TRUE )$x )
      vmats[[i]] <- vmats[[i]] / norm(vmats[[i]], "F")
    }
  }

  nc <- ncol(initialUMatrix[[1]])
  myw <- matrix(rnorm(nc^2), nc, nc) # initialization for fastICA
  getSyME2 <- function(lineSearch, gradient, myw, mixAlg,
                       avgU, whichModality, verbose = FALSE) {
    prediction <- 0
    myenergysearchv <- (vmats[[whichModality]] + gradient * lineSearch) # update the i^th v matrix
    if (verbose) {
      print(paste("getSyME2", whichModality))
      print(dim(vmats[[whichModality]]))
    }
    if (sparsenessQuantiles[whichModality] != 0) {
      myenergysearchv <- orthogonalizeAndQSparsify( # make sparse
        as.matrix(smoothingMatrices[[whichModality]] %*% myenergysearchv),
        sparsenessQuantiles[whichModality],
        orthogonalize = FALSE,
        positivity = positivities[whichModality],
        softThresholding = TRUE,
        sparsenessAlg = sparsenessAlg
      )
    }

    if (ccaEnergy) {
      # ( v'*X'*Y )/( norm2(X*v ) * norm2( u ) )
      t0 <- norm(voxmats[[whichModality]] %*% myenergysearchv, "F")
      t1 <- norm(avgU, "F")
      mynorm <- -1.0 / (t0 * t1)
      if (verbose) {
        print("CCA")
        print(dim(avgU))
        print(dim(voxmats[[whichModality]]))
      }
      return(mynorm *
        sum(abs(diag(t(avgU) %*% (voxmats[[whichModality]] %*% myenergysearchv)))))
      # t(avgU/t1) %*% ( (voxmats[[whichModality]] %*% myenergysearchv) /t0 ) )) )
    }

    # ACC tr( abs( U' * X * V ) ) / ( norm2(U)^0.5 * norm2( X * V )^0.5 )

    # low-dimensional error approximation
    if (energyType == "lowRank") {
      vpro <- voxmats[[whichModality]] %*% (myenergysearchv)
      # energy = norm( scale(avgU,F,F)  - scale(vpro,F,F), "F" )
      energy <- norm(avgU / norm(avgU, "F") - vpro / norm(vpro, "F"), "F")
      return(energy)
    }
    #
    # low-dimensional error approximation
    if (energyType == "lowRankRegression") {
      vpro <- voxmats[[whichModality]] %*% (myenergysearchv)
      energy <- norm(avgU - vpro, "F")
      return(energy)
    }

    prediction <- avgU %*% t(myenergysearchv)
    prediction <- prediction - (colMeans(prediction) - colMeans(voxmats[[whichModality]]))
    if (!normalized) energy <- norm(prediction - voxmats[[whichModality]], "F")
    if (normalized) energy <- norm(prediction / norm(prediction, "F") - voxmats[[whichModality]] / norm(voxmats[[whichModality]], "F"), "F")
    return(energy)
  }

  energyPath <- matrix(Inf, nrow = iterations, ncol = nModalities)
  initialEnergy <- 0
  for (i in 1:nModalities) {
    loki <- getSyME2(0, 0,
      myw = myw, mixAlg = mixAlg,
      avgU = initialUMatrix[[i]],
      whichModality = i
    )
    #    energyPath[1,i] = loki
    initialEnergy <- initialEnergy + loki / nModalities
  }
  bestU <- initialUMatrix
  bestV <- vmats
  totalEnergy <- c(initialEnergy)
  if (verbose) {
    print(paste(
      "initialDataTerm:", initialEnergy,
      " <o> mixer:", mixAlg, " <o> E: ", energyType
    ))
  }

  getSyMGnorm <- function(v, i, myw, mixAlg) {
    # norm2( X/norm2(X) - U * V'/norm2(U * V') )^2
    u <- initialUMatrix[[i]]
    x <- voxmats[[i]]
    t0 <- v %*% t(u)
    t1 <- u %*% t(v)
    t2 <- norm(t1, "F")
    t3 <- t(x) / norm(x, "F") - t0 / norm(t0)
    tr <- function(x) sum(diag(x))
    gradV <- -2.0 / t2 * t3 %*% u +
      2.0 / t2^3 * tr(t1 %*% t3) * (t0 %*% u)
    return(gradV)
  }
  wm <- "matrix"
  if (ccaEnergy) wm <- "ccag"
  getSyMGccamse <- function(v, i, myw, mixAlg, whichModel = wm) {
    u <- initialUMatrix[[i]]
    x <- voxmats[[i]]
    if (whichModel == "matrix") {
      if (energyType == "lowRankRegression") {
        xv <- x %*% (v)
        return(1.0 / norm(xv - u, "F") * t(x) %*% (xv - u))
      }
      if (energyType == "lowRank") {
        #          term1 = 2.0 * t( x ) %*% ( u - x %*% v ) #  norm2( U - X * V )^2
        if (TRUE) {
          t0 <- x %*% v
          t1 <- norm(t0, "F")
          t2 <- t((u) / norm(u, "F") - (t0) / t1)
          tr <- function(x) sum(diag(x))
          vgrad1 <- -2.0 / t1 * (t2 %*% x)
          lowx <- (t(x) %*% (x %*% v))
          vgrad2 <- (2.0 / t1^3 * tr((t2) %*% (t0)) * t(lowx))
          return(t(vgrad2 + vgrad1))
        }
        return(term1)
      } else {
        term1 <- 2.0 * (t(x) %*% u - (v %*% t(u)) %*% u)
      } #  norm2( X - U * V' )^2
      term2 <- 0
      if (i %in% connectors[i]) { # d/dV ( norm2( X - X * V * V' )^2 ) => reconstruction energy
        temp <- (x - (x %*% v) %*% t(v)) # n by p
        term2 <- (t(temp) %*% (x %*% v)) -
          t(x) %*% (temp %*% v)
        term1 <- term1 * 0.5
      }
      return(term1 + term2) # + sign( v ) * 0.001 # pure data-term
    }
    if (whichModel == "regression") {
      mdl <- lm(x ~ u)
      intercept <- coefficients(mdl)[1, ]
      return(t(coefficients(mdl)[-1, ]) * 0.05) # move toward these coefficients
    }
    if (whichModel == "ccag") {
      # ( v'*X'*u)/( norm2(X*v ) * norm2( u ) ) # CCA objective
      subg <- function(x, u, v) {
        t0 <- norm(x %*% v, "F")
        t1 <- norm(u, "F")
        mytr <- sum(diag(t(u) %*% (x %*% v)))
        return(1.0 / (t0 * t1) * (t(x) %*% u) -
          1 / (t0^3 * t1) * mytr * (t(x) %*% (x %*% v)) +
          sign(v) * 0.0)
      }
      # tr( abs( v'*X'*u) )/( norm2(X*v ) * norm2( u ) ) # CCA style objective
      subg <- function(x, u, v) {
        t0 <- norm(x %*% v, "F")
        t1 <- norm(u, "F")
        t2 <- t(u) %*% (x %*% v)
        mytr <- sum(abs(diag(t2)))
        signer <- t2 * 0
        diag(signer) <- sign(diag(t2))
        return(1.0 / (t0 * t1) * (t(x) %*% u) %*% signer -
          1.0 / (t0^3 * t1) * mytr * (t(x) %*% (x %*% v)) +
          sign(v) * 0.0)
      }

      #        detg <- function( x, u, v ) {
      #          t0 = x %*% v
      #          t1 = norm( t0, "F" )
      #          t2 = norm( u, "F" )
      #          term1 = 1.0/(t1*t2) * ( t(x) %*% u ) %*% matlib::adjoint( t(t0) %*% u )
      #          term2 = det( t( u ) %*% ( x %*% v ) ) / ( t1^3 * t2 ) * ( t(x) %*% ( x %*% v ) )
      #          return( term1 - term2 )
      #          }

      if (energyType == "cca") {
        return(subg(x, u, v))
      }
      gradder <- v * 0
      for (j in 1:nModalities) {
        if (j != i) {
          gradder <- gradder +
            subg(x, scale(voxmats[[j]] %*% vmats[[j]], TRUE, TRUE), v)
        }
      }
      return(gradder / (nModalities - 1))
    }
    if (whichModel == "ccag2") { # THIS IS WIP / not recommended
      # tr( (x'*X'/norm2(X*x ))*(Y/norm2( Y )))
      t0 <- x %*% v
      poster <- t(t(x) %*% t0)
      preposter <- (t(u) %*% (t0))
      t1 <- norm(t0, "F")
      t2 <- norm(u, "F")
      mytr <- sum(diag(t(u) %*% (x %*% v)))
      (1.0 / (t1 * t2) * (t(x) %*% u) -
        1.0 / (t1^3 * t2) * t(preposter %*% poster)) * 0.5
    }
  }
  if (normalized) getSyMG <- getSyMGnorm else getSyMG <- getSyMGccamse

  lineSearchLogic <- function(x) { # energy path is input
    nna <- sum(!is.na(x))
    if (nna <= 1) {
      return(TRUE)
    }
    xx <- na.omit(x)
    if ((xx[length(xx) - 1] > xx[length(xx)])) {
      return(FALSE)
    }
    return(TRUE)
    # mdl = loess( xx ~ as.numeric(1:length(xx)) ) # for slope estimate
  }

  optimizationLogic <- function(energy, iteration, i) {
    if (optimizationStyle == "greedy" & iteration < 3) {
      return(TRUE)
    }
    if (optimizationStyle == "greedy" & iteration >= 3) {
      return(FALSE)
    }
    if (optimizationStyle == "mixed") {
      return(lineSearchLogic(energy[, i]) | iteration < 3)
    }
    if (optimizationStyle == "lineSearch") {
      return(TRUE)
    }
  }
  ################################################################################
  # below is the primary optimization loop - grad for v then for vran
  ################################################################################
  datanorm <- rep(0.0, nModalities)
  bestTot <- Inf
  lastV <- list()
  lastG <- list()
  for (myit in 1:iterations) {
    errterm <- rep(1.0, nModalities)
    matrange <- 1:nModalities
    for (i in matrange) { # get update for each V_i
      if (ccaEnergy) {
        vmats[[i]] <- vmats[[i]] / norm(vmats[[i]], "F")
        initialUMatrix[[i]] <- initialUMatrix[[i]] / norm(initialUMatrix[[i]], "F")
      }
      # initialize gradient line search
      temperv <- getSyMG(vmats[[i]], i, myw = myw, mixAlg = mixAlg)
      temperv <- constrainG(temperv, i, constraint = constraint)

      useAdam <- FALSE
      if (useAdam) { # completely experimental hack that may be improved/used in future for batch opt
        if (myit == 1 & i == 1) {
          m <- list()
          v <- list()
        }
        beta_1 <- 0.9
        beta_2 <- 0.99
        if (myit <= 1) {
          m[[i]] <- temperv * 0
          v[[i]] <- temperv * 0
        }
        m[[i]] <- beta_1 * m[[i]] + (1 - beta_1) * temperv
        v[[i]] <- beta_2 * v[[i]] + (1 - beta_2) * temperv^2.0
        m_hat <- m[[i]] / (1 - beta_1^myit)
        v_hat <- v[[i]] / (1 - beta_2^myit)
      }

      if (expBeta > 0) {
        if (myit == 1) lastG[[i]] <- 0
        temperv <- temperv * (1.0 - expBeta) + lastG[[i]] * (expBeta)
        lastG[[i]] <- temperv
      }
      if (optimizationLogic(energyPath, myit, i)) {
        temp <- optimize(getSyME2, # computes the energy
          interval = lineSearchRange,
          tol = lineSearchTolerance,
          gradient = temperv,
          myw = myw, mixAlg = mixAlg,
          avgU = initialUMatrix[[i]], whichModality = i
        )
        errterm[i] <- temp$objective
        gamma[i] <- temp$minimum
      } else {
        gamma[i] <- gamma[i] * 0.5
        errterm[i] <- getSyME2(
          gamma[i], temperv,
          myw = myw, mixAlg = mixAlg,
          avgU = initialUMatrix[[i]],
          whichModality = i
        )
      }
      if (errterm[i] <= min(energyPath[, i], na.rm = T) |
        optimizationStyle == "greedy") # ok to update
        {
          if (!useAdam) vmats[[i]] <- (vmats[[i]] + (temperv) * gamma[i])
          if (useAdam) vmats[[i]] <- vmats[[i]] - gamma[i] * m_hat / (sqrt(v_hat) + 1) # adam
          if (sparsenessQuantiles[i] != 0) {
            vmats[[i]] <- orthogonalizeAndQSparsify(
              as.matrix(smoothingMatrices[[i]] %*% vmats[[i]]),
              sparsenessQuantiles[i],
              orthogonalize = FALSE, positivity = positivities[i],
              unitNorm = FALSE,
              softThresholding = TRUE,
              sparsenessAlg = sparsenessAlg
            )
          }
          if (normalized) vmats[[i]] <- vmats[[i]] / norm(vmats[[i]], "F")
        }
      if (ccaEnergy) {
        vmats[[i]] <- vmats[[i]] / norm(vmats[[i]], "F")
        initialUMatrix[[i]] <- initialUMatrix[[i]] / norm(initialUMatrix[[i]], "F")
      }
    } # matrange
    if (verbose == 2) print(gamma)

    # run the basis calculation for each U_i - convert self to other
    nn <- !ccaEnergy
    if (TRUE) {
      for (jj in 1:nModalities) {
        initialUMatrix[[jj]] <- scale(voxmats[[jj]] %*% vmats[[jj]], nn, nn)
      }
      temp <- simlrU(initialUMatrix, mixAlg, myw,
        orthogonalize = orthogonalize,
        connectors = connectors
      )
      for (jj in 1:nModalities) {
        initialUMatrix[[jj]] <-
          localGS(temp[[jj]], orthogonalize = orthogonalize)
        # below exp avg for u updates --- not tested
        #        initialUMatrix[[jj]] = initialUMatrix[[jj]] * 0.9 +
        #            localGS( temp[[jj]], orthogonalize = orthogonalize ) * 0.1
      }
    }


    # evaluate new energies
    for (jj in 1:nModalities) {
      loki <- getSyME2(0, 0,
        myw = myw, mixAlg = mixAlg,
        avgU = initialUMatrix[[jj]],
        whichModality = jj, verbose = FALSE
      )
      energyPath[myit, jj] <- loki
    } # matrix loop
    if (myit == 1) {
      bestEv <- mean(energyPath[1, ], na.rm = TRUE)
      bestRow <- 1
    } else {
      bestEv <- min(rowMeans(energyPath[1:(myit), ], na.rm = TRUE))
      bestRow <- which.min(rowMeans(na.omit(energyPath[1:(myit), ]), na.rm = TRUE))
    }
    if (optimizationStyle == "greedy") {
      #      bestEv = ( mean( energyPath[(myit),], na.rm = TRUE  ) )
      bestRow <- myit
    }
    totalEnergy[myit] <- bestEv
    if (mean(energyPath[myit, ], na.rm = TRUE) <= bestEv |
      optimizationStyle == "greedy") {
      bestU <- initialUMatrix
      bestV <- vmats
    } else { # FIXME - open question - should we reset or not?
      #      initialUMatrix = bestU ; vmats = bestV
    }
    if (verbose > 0) {
      outputString <- paste("Iteration:", myit, "bestEv:", bestEv, "bestIt:", bestRow)
      #  if ( optimizationStyle == 'greedy' )
      outputString <- paste(outputString, "CE:", mean(energyPath[myit, ]))
      print(outputString)
    }
    if ((myit - bestRow - 1) >= 5) break # consider converged
  } # iterations

  energyPath <- na.omit(energyPath)
  return(
    list(
      u = bestU,
      v = bestV,
      initialRandomMatrix = randmat,
      energyPath = energyPath,
      finalError = bestEv,
      totalEnergy = totalEnergy,
      connectors = connectors,
      energyType = energyType,
      optimizationStyle = optimizationStyle
    )
  )
}



#' Compute the low-dimensional u matrix for simlr
#'
#' simlr minimizes reconstruction error across related modalities.  One crucial
#' component of the reconstruction is the low-dimensional cross-modality basis.
#' This function computes that basis, given a mixing algorithm.
#'
#' @param projections A list that contains the low-dimensional projections.
#' @param mixingAlgorithm the elected mixing algorithm.  see \code{simlr}.  can
#' be 'svd', 'ica', 'rrpca-l', 'rrpca-s', 'pca', 'stochastic' or 'avg'.
#' @param initialW initialization matrix size \code{n} by \code{k} for fastICA.
#' @param orthogonalize boolean
#' @param connectors a list ( length of projections or number of modalities )
#' that indicates which modalities should be paired with current modality
#' @return u matrix for modality i
#' @author BB Avants.
#' @examples
#'
#' set.seed(1500)
#' nsub <- 25
#' npix <- c(100, 200, 133)
#' nk <- 5
#' outcome <- matrix(rnorm(nsub * nk), ncol = nk)
#' outcome1 <- matrix(rnorm(nsub * nk), ncol = nk)
#' outcome2 <- matrix(rnorm(nsub * nk), ncol = nk)
#' u <- simlrU(list(outcome, outcome1, outcome2), 2, "avg")
#'
#' @seealso \code{\link{simlr}}
#' @export

simlrU <- function(
    projections, mixingAlgorithm, initialW,
    orthogonalize = FALSE, connectors = NULL) {
  # some gram schmidt code
  projectionsU <- projections
  for (kk in 1:length(projections)) {
    projectionsU[[kk]] <- projectionsU[[kk]] / norm(projectionsU[[kk]], "F")
  }
  if (!is.null(connectors)) {
    if (length(connectors) != length(projections)) {
      stop("length( connectors ) should equal length( projections )")
    }
  }
  localGS <- function(x, orthogonalize = TRUE) {
    if (!orthogonalize) {
      return(x)
    }
    n <- dim(x)[1]
    m <- dim(x)[2]
    q <- matrix(0, n, m)
    r <- matrix(0, m, m)
    qi <- x[, 1]
    si <- sqrt(sum(qi^2))
    q[, 1] <- qi / si
    if (si < .Machine$double.eps) si <- 1
    r[1, 1] <- si
    for (i in 2:m) {
      xi <- x[, i]
      qj <- q[, 1:(i - 1)]
      rj <- t(qj) %*% xi
      qi <- xi - qj %*% rj
      r[1:(i - 1), i] <- rj
      si <- sqrt(sum(qi^2))
      if (si < .Machine$double.eps) si <- 1
      q[, i] <- qi / si
      r[i, i] <- si
    }
    return(q)
  }
  subU <- function(
      projectionsU, mixingAlgorithm, initialW,
      orthogonalize, i, wtobind) {
    avgU <- NULL
    mixAlg <- mixingAlgorithm
    nComponents <- ncol(projectionsU[[1]])
    nmodalities <- length(projectionsU)
    if (missing(wtobind)) wtobind <- (1:nmodalities)[-i]
    if (mixAlg == "avg") {
      avgU <- projectionsU[[1]] * 0.0
      for (j in wtobind) {
        avgU <- avgU + projectionsU[[j]] / (nmodalities - 1)
      }
      return(avgU)
    }
    if (mixAlg == "stochastic") { # FIXME
      avgU <- Reduce(cbind, projectionsU[wtobind])
      G <- scale(replicate(nComponents, rnorm(nrow(avgU))), FALSE, FALSE)
      Y <- localGS(t(avgU) %*% G) # project onto random basis - no localGS because of numerical issues when self-mapping
      avgU <- scale(avgU %*% Y, FALSE, FALSE)
      return(avgU)
    }
    nc <- ncol(projectionsU[[1]])
    avgU <- Reduce(cbind, projectionsU[wtobind])
    if (mixAlg == "pca") {
      basis <- stats::prcomp(avgU, retx = FALSE, rank. = nc, scale. = TRUE)$rotation
    }
    if (mixAlg == "rrpca-l") {
      basis <- (rsvd::rrpca(avgU, rand = FALSE)$L[, 1:nc])
    } else if (mixAlg == "rrpca-s") {
      basis <- (rsvd::rrpca(avgU, rand = FALSE)$S[, 1:nc])
    } else if (mixAlg == "ica" & !missing(initialW)) {
      basis <- (fastICA::fastICA(avgU,
        method = "C", w.init = initialW,
        n.comp = nc
      )$S)
    } else if (mixAlg == "ica" & missing(initialW)) {
      basis <- (fastICA::fastICA(avgU, method = "C", n.comp = nc)$S)
    } else {
      basis <- (svd(avgU, nu = nc, nv = 0)$u)
    }
    if (!orthogonalize) {
      return(basis)
    }
    return(localGS(basis, orthogonalize = orthogonalize))
  }

  outU <- list()
  for (i in 1:length(projectionsU)) {
    if (is.null(connectors)) {
      outU[[i]] <- subU(projectionsU, mixingAlgorithm, initialW, orthogonalize, i)
    }
    if (!is.null(connectors)) {
      outU[[i]] <- subU(projectionsU, mixingAlgorithm, initialW, orthogonalize,
        i,
        wtobind = connectors[[i]]
      )
    }
  }
  return(outU)
}
stnava/ANTsR documentation built on April 16, 2024, 12:17 a.m.