R/tess3.R

Defines functions plot.tess3Main rmse.tess3Main is.tess3Main summary.tess3Main tess3Main

Documented in is.tess3Main plot.tess3Main rmse.tess3Main summary.tess3Main tess3Main

#' Estimate ancestry coefficients and run genome scans for selection
#'
#' \code{tess3Main} estimates spatial population structure using a graph based non
#' negative matrix factorization. After estimating the population structure is used to
#' compute a Fst statistic for each locus. See references for more details.
#'
#' @param K an integer corresponding to
#' the number of ancestral populations.
#' @param ploidy an integer which corresponds to the ploidy of the number of copy of chromosomes.
#' @param lambda a nonnegative numeric which corresponds to the
#' spatial regularization parameter.
#' @param W a numeric matrix which corresponds to the graph weight matrix.
#' If \code{NULL}, W is computed as
#' \code{W[i,j] = exp(-(coord[i] - coord[j])^2 / sigma^2)},
#' where \code{coord[i]} is the set of geographic coordinates for individual i and
#' \code{sigma} equals 5 percent of the average geographic distance between individuals.
#' @param method \code{"projected.ls"} or \code{"qp"}. If \code{"projected.ls"},
#' an alternating projected least squares algorithm is used. If \code{"qp"},
#' an alternating quadratic programing algorithm is used. See references for details.
#' @param max.iteration the maximum number of
#' iterations in the optimization algorithm.
#' @param tolerance a numeric value which corresponds to the tolerance paramter in the
#' stopping criterion of the optimization algorithm.
#' @param X a numeric matrix which corresponds to the genotype matrix. This matrix
#' must be of size \eqn{n \times L} where \eqn{n} is the number of individuals and
#' \eqn{L} is the number of loci. Values of this matrix are integers corresponding
#' to the number of variant alleles observed at a locus. If \code{NULL}, \code{XProba}
#' is used.
#' @param openMP.core.num number of core used by the algorithm. It requires that openMP is
#' installed.
#' @param Q.init a numeric matrix which corresponds to the initial value of
#' \code{Q} for the algorithm.
#' @param coord a numeric matrix of size \eqn{n \times 2} where \eqn{n} is the
#' number of individuals. It contains the geographic coordinates (Longitude, Latitude) of
#' all sampled individuals.
#' @param mask if not \code{NULL}, this numeric value is the proportion of genotypic matrix entries
#' which are masked when computing the cross validation criterion.
#' @param algo.copy if TRUE, data will be copied in order to speed the algorithm.
#' @param copy if TRUE data will be copied once.
#' @param verbose If \code{TRUE} run information is printed.
#' @param XProba a numeric matrix which corresponds to genotype likelihoods (probabilities).
#' This matrix must be of size \eqn{n \times (ploidy + 1)L} where
#' \eqn{n} is the number of individuals and \eqn{L} is the number of loci. Entries of
#' this matrix are numeric values between 0 and 1 corresponding
#' to genotype probability. If \code{NULL}, this matrix is computed from the genotype matrix \code{X}.
#' See reference for details.
#'
#' @return An object of class tess3Main which is a list with the following attributes:
#' \describe{
#'    \item{L}{the number of loci}
#'    \item{n}{the number of individuals}
#'    \item{ploidy}{the number of copies of chromosomes.}
#'    \item{K}{the number of ancestral populations.}
#'    \item{G}{the ancestral genotype frequency matrix.}
#'    \item{Q}{the ancestry coefficient matrix.}
#'    \item{Fst}{Fst statistic computed at each locus.}
#'    \item{Fscore}{Fscores computed from the Fst statistics.}
#'    \item{pvalue}{pvalues computed from the Fscores.}
#'    \item{log.pvalue}{The \eqn{log(pvalue)}.}
#'    \item{rmse}{root square mean error between \code{XProba} and
#'    \code{tcrossprod(Q, G)}.}
#'    \item{crossentropy}{cross-entropy error between \code{XProba} and
#'    \code{tcrossprod(Q, G)}.}
#'   \item{crossvalid.rmse}{if masked is not \code{NULL}, root square mean error
#'   between \code{XProba[masked]} and \code{tcrossprod(Q, G)[masked]}.}
#'    \item{crossvalid.crossentropy}{if masked not \code{NULL},
#'    the cross-entropy error between \code{XProba[masked]} and
#'    \code{tcrossprod(Q, G)[masked]}.}
#' }
#'
#'
#' @export
#' @examples
#' library(tess3r)
#'
#' # Arabidopsis thaliana data set
#' data(data.at)
#' genotype <- data.at$X
#' coordinates <- data.at$coord
#'
#' # Run of tess3 main algorithm
#' tess3.obj <- tess3Main(X = genotype,
#'                       coord = coordinates,
#'                       K = 3,
#'                       method = "projected.ls",
#'                       ploidy = 1)
#'
#' # Run of tess3 main algorithm with cross validation errors computation.
#' tess3.obj <- tess3Main(X = genotype,
#'                       coord = coordinates,
#'                       K = 3,
#'                       method = "projected.ls",
#'                       ploidy = 1,
#'                       mask = 0.05)
#'
#'
#' @references \url{https://hal.archives-ouvertes.fr/hal-01222555/} \url{http://biorxiv.org/content/early/2016/10/12/080291}
#' @seealso \code{\link{tess3}}
tess3Main <- function(X,
                      XProba = NULL,
                      coord,
                      K,
                      ploidy,
                      lambda = 1.0,
                      W = NULL,
                      method = "projected.ls",
                      max.iteration = 200,
                      tolerance = 1e-5,
                      openMP.core.num = 1,
                      Q.init = NULL,
                      mask = 0.0,
                      copy = TRUE,
                      algo.copy = TRUE,
                      verbose = FALSE)
{
  # mem <- c()
  # mem <- c(mem, pryr::mem_used())
  res = list()

  ################################################
  # Init openMP
  InitOpenMP(openMP.core.num)

  ################################################

  # mem <- c(mem, pryr::mem_used())

  ################################################
  # copy X
  if (copy & !is.null(X)) {
    X <- as.matrix(X) # to handle type conversion
    X <- matrix(as.double(X), nrow(X), ncol(X)) # to ensure we have a matrix of double
    CheckX(X, ploidy)
  } else if (!copy & is.null(XProba)) {
    stop("To force the function not doing copy of the data, you must set XProba")
  }
  ################################################

  # mem <- c(mem,pryr::mem_used())

  ################################################
  # check type of input

  ## geographic.coordinate
  if (is.null(coord) && is.null(W)) {
    stop("If no graph matrix is specified, geographic coordinates must be provided as coord parameter")
  }

   ## K
  if (!is.numeric(K)) {
    stop("K must be numeric")
  }
  ## ploidy
  if (!is.numeric(ploidy)) {
    stop("ploidy must be numeric")
  }
  ## Q.init
  if (!is.null(Q.init)) {
    if (!is.matrix(Q.init) ) {
      stop("Q.init must be a matrix")
    }
  }
  ## method
  if (!is.character(method)) {
    stop("method must be a character string for the method to use")
  }
  if (method != "projected.ls" & method != "qp") {
    stop("method must be projected.ls or qp")
  }
  ## max.iteration
  if (!is.numeric(max.iteration)) {
    stop("max.iteration must be numeric")
  }
  ## tolerance
  if (!is.numeric(tolerance)) {
    stop("tolerance must be numeric")
  }

  if (mask > 1.0 | mask < 0.0) {
    stop("mask is the proportion of the genotype masked for cross validation and must between 0 and 1")
  }

  ################################################

  # mem <- c(mem,pryr::mem_used())

  ################################################
  # Check consistence of input

  # check coord
  CheckCoord(coord)

  # Compute W
  if (is.null(W)) {
    W <- ComputeHeatKernelWeight(coord, ComputeMeanDist(coord) * 0.05)
  }


  # Q.init
  if (!is.null(Q.init)) {
    if (nrow(Q.init) != nrow(X) | ncol(Q.init) != K) {
      stop("Q.init must be of dimensions (number of individuals) x K.")
    }
  }

  ################################################

  # mem <- c(mem,pryr::mem_used())

  ################################################
  # Compute parameters
  ## Laplacian
  CheckW(W)
  Lapl <- ComputeGraphLaplacian(W)

  ## Compute number of loci and indiv
  if (!is.null(X)) {
    res$L <- ncol(X)
    res$n <- nrow(X)
  } else if (!is.null(XProba)) {
    if (ncol(XProba) %% (ploidy + 1) != 0) {
      stop("Number of columns of XProba must be a multiple of ploidy + 1.")
    }
    res$L <- ncol(XProba) %/% (ploidy + 1)
    res$n <- nrow(coord)
  } else {
    stop("X or XProba must be non-null")
  }

  res$ploidy <- ploidy
  res$K <- K

  if (is.null(XProba)) {
    XProba <- matrix(0.0, res$n, res$L * (res$ploidy + 1))
    X2XBin(X, ploidy, XProba)
    rm(X)
  }
  CheckXBin(XProba, ploidy)
  CheckCoord(coord)
  CheckXBinCoord(XProba, coord)
  CheckW(W)
  CheckWCoord(W, coord)
  CheckXBinW(XProba, W)
  ################################################

  # mem <- c(mem,pryr::mem_used())

  ################################################
  # mask if asked
  if (mask != 0.0) {
    DebugMessage(paste("Mask ", mask, "% of genotypes for cross validation"))
    missing.index.X <- sample(1:(length(XProba)), length(XProba) * mask)
    masked.X.value <- XProba[missing.index.X]
    XProba[missing.index.X] <- NA
  }
  ################################################

  # mem <- c(mem,pryr::mem_used())

  ################################################
  # check if there is missing data and compute the binary representation
  if (any(is.na(XProba))) {
    DebugMessage("Missing value detected in genotype")
    # Naive imputation by mean
    geno.freq <- apply(XProba, 2, function(x) mean(x,na.rm = TRUE))
    geno.freq <- matrix(geno.freq,1)[rep(1, res$n),]
    XProba[is.na(XProba)] <- geno.freq[is.na(XProba)]
    rm(geno.freq)
  }
  ################################################

  # mem <- c(mem,pryr::mem_used())

  ################################################
  # compute Q and G matrix
  if (method == "qp") {
    res <- c(res, SolveTess3QP(XProba,
                               K,
                               ploidy,
                               Lapl,
                               lambda,
                               max.iteration = max.iteration,
                               tolerance = tolerance,
                               verbose = verbose))
  } else if (method == "projected.ls") {
    Lapl <- as.matrix(Lapl)
    # Q and G
    res$G <- matrix(0.0, nrow = (res$ploidy + 1) * res$L, ncol = res$K)
    if (!is.null(Q.init)) {
      res$Q <- Q.init
    } else {
      res$Q <- matrix(runif(res$n * K), res$n, res$K)
      res$Q <- ProjectQ(res$Q)
    }

    # mem <- c(mem,pryr::mem_used())
    if (!algo.copy) {
      ComputeMCPASolutionNoCopyX(X = XProba,
                                 K = K,
                                 Lapl = Lapl,
                                 lambdaPrim = lambda,
                                 D = ploidy + 1,
                                 maxIteration = max.iteration,
                                 tolerance = tolerance,
                                 Q = res$Q,
                                 G = res$G,
                                 verbose = verbose)
    } else {
      ComputeMCPASolution(X = XProba,
                          K = K,
                          Lapl = Lapl,
                          lambdaPrim = lambda,
                          D = ploidy + 1,
                          maxIteration = max.iteration,
                          tolerance = tolerance,
                          Q = res$Q,
                          G = res$G,
                          verbose = verbose)
    }

  } else {
    stop("Unknow method name")
  }
  class(res$Q) <- c("tess3Q", class(res$Q))
  class(res$G) <- c("tess3G", class(res$G))

  ################################################

  # mem <- c(mem,pryr::mem_used())

  ################################################
  # Selection scan
  # Compute stat for test
  if (K > 1) {
    res$Fst <- ComputeFst(res$Q, res$G, ploidy + 1)

    # To avoid numerical issues
    res$Fst[res$Fst < 0.0] = 0.0

    # Convert Fst into t score
    res <- c(res, ComputeTscoreAndPvalue(res$Fst, K, res$n))
    class(res$pvalue) <- c("tess3pvalue", class(res$pvalue))
  }

  ################################################

  # mem <- c(mem,pryr::mem_used())

  ################################################
  # Compute rmse
  QtG <- tcrossprod(res$Q, res$G)
  res$rmse <- ComputeRmse(XProba, QtG)
  res$crossentropy <- ploidy * ComputeAveragedCrossEntropy(XProba, QtG) #because this function compute mean also by allele
  if (mask > 0.0) {
    res$crossvalid.rmse <- ComputeRmse(masked.X.value, QtG[missing.index.X])
    res$crossvalid.crossentropy <- ploidy * ComputeAveragedCrossEntropy(masked.X.value, QtG[missing.index.X])
  }
  ################################################
  class(res) <- c(class(res), "tess3Main")

  # mem <- c(mem,pryr::mem_used())

  return(res)
}


#' Summary of tess3 object.
#'
#' @param object TODOC
#' @param ... TODOC
#'
#' @return TODOCC
#' @export
summary.tess3Main <- function(object, ...) {
  cat(paste("=== Object of class tess3Main ===\n"))
  cat(paste("Number of individuals (n):", object$n,"\n"))
  cat(paste("Number of loci (L):", object$L,"\n"))
  cat(paste("Ploidy:", object$ploidy,"\n"))
  cat(paste("Number of ancestral populations (K):", object$K,"\n"))
  cat(paste("Residual error:", object$rmse,"\n"))
}


#' Title
#'
#' @param x TODOC
  #'
#' @return TODOC
#' @export
is.tess3Main <- function(x) {
  inherits(x, "tess3Main")
}

#' Title
#'
#' @param tess3.obj TODOCC
#' @param X TODOC
#' @param ploidy TODOC
#' @param mask TODOC
#'
#' @export
rmse.tess3Main <- function(tess3.obj, X, ploidy, mask = NULL) {
  if (!is.tess3Main(tess3.obj)) {
    stop("tess3.obj must of class tess3Main.")
  }
  CheckX(X, ploidy)
  if (!is.null(mask)) {
    X[-mask] <- NA
  }
  XBin <- matrix(0.0, nrow(X), ncol(X) * (ploidy + 1))
  X2XBin(X, ploidy, XBin)
  return(ComputeRmse(XBin, tcrossprod(tess3.obj$Q, tess3.obj$G)))
}

#' Title
#'
#' @param x tess3Main object.
#' @param ... TODOC
#'
#' @export
plot.tess3Main <- function(x, ...) {
  message("Nothing to plot")
}

#' tess3r : estimation of spatial population structure
#'
#' This R package implements the TESS3 method and tools useful to plot program outputs.
#'
#' @docType package
#'
#' @name tess3r
#' @importFrom Rcpp evalCpp
#' @importFrom graphics barplot hist image par plot points segments
#' @importFrom grDevices colorRampPalette rainbow topo.colors
#' @importFrom stats as.formula dist median pchisq pf predict qchisq qf rnorm runif sd
#' @importFrom utils capture.output
#' @import RcppEigen
#' @useDynLib tess3r
NULL

#' TODOC
#'
#' @name data.at
#' @docType data
#' @keywords data
NULL

#' TODOC
#'
#' @name data.for.test
#' @docType data
#' @keywords data
NULL

#' TODOC
#'
#' @name durand09
#' @docType data
#' @keywords data
NULL
bcm-uga/TESS3_encho_sen documentation built on June 30, 2023, 3:08 a.m.