R/unfazeD.R

Defines functions getComb generateRevalList generateReval gdist gdistPair2 gdistPair1 gdistPair

Documented in gdist gdistPair gdistPair1 gdistPair2 generateReval generateRevalList

#' Genetic Distance for a Pair of Samples
#'
#' Calculates likelihood over a range of values for relatedness parameter
#' \eqn{{r}} and provides an estimate for a pair of samples.
#'
#' @param pair   a list of length two containing two samples.
#' @param afreq  a list of allele frequencies. Each element of the list
#'   corresponds to a locus.
#' @param coi    a vector indicating complexity of infection for each sample.
#' @param nr     an integer value for the resolution of the grid (\eqn{nr - 1}
#'   values between 0 and 1), over which the likelihood will be calculated.
#'   Ignored if non-null \code{reval} is provided.
#' @param nm     the number of related pairs of strains for evaluation.
#' @param rval  \eqn{{r}} values for the grid or for evaluation when
#'   \code{equalr} is \code{TRUE}. If \code{NULL}, will be evenly spaced between
#'   0 and 1 and interval \eqn{1/nr}.
#' @param reval  the grid of \eqn{{r}} combinations, over which the likelihood
#'   will be calculated. A matrix where each column represents a single
#'   combination.
#' @param equalr a logical value. If \code{TRUE}, only equal values of _r_ for
#'   different pairs of strains are evaluated.
#' @param out    a character string for the type of results to be returned. If
#'   \code{"mle"}, an estimate is returned, if \code{"llik"} - a vector of
#'   log-likelihood for each combination.
#'
#' @return If \code{out} is \code{"mle"}, a vector of length 1 if \code{equalr}
#'   is \code{TRUE} or length \code{nm} otherwise, containing estimated
#'   \eqn{{r}} (or vector/matrix if not just first). If \code{out} is
#'   \code{"llike"}, a vector of log-likelihood values for each evaluated
#'   combination.
#'
#' @seealso \code{\link{gdist}} for processing multi-sample data in various
#'   formats and estimating pairwise relatedness.
#' @export
#' @useDynLib unfazeD

#*** add NA handling
gdistPair <- function(pair, afreq, coi, nr = 1e2, nm = min(coi), rval = NULL,
                      reval = NULL, equalr = FALSE, out = "mle") {  # "mle", "llik", "all"
  nloc <- length(afreq)

  if (is.null(rval)) {
    rval <- round(seq(0, 1, 1/nr), ceiling(log(nr, 10)))
  }
  if (equalr) {
    neval <- length(rval)
  } else {
    if (is.null(reval)) {
      reval <- generateReval(nm, rval)
    }
    neval <- ncol(reval)
  }

  llik <- rep(0, neval)
  for (t in 1:nloc) {
    Ux <- which(as.logical(pair[[1]][[t]]))  # in case of integer vector
    Uy <- which(as.logical(pair[[2]][[t]]))
    if (length(Ux) == 0 ||                   # NA [or all 0's - not in true]
        length(Uy) == 0) {#||                # NA [or all 0's - not in true]
#        any(afreq[[t]][unique(c(Ux, Uy))] <= 0)) {  # not in true genotypes
      next                                   # no data (NA)
    }
    if (equalr) {
      llikt <- probUxUyEqr(Ux, Uy, coi[1], coi[2], afreq[[t]], rval, nm)
    } else {
      llikt <- probUxUy(   Ux, Uy, coi[1], coi[2], afreq[[t]], reval)
    }
    if (llikt[1] == -Inf) {
      if (all(llikt == -Inf)) {
        next
      } else {
        iinf <- which(llikt == -Inf)
        writeLines(paste("\n0 prob for r combinations",  # not added if 1st!!
                         paste(iinf, collapse = " ")))
      }
    } else {
      llik <- llik + llikt
    }
  }

  if (out == "llik") {
    return(llik)
  } else {
    imax <- which(llik == max(llik))
    if (equalr) {
      return(mean(rval[imax]))  # [1] first only; reval[imax] - all; mean()
    } else {
      return(rowMeans(reval[, imax, drop = FALSE]))
      # reval[, imax[1]]
    }
  }
}

#' Genetic distance for data with genotyping errors
#'
#' Distance when estimated COI is less than |Ux|: COI increased to |Ux| for a
#' given locus only. ***Handling of the situation where alleles with estimated
#' allele frequency of 0 are present: probUxUy crashes only if present in both
#' Ux and Uy; however, if present in either one, the likelihood is 0, so no need
#' to calculate.
#'
#' @param alpha significance level.
#' @return
#'   * If \code{out = "mle"}: a vector of length 1 if \code{equalr = TRUE}
#'   or of length \code{nm} otherwise, containing estimated \eqn{{r}} (or vector
#'   /matrix if not just first).
#'   * If \code{out = "llike"}, a vector of log-likelihood values for each
#'   evaluated combination.
#'   * If \code{out = "all"}, a list containing an estimate, log-likelihood,
#'   maximum log-likelihood, \eqn{{r}} values corresponding to the acceptance
#'   region determined by the significance level, and the size of that region
#'   (as a proportion of all evaluated).
#' @inherit gdistPair params
#' @export

# out options: "mle", "llik", "all" (*** gdistPair1 only - add to others?)
gdistPair1 <- function(pair, afreq, coi, nr = 1e2, nm = min(coi), rval = NULL,
                       reval = NULL, equalr = FALSE, out = "mle",
                       alpha = 0.05) {
  nloc <- length(afreq)

  if (( equalr && is.null(rval)) ||
      (!equalr && is.null(reval) && is.null(rval))) {
    rval <- round(seq(0, 1, 1/nr), ceiling(log(nr, 10)))
  }
  if (!equalr && is.null(reval)) {
    reval <- generateReval(nm, rval)
  }
  neval <- ifelse(equalr, length(rval), ncol(reval))

  llik <- rep(0, neval)
  for (t in 1:nloc) {
    Ux <- which(as.logical(pair[[1]][[t]]))  # in case of integer vector
    Uy <- which(as.logical(pair[[2]][[t]]))
    #*** shouldn't happen with our error simulation
    if (length(Ux) == 0 ||                   # NA or all 0's
        length(Uy) == 0 ||                   # NA or all 0's
        any(afreq[[t]][unique(c(Ux, Uy))] == 0)) {  # likelihood = 0
      next                                   # likelihood = 0 or no data
    }
    coix <- max(coi[1], length(Ux))
    coiy <- max(coi[2], length(Uy))
    if (equalr) {
      llikt <- probUxUyEqr(Ux, Uy, coix, coiy, afreq[[t]], rval, nm)
    } else {
      llikt <- probUxUy(   Ux, Uy, coix, coiy, afreq[[t]], reval)
    }
    #*** check and maybe remove
    if (llikt[1] == -Inf) {
      if (all(llikt == -Inf)) {
        next
      } else {
        iinf <- which(llikt == -Inf)
        writeLines(paste("\n0 prob for r combinations",  # not added if 1st!!
                         paste(iinf, collapse = " ")))
      }
    } else {
      llik <- llik + llikt
    }
  }

  if (tolower(out) == "llik") {
    return(llik)
  }
  imax <- which(llik == max(llik))
  if (equalr) {
    est <- mean(rval[imax])  # [1] first only; reval[imax] - all; mean()
  } else {
    est <- rowMeans(reval[, imax, drop = FALSE])
    # reval[, imax[1]]
  }
  if (tolower(out == "mle")) {
    return(est)
  }

  qchi <- stats::qchisq(1 - alpha, df = ifelse(equalr, 1, nrow(reval)))
  cutoff  <- max(llik) - qchi/2
  itop    <- llik >= cutoff
  proptop <- sum(itop)/neval
  if (equalr) {
    rtop <- rval[itop]
  } else {
    rtop <- reval[, itop, drop = FALSE]
  }
  return(list(mle = est, llik = llik, maxllik = llik[imax[1]], rtop = rtop,
              proptop = proptop))
}

#' Genetic distance for data with genotyping errors
#'
#' Distance when estimated COI is less than |Ux|: likelihoods for all subsets of
#' size = COI of Ux are calculated; geometric mean taken.
#'
#' @inherit gdistPair return params
#' @export
#'

gdistPair2 <- function(pair, afreq, coi, nr = 1e2, nm = min(coi), rval = NULL,
                       reval = NULL, equalr = FALSE, out = "mle") {  # "mle", "llik"
  nloc <- length(afreq)

  if (is.null(rval)) {
    rval <- round(seq(0, 1, 1/nr), ceiling(log(nr, 10)))
  }
  if (equalr) {
    neval <- length(rval)
  } else {
    if (is.null(reval)) {
      reval <- generateReval(nm, rval)
    }
    neval <- ncol(reval)
  }

  llik <- rep(0, neval)
  for (t in 1:nloc) {
    Ux <- which(as.logical(pair[[1]][[t]]))  # in case of integer vector
    Uy <- which(as.logical(pair[[2]][[t]]))  # sorted
    if (length(Ux) == 0 ||                   # NA or all 0's
        length(Uy) == 0 ||                   # NA or all 0's
        any(afreq[[t]][unique(c(Ux, Uy))] <= 0)) {  #*** < check 0 separately?
      next                                   # likelihood = 0 or no data
    }
    Uxcomb <- getComb(Ux, coi[1])
    Uycomb <- getComb(Uy, coi[2])
    ncx <- ncol(Uxcomb)
    ncy <- ncol(Uycomb)
    llikt <- matrix(0, ncx*ncy, neval)
    icomb <- 1
    for (icx in 1:ncx) {
      for (icy in 1:ncy) {
        if (equalr) {
          llikt[icomb, ] <- probUxUyEqr(Uxcomb[, icx], Uycomb[, icy],
                                         coi[1], coi[2], afreq[[t]], rval, nm)
        } else {
          llikt[icomb, ] <- probUxUy(   Uxcomb[, icx], Uycomb[, icy],
                                        coi[1], coi[2], afreq[[t]], reval)
        }
        icomb <- icomb + 1
      }
    }
    llikt <- colMeans(llikt, na.rm = TRUE)  # or FALSE?
    if (llikt[1] == -Inf) {
      if (all(llikt == -Inf)) {
        next
      } else {
        iinf <- which(llikt == -Inf)
        writeLines(paste("\n0 prob for r combinations",  # not added if 1st!!
                         paste(iinf, collapse = " ")))
      }
    } else {
      llik <- llik + llikt
    }
  }

  if (out == "llik") {
    return(llik)
  } else {
    imax <- which(llik == max(llik))
    if (equalr) {
      return(mean(rval[imax]))  # [1] first only; reval[imax] - all; mean()
    } else {
      return(rowMeans(reval[, imax, drop = FALSE]))
      # reval[, imax[1]]
    }
  }
}

#' Genetic Distance
#'
#' @description Pairwise estimation of relatedness parameters for polyclonal
#'   multiallelic samples.
#'
#' @details To be added: data formats that can be processed, formats to be
#'   returned.
#'
#' @param dat data containing sample genotypes. Various formats allowed.
#' @param nmmax the maximum number of related strains in a pair of samples.
#' @param split the allele separator character string if genotypes are provided
#'   as character strings.
#' @param FUNnm potentially a function to select \code{nm} for each pair of
#'   samples.
#' @param ...   additional arguments for FUNnm. #*** take out if FUNnm is out
#' @inheritParams gdistPair
#'
#' @return A lower triangular distance/relatedness matrix in a type of shaped
#'   list. Each pairwise distance contains a scalar, a vector, or a matrix of
#'   \eqn{{r}} estimates. Simple matrix (of type \code{double}) can be returned
#'   if all the elements are of length 1.
#'
#' @seealso \code{\link{gdistPair}} for genetic relatedness between two samples
#'   with an option of returning log-likelihood.
#' @export

#*** maybe add an option where we want a function for nm, not just a nmmax
#    e.g. max(coi) - 1; provide a function?
gdist <- function(dat, afreq, coi, nmmax, nr = 1e2, rval = NULL, reval = NULL,
                  FUNnm = NULL,
                  equalr = FALSE, split = "[[:space:][:punct:]]+", ...) {
  if (inherits(dat, "matrix")) {
    nsmp <- nrow(dat)
    nloc <- ncol(dat)
    if (typeof(dat) == "list") {
      format <- "matlist"
    } else {
      format <- "matstr"
    }
  } else {
    nsmp <- length(dat)
    nloc <- length(dat[[1]])
    format <- "listlist"
  }

  if (is.null(rval)) {
    rval <- round(seq(0, 1, 1/nr), ceiling(log(nr, 10)))
  }
  if (equalr) {
    neval <- length(rval)
  } else {
    if (is.null(reval)) {
      reval <- generateRevalList(1:nmmax, rval)
    }
  }
  res  <- matrix(list(NA), nsmp, nsmp)

  for (ix in 1:nsmp) {
    for (iy in 1:(ix)) {
      nm <- min(coi[ix], coi[iy], nmmax)       # or FUNnm() - or AFTER ix == iy
      if (ix == iy) {                          # diagonal
        res[[ix, ix]] <- rep(1, ifelse(equalr, 1, min(coi[ix], nmmax)))
        next
      }
      llik <- rep(0, ifelse(equalr, neval, ncol(reval[[nm]])))
      for (t in 1:nloc) {
        if (format == "matstr") {
          alleles <- names(afreq[[t]])
          if (is.null(alleles))
            stop("Please provide names for allele frequencies")
          Ux  <- unlist(strsplit(dat[ix, t], split = split))
          Ux  <- sort(match(Ux, alleles))
          Uy  <- unlist(strsplit(dat[iy, t], split = split))
          Uy  <- sort(match(Uy, alleles))
        } else if (format == "matlist") {
          Ux <- which(as.logical(dat[[ix, t]]))
          Uy <- which(as.logical(dat[[iy, t]]))
        } else if (format == "listlist") {
          Ux <- which(as.logical(dat[[ix]][[t]]))
          Uy <- which(as.logical(dat[[iy]][[t]]))
        }
        if (length(Ux) == 0 || is.na(Ux[1]) ||   # NA or all 0's
            length(Uy) == 0 || is.na(Uy[1]) ||   # NA or all 0's
            any(afreq[[t]][unique(c(Ux, Uy))] <= 0)) {  #*** < check 0 separately?
          next                                   # likelihood = 0 or no data
        }
        if (equalr) {
          llikt <- probUxUyEqr(Ux, Uy, coi[ix], coi[iy], afreq[[t]], rval, nm)
        } else {
          llikt <- probUxUy(Ux, Uy, coi[ix], coi[iy], afreq[[t]], reval[[nm]])
        }
        if (any(llikt == -Inf)) {
          writeLines(paste("samples", ix, "and", iy, "- 0 prob for some r
                           combinations at locus", t))
          if (all(llikt == -Inf)) {       # -Inf is added otherwise
            next
          }
          llik <- llik + llikt            # lik = 0 if -Inf at any locus
        }
      }
      imax <- which(llik == max(llik))
      if (equalr) {
        res[[ix, iy]] <- mean(rval[imax])  # [1] first; rval[imax] all; mean()
      } else {
        res[[ix, iy]] <- rowMeans(reval[[nm]][, imax, drop = FALSE])
        # reval[[nm]][, imax[1]] first
      }
    }
  }
  if (all(sapply(res, length) == 1)) {
    res <- matrix(unlist(res), nrow(res), ncol(res))  # return simple matrix
  }
  return(res)
}

#' Generate a grid of parameter values to evaluate over
#'
#' @param M an integer.
#' @param rval \eqn{{r}} values for the grid. Takes precedence over \code{nr}.
#' @param nr an integer. If \code{rval} is not provided, it will be generated
#'   using \code{0}, \code{1}, and \code{nr - 1} values between them.
#' @return A a matrix with \code{M} rows and \code{nr + 1} or
#'   \code{length(rval)} columns.
#'
#' @export
generateReval <- function(M, rval = NA, nr = NA) {
  if (is.na(rval[1])) {
    if (is.na(nr)) {
      return(NA)
    }
    rval <- round(seq(0, 1, 1/nr), ceiling(log(nr, 10)))
  }
  if (M == 1) {
    return(matrix(rval, 1))
  }
  reval <- as.matrix(expand.grid(rep(list(rval), M)))
  for (k in 1:nrow(reval)) {         # faster than apply()
    reval[k, ] <- sort(reval[k, ])
  }
  return(t(unique(reval)))
}

#' Generate a grid of parameter values for multiple values of \code{M}
#'
#' @param Ms an integer vector.
#' @param rvals a list of the length \code{max(Mv)}. Can also be a vector like
#'   \code{rval}; in that case the same vector will be used for all the values
#'   of \code{Mv}.
#' @param nrs an integer vector of the length \code{max(Mv)}.
#' @return A list of the length \code{max(Mv)} with elements corresponding to
#'   the values of \code{Mv}. Each element is a matrix with \code{M} rows and
#'   \code{nr + 1} or \code{length(rval)} columns.
#'
#' @export
#' @rdname generateReval
generateRevalList <- function(Ms, rvals = NA, nrs = NA) {
  Mmax <- max(Ms)
  revals <- as.list(rep(NA, Mmax))
  if (is.na(rvals[1]))          rvals <- rep(list(NA   ), Mmax)
  if (!inherits(rvals, "list")) rvals <- rep(list(rvals), Mmax)
  if (is.na(nrs[1]))            nrs   <- rep(NA, Mmax)
  for (M in Ms) {
    revals[[M]] <- generateReval(M, rvals[[M]], nrs[M])
  }
  return(revals)
}

# get combinations of alleles when length(Ux) > coix
getComb <- function(Ux, coix) {
  if (length(Ux) <= coix) {
    return(matrix(Ux, ncol = 1))
  }
  return(utils::combn(Ux, coix))
}
innager/unfazeD documentation built on June 26, 2021, 6:01 p.m.