R/analysis_distance.R

Defines functions plot_distance get_IBS_distance get_IB_mixture inbreeding_mle get_genomic_distance lonlat_to_bearing get_spatial_distance

Documented in get_genomic_distance get_IB_mixture get_IBS_distance get_spatial_distance inbreeding_mle lonlat_to_bearing plot_distance

#------------------------------------------------
#' @title Get great circle distance between spatial points
#'
#' @description Get great circle distance between spatial points.
#' 
#' @param lat vector of latitudes.
#' @param long vector of longitudes.
#'
#' @export

get_spatial_distance <- function(lat, long) {
  
  # check inputs
  assert_vector(lat)
  assert_numeric(lat)
  assert_vector(long)
  assert_numeric(long)
  
  # calculate distance matrix
  ret <- apply(cbind(lat, long), 1, function(y) {lonlat_to_bearing(lat, long, y[1], y[2])$gc_dist})
  diag(ret) <- 0
  
  return(ret)
}

#------------------------------------------------
#' @title Calculate great circle distance and bearing between coordinates
#'
#' @description Calculate great circle distance and bearing between spatial
#'   coordinates.
#'
#' @param origin_lon The origin longitude
#' @param origin_lat The origin latitude
#' @param dest_lon The destination longitude
#' @param dest_lat The destination latitude
#'
#' @export
#' @examples
#' # one degree longitude should equal approximately 111km at the equator
#' lonlat_to_bearing(0, 0, 1, 0)

lonlat_to_bearing <- function(origin_lon, origin_lat, dest_lon, dest_lat) {
  
  # check inputs
  assert_vector(origin_lon)
  assert_numeric(origin_lon)
  assert_vector(origin_lat)
  assert_numeric(origin_lat)
  assert_vector(dest_lon)
  assert_numeric(dest_lon)
  assert_vector(dest_lat)
  assert_numeric(dest_lat)
  
  # convert input arguments to radians
  origin_lon <- origin_lon*2*pi/360
  origin_lat <- origin_lat*2*pi/360
  dest_lon <- dest_lon*2*pi/360
  dest_lat <- dest_lat*2*pi/360
  
  # get change in lon
  delta_lon <- dest_lon - origin_lon
  
  # calculate bearing
  bearing <- atan2(sin(delta_lon)*cos(dest_lat), cos(origin_lat)*sin(dest_lat)-sin(origin_lat)*cos(dest_lat)*cos(delta_lon))
  
  # calculate great circle angle. Use temporary variable to avoid acos(>1) or 
  # acos(<0), which can happen due to underflow issues
  tmp <- sin(origin_lat)*sin(dest_lat) + cos(origin_lat)*cos(dest_lat)*cos(delta_lon)
  tmp <- ifelse(tmp > 1, 1, tmp)
  tmp <- ifelse(tmp < 0, 0, tmp)
  gc_angle <- acos(tmp)
  
  # convert bearing from radians to degrees measured clockwise from due north,
  # and convert gc_angle to great circle distance via radius of earth (km)
  bearing <- bearing*360/(2*pi)
  bearing <- (bearing+360)%%360
  earth_rad <- 6371
  gc_dist <- earth_rad*gc_angle
  
  # return list
  ret <-list(bearing = bearing,
             gc_dist = gc_dist)
  return(ret)
}

#------------------------------------------------
#' @title Get genomic distance between samples
#'
#' @description Get genomic distance between samples using a distance metric
#'   that allows for mixed infections and takes account of linkage (see
#'   references for details).
#' 
#' @references MalariaGEN Plasmodium falciparum Community Project. "Genomic
#'   epidemiology of artemisinin resistant malaria". eLIFE (2016).
#' 
#' @param x object of class \code{mipanalyzer_biallelic}.
#' @param cutoff when calculating weights, correlations below this value are
#'   ignored (see references).
#' @param report_progress if \code{TRUE} then a progress bar is printed to the
#'   console.
#'
#' @importFrom stats prcomp
#' @export

get_genomic_distance <- function(x, cutoff = 0.1, report_progress = TRUE) {
  
  # check inputs
  assert_custom_class(x, "mipanalyzer_biallelic")
  assert_in(c("CHROM", "POS"), names(x$loci), message = "the `loci` component of the data must contain columns `CHROM` and `POS`")
  
  # get basic quantities
  nsamp <- nrow(x$samples)
  chrom_levels <- unique(x$loci$CHROM)
  nchrom <- length(chrom_levels)
  
  # get within-sample allele frequencies
  wsaf <- get_wsaf(x, impute = FALSE)
  
  # calculate weights
  weight <- NULL
  for (i in 1:nchrom) {
    
    # get correlation matrix between SNPs
    w <- which(x$loci$CHROM == chrom_levels[i])
    cormat <- suppressWarnings(cor(wsaf[,w,drop = FALSE], use = "pairwise.complete.obs"))
    cormat_na <- apply(cormat, 1, function(x) all(is.na(x)))
    cormat[cormat < cutoff] <- 0
    
    # calculate weights
    tmp <- rowSums(cormat^2, na.rm = TRUE)
    tmp[cormat_na] <- NA
    weight <- c(weight, 1/tmp)
  }
  
  # initialise progress bar
  if (report_progress) {
    pbar <- txtProgressBar(0, nsamp, style = 3)
  }
  
  # calculate distance matrix
  dist_mat <- matrix(NA, nsamp, nsamp)
  for (i in 1:(nsamp-1)) {
    
    # calculate distance between sample i and all others
    dab <- sweep(1-wsaf[-(1:i),,drop = FALSE], 2, wsaf[i,], "*") + sweep(wsaf[-(1:i),,drop = FALSE], 2, 1-wsaf[i,], "*")
    dab_weighted <- sweep(dab, 2, weight, "*")
    dist_mat[i,-(1:i)] <- rowSums(dab_weighted, na.rm = TRUE) / apply(dab_weighted, 1, function(x) sum(!is.na(x)))
    
    # update progress bar
    if (report_progress) {
      setTxtProgressBar(pbar, i)
    }
  }
  
  # return distance matrix
  invisible(dist_mat)
}

#------------------------------------------------
#' @title Estimate pairwise inbreeding coefficient F by maximum likelihood
#'
#' @description Estimates the inbreeding coefficient between all pairs of
#'   samples by maximum likelihood.
#'
#' @details The probability of seeing the same or different alleles at a locus
#'   can be written in terms of the global allele frequency p and the inbreeding
#'   coefficient f, for example the probability of seeing the same REF allele is
#'   \eqn{(1-f)*p^2 + f*p}. This formula can be multiplied over all loci to
#'   arrive at the overall likelihood of each value of f, which can then be
#'   chosen by maximum likelihood. This function carries out this comparison
#'   between all pairwise samples, passed in as a matrix. The formula above only
#'   applies when comparing homozygous calls - for homo/het or het/het
#'   comparisons we can either ignore these loci (the default) or convert hets
#'   to homo by calling the major allele at every locus.
#'
#' @param x object of class \code{mipanalyzer_biallelic}.
#' @param f values of f that are explored.
#' @param ignore_het whether to ignore heterzygous comparisons, or alternatively
#'   call the major allele at every locus (see details).
#' @param report_progress if \code{TRUE} then a progress bar is printed to the
#'   console.
#'
#' @export

inbreeding_mle <- function(x, f = seq(0,1,l=11), ignore_het = FALSE, report_progress = TRUE) {
  
  # check inputs
  assert_custom_class(x, "mipanalyzer_biallelic")
  assert_vector(f)
  assert_bounded(f)
  assert_single_logical(ignore_het)
  assert_single_logical(report_progress)
  
  # get within-sample allele frequencies
  wsaf <- get_wsaf(x, impute = FALSE)
  
  # get global allele frequencies
  p <- colMeans(wsaf, na.rm = TRUE)
  
  # process hets
  if (ignore_het) {
    wsaf[wsaf != 0 & wsaf != 1] <- NA
  } else {
    wsaf <- round(wsaf)
  }
  
  # convert NA to -1 before passing to C++
  wsaf[is.na(wsaf)] <- -1
  
  # create progress bars
  pb <- txtProgressBar(min = 0, max = nrow(wsaf)-1, initial = NA, style = 3)
  args_progress <- list(pb = pb)
  
  # run efficient C++ function
  args <- list(x = mat_to_rcpp(wsaf), f = f, p = p, report_progress = report_progress)
  args_functions <- list(update_progress = update_progress)
  output_raw <- inbreeding_mle_cpp(args, args_functions, args_progress)
  
  # process output
  ret_ml <- rcpp_to_mat(output_raw$ret_ml)
  ret_ml[row(ret_ml) >= col(ret_ml)] <- NA
  ret_all <- rcpp_to_array(output_raw$ret_all)
  
  # return list
  ret <- list(mle = ret_ml,
              loglike = ret_all)
  return(ret)
}


#------------------------------------------------
#' @title Get identity by mixture
#'
#' @description Get identity by "mixture distance". The mixture distance between
#'   two samples is the proportion of loci that have identical within-sample
#'   allele frequencies (WSAFs), or alternatively have WSAFs within a given
#'   tolerance. This extends the idea of identity by state (IBS) to continuous
#'   WSAFs rather than categorical genotypes.
#'
#' @param x object of class \code{mipanalyzer_biallelic}.
#' @param tol tolerance on mixture comparisons. Default = 0.
#' @param diagonal Should the diagonal of the distance matrix be changed to a
#'   given value. Default = NULL, which cause no changes.
#' @param report_progress if \code{TRUE} then a progress bar is printed to the
#'   console.
#'
#' @export

get_IB_mixture <- function(x, tol = 0, diagonal = NULL, report_progress = TRUE) {
  
  # check inputs
  assert_custom_class(x, "mipanalyzer_biallelic")
  assert_single_numeric(tol)
  if (!is.null(diagonal)) {
    assert_single_numeric(diagonal)
  }
  assert_single_logical(report_progress)
  
  # get basic quantities
  nsamp <- nrow(x$samples)
  
  # get within-sample allele frequencies
  wsaf <- get_wsaf(x, impute = FALSE)
  
  # initialise progress bar
  if (report_progress) {
    pbar <- txtProgressBar(0, nsamp, style = 3)
  }
  
  # compute all pairwise IBS
  ret <- matrix(NA, nsamp, nsamp)
  for (i in 1:nsamp) {
    if (tol == 0) {
      ret[i,] <- rowMeans(outer(rep(1,nsamp), wsaf[i,]) == wsaf, na.rm = TRUE)
    } else {
      ret[i,] <- rowMeans(abs(outer(rep(1,nsamp), wsaf[i,]) - wsaf) <= tol, na.rm = TRUE)  
    }
    # update progress bar
    if (report_progress) {
      setTxtProgressBar(pbar, i)
    }
  }
  
  if (!is.null(diagonal)) {
    ret[diag(ret)] <- diagonal
  }
  ret[lower.tri(ret)] <- NA
  
  # return distance matrix
  return(ret)
}


#------------------------------------------------
#' @title Get identity by state (IBS) distance
#'
#' @description Get identity by state (IBS) distance, computed as the proportion
#'   of sites that are identical between samples. If \code{ignore_het = TRUE}
#'   then heterozygous sites are ignored, otherwise the major strain is called
#'   at every locus.
#'
#' @param x object of class \code{mipanalyzer_biallelic}.
#' @param ignore_het whether to ignore heterozygous comparisons, or alternatively
#'   call the major allele at every locus (see details).
#' @param diagonal Should the diagonal of the distance matrix be changed to a
#'   given value. Default = \code{NULL}, which cause no changes.
#' @param report_progress if \code{TRUE} then a progress bar is printed to the
#'   console.
#'
#' @export

get_IBS_distance <- function(x, ignore_het = TRUE, diagonal = NULL, report_progress = TRUE) {
  
  # check inputs
  assert_custom_class(x, "mipanalyzer_biallelic")
  assert_single_logical(ignore_het)
  assert_single_logical(report_progress)
  
  # get basic quantities
  nsamp <- nrow(x$samples)
  
  # get within-sample allele frequencies
  wsaf <- get_wsaf(x, impute = FALSE)
  
  # deal with heterozygous sites
  if (ignore_het) {
    wsaf[wsaf != 0 & wsaf != 1] <- NA
  } else {
    wsaf <- round(wsaf)
  }
  
  # initialise progress bar
  if (report_progress) {
    pbar <- txtProgressBar(0, nsamp, style = 3)
  }
  
  # compute all pairwise IBS
  ret <- matrix(NA, nsamp, nsamp)
  for (i in 1:nsamp) {
    ret[i,] <- rowMeans(outer(rep(1,nsamp), wsaf[i,]) == wsaf, na.rm = TRUE)
    
    # update progress bar
    if (report_progress) {
      setTxtProgressBar(pbar, i)
    }
  }
  
  if (!is.null(diagonal)) {
    ret[diag(ret)] <- diagonal
  }
  ret[lower.tri(ret)] <- NA
  
  
  # return distance matrix
  return(ret)
}

#------------------------------------------------
#' @title Plot a distance matrix
#'
#' @description Simple image plot of a matrix of pairwise distances.
#'
#' @param m square matrix of pairwise distances.
#' @param col_pal which viridis colour pallet to use. Options are "viridis",
#'   "plasma", "magma" or "inferno".
#'
#' @export

plot_distance <- function(m, col_pal = "plasma") {
  
  # avoid "no visible binding" notes
  x <- y <- value <- NULL
  
  # check inputs
  assert_square_matrix(m)
  assert_single_string(col_pal)
  assert_in(col_pal, c("viridis", "plasma", "magma", "inferno"))
  
  # get matrix into long-form data.frame
  n <- nrow(m)
  df_plot <- data.frame(x = 1:n,
                        y = rep(1:n, each = n),
                        value = as.vector(m))
  
  # produce plot
  df_plot |>
    ggplot() + theme_void() +
    geom_tile(aes(x = x, y = y, fill = value)) +
    scale_fill_viridis_c(option = col_pal, na.value = "white")
  
}
mrc-ide/MIPanalyzer documentation built on Jan. 17, 2024, 7:16 p.m.