R/detect_outlier_in_GeneticSpace.R
In GGoutlieR: Identify Individuals with Unusual Geo-Genetic Patterns

Documented in ggoutlier_geneticKNN

#' GGoutlieR with the genetic KNN approach
#' @description identify samples geographically remote from K genetically nearest neighbors (genetic KNN). For the details of the outlier detection approach, please see the supplementary material of Chang and Schmid 2023 (doi:https://doi.org/10.1101/2023.04.06.535838)
#' @param geo_coord matrix or data.frame with two columns. The first column is longitude and the second one is latitude.
#' @param gen_coord matrix. A matrix of "coordinates in a genetic space". Users can provide ancestry coefficients or eigenvectors for calculation. If, for example, ancestry coefficients are given, each column corresponds to an ancestral population. Samples are ordered in rows as in `geo_coord`.
#' @param pgdM matrix. A pairwise genetic distance matrix. Users can provide a customized genetic distance matrix with this argument. Samples are ordered in rows and columns as in the rows of `geo_coord`. The default of `pgdM` is `NULL`. If `pgdM` is not provided, a genetic distance matrix will be calculated from `gen_coord`.
#' @param k integer. Number of the nearest neighbors.
#' @param klim vector. A range of K to search for the optimal number of nearest neighbors. The default is `klim = c(3, 50)`
#' @param make_fig logic. If `make_fig = TRUE`, plots for diagnosing GGoutlieR analysis will be generated and saved to `plot_dir`. The default is `FALSE`
#' @param plot_dir string. The path to save plots
#' @param w_power numanceric. A value controlling the power of distance weight in genetic KNN prediction.
#' @param p_thres numeric. A significe level
#' @param s integer. A scalar of geographical distance. The default `s=100` scales the distance to a unit of 0.1 kilometer.
#' @param cpu integer. Number of CPUs to use for searching the optimal K.
#' @param verbose logic. If `verbose = FALSE`, `ggoutlier` will suppress printout messages.
#' @param multi_stages logic. A multi-stage test will be performed if is `TRUE` (the default is `TRUE`).
#' @param maxIter numeric. Maximal iteration number of multi-stage KNN test.
#' @param keep_all_stg_res logic. Results from all iterations of the multi-stage test will be retained if it is`TRUE`. (the default is `FALSE`)
#' @param warning_minR2 numeric. The prediction accuracy of KNN is evaluated as R^2 to assess the violation of isolation-by-distance expectation. If any R^2 is larger than `warning_minR2`, a warning message will be reported at the end of your analysis.
#' @param n numeric. A number of random samples to draw from the null distribution for making a graph.
#' @returns an object of `list` including six items. `statistics` is a `data.frame` consisting of the `D geography` "Dgeo" values, p values and a column of logic values showing if a sample is an outlier or not. `threshold` is a `data.frame` recording the significance threshold. `gamma_parameter` is a vector recording the parameter of the heuristic Gamma distribution. `knn_index` and `knn_name` are a `data.frame` recording the K nearest neighbors of each sample. `scalar` is the value of geographical distance scalar used in the computation.
#' @export
ggoutlier_geneticKNN <- function(geo_coord,
                                 gen_coord = NULL,
                                 pgdM = NULL,
                                 k = NULL,
                                 klim = c(3,50),
                                 make_fig = FALSE,
                                 plot_dir = ".",
                                 w_power = 2,
                                 p_thres = 0.05,
                                 n = 10^6,
                                 s = 100,
                                 multi_stages = TRUE,
                                 maxIter=NULL,
                                 keep_all_stg_res = FALSE,
                                 warning_minR2 = 0.9,
                                 cpu = 1,
                                 verbose = TRUE
                                 ){
  required_pkgs <- c("stats4", # package to perform maximum likelihood estimation
                     "FastKNN", # KNN algorithm using a given distance matrix (other packages do not take arbitrary distance matrices)
                     "foreach", "doParallel",
                     "iterators","parallel")
  invisible(lapply(required_pkgs, FUN=function(x){suppressPackageStartupMessages(library(x, verbose = FALSE, character.only = TRUE))}))

  # use on.exit to prevent changes in users' pars
  if(make_fig){
    oldpar <- par(no.readonly = TRUE) # save original par of users
    on.exit(oldpar)
  }

  if(cpu > 1){
    max_cores=detectCores()
    if(cpu >= max_cores){
        warning(paste0("\n The maximum number of CPUs is ", max_cores, ". Set `cpu` to ", max_cores-1," \n"))
        cpu <- max_cores -1 # reserve max-1 cpus if users request too many cpus
    }
    if(cpu > 1){
      if(verbose) cat(paste0("\n Parallelize computation using ", cpu, " cores \n"))
      cl <- makeCluster(cpu)
      do_par <- TRUE
    }
  } else {
    if(cpu < 1){stop("`cpu` has to be at least 1")}
    cl = NULL
    do_par <- FALSE
    if(verbose) cat("\n Computation using 1 core. you can parallelize computation by setting a higher value to `cpu` argument \n")
  }

  # check inputs
  if(all(is.null(gen_coord) & is.null(pgdM))){
    stop("Either `gen_coord` or `pgdM` has to be provided!")
  }
  if(all(!is.null(gen_coord) & !is.null(pgdM))){
    warning("Both `gen_coord` and `pgdM` are provided. `pgdM` will be used instead of calculating genetic distances from `gen_coord`.\n")
  }
  if(ncol(geo_coord) != 2){stop("Please ensure the `geo_coord` having two columns!")}
  if(!is.null(gen_coord)){
    if(nrow(geo_coord) != nrow(gen_coord)){stop("`geo_coord` and `gen_coord` has different sample size!")}
  }else{
    if(nrow(geo_coord) != nrow(pgdM) | nrow(geo_coord) != ncol(pgdM)){stop("`geo_coord` and `pgdM` has different sample size!")}
  }

  if(is.null(rownames(geo_coord))){
    rownames(geo_coord) <- paste("sample", 1:nrow(geo_coord), sep = "")
  }else{
    if(any(is.na(rownames(geo_coord)))){
      if(verbose) cat("Some samples in `geo_coord` have IDs but some do not. Arbitratry IDs are assigned to all samples...")
      rownames(geo_coord) <- paste("sample", 1:nrow(geo_coord), sep = "")
    }
  }
  # check missing values
  if(any(apply(geo_coord, 1, function(x){any(is.na(x))}))){
    n_geomis <- sum(apply(geo_coord, 1, function(x){any(is.na(x))}))
    warning(paste0(n_geomis," samples are missing in your `geo_coord`, they are removed from the analysis"))
  }
  if(any(apply(gen_coord, 1, function(x){any(is.na(x))}))){
    n_genmis <- sum(apply(geo_coord, 1, function(x){any(is.na(x))}))
    warning(paste0(n_genmis," samples are missing in your `gen_coord`, they are removed from the analysis"))
  }
  to.rm <- apply(geo_coord, 1, function(x){any(is.na(x))}) | apply(gen_coord, 1, function(x){any(is.na(x))})
  if(any(to.rm)){
    geo_coord <- geo_coord[!to.rm,]
    gen_coord <- gen_coord[!to.rm,]
  }

  # calculate genetic distance using ancestry coefficients
  if(is.null(pgdM)){
    pgdM <- as.matrix(dist(gen_coord, method = "euclidean"))
  }
  # handle samples with identical genotypes
  if(any(pgdM[lower.tri(pgdM)] == 0)){
    if(verbose) cat("Find samples with zero genetic distances.\n")
    if(verbose) cat("Add 10^-6 unit of distance to individual pairs\n")
    pgdM <- pgdM + 10^-6
    diag(pgdM) <- 0
  }

  ##---------------------------------------------------
  ## search for the optimal K

  if(is.null(k)){
    cat(paste("\n `k` is NULL; searching for optimal k between", klim[1], "and", klim[2],"\nthis process can take time...\n"))

    all.D = find_optimalK_geneticKNN(geo_coord = geo_coord,
                                             pgdM = pgdM,
                                             w_power = w_power,
                                             klim = klim,
                                             do_par = do_par,
                                             s = s,
                                             cl = cl)

    # make a figure of optimal k selection
    opt.k = c(klim[1]:klim[2])[which.min(all.D)]
    if(make_fig){
      k.sel.plot <- paste(plot_dir, "/KNN_Dgeo_optimal_k_selection.pdf", sep = "")
      pdf(k.sel.plot, width = 5, height = 4)
      par(mar=c(4,6,1,1))
      plot(x = klim[1]:klim[2], y = all.D, xlab="K", ylab=expression(sum(D["geo,i"], i==1, n)))
      abline(v = opt.k)
      legend("top",legend = paste("optimal k =", opt.k), pch="", bty = "n",cex = 1.2)
      dev.off()
    }

    k = opt.k
    if(make_fig){
      if(verbose) cat(paste0("\n The optimal k is ",opt.k,". Its figure is saved at ", k.sel.plot," \n"))
    }else{
      if(verbose) cat(paste0("\n The optimal k is ",opt.k," \n"))
    }


  }
  if(is.null(k)){stop("k is NULL!")}
  cat("\nSearching K nearest neighbors...\n")
  #---------------------------------
  # KNN prediction with the optimal K (or K given by users)
  knn.indx <- find_gen_knn(pgdM, k=k)
  pred.geo_coord <- pred_geo_coord_knn(geo_coord = geo_coord,
                                       pgdM = pgdM,
                                       knn.indx = knn.indx,
                                       w_power = w_power)
  # calculate Dgeo statistic
  if(verbose) cat(paste("\n\n D geo is scaled to a unit of",s,"meters \n"))
  Dgeo <- cal_Dgeo(pred.geo_coord = pred.geo_coord, geo_coord = geo_coord, scalar = s)

  # rescale Dgeo
  # -> maximum likelihood estimation would run into an error if the values of Dgeo is too large
  # -> rescaling does not influence our outlier test
  maxD <- max(Dgeo)
  if(maxD > 20){
    tmps <- 1
    while (maxD > 20) {
      tmps <- tmps * 10
      maxD <- maxD / 10
    }
    orig_s <- s
    s <- s * tmps # new scalar
    if(verbose) cat(paste0("\n\n GGoutlieR adjusts the given scalar `s` value from `s=", orig_s, "` to `s=", s, "` to prevent an error in the maximum likelihood estimation process\n"))
    if(verbose) cat(paste0("\n\n D geo is re-scaled to a unit of ",s," meters \n\n"))
  }
  Dgeo <- cal_Dgeo(pred.geo_coord = pred.geo_coord, geo_coord = geo_coord, scalar = s)
  if(any(Dgeo == 0)){
    tmp.indx <- Dgeo == 0
    if(verbose) cat(paste("\n\n",sum(tmp.indx),"samples have D geo = 0. Zeros are replaced with 10^-8 to prevent the error in maximum likelihood estimation."))
    Dgeo[tmp.indx] <- 10^-8
  }

  predR2 <- diag(cor(pred.geo_coord, geo_coord))^2
  if(min(predR2) >= warning_minR2){
    warning(paste0("\n\n\n The lowest prediction accuracy (R^2) of KNN among two geographical dimensions (according to the given `geo_coord`) is ", round(min(predR2), digits = 4),
                   ". Maybe only few extreme outliers in your samples. \nYou could manually check the results with `plot_GGoutlieR` and adjust its `p_thres` argument to see which threshold is more appropriate. \n\n\n\n"))
  }
  #---------------------------------
  # Get null distribution
  ## Maximum likelihood assuming Gamma distribution
  ### Define negative log likelihood function
  negLL <- function(a, b){
    -sum(dgamma(Dgeo, shape = a, rate = b, log = T))
  }

  ### Loop until paramters converging
  initial.a <- (mean(Dgeo)^2)/var(Dgeo)
  initial.b <- mean(Dgeo)/var(Dgeo)
  #if(verbose) cat("\n\nUsing maximum likelihood estimation to infer the null Gamma distribution...\n")
  mle.res <- mle(minuslogl = negLL, start = list(a = initial.a, b = initial.b),
                 lower = 10^-8, upper = 10^8, method = "L-BFGS-B")
  current.a <- unname(mle.res@coef["a"])
  current.b <- unname(mle.res@coef["b"])

  #--------------------------------------------------
  # make a figure for the null Gamma distribution
  null.fun <- function(x){dgamma(x , shape = current.a, rate = current.b)}
  gamma.thres <- qgamma(1-p_thres, shape = current.a, rate = current.b)
  null.distr <- rgamma(n , shape = current.a, rate = current.b)

  if(make_fig){
    null.plot <- paste(plot_dir, "/KNN_Dgeo_null_distribution.pdf", sep = "")
    if(verbose) cat(paste("\nThe plot of null distribution is saved at ", null.plot," \n", sep = ""))
    pdf(null.plot, width = 5, height = 4)
    curve(null.fun, from = 0, to = max(null.distr), add = F, col = "blue",
          ylab = "Density",
          xlab = bquote(Gamma ~ '(' ~ alpha ~'='~.(round(current.a, digits = 3))~','~beta~'='~.(round(current.b, digits = 3))~')' )
          ,bty = "n", main = expression("Null distribution of D"[geo]))
    abline(v = gamma.thres, col = "red", lty = 2)
    text(x = gamma.thres, y = par("usr")[4], labels = paste("p =", p_thres), xpd=NA)
    par(mgp = c(3,0,0))
    axis(side = 1 ,at = round(gamma.thres, digits = 3), line = 0.3, tck = 0.02, font = 2)
    dev.off()
  }


  #----------------------------
  # return results
  sig.indx <- Dgeo > gamma.thres
  p.value <- 1 - pgamma(Dgeo, shape = current.a, rate = current.b)
  out <- data.frame(Dgeo, p.value, significant = sig.indx)
  rownames(out) <- rownames(geo_coord)
  thres <- data.frame(pvalue = c(0.05,0.01,0.005,0.001),statistic = qgamma(1 - c(0.05,0.01,0.005,0.001), shape = current.a, rate = current.b))
  gamma.par <- c(current.a, current.b)
  names(gamma.par) <- c("alpha", "beta")
  rownames(knn.indx) <- rownames(geo_coord)
  knn.name <- apply(knn.indx,2, function(x){rownames(geo_coord)[x]})
  res.out <- list(out, thres, gamma.par, knn.indx, knn.name, s)
  names(res.out) <- c("statistics","threshold","gamma_parameter", "knn_index", "knn_name", "scalar")

  arguments <- c(s, w_power, k, NA, multi_stages)
  names(arguments) <- c("scalar", "genetic_weight_power", "K", "min_neighbor_dist", "multi_stage_test")

  if(!multi_stages){
    attr(res.out, "model") <- "ggoutlier_geneticKNN"
    ## save arguments used in GGoutlieR
    attr(res.out, "arguments") <- arguments
    return(res.out)
  }else{
    #-------------------------------------------------------------
    # multi-stage test
    cat("\n\nDoing multi-stage test...\n\n")
    if(verbose) cat(paste0("\n\nInitiate the multi-stage KNN test process with k=",k,
                   " using a null Gamma distribution with shape=", round(current.a, digits = 3),
                   " and rate=",round(current.b, digits = 3),
                   " (the parameters of Gamma distribution were determined by MLE)\n\n"))

    i=1
    to_keep <- res.out$statistics$p.value > min(res.out$statistics$p.value)
    tmp.pgdM <- pgdM[to_keep, to_keep]
    tmp.geo_coord <- geo_coord[to_keep,]

    res.Iters <- list(res.out)
    # if `maxIter` is NULL -> let it equal to 50% of sample size
    if(is.null(maxIter)){maxIter <- round(nrow(gen_coord) * 0.5)}
    while (i <= maxIter) {
      cat("iteration",i, "\r")
      flush.console()
      if(i > 1){
        tmp.pgdM <- tmp.pgdM[to_keep, to_keep]
        tmp.geo_coord <- tmp.geo_coord[to_keep,]
      }

      if(verbose) cat(paste0("Iteration ", i,"\r"), append = F)
      # find KNN
      tmp.knn.indx <- find_gen_knn(tmp.pgdM, k=k)
      # KNN prediction
      tmp.pred.geo_coord <- pred_geo_coord_knn(geo_coord = tmp.geo_coord,
                            pgdM = tmp.pgdM,
                            knn.indx = tmp.knn.indx,
                            w_power = w_power)
      # calculate Dgeo statistic
      tmp.Dgeo <- cal_Dgeo(pred.geo_coord = tmp.pred.geo_coord,
                           geo_coord = tmp.geo_coord,
                           scalar = s)
      tmp.p.value <- 1 - pgamma(tmp.Dgeo, shape = current.a, rate = current.b)
      to_keep <- tmp.p.value > min(tmp.p.value)

      if(all(tmp.p.value > p_thres)){
        if(verbose) cat("\nNo new significant sample is identified. Multi-stage testing process ends...\n")
        break
      }else{
        out <- data.frame(tmp.Dgeo, tmp.p.value, significant = tmp.p.value < p_thres)
        colnames(out) <- c("Dgeo", "p.value", "significant")
        rownames(out) <- rownames(tmp.geo_coord)
        thres <- data.frame(pvalue = c(0.05,0.01,0.005,0.001),statistic = qgamma(1 - c(0.05,0.01,0.005,0.001), shape = current.a, rate = current.b))
        gamma.par <- c(current.a, current.b)
        names(gamma.par) <- c("alpha", "beta")
        rownames(tmp.knn.indx) <- rownames(tmp.geo_coord)
        tmp.knn.name <- apply(tmp.knn.indx,2, function(x){rownames(tmp.geo_coord)[x]})
        tmp.res <- list(out, thres, gamma.par,  tmp.knn.indx, tmp.knn.name, s)
        names(tmp.res) <- c("statistics","threshold","gamma_parameter", "knn_index", "knn_name", "scalar")
        res.Iters <- c(res.Iters, list(tmp.res))
      }
      i = i+1

    } # while loop end

    ## collect results of all iterations
    ## NOTE: In each iteration of 'multi-stage test', the sample with the most significant p values will be exclude from the KNN searching procedure in the next iteration
    ##       The loop here sequentially collect the outputs from each iteration and update the data.frame `collapse_res`
    collapse_res <- res.Iters[[1]]
    if(length(res.Iters) > 1){
      for(i in 2:length(res.Iters)){
        tmp <- res.Iters[[i]]
        tmp$statistics <- tmp$statistics[match(rownames(collapse_res$statistics),
                                               rownames(tmp$statistics)),]
        tmp$knn_name <- tmp$knn_name[match(rownames(collapse_res$statistics),
                                           rownames(tmp$knn_index)),]

        tmp.indx <- which(!is.na(tmp$statistics$Dgeo))
        if(length(tmp.indx)>0){
          collapse_res$statistics[tmp.indx,] <- tmp$statistics[tmp.indx,]
          collapse_res$knn_name[tmp.indx,] <- tmp$knn_name[tmp.indx,]
          for(j in tmp.indx){
            collapse_res$knn_index[j,] <- match(collapse_res$knn_name[j,], rownames(collapse_res$statistics))
          }
        }
      }
    }
    # make a figure comparing the results of single stage and multi-stage tests
    if(make_fig){
      logp.plot <- paste0(plot_dir, "/geneticKNN_test_multi_stage_Log10P_comparison.pdf")
      if(verbose) cat(paste("\n\n\nThe plot for comparing -logP between single-stage and multi-stage KNN tests is saved at ", logp.plot," \n", sep = ""))
      pdf(logp.plot, width = 4, height = 4.2)
      plot(-log10(res.out$statistics$p.value),
           -log10(collapse_res$statistics$p.value),
           xlab = expression("-log"[10]~"(p) of single-stage KNN test"),
           ylab = expression("-log"[10]~"(p) of multi-stage KNN test"),
           main = "KNN in Genetic space")
      dev.off()
    }

    if(keep_all_stg_res){
      names(res.Iters) <- paste0("Iter_", 1:length(res.Iters))
      out <- c(res.Iters, combined_result = list(collapse_res))
      attr(out, "model") <- "ggoutlier_geneticKNN"
      ## save arguments used in GGoutlieR
      attr(out, "arguments") <- arguments
      return(out)
    }else{
      attr(collapse_res, "model") <- "ggoutlier_geneticKNN"
      ## save arguments used in GGoutlieR
      attr(collapse_res, "arguments") <- arguments
      return(collapse_res)
    }
  } # multi-stage test end

} # ggoutlier_geneticKNN end