Nothing
#' GGoutlieR with the geographical KNN approach
#' @description identify samples genetically different from K nearest geographical neighbors (geographical KNN). For the details of the outlier detection approach, please see the supplementary material of Chang and Schmid 2023 (doi:https://doi.org/10.1101/2023.04.06.535838)
#' @param geo_coord matrix or data.frame with two columns. The first column is longitude and the second one is latitude.
#' @param gen_coord matrix. A matrix of "coordinates in a genetic space". Users can provide ancestry coefficients or eigenvectors for calculation. If, for example, ancestry coefficients are given, each column corresponds to an ancestral population. Samples are ordered in rows as in `geo_coord`.
#' @param min_nn_dist numeric. A minimal geographical distance for searching KNNs. Neighbors of a focal sample within this distance will be excluded from the KNN searching procedure.
#' @param k integer. Number of the nearest neighbors.
#' @param klim vector. A range of K to search for the optimal number of nearest neighbors. The default is `klim = c(3, 50)`
#' @param make_fig logic. If `make_fig = TRUE`, plots for diagnosing GGoutlieR analysis will be generated and saved to `plot_dir`. The default is `FALSE`
#' @param plot_dir string. The path to save plots
#' @param w_power numeric. A value controlling the power of distance weight in geographical KNN prediction.
#' @param p_thres numeric. A significance level
#' @param s integer. A scalar of geographical distance. The default `s=100` scales the distance to a unit of 0.1 kilometer.
#' @param cpu integer. Number of CPUs to use for searching the optimal K.
#' @param verbose logic. If `verbose = FALSE`, `ggoutlier` will suppress printout messages.
#' @param multi_stages logic. A multi-stage test will be performed if is `TRUE` (the default is `TRUE`).
#' @param maxIter numeric. Maximal iteration number of multi-stage KNN test.
#' @param keep_all_stg_res logic. Results from all iterations of the multi-stage test will be retained if it is`TRUE`. (the default is `FALSE`)
#' @param warning_minR2 numeric. The prediction accuracy of KNN is evaluated as R^2 to assess the violation of isolation-by-distance expectation. If any R^2 is larger than `warning_minR2`, a warning message will be reported at the end of your analysis.
#' @param n numeric. A number of random samples to draw from the null distribution for making a graph.
#' @returns an object of `list` including five items. `statistics` is a `data.frame` consisting of the `D genetics` ("Dg") values, p values and a column of logic values showing if a sample is an outlier or not. `threshold` is a `data.frame` recording the significance threshold. `gamma_parameter` is a vector recording the parameter of the heuristic Gamma distribution. `knn_index` and `knn_name` are a `data.frame` recording the K nearest neighbors of each sample.
#' @export
ggoutlier_geoKNN <- function(geo_coord,
gen_coord,
min_nn_dist = 100,
k = NULL,
klim = c(3,50),
s = 100,
make_fig = FALSE,
plot_dir = ".",
w_power = 1,
p_thres = 0.05,
n = 10^6,
multi_stages = TRUE,
maxIter=NULL,
keep_all_stg_res = FALSE,
warning_minR2 = 0.9,
cpu = 1,
verbose = TRUE
){
required_pkgs <- c("sf", # for calculating geographical distances
"stats4", # package to perform maximum likelihood estimation
"FastKNN", # KNN algorithm using a given distance matrix (other packages do not take arbitrary distance matrices)
"foreach", "doParallel",
"iterators","parallel")
invisible(lapply(required_pkgs, FUN=function(x){suppressPackageStartupMessages(library(x, verbose = FALSE, character.only = TRUE))}))
# use on.exit to prevent changes in users' pars
if(make_fig){
oldpar <- par(no.readonly = TRUE) # save original par of users
on.exit(oldpar)
}
if(cpu > 1){
max_cores=detectCores()
if(cpu >= max_cores){
warning(paste0("\n The maximum number of CPUs is ", max_cores, ". Set `cpu` to ", max_cores-1," \n"))
cpu <- max_cores -1 # reserve max-1 cpus if users request too many cpus
}
if(cpu > 1){
if(verbose) cat(paste0("\n Parallelize computation using ", cpu, " cores \n"))
cl <- makeCluster(cpu)
do_par <- TRUE
}
} else {
if(cpu < 1){stop("`cpu` has to be at least 1")}
do_par <- FALSE
cl <- NULL
if(verbose) cat("\n Computation using 1 core. you can parallelize computation by setting a higher value to `cpu` argument \n")
}
# check inputs
if(ncol(geo_coord) != 2){stop("Please ensure the `geo_coord` having two columns!")}
if(nrow(geo_coord) != nrow(gen_coord)){stop("`geo_coord` and `gen_coord` has different sample size!")}
if(is.null(rownames(geo_coord))){
rownames(geo_coord) <- paste("sample", 1:nrow(geo_coord), sep = "")
}else{
if(any(is.na(rownames(geo_coord)))){
if(verbose) cat("Some samples in `geo_coord` have IDs but some do not. Arbitratry IDs are assigned to all samples...")
rownames(geo_coord) <- paste("sample", 1:nrow(geo_coord), sep = "")
}
}
# check missing values
if(any(apply(geo_coord, 1, function(x){any(is.na(x))}))){
n_geomis <- sum(apply(geo_coord, 1, function(x){any(is.na(x))}))
warning(paste0(n_geomis," samples are missing in your `geo_coord`, they are removed from the analysis"))
}
if(any(apply(gen_coord, 1, function(x){any(is.na(x))}))){
n_genmis <- sum(apply(geo_coord, 1, function(x){any(is.na(x))}))
warning(paste0(n_genmis," samples are missing in your `gen_coord`, they are removed from the analysis"))
}
to.rm <- apply(geo_coord, 1, function(x){any(is.na(x))}) | apply(gen_coord, 1, function(x){any(is.na(x))})
if(any(to.rm)){
geo_coord <- geo_coord[!to.rm,]
gen_coord <- gen_coord[!to.rm,]
}
#--------------------------------
# Data pre-treatment
## geographical data: data.frame to sf format
geo_data_df <- data.frame(ID = rownames(geo_coord),
x = geo_coord$x,
y = geo_coord$y)
geo_data_sf <- sf::st_as_sf(geo_data_df,
coords = c("x", "y"), crs = 4326)
## calculate geographical distance
geo.dM <- sf::st_distance(geo_data_sf)/s
geo.dM <- matrix(as.vector(geo.dM), ncol = ncol(geo.dM), nrow = nrow(geo.dM)) # convert geo.dM from `units` to matrix
## handle samples with identical geographical coordinates
if(any(as.vector(geo.dM[lower.tri(geo.dM)]) == 0)){
if(verbose) cat("Find samples with identical geographical coordinates.\n")
if(any(min_nn_dist == 0 , is.null(min_nn_dist))){
if(verbose) cat("Add one unit of distance to individual pairs\n")
geo.dM <- geo.dM + 1
diag(geo.dM) <- 0
}else{
if(verbose) cat("Ignore neighbors whose pairwise distance is less than `min_nn_dist` when searching KNNs.\n")
orig_min_nn_dist <- min_nn_dist
min_nn_dist <- orig_min_nn_dist/s
if(verbose) cat(paste0("\n\nIgnore neighbors within ",min_nn_dist, " unit(s) of distance (the default unit of distance is m; `min_nn_dist` is adjusted automatically according to `s`)\n"))
}
}
#---------------------------------------
# Search the optimal K for KNN
if(is.null(k)){
# automatically select k if k=NULL
cat(paste("\n\n `k` is NULL; searching for optimal k between", klim[1], "and", klim[2],"\nthis process can take time...\n"))
all.D <- find_optimalK_geoKNN(geo_coord = geo_coord,
gen_coord = gen_coord,
geo.dM = geo.dM,
w_power = w_power,
klim = klim,
do_par = do_par,
min_nn_dist = min_nn_dist,
cl = cl)
# make a figure for K searching procedure
opt.k = c(klim[1]:klim[2])[which.min(all.D)]
k = opt.k # replace k with the optimal k
if(make_fig){
k.sel.plot <- paste(plot_dir, "/KNN_Dg_optimal_k_selection.pdf", sep = "")
pdf(k.sel.plot, width = 5, height = 4)
par(mar=c(4,6,1,1))
plot(x = klim[1]:klim[2], y = all.D, xlab="K", ylab=expression(sum(D["genetic,i"], i==1, n)))
abline(v = opt.k)
legend("top",legend = paste("optimal k =", opt.k), pch="", bty = "n",cex = 1.2)
dev.off()
}
if(make_fig){
if(verbose) cat(paste0("\n The optimal k is ",opt.k,". Its figure is saved at ", k.sel.plot," \n"))
}else{
if(verbose) cat(paste0("\n The optimal k is ",opt.k," \n"))
}
}
if(is.null(k)){stop("k is NULL!")}
cat("\nSearching K nearest neighbors...\n")
#-----------------------------------
# KNN prediction with the optimal K (or K given by users)
knn.indx <- find_geo_knn(geo.dM = geo.dM, k=k, min_nn_dist=min_nn_dist)
pred.q <- pred_q_knn(geo_coord = geo_coord, gen_coord = gen_coord, geo.dM = geo.dM, knn.indx, w_power = w_power)
# calculate Dg statistic
Dg <- cal_Dg(pred.q, gen_coord)
if(any(Dg == 0)){
tmp.indx <- Dg == 0
if(verbose) cat(paste("\n\n", sum(tmp.indx),"samples have Dg = 0. Zeros are replaced with 10^-8 to avoid the error in maximum likelihood estimation."))
Dg[tmp.indx] <- 10^-8
}
# evaluate prediction accuracy as R^2
predR2 <- diag(cor(pred.q, gen_coord))^2
if(min(predR2) >= warning_minR2){
warning(paste0("\n\n\nThe lowest prediction accuracy (R^2) of KNN among ", ncol(gen_coord),
" genetic dimensions (according to the given `gen_coord`) is ", round(min(predR2), digits = 4),
". Maybe only few extreme outliers in your samples. \nYou could manually check the results with `plot_GGNet_map` and adjust its `p_thres` argument to see which threshold is more appropriate.\n\n\n\n"))
}
#----------------------------------------------------------
# Get null distribution
## Maximum likelihood assuming Gamma distribution
### Define negative log likelihood function
negLL <- function(a, b){
-sum(dgamma(Dg, shape = a, rate = b, log = T))
}
### Loop until parameters converging
initial.a <- (mean(Dg)^2)/var(Dg)
initial.b <- mean(Dg)/var(Dg)
#if(verbose) cat("\n\n Using maximum likelihood estimation to infer the null Gamma distribution...\n")
mle.res <- mle(minuslogl = negLL, start = list(a = initial.a, b = initial.b),
lower = 10^-8, upper = 10^8, method = "L-BFGS-B")
current.a <- unname(mle.res@coef["a"])
current.b <- unname(mle.res@coef["b"])
# get significance threshold
null.fun <- function(x){dgamma(x , shape = current.a, rate = current.b)}
gamma.thres <- qgamma(1-p_thres, shape = current.a, rate = current.b)
null.distr <- rgamma(n , shape = current.a, rate = current.b)
## make a plot for the null distribution
if(make_fig){
null.plot <- paste(plot_dir, "/KNN_Dg_null_distribution.pdf", sep = "")
if(verbose) cat(paste("\nThe plot of null distribution is saved at ", null.plot," \n", sep = ""))
pdf(null.plot, width = 5, height = 4)
curve(null.fun, from = 0, to = max(null.distr), add = F, col = "blue",
ylab = "Density",
xlab = bquote(Gamma ~ '(' ~ alpha ~'='~.(round(current.a, digits = 3))~','~beta~'='~.(round(current.b, digits = 3))~')' )
,bty = "n", main = expression("Null distribution of D"[g]))
abline(v = gamma.thres, col = "red", lty = 2)
text(x = gamma.thres, y = par("usr")[4], labels = paste("p =", p_thres), xpd=NA)
par(mgp = c(3,0,0))
axis(side = 1 ,at = round(gamma.thres, digits = 3), line = 0.3, tck = 0.02, font = 2)
dev.off()
}
#--------------------------------------------
# Return results
sig.indx <- Dg > gamma.thres
p.value <- 1 - pgamma(Dg, shape = current.a, rate = current.b)
out <- data.frame(Dg, p.value, significant = sig.indx)
rownames(out) <- rownames(geo_coord)
thres <- data.frame(pvalue = c(0.05,0.01,0.005,0.001),statistic = qgamma(1 - c(0.05,0.01,0.005,0.001), shape = current.a, rate = current.b))
gamma.par <- c(current.a, current.b)
names(gamma.par) <- c("alpha", "beta")
rownames(knn.indx) <- rownames(geo_coord)
knn.name <- apply(knn.indx,2, function(x){rownames(geo_coord)[x]})
res.out <- list(out, thres, gamma.par, knn.indx, knn.name)
names(res.out) <- c("statistics","threshold","gamma_parameter", "knn_index", "knn_name")
## arguments used in GGoutlieR
arguments <- c(s, w_power, k, min_nn_dist, multi_stages)
names(arguments) <- c("scalar", "geo_weight_power", "K", "min_neighbor_dist", "multi_stage_test")
if(!multi_stages){
attr(res.out, "model") <- "ggoutlier_geoKNN"
## save arguments used in GGoutlieR
attr(res.out, "arguments") <- arguments
return(res.out)
}else{
#---------------------
# multi-stage test
cat("\n\nDoing multi-stage test...\n\n")
if(verbose) cat(paste0("\n\nStart multi-stage KNN test process with k=",k,
" and using the null Gamma distribution with shape=", round(current.a, digits = 3),
" and rate=",round(current.b, digits = 3),
" (the parameters of Gamma distribution were determined by MLE)\n\n"))
i = 1
to_keep <- res.out$statistics$p.value > min(res.out$statistics$p.value)
tmp.gen_coord <- gen_coord[to_keep,]
tmp.geo.dM <- geo.dM[to_keep, to_keep]
tmp.geo_coord <- geo_coord[to_keep,]
res.Iters <- list(res.out)
# if `maxIter` is NULL -> let it equal to 50% of sample size
if(is.null(maxIter)){maxIter <- round(nrow(gen_coord) * 0.5)}
while (i <= maxIter) {
cat("iteration",i, "\r")
flush.console()
if(i > 1){
tmp.gen_coord <- tmp.gen_coord[to_keep,]
tmp.geo.dM <- tmp.geo.dM[to_keep, to_keep]
tmp.geo_coord <- tmp.geo_coord[to_keep,]
}
if(verbose) cat(paste0("Iteration ", i,"\r"), append = F)
# find KNN
tmp.knn.indx <- find_geo_knn(geo.dM = tmp.geo.dM, k=k, min_nn_dist=min_nn_dist)
# KNN prediction
tmp.pred.q <- pred_q_knn(geo_coord = tmp.geo_coord,
gen_coord = tmp.gen_coord,
geo.dM = tmp.geo.dM,
knn.indx = tmp.knn.indx,
w_power = w_power)
# calculate Dg statistic
tmp.Dg <- cal_Dg(tmp.pred.q, tmp.gen_coord)
tmp.p.value <- 1 - pgamma(tmp.Dg, shape = current.a, rate = current.b)
to_keep <- tmp.p.value > min(tmp.p.value)
if(all(tmp.p.value > p_thres)){
if(verbose) cat("\nNo new significant sample is identified. Multi-stage testing process ends...\n")
break
}else{
out <- data.frame(tmp.Dg, tmp.p.value, significant = tmp.p.value < p_thres)
colnames(out) <- c("Dg", "p.value", "significant")
rownames(out) <- rownames(tmp.geo_coord)
thres <- data.frame(pvalue = c(0.05,0.01,0.005,0.001),statistic = qgamma(1 - c(0.05,0.01,0.005,0.001), shape = current.a, rate = current.b))
gamma.par <- c(current.a, current.b)
names(gamma.par) <- c("alpha", "beta")
rownames(tmp.knn.indx) <- rownames(tmp.geo_coord)
tmp.knn.name <- apply(tmp.knn.indx,2, function(x){rownames(tmp.geo_coord)[x]})
tmp.res <- list(out, thres, gamma.par, tmp.knn.indx, tmp.knn.name)
names(tmp.res) <- c("statistics","threshold","gamma_parameter", "knn_index", "knn_name")
res.Iters <- c(res.Iters, list(tmp.res))
}
i = i+1
} # while loop end
## collect results of all iterations
collapse_res <- res.Iters[[1]]
if(length(res.Iters) > 1){
for(i in 2:length(res.Iters)){
tmp <- res.Iters[[i]]
tmp$statistics <- tmp$statistics[match(rownames(collapse_res$statistics),
rownames(tmp$statistics)),]
tmp$knn_name <- tmp$knn_name[match(rownames(collapse_res$statistics),
rownames(tmp$knn_index)),]
tmp.indx <- which(!is.na(tmp$statistics$Dg))
if(length(tmp.indx)>0){
collapse_res$statistics[tmp.indx,] <- tmp$statistics[tmp.indx,]
collapse_res$knn_name[tmp.indx,] <- tmp$knn_name[tmp.indx,]
for(j in tmp.indx){
collapse_res$knn_index[j,] <- match(collapse_res$knn_name[j,], rownames(collapse_res$statistics))
}
}
}
}
# make a figure comparing the results of single stage and multi-stage tests
if(make_fig){
logp.plot <- paste0(plot_dir, "/geoKNN_test_multi_stage_Log10P_comparison.pdf")
if(verbose) cat(paste("\n\n\nThe plot of comparing -logP between single-stage and multi-stage KNN tests is saved at ", logp.plot," \n", sep = ""))
pdf(logp.plot, width = 4, height = 4.2)
plot(-log10(res.out$statistics$p.value),
-log10(collapse_res$statistics$p.value),
xlab = expression("-log"[10]~"(p) of single-stage KNN test"),
ylab = expression("-log"[10]~"(p) of multi-stage KNN test"),
main = "KNN in Geographical space")
dev.off()
}
if(keep_all_stg_res){
names(res.Iters) <- paste0("Iter_", 1:length(res.Iters))
out <- c(res.Iters, combined_result = list(collapse_res))
attr(out, "model") <- "ggoutlier_geoKNN"
## save arguments used in GGoutlieR
attr(out, "arguments") <- arguments
return(out)
}else{
attr(collapse_res, "model") <- "ggoutlier_geoKNN"
## save arguments used in GGoutlieR
attr(collapse_res, "arguments") <- arguments
return(collapse_res)
}
} # multi-stage test end
} # ggoutlier_geoKNN end
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.