#' Samples causal SNPs with different effect types
#'
#' The sampled SNPs are combined in a list of character vectors with the
#' following fields: target, marginal, inter1 and inter2. Through the
#' parameters \code{overlap_marg} and \code{overlap_inter}, the synergistic
#' SNPs with the target can have additional marginal and epistatic effects. The
#' SNPs are consecutively sampled in the following order: target, marginal,
#' inter1 and inter2. For each SNP, we iteratively sample until the picked
#' SNP candidate meets the constraints defined by \code{thresh_MAF} and
#' \code{window_size} (see arguments for more details) or until the maximum
#' number of resamplings is reached. To avoid duplication of effects, we
#' sample at most one SNP per cluster.
#'
#' @section Warning:
#' Make sure to supply the SNP IDs in \code{names(clusters)}. The SNPs in
#' the output list are referenced by their names.
#'
#' @param nX number of SNPs interacting with the target variant
#' @param nY number of SNPs with marginal effects
#' @param nZ12 number of SNP pairs with epistatic effects
#' @param clusters vector of cluster memberships. Typically, the output
#' of \code{\link[stats]{cutree}}. For ease of identification,
#' the SNP IDs in \code{names(clusters)} are mandatory.
#' @param MAF vector of minor allele frequencies. The order of the SNPs in
#' \code{MAF} is identical to that in \code{clusters}.
#' @param thresh_MAF lower-bound on the minor allele frequencies of
#' causal SNPs. Rare variants are inherently difficult to recover. Assessing
#' the retrieval performance on common variants better reflects the true
#' performance of the epistasis detection algorithm.
#' @param window_size in number of clusters. Beside the target variant, the
#' causal SNPs are sampled outside of a window centered around the
#' target. On each side of the target variant, the width of the window
#' is \code{window_size}.
#' @param overlap_marg number of SNPs with both synergistic effects with the
#' target and marginal effects
#' @param overlap_inter number of SNPs with both synergistic effects with the
#' target and additional epistatic effects
#' @param max_iter maximum number of sampling rejections for each SNP.
#' If exceeded, the function generates an error
#'
#' @return list of character vectors containing to the causal SNP IDs. The
#' output list entries are: target, marginal, inter1 and inter2. An
#' epistatic pair is obtained from the combination of two SNPs with
#' identical positions in \code{inter1} and \code{inter2}.
#'
#' @examples
#' clusters <- rep(seq_len(10), each = 3)
#' names(clusters) <- paste0("SNP_", seq_along(clusters))
#' MAF <- runif(length(clusters), min = 0.1, max = 0.5)
#'
#' sample_SNP(nX = 2, nY = 2, nZ12 = 1, clusters, MAF, thresh_MAF = 0.2,
#' window_size = 2, overlap_marg = 1, overlap_inter = 0)
#'
#' @export
sample_SNP <- function(nX, nY, nZ12, clusters, MAF, thresh_MAF = 0.2, window_size = 3,
overlap_marg = 0, overlap_inter = 0, max_iter = 10000) {
stopifnot((thresh_MAF > 0) & (thresh_MAF <= 0.5))
stopifnot(!is.null(names(clusters)))
stopifnot(length(clusters) == length(MAF))
stopifnot(overlap_marg + overlap_inter < nX)
stopifnot(overlap_marg < nY)
stopifnot(overlap_inter < nZ12)
if (is.null(names(MAF))) {
names(MAF) <- names(clusters)
} else {
stopifnot(all(names(MAF) == names(clusters)))
}
active <- rep(0, 1 + nX + nY + 2 * nZ12 - (overlap_marg + overlap_inter))
for (i in seq_along(active)) {
iter <- 0
cdt <- FALSE
while (cdt == FALSE) {
iter <- iter + 1
candidate_SNP <- sample.int(length(clusters), size = 1)
candidate_name <- names(clusters)[candidate_SNP]
cdt <- (MAF[candidate_SNP] > thresh_MAF)
if (i == 1) {
cluster_target <- clusters[candidate_name]
} else {
cdt <- cdt & (abs(clusters[candidate_name] - cluster_target) >
window_size)
}
if (iter > max_iter) {
stop("The MAF constraint can not be satisfied")
}
}
active[i] <- candidate_name
MAF <- subset(MAF, (clusters != clusters[candidate_SNP]))
clusters <- subset(clusters, (clusters != clusters[candidate_SNP]))
}
marg_idx <- sample.int(nX, overlap_marg)
if (overlap_marg == 0) {
inter_idx <- sample(seq_len(nX), overlap_inter)
} else {
inter_idx <- sample(seq_len(nX)[-marg_idx], overlap_inter)
}
SNP_list <- list(
target = active[1], syner = active[seq_len(nX) + 1],
marginal = active[c(seq(nX + 2, nX + nY + 1 - overlap_marg), 1 +
marg_idx)], inter1 = active[c(seq(nX + nY + 2, nX + nY + nZ12 +
1 - overlap_inter) - overlap_marg, 1 + inter_idx)], inter2 = active[seq(nX +
nY + nZ12 + 2 - (overlap_marg + overlap_inter), length(active))]
)
return(SNP_list)
}
#' Samples effect sizes for the disease model
#'
#' The generated disease model is the list of effect size coefficients.
#' The list comprises the following fields: 'syner', 'marg' and 'inter'.
#' 'syner' is itself a list of numeric vectors with two entries named
#' 'A0' and 'A1'. 'A0' refers to the vector of effect sizes when the target
#' variant \eqn{A = 0}{A = 0}. Similarly, 'A1' refers to the vector of effect
#' sizes in the case \eqn{A = 1}{A = 1}. The two other entries 'marg' and
#' 'inter' are, respectively, the marginal and epistatic effect sizes. The
#' effect sizes are independent and normally-distributed. The \code{mean}
#' parameter is either a list of vectors or a vector of length 4. If
#' \code{mean} is a vector, then the effect sizes for each type of effects have
#' the same mean. Otherwise, the corresponding vector in the list
#' specifies their individual means. The same logic applies to \code{sd}, the
#' standard deviation parameter. For coherence, the parameters \code{mean} and
#' \code{sd} are encoded in the same order as the output.
#'
#' @param nX number of SNPs interacting with the target variant
#' @param nY number of SNPs with marginal effects
#' @param nZ12 number of SNP pairs with epistatic effects
#' @param mean vector or list of means
#' @param sd vector or list of standard deviations
#'
#' @return a list of vectors corresponding to the effect size coefficients.
#'
#' @examples
#' effect_sizes <- gen_model(nX = 2, nY = 2, nZ12 = 1,
#' mean = rep(1, 4), sd = rep(1, 4))
#'
#' @export
gen_model <- function(nX, nY, nZ12, mean = rep(0, 4), sd = rep(1, 4)) {
stopifnot(length(mean) == 4)
stopifnot(length(sd) == 4)
if (is.list(mean)) {
stopifnot((length(mean[[1]]) == nX) | (length(mean[[2]]) == nX) |
(length(mean[[3]]) == nY) | (length(mean[[4]]) == nZ12))
}
if (is.list(sd)) {
stopifnot((length(sd[[1]]) == nX) | (length(sd[[2]]) == nX) | (length(sd[[3]]) ==
nY) | (length(sd[[4]]) == nZ12))
}
model <- list(syner = list(
A0 = stats::rnorm(nX, mean = mean[[1]], sd = sd[[1]]),
A1 = stats::rnorm(nX, mean = mean[[2]], sd = sd[[2]])
), marg = stats::rnorm(nY,
mean = mean[[3]], sd = sd[[3]]
), inter = stats::rnorm(nZ12,
mean = mean[[4]],
sd = sd[[4]]
))
return(model)
}
#' Simulates a binary phenotype
#'
#' The phenotypes are simulated according to a logistic regression model.
#' Depending on the chosen configuration in \code{\link{sample_SNP}}, the model
#' includes different effect types: synergistic effects with the target,
#' marginal effects and additional epistatic effects. We offer the option to
#' generate a balanced phenotype vector between cases and controls, through the
#' \code{intercept} parameter.
#'
#' @param X genotype matrix
#' @param causal causal SNPs.
#' @param model disease model
#' @param intercept binary flag. If \code{intercept=TRUE}, a non-null intercept
#' is added so that the output is (approximately) balanced between cases and
#' controls.
#'
#' @return A vector of simulated phenotypes which are encoded as a two-level
#' factor (TRUE/FALSE).
#'
#' @seealso \code{\link{sample_SNP}} and \code{\link{gen_model}}
#'
#' @examples
#' nX <- 5
#' nY <- 3
#' nZ12 <- 2
#' clusters <- rep(seq_len(25), each = 3)
#' names(clusters) <- paste0("SNP_", seq_along(clusters))
#' MAF <- runif(length(clusters), min = 0.2, max = 0.5)
#'
#' n_samples <- 3
#' X <- matrix((runif(n_samples * length(clusters)) < 0.4) +
#' (runif(n_samples * length(clusters)) < 0.4),
#' ncol = length(clusters), nrow = n_samples)
#'
#' colnames(X) <- names(clusters)
#'
#' causal <- sample_SNP(
#' nX, nY, nZ12, clusters, MAF, thresh_MAF = 0.2, window_size = 2,
#' overlap_inter = 0)
#' model <- gen_model(nX, nY, nZ12, mean = rnorm(4), sd = rep(1, 4))
#' Y <- sim_phenotype(X, causal, model, intercept = TRUE)
#'
#' @export
sim_phenotype <- function(X, causal, model, intercept = TRUE) {
stopifnot(!is.null(colnames(X)))
stopifnot(!(is.null(causal[["target"]]) | is.null(causal[["syner"]]) |
is.null(causal[["marginal"]]) | is.null(causal[["inter1"]]) | is.null(causal[["inter2"]])))
stopifnot(!(is.null(model[["syner"]][["A0"]]) | is.null(model[["syner"]][["A1"]]) |
is.null(model[["marg"]]) | is.null(model[["inter"]])))
stopifnot((length(causal[["syner"]]) == length(model[["syner"]][["A0"]])) &
(length(causal[["syner"]]) == length(model[["syner"]][["A1"]])) &
(length(causal[["marginal"]]) == length(model[["marg"]])) & (length(causal[["inter1"]]) ==
length(model[["inter"]])) & (length(causal[["inter2"]]) == length(model[["inter"]])))
risk <- (X[, causal$target] == 0) * (X[, causal$syner] %*% model$syner$A0) +
(X[, causal$target] > 0) * (X[, causal$syner] %*% model$syner$A1) +
(X[, causal$marginal] %*% model$marg) + (X[, causal$inter1] * X[
,
causal$inter2
]) %*% model$inter
if (intercept) {
risk <- risk - mean(risk)
}
phenotypes <- stats::runif(dim(X)[1]) < (1 / (1 + exp(-risk)))
return(phenotypes)
}
#' Merges a number of clusters around the target
#'
#' The purpose of the function \code{\link{merge_cluster}} is to
#' define an enlarged window of SNPs which
#' are in linkage disequilibrium with the target. It replaces the indices of neighbor
#' clusters with \code{center}, the target cluster index. The neighborhood is defined
#' according to the parameter \code{k} (see Arguments for more details). Subsequently,
#' we filter them out for the estimate of the propensity scores.
#'
#' @param clusters vector of cluster memberships. Typically, the output
#' of \code{\link[stats]{cutree}}
#' @param center the target variant cluster
#' @param k vector or integer. if \code{k} is given as a vector, it corresponds to
#' the cluster indices to be updated. Otherwise, if \code{k} is an integer,
#' the cluster indices to be updated lie between \code{center-k} and
#' \code{center+k}.
#'
#' @return The updated cluster membership vector. The cluster indexing is also
#' updated so that the maximum cluster index is equal to the total number of
#' clusters after merging.
#'
#' @examples
#' hc <- hclust(dist(USArrests))
#' clusters <- cutree(hc, k = 10)
#' merge_cluster(clusters, center=5, k=2)
#'
#' @export
merge_cluster <- function(clusters, center, k = 3) {
stopifnot(center %in% clusters)
if (is.atomic(k) & length(k) == 1L) {
stopifnot((2 * k + 1) <= nlevels(as.factor(clusters)))
idx_window <- center - (-k:k)
} else {
stopifnot(all(k %in% clusters))
idx_window <- k
}
clusters[which(clusters %in% idx_window)] <- center
clusters <- vapply(
clusters, function(s) match(s, sort(unique(clusters))),
numeric(1)
)
return(clusters)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.