Nothing
#' Simulate phenotye data
#'
#' This function simulates trait data from a genotype matrix.
#'
#' This function takes a genotype matrix and simulates trait data under the following model:
#' beta_i ~ MN(0, V_i, I), i in \{ additive, epistatic, residual\}
#'
#' The effect sizes follow a matrix normal distribution with no correlation between the samples but covariance between the effects for different traits
#'
#' @param genotype_matrix Genotype matrix with samples as rows, and SNPs as columns.
#' @param n_causal Number of SNPs that are causal.
#' @param n_trait_specific Number of causal SNPs with single trait epistatic effects.
#' @param n_pleiotropic Number of SNPs with epistatic effects on all traits.
#' @param group_ratio_trait Ratio of sizes of trait specific groups that interact, e.g. a ratio 1:3 would be value 3.
#' @param group_ratio_pleiotropic Ratio of sizes of pleiotropic groups that interact, e.g. a ratio 1:3 would be value 3.
#' @param H2 Broad-sense heritability. Can be vector.
#' @param d Number of traits.
#' @param rho Proportion of heritability explained by additivity.
#' @param marginal_correlation Correlation between the additive effects of the trait.
#' @param epistatic_correlation Correlation between the epistatic effects of the trait.
#' @param seed Random seed for simulation.
#' @param logLevel is a string parameter defining the log level for the logging package.
#' @param logFile is a string parameter defining the name of the log file for the logging output.
#' @param maf_threshold is a float parameter defining the threshold for the minor allele frequency not included in causal SNPs.
#' @return A list object containing the trait data, the genotype data, as well as the causal SNPs and summary statistics.
#' @examples
#' p <- 200
#' f <- 10
#' g <- 4
#' n <- 100
#' d <- 3
#' X <- matrix(
#' runif(p * n),
#' ncol = p
#' )
#' data <- simulate_traits(
#' X, n_causal = f, n_trait_specific = g, n_pleiotropic = g, d = d, maf_threshold = 0,
#' logLevel = "ERROR"
#' )
#' @useDynLib mvMAPIT
#' @export
#' @name simulate_traits
#' @import checkmate
#' @import dplyr
#' @import foreach
#' @import mvtnorm
#' @import parallel
#' @importFrom utils head
#' @importFrom stats var cor sd complete.cases
simulate_traits <- function(
genotype_matrix, n_causal = 1000, n_trait_specific = 10, n_pleiotropic = 10,
H2 = 0.6, d = 2, rho = 0.8, marginal_correlation = 0.3, epistatic_correlation = 0.3,
group_ratio_trait = 1, group_ratio_pleiotropic = 1, maf_threshold = 0.01, seed = 67132,
logLevel = "INFO", logFile = NULL
) {
heritability <- rep(H2, d)[1:d]
set.seed(seed)
coll <- makeAssertCollection()
assertInt(n_causal, lower = 0, add = coll)
assertInt(n_trait_specific, lower = 0, add = coll)
assertInt(n_pleiotropic, lower = 0, add = coll)
assertDouble(group_ratio_trait, lower = 1, add = coll)
assertDouble(group_ratio_pleiotropic, lower = 1, add = coll)
assertDouble(heritability, lower = 0, upper = 1, add = coll)
assertDouble(rho, lower = 0, upper = 1, add = coll)
assertDouble(marginal_correlation, lower = -1, upper = 1, add = coll)
assertDouble(epistatic_correlation, lower = -1, upper = 1, add = coll)
assertDouble(maf_threshold, lower = 0, upper = 1, add = coll)
assertInt(d, lower = 1, add = coll)
assertInt(seed, lower = 1, add = coll)
assertMatrix(genotype_matrix, all.missing = FALSE, add = coll)
reportAssertions(coll)
logging::logReset()
logging::basicConfig(level = logLevel)
log <- logging::getLogger("simulate_traits")
if (!is.null(logFile)) {
filePath <- file.path(getwd(), logFile)
log$debug("Logging to file: %s", filePath)
log$addHandler(logging::writeToFile, file = filePath)
}
snp.ids <- 1:ncol(genotype_matrix)
maf <- colMeans(genotype_matrix) / 2
X <- scale(genotype_matrix)
maf_compliant <- (maf > maf_threshold) & (maf < 1 - maf_threshold)
# scale produces NaN when the columns have zero variance
snp.ids.filtered <- snp.ids[complete.cases(t(X)) & maf_compliant]
n_samples <- nrow(X) # number of genotype samples
n_snp <- length(snp.ids.filtered) # number of SNPs passing quality control
log$debug("Scaled genotype matrix: %d x %d", n_samples, n_snp)
log$debug(
"Disregard %d variants due to zero variance or small minor allele frequency.",
ncol(genotype_matrix) - length(snp.ids.filtered)
)
log$debug("Minor allele frequency threshold %f.", maf_threshold)
# divide groups into ratios
n_group1_trait <- ceiling(n_trait_specific / (1 + group_ratio_trait))
n_group1_pleiotropic <- ceiling(n_pleiotropic / (1 + group_ratio_pleiotropic))
coll <- makeAssertCollection()
assertInt(n_causal, lower = 0, upper = n_snp, add = coll)
assertInt(n_pleiotropic + n_trait_specific, lower = 0, upper = n_causal, add = coll)
assertInt(n_group1_trait, lower = 0, upper = n_trait_specific, add = coll)
assertInt(n_group1_pleiotropic, lower = 0, upper = n_pleiotropic, add = coll)
reportAssertions(coll)
# factor vectors for splitting the groups
f_trait <- get_factors(n_group1_trait, n_trait_specific)
f_pleiotropic <- get_factors(n_group1_pleiotropic, n_pleiotropic)
log$debug("Number of causal SNPs: %d", n_causal)
log$debug("Number of trait specific SNPs: %d", n_trait_specific)
log$debug("Number of pleiotropic SNPs: %d", n_pleiotropic)
log$debug("NA in raw genotype matrix: %d", sum(is.na(genotype_matrix)))
log$debug("NA in scaled genotype matrix: %d", sum(is.na(X)))
Y <- c()
causal_snps <- list()
pleiotropic_set <- sample(snp.ids.filtered, n_pleiotropic, replace = F) # declare peleiotropic SNPs before since they have to be present in every trait
pleio_split <- split(pleiotropic_set, f = f_pleiotropic)
X_pleio_group1 <- X[, pleio_split$group1, drop=FALSE]
X_pleio_group2 <- X[, pleio_split$group2, drop=FALSE]
i <- NULL
X_epi_pleio <- foreach(i = seq_len(n_group1_pleiotropic), .combine = cbind) %do% {
# this step fails if there are too little pleiotropic SNPs; i.e. <=
# 3?
X_pleio_group1[, i] * X_pleio_group2
}
if (!is.matrix(X_epi_pleio)) {
X_epi_pleio <- matrix(
NA, nrow = nrow(X),
ncol = 0
)
}
log$debug(
"Dimensions of pleiotropic interaction matrix: %s x %s", nrow(X_epi_pleio),
ncol(X_epi_pleio)
)
log$debug("Draw effects from multivariate normal with desired correlation.")
C_marginal <- matrix(marginal_correlation, ncol = d, nrow = d)
diag(C_marginal) <- 1
C_epistatic <- matrix(epistatic_correlation, ncol = d, nrow = d)
diag(C_epistatic) <- 1
C_error <- matrix(0, ncol = d, nrow = d)
diag(C_error) <- 1
log$debug("Desired marginal correlation: %f", marginal_correlation)
beta <- mvtnorm::rmvnorm(n_causal, sigma = C_marginal)
log$debug("Correlation of simulated marginal effects: %s", cor(beta))
n_epistatic_effects <- n_group1_trait * (n_trait_specific - n_group1_trait) +
ncol(X_epi_pleio)
log$debug("Number of epistatic effects: %s", n_epistatic_effects)
if (n_epistatic_effects > 0) {
log$debug("Desired epistatic correlation: %f", epistatic_correlation)
alpha <- mvtnorm::rmvnorm(n_epistatic_effects, sigma = C_epistatic)
log$debug("Correlation of simulated epistatic effects: %s", cor(alpha))
} else {
log$debug("Return empty effect matrix.")
alpha <- matrix(0, ncol = d, nrow = 0)
}
log$debug("Desired error correlation: %f", 0)
error <- mvtnorm::rmvnorm(n_samples, sigma = C_error)
log$debug("Correlation of simulated error: %s", cor(error))
snp.ids.trait <- setdiff(snp.ids.filtered, pleiotropic_set)
pleiotropic_interactions <- tibble(
group1 = rep(pleio_split$group1, each = length(pleio_split$group2)),
group2 = rep(pleio_split$group2, times = length(pleio_split$group1))
)
interactions <- tibble()
additive <- tibble()
colnames(genotype_matrix) <- seq_len(ncol(genotype_matrix)) %>%
sprintf(fmt = "snp_%05d") # column names names for SNPs
for (j in 1:d) {
## select causal SNPs
log$debug("Simulating trait %d", j)
causal_snps_j <- sample(snp.ids.trait, n_causal - n_pleiotropic, replace = F)
trait_specific_additive <- c(causal_snps_j, pleiotropic_set)
trait_specific_epistatic <- sample(causal_snps_j, n_trait_specific, replace = F)
trait_grouped <- split(trait_specific_epistatic, f_trait)
trait_specific_j_1 <- trait_grouped$group1
trait_specific_j_2 <- trait_grouped$group2
log$debug("Length causal set: %d", length(causal_snps_j))
log$debug("Length pleiotropic set: %d", length(pleiotropic_set))
log$debug("Length trait specific set 1: %d", length(trait_specific_j_1))
log$debug("Length trait specific set 2: %d", length(trait_specific_j_2))
log$debug("Head of causal SNPs: %s", head(trait_specific_additive))
log$debug("Head of trait specific SNPs group 1: %s", head(trait_specific_j_1))
log$debug("Head of trait specific SNPs group 2: %s", head(trait_specific_j_2))
# create trait_specific interaction matrix
X_causal_j <- X[, c(causal_snps_j, pleiotropic_set)] # all SNPs have additive effects
X_trait_specific_j_1 <- X[, trait_specific_j_1, drop = FALSE]
X_trait_specific_j_2 <- X[, trait_specific_j_2, drop = FALSE]
trait_specific_interactions <- tibble(
group1 = rep(trait_specific_j_1, each = length(trait_specific_j_2)),
group2 = rep(trait_specific_j_2, times = length(trait_specific_j_1))
)
start_interactions <- proc.time()
log$debug("Computing interactions. This may take a while.")
i <- NULL
X_epi <- foreach(i = seq_len(length(trait_specific_j_1)), .combine = cbind) %do% {
X_trait_specific_j_1[, i] * X_trait_specific_j_2
}
X_epi <- cbind(X_epi_pleio, X_epi)
time_interactions <- proc.time() - start_interactions
log$debug("Interactions X_epi computed in %f", time_interactions[3])
log$debug(
"Dimension of interaction matrix X_epi: %d x %d", nrow(X_epi),
ncol(X_epi)
)
# marginal effects
X_marginal <- X_causal_j
beta_j <- beta[, j]
y_marginal <- X_marginal %*% beta_j
y_marginal <- y_marginal * sqrt(heritability[j] * rho/c(var(y_marginal)))
log$debug("Variance scaled y_marginal: %f", var(y_marginal))
# pairwise epistatic effects
alpha_j <- alpha[, j]
if (n_epistatic_effects > 0) {
y_epi <- X_epi %*% alpha_j
y_epi <- y_epi * sqrt(heritability[j] * (1 - rho)/c(var(y_epi)))
log$debug("Variance scaled y_epi: %f", var(y_epi))
trait_interactions <- bind_rows(pleiotropic_interactions,
trait_specific_interactions) %>%
mutate(effect_size = alpha_j) %>%
mutate(trait = j)
interactions <- bind_rows(interactions, trait_interactions)
} else {
y_epi <- 0 * y_marginal
log$debug("y_epi vector of zeros size y_marginal: %s", y_epi)
}
trait_additive <- tibble(
id = trait_specific_additive,
effect_size = beta_j,
trait = j
)
additive <- bind_rows(additive, trait_additive)
epistatic_idx <- c(pleiotropic_set, trait_specific_epistatic)
additive_idx <- c(pleiotropic_set, trait_specific_additive)
colnames(genotype_matrix)[additive_idx] <- paste0(colnames(genotype_matrix[, additive_idx]),
rep(sprintf(j, fmt = "_p%02dadd"),
length(additive_idx)))
colnames(genotype_matrix)[epistatic_idx] <- paste0(colnames(genotype_matrix[, epistatic_idx]),
rep(sprintf(j, fmt = "_p%02depi"),
length(epistatic_idx)))
# unexplained phenotypic variation
y_err <- error[, j]
y_err <- y_err * sqrt((1 - heritability[j])/c(var(y_err)))
y <- y_marginal + y_epi + y_err
Y <- cbind(Y, y)
causal_snps[[paste0("trait_", j)]] <- list(
causal_snps = c(causal_snps_j, pleiotropic_set),
pleiotropic_groups = pleio_split, trait_specific_groups = trait_grouped,
alpha = alpha_j, beta = beta_j
)
}
if (n_epistatic_effects > 0) {
epistatic <- tidyr::pivot_longer(
interactions,
cols = c("group1", "group2"),
names_to = "group",
names_prefix = "group",
values_to = "id"
) %>%
mutate(name = sprintf("snp_%05d", id)) %>%
dplyr::group_by(.data[["trait"]], .data[["id"]], .data[["name"]]) %>%
summarise(total_effect = sum(abs(.data[["effect_size"]]))) %>%
mutate(pleiotropic = (id %in% pleiotropic_set))
} else {
epistatic <- NULL
interactions <- NULL
}
colnames(Y) <- seq_len(ncol(Y)) %>%
sprintf(fmt = "p_%02d") # column names names for traits
log$debug("trait data: %s", head(Y))
log$debug("trait correlation: %s", cor(Y))
log$debug("Correlation of simulated epistatic effects: %s", cor(alpha))
# return data
parameter_names <- c("number_samples",
"number_snps",
"number_traits",
"number_causal_snps",
"number_epistatic_effects",
"number_pleiotropic_snps",
"number_trait_specific_snps",
"heritability",
"rho",
"epistatic_correlation",
"marginal_correlation",
"group_ratio_pleiotropic",
"group_ratio_trait",
"seed"
)
parameters <- tibble()
for (j in seq_len(d)) {
parameter_values <- c(n_samples,
n_snp,
d,
n_causal,
n_epistatic_effects,
n_pleiotropic,
n_trait_specific,
heritability[j],
rho,
epistatic_correlation,
marginal_correlation,
group_ratio_pleiotropic,
group_ratio_trait,
seed)
parameters <- bind_rows(parameters,
tidyr::tibble(name = parameter_names,
value = parameter_values)
)
}
parameters <- parameters %>%
mutate(trait = rep(colnames(Y), each = length(parameter_names)))
simulated_pleiotropic_epistasis_data <- list(
parameters = parameters, trait = Y, genotype = genotype_matrix,
additive = additive, epistatic = epistatic, interactions = interactions,
snps.filtered = snp.ids.filtered
)
return(simulated_pleiotropic_epistasis_data)
}
get_factors <- function(n1, n) {
return(
c(
rep("group1", n1),
rep("group2", n - n1)
)
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.