R/variancecomponentFunctions.R

Defines functions correlatedBgEffects noiseBgEffects geneticBgEffects noiseFixedEffects geneticFixedEffects

Documented in correlatedBgEffects geneticBgEffects geneticFixedEffects noiseBgEffects noiseFixedEffects

#' Simulate genetic variant effects.
#'
#' geneticFixedEffects takes genetic variants which should be added as genetic 
#' variant effects to the phenotype. These variants can have the same effects 
#' across all traits (shared) or can be independent across traits (independent); 
#' in addition, only a certain proportion of traits can be affected by the 
#' genetic variants.
#' 
#' @param X_causal [N x NrCausalSNPs] Matrix of [NrCausalSNPs] SNPs from [N] 
#' samples.
#' @param N Number [integer] of samples to simulate; has to be provided as a 
#' dimnesionality check for X_causal and downstream analyses; nrow(X_causal) has 
#' to be equal to N.
#' @param P Number [integer] of phenotypes to simulate.
#' @param phenoID Prefix [string] for naming traits.
#' @param id_samples Vector of [NrSamples] sample IDs [string]; if not provided
#' colnames(X_causal) used.
#' @param id_phenos Vector of [NrTraits] phenotype IDs [string]; if not provided
#' constructed by paste(phenoID, 1:P, sep="").
#' @param pTraitsAffected Proportion [double] of traits affected by the genetic
#' effect. For non-integer results of pTraitsAffected*P, the ceiling of the 
#' result is used. Allows to simulate for instance different levels of 
#' pleiotropy.
#' @param pIndependentGenetic Proportion [double] of genetic effects (SNPs) to
#' have an independent fixed effect.
#' @param pTraitIndependentGenetic Proportion [double] of traits influenced by 
#' independent fixed genetic effects.
#' @param keepSameIndependent [boolean] If set to TRUE, the independent genetic 
#' effects always influence the same subset of traits.
#' @param distBeta Vector of name(s) [string] of distribution to use to simulate 
#' effect sizes of SNPs; one of "unif" or "norm".
#' @param mBeta Vector of mean/midpoint(s) [double] of normal/uniform 
#' distribution for effect sizes of SNPs.
#' @param sdBeta Vector of standard deviation/distance from midpoint [double] 
#' of normal/uniform distribution for effect sizes of SNPs.
#' @param verbose [boolean] If TRUE, progress info is printed to standard out
#' @return Named list of shared fixed genetic effects (shared: [N x P] matrix), 
#' independent fixed genetic effects (independent: [N x P] matrix), 
#' the causal SNPs labeled as shared or independent effect 
#' (cov: [NrCausalSNPs x N] matrix) and the simulated effect sizes of the causal 
#' SNPs (cov_effect: [P x NrCausalSNPs] dataframe).
#' @export
#' @examples
#' genotypes <- simulateGenotypes(N=100, NrSNP=20, verbose=FALSE)
#' causalSNPs <- getCausalSNPs(N=100, genotypes=genotypes$genotypes)
#' geneticFixed <- geneticFixedEffects(N=100, X_causal=causalSNPs, 
#' P=10)
geneticFixedEffects <- function(X_causal, P, N, phenoID="Trait_",
                                id_samples = rownames(X_causal),
                                id_phenos = NULL,
                                pTraitsAffected=1,
                                pIndependentGenetic=0.4, 
                                pTraitIndependentGenetic=0.2, 
                                keepSameIndependent=FALSE,
                                distBeta="norm", mBeta=0, sdBeta=1, 
                                verbose=FALSE) {
    numbers <- list(P=P, N=N, mBeta=mBeta, sdBeta=sdBeta,
                    pIndependentGenetic=pIndependentGenetic, 
                    pTraitIndependentGenetic=pTraitIndependentGenetic, 
                    pTraitsAffected=pTraitsAffected)
    positives <- list(P=P, N=N, sdBeta=sdBeta)
    proportions <- list(pIndependentGenetic=pIndependentGenetic, 
                        pTraitIndependentGenetic=pTraitIndependentGenetic, 
                        pTraitsAffected=pTraitsAffected)
    testNumerics(numbers=numbers, positives=positives, proportions=proportions)
    if (nrow(X_causal) != N){
        stop("Number of samples in SNP matrix (", nrow(X_causal), ") is 
             different from number of samples to be simulated")
    }
    if (length(id_samples) !=  nrow(X_causal)) {
        stop("Length of id_samples (", length(id_samples), ") is different ",
             "from number of samples in X_causal (", nrow(X_causal), "). Does ",
             "your X_causal have rownames (default to retrieve id_samples if ",
             "not provided)?")
    }
    if (!(is.character(phenoID) && length(phenoID) == 1)) {
        stop("phenoID has to be of length 1 and of type character")
    }
    if (!is.null(id_phenos) && length(id_phenos) !=  P) {
        stop("Length of id_phenos (", length(id_phenos), ") is different ",
             "from P (", P, ")")
    }
    if (is.null(id_phenos)) id_phenos <- paste(phenoID, 1:P, sep="")
    NrCausalSNPs <- ncol(X_causal)
    traitsAffected <- ceiling(P*pTraitsAffected)
    
    NrIndependentSNPs <- round(pIndependentGenetic * NrCausalSNPs)
    NrSharedSNPs <- NrCausalSNPs - NrIndependentSNPs
    
    vmessage(c("Out of", P, "total phenotypes,", traitsAffected, "traits will",
               "be affected by genetic variant effects"), verbose=verbose)
    
    if (traitsAffected == 1 && NrIndependentSNPs != 0) {
        vmessage(c("The total number of traits affected by genetic variant",
                 "effects is 1, so pTraitIndependentGenetic",
                 " will automatically be set to 1."), verbose=verbose)
        pTraitIndependentGenetic <- 1
    }
    
    if (NrIndependentSNPs != 0) {
        vmessage(c("Out of these affected traits (", traitsAffected, "), ", 
                   ceiling(pTraitIndependentGenetic * traitsAffected), 
                   " trait(s)",
                   " will have independent genetic variant effects"), sep="",
                 verbose=verbose)
    }
    
    Gshared <- NULL
    Gindependent <- NULL
    
    if (NrSharedSNPs != 0) {
        if (NrIndependentSNPs != 0) {
            shared <- sample(c(rep(TRUE, NrSharedSNPs), 
                               rep(FALSE, NrIndependentSNPs)), 
                             replace=FALSE)
            X_shared <-  X_causal[,shared]
            snpIDshared <- colnames(X_causal)[shared]
        } else {
            X_shared <- X_causal
            snpIDshared <- colnames(X_causal)
        }
        
        if (distBeta == "unif") {
            betaX_exp <- rexp(NrSharedSNPs) %*% t(rexp(traitsAffected))
            ## transfrom to uniform with mean=m, with m*x=0.5
            betaX_unif <- exp(-betaX_exp/mean(betaX_exp))
            multiplicative = 2*sdBeta/(max(betaX_unif)-min(betaX_unif))
            additive = mBeta + sdBeta - multiplicative * max(betaX_unif)
            betaX_shared  = multiplicative * betaX_unif + additive
        }
        if (distBeta == "norm") {
            betaX_shared <- simulateDist(NrSharedSNPs,  dist=distBeta, 
                                         m=mBeta, std=sdBeta) %*% 
                t(abs(simulateDist(traitsAffected, dist=distBeta, m=0, 
                                   std=1)))
        }
        
        if (P != traitsAffected) {
            betaX_shared <- cbind(betaX_shared, 
                                  matrix(0, ncol=P-traitsAffected, 
                                         nrow=NrSharedSNPs))
        }
        rownames(betaX_shared) <- paste("sharedEffect", 
                                        1:nrow(betaX_shared), sep="")
        
        cov <- data.frame(X_shared)
        colnames(cov) <- snpIDshared
        rownames(cov) <- id_samples
        
        cov_effect <- data.frame(t(betaX_shared))
        colnames(cov_effect) <- paste(rownames(betaX_shared), "_", 
                                      colnames(cov), sep="")
        rownames(cov_effect) <- id_phenos
        
        Gshared = X_shared %*% betaX_shared
        colnames(Gshared) <- id_phenos
        rownames(Gshared) <- id_samples
    }
    
    if (NrIndependentSNPs != 0) {
        if (NrSharedSNPs != 0) {
            independent <- !shared
            X_independent <- X_causal[,independent]
            snpIDindependent <- colnames(X_causal)[independent]
        } else {
            X_independent <- X_causal
            snpIDindependent <- colnames(X_causal)
        }
        
        betaX_independent <- matrix(simulateDist(traitsAffected * 
                                                     NrIndependentSNPs, 
                                                 dist=distBeta,
                                                 m=mBeta, std=sdBeta), 
                                    ncol=traitsAffected)
        
        TraitIndependentGenetic <- ceiling(pTraitIndependentGenetic * 
                                               traitsAffected)
        if (keepSameIndependent) {
            p_nongenetic <- sample(
                c(rep(FALSE, TraitIndependentGenetic), 
                  rep(TRUE, (traitsAffected - TraitIndependentGenetic))), 
                replace=FALSE)
            p_nongenetic <- matrix(rep(p_nongenetic, NrIndependentSNPs), 
                                   NrIndependentSNPs, byrow = TRUE)
        } else {
            p_nongenetic <- t(sapply(1:ncol(X_independent), function(x) {
                sample(c(rep(FALSE, TraitIndependentGenetic), 
                         rep(TRUE, (traitsAffected - TraitIndependentGenetic))), 
                       replace=FALSE)
            }))
        }
        
        betaX_independent[p_nongenetic] <- 0
        
        if (P != traitsAffected) {
            betaX_independent <- cbind(betaX_independent, 
                                       matrix(0, ncol=P-traitsAffected, 
                                              nrow=NrIndependentSNPs))
        }
        rownames(betaX_independent) <- paste("independentEffect", 
                                             1:nrow(betaX_independent), sep="") 
        cov <- data.frame(X_independent)
        colnames(cov) <- snpIDindependent
        rownames(cov) <- id_samples
        
        cov_effect <- data.frame(t(betaX_independent))
        colnames(cov_effect) <- paste(rownames(betaX_independent), "_", 
                                      colnames(cov), sep="")
        rownames(cov_effect) <- id_phenos
        
        Gindependent = X_independent %*% betaX_independent
        colnames(Gindependent) <- id_phenos
        rownames(Gindependent) <- id_samples
    }
    if (NrSharedSNPs != 0 && NrIndependentSNPs != 0) {
        cov = cbind(X_shared, X_independent)
        colnames(cov) <- c( snpIDshared, snpIDindependent)
        cov_effect = data.frame(betaX_shared=t(betaX_shared), 
                                betaX_independent=t(betaX_independent))
        colnames(cov_effect) <- paste(colnames(cov_effect), "_", 
                                      colnames(cov), sep="")
        rownames(cov_effect) <- id_phenos
    }
    
    return(list(shared=Gshared, 
                independent=Gindependent, 
                cov=cov, 
                cov_effect=cov_effect))
}

#' Simulate noise fixed effects.
#'
#' noiseFixedEffects simulates a number of non-genetic covariate effects 
#' (confounders). Confounders can have effects across all traits (shared) 
#' or to a number of traits only (independent); in addition, only a certain 
#' proportion of traits can be affected by the confounders.
#' Confounders can be simulated as categorical variables or following a binomial 
#' , uniform or normal distribution. Effect sizes for the noise effects can be 
#' simulated from a uniform or normal distribution. Multiple confounder sets 
#' drawn from different distributions/different parameters of the same 
#' distribution can be simulated by specifying NrFixedEffects and supplying the 
#' respective distribution parameters. 
#'
#' @param N Number [integer] of samples to simulate.
#' @param P Number [integer] of phenotypes to simulate.
#' @param sampleID Prefix [string] for naming samples.
#' @param phenoID Prefix [string] for naming traits.
#' @param id_samples Vector of [NrSamples] sample IDs [string]; if not provided
#' constructed by paste(sampleID, 1:N, sep="").
#' @param id_phenos Vector of [NrTraits] phenotype IDs [string]; if not provided
#' constructed by paste(phenoID, 1:P, sep="").
#' @param pTraitsAffected Vector of proportion(s) [double] of traits affected by 
#' the confounders. For non-integer results of pTraitsAffected*P, the ceiling of 
#' the result is used.
#' @param NrFixedEffects Number [integer] of different confounder effects to 
#' simulate; allows to simulate fixed effects from different distributions or 
#' with different parameters; if only one type of confounder distribution is 
#' wanted, set NrFixedEffects=1 and choose the number of confounders with eg 
#' NrConfounders=10.
#' @param NrConfounders Vector of number(s) [integer] of confounders from a 
#' specified distribution to simulate.
#' @param pIndependentConfounders Vector of proportion(s) [double] of 
#' confounders to have a trait-independent effect.
#' @param pTraitIndependentConfounders Vector of proportion(s) [double] of 
#' traits influenced by independent confounder effects.
#' @param keepSameIndependent [boolean] If set to TRUE, the independent genetic 
#' effects always influence the same subset of traits.
#' @param distConfounders Vector of name(s) [string] of distribution to use to 
#' simulate confounders; one of "unif", "norm", "bin", "cat_norm", "cat_unif".
#' @param distBeta Vector of name(s) [string] of distribution to use to simulate 
#' effect sizes of confounders; one of "unif" or "norm".
#' @param mConfounders Vector of mean/midpoint(s) [double] of normal/uniform 
#' distribution for confounders.
#' @param sdConfounders Vector of standard deviation(s)/distance from 
#' midpoint(s) [double] of normal/uniform distribution for confounders.
#' @param catConfounders Vector of number(s) of confounder categories [integer]; 
#' required if distConfounders "cat_norm" or "cat_unif".
#' @param probConfounders Vector of probability(s) [double] of binomial 
#' confounders (0/1); required if distConfounders "bin".
#' @param mBeta Vector of mean/midpoint [double] of normal/uniform distribution 
#' for effect sizes of confounders.
#' @param sdBeta Vector of standard deviation/distance from midpoint [double] 
#' of normal/uniform distribution for effect sizes of confounders.
#' @param verbose [boolean] If TRUE, progress info is printed to standard out
#' @return Named list of shared confounder effects (shared: [N x P] matrix), 
#' independent confoudner effects (independent: [N x P] matrix), 
#' the confounders labeled as shared or independent effect 
#' (cov: [NrConfounders x N] matrix) and the simulated effect sizes of the 
#' confounders (cov_effect: [P x NrConfounders] dataframe).
#' @seealso \code{\link{simulateDist}} 
#' @export
#' @examples
#' # fixed noise effect with default setting
#' noiseFE <- noiseFixedEffects(P=5, N=20)
#' 
#' # 1 categorical fixed noise effect with uniform distribution of the 
#' # categories
#' noiseFE_catUnif <- noiseFixedEffects(P=10, N=20, NrConfounders=1, 
#' distConfounders="cat_unif", catConfounders=3)
#' 
#' # 10 fixed noise effect with uniform distribution between 1 and 5 (3 +/- 2) 
#' # categories
#' noiseFE_uniformConfounders_normBetas <- noiseFixedEffects(P=10, N=20, 
#' NrConfounders=10, distConfounders="unif", mConfounders=3, sdConfounders=2, 
#' distBeta="norm",  sdBeta=2)
#' 
#'  # 4 fixed noise effect with binomial distribution with p=0.2 
#' noiseFE_binomialConfounders_uniformBetas <- noiseFixedEffects(P=10, N=20, 
#' NrConfounders=4, distConfounders="bin", probConfounders=0.2, distBeta="norm", 
#' sdBeta=2)
#' 
#'  # 2 fixed noise effect with 1 binomial confounders and 1 normally 
#'  # distributed confounder; the latter only affects 2 traits 
#'  noiseFE_binomialandNormalConfounders <- noiseFixedEffects(P=10, N=20, 
#'  NrFixedEffects=2, pTraitsAffected =c (1,0.2), NrConfounders=c(2,2), 
#'  distConfounders=c("bin", "norm"),  probConfounders=0.2)
noiseFixedEffects <- function(N, P, NrConfounders=10, sampleID="ID_",
                              phenoID="Trait_", 
                              id_samples=NULL,
                              id_phenos=NULL,
                              pTraitsAffected=1,
                              NrFixedEffects=1, 
                              pIndependentConfounders=0.4, 
                              pTraitIndependentConfounders=0.2, 
                              keepSameIndependent=FALSE,
                              distConfounders="norm", mConfounders=0, 
                              sdConfounders=1, catConfounders=NULL, 
                              probConfounders=NULL, distBeta="norm", mBeta=0, 
                              sdBeta=1,
                              verbose=FALSE) {
    numbers <- list(N=N, P=P, NrFixedEffects=NrFixedEffects, 
                    NrConfounders=NrConfounders)
    positives <- list(P=P, N=N, NrFixedEffects=NrFixedEffects, 
                      NrConfounders=NrConfounders)
    integers <- list(P=P, N=N, NrFixedEffects=NrFixedEffects, 
                     NrConfounders=NrConfounders)
    testNumerics(numbers=numbers, positives=positives, integers=integers)
    if (!is.null(id_samples) && length(id_samples) !=  N) {
        stop("Length of id_samples (", length(id_samples), ") is different ",
             "from N (", N, ")")
    }
    if (!is.null(id_phenos) && length(id_phenos) !=  P) {
        stop("Length of id_phenos (", length(id_phenos), ") is different ",
             "from P (", P, ")")
    }
    if (!(is.character(phenoID) && length(phenoID) == 1)) {
        stop("phenoID has to be of length 1 and of type character")
    }
    if (!(is.character(sampleID) && length(sampleID) == 1)) {
        stop("sampleID has to be of length 1 and of type character")
    }
    if (is.null(id_samples)) id_samples <- paste(sampleID, 1:N, sep="")
    if (is.null(id_phenos)) id_phenos <- paste(phenoID, 1:P, sep="")
    
    
    oneFixedEffectComponent <- function(N, P, NrOfEffect, NrConfounders, 
                                        pTraitsAffected,
                                        pIndependentConfounders, 
                                        pTraitIndependentConfounders,
                                        keepSameIndependent,
                                        distConfounders, mConfounders, 
                                        sdConfounders, catConfounders, 
                                        probConfounders, distBeta, mBeta, 
                                        sdBeta) {
        numbers <- list(mConfounders=mConfounders,
                        sdConfounders=sdConfounders, mBeta=mBeta, 
                        sdBeta=sdBeta,
                        pIndependentConfounders=pIndependentConfounders, 
                        pTraitIndependentConfounders=
                            pTraitIndependentConfounders, 
                        probConfounders=probConfounders,
                        pTraitsAffected=pTraitsAffected)
        positives <- list(sdBeta=sdBeta, sdConfounders=sdConfounders)
        proportions <- list(pIndependentConfounders=pIndependentConfounders, 
                            pTraitIndependentConfounders=
                                pTraitIndependentConfounders, 
                            probConfounders=probConfounders,
                            pTraitsAffected=pTraitsAffected)
        testNumerics(numbers=numbers, positives=positives, 
                     proportions=proportions)
        
        if (is.null(catConfounders) && grepl("cat", distConfounders)) {
            stop("Confounder distribution set to ", distConfounders, " but ",
                 "no categories provided")
        }
        if (is.null(probConfounders) && grepl("bin", distConfounders)) {
            stop("Confounder distribution set to ", distConfounders, " but ",
                 "no probabilities provided")
        }
        
        traitsAffected <- ceiling(P*pTraitsAffected)
        
        if(is.null(NrOfEffect)) {
            vmessage(c("Out of", P, "total phenotypes,", traitsAffected, 
                       "trait(s) will",
                       "be affected by the covariate effect"), 
                     verbose=verbose)
        } else {
            vmessage(c("Out of", P, "total phenotypes,", traitsAffected, 
                       "trait(s) will",
                       "be affected by the ", NrOfEffect ," covariate effect"), 
                     verbose=verbose)
        }
        
        NrIndependentConfounders <- round(pIndependentConfounders * 
                                              NrConfounders)
        NrSharedConfounders <- NrConfounders - NrIndependentConfounders
        
        if (traitsAffected == 1 && NrIndependentConfounders != 0) {
            vmessage(c("The total number of traits affected by covariate ",
                     "effects is 1, so pTraitIndependentConfounders",
                     " will automatically be set to 1."), verbose=verbose)
            pTraitIndependentConfounders <- 1
        }
        
        if (NrIndependentConfounders != 0) {
            vmessage(c("Out of these affected traits (", traitsAffected, "), ", 
                       ceiling(pTraitIndependentConfounders * traitsAffected), 
                       " trait(s)",
                       " will have independent covariate effects"), sep="",
                     verbose=verbose)
        }
        
        Cshared <- NULL
        Cindependent <- NULL
        
        if (NrSharedConfounders != 0) {
            shared <- matrix(simulateDist(N * NrSharedConfounders, 
                                          dist=distConfounders, m=mConfounders, 
                                          std=sdConfounders, 
                                          categories=catConfounders, 
                                          prob=probConfounders), 
                             ncol=NrSharedConfounders)
            colnames(shared) <- paste("sharedConfounder", NrOfEffect, "_", 
                                      distConfounders, 
                                      seq(1, NrSharedConfounders, 1), sep="")
            
            if (distBeta == "unif") {
                beta_exp <- rexp(NrSharedConfounders) %*% t(rexp(traitsAffected))
                ## transfrom to uniform with mean=m, with m*x=0.5
                beta_unif <- exp(-beta_exp/mean(beta_exp))
                multiplicative = 2*sdBeta/(max(beta_unif)-min(beta_unif))
                additive = mBeta + sdBeta - multiplicative * max(beta_unif)
                beta_shared  = multiplicative * beta_unif + additive
            }
            if (distBeta == "norm") {
                beta_shared <- simulateDist(NrSharedConfounders,  dist=distBeta, 
                                            m=mBeta, std=sdBeta) %*% 
                    t(abs(simulateDist(traitsAffected, dist=distBeta, m=0, 
                                       std=1)))
            }
            rownames(beta_shared) <- paste("sharedConfounder", NrOfEffect, "_", 
                                           distConfounders,
                                           "_Beta_", distBeta, 
                                           seq(1, NrSharedConfounders, 1), sep="")
            
            if (P != traitsAffected) {
                beta_shared <- cbind(beta_shared, 
                                     matrix(0, ncol=P-traitsAffected, 
                                            nrow=NrSharedConfounders))
            }
            
            Cshared <- shared %*% beta_shared
            colnames(Cshared) <- id_phenos
            rownames(Cshared) <- id_samples
            
            cov <- data.frame(shared)
            rownames(cov) <- id_samples
            cov_effect <- data.frame(t(beta_shared))
            rownames(cov_effect) <- id_phenos
        } 
        if (NrIndependentConfounders != 0) {
            independent <- matrix(simulateDist(N * NrIndependentConfounders, 
                                               dist=distConfounders, 
                                               m=mConfounders, 
                                               std=sdConfounders, 
                                               categories=catConfounders, 
                                               prob=probConfounders), 
                                  ncol=NrIndependentConfounders)
            colnames(independent) <- paste("independentConfounder", NrOfEffect, 
                                           "_", distConfounders, 
                                           seq(1, NrIndependentConfounders, 1), 
                                           sep="")
            beta_independent <- matrix(simulateDist(traitsAffected * 
                                                        NrIndependentConfounders, 
                                                    dist=distBeta, m=mBeta, 
                                                    std=sdBeta), 
                                       ncol=traitsAffected)
            rownames(beta_independent) <- paste("independentConfounder", 
                                                NrOfEffect, "_", 
                                                distConfounders,
                                                "_Beta_", distBeta, 
                                                seq(1, NrIndependentConfounders, 
                                                    1), 
                                                sep="")
            
            TraitIndependentConfounders <- ceiling(pTraitIndependentConfounders* 
                                                       traitsAffected)
            if (keepSameIndependent) {
                p_nonconfounders <- sample(
                    c(rep(FALSE, TraitIndependentConfounders), 
                      rep(TRUE, (traitsAffected - TraitIndependentConfounders))), 
                    replace=FALSE)
                p_nonconfounders <- matrix(rep(p_nonconfounders, 
                                               NrIndependentConfounders), 
                                           NrIndependentConfounders, 
                                           byrow = TRUE)
            } else {
                p_nonconfounders <- t(sapply(1:ncol(independent), function(x) {
                    sample(c(rep(FALSE, TraitIndependentConfounders), 
                             rep(TRUE, 
                                 (traitsAffected - TraitIndependentConfounders))
                    ), 
                    replace=FALSE)
                }))
            }
            
            beta_independent[p_nonconfounders] <- 0
            
            if (P != traitsAffected) {
                beta_independent <- cbind(beta_independent, 
                                          matrix(0, ncol=P-traitsAffected, 
                                                 nrow=NrIndependentConfounders))
            }
            
            
            Cindependent <- independent %*% beta_independent
            colnames(Cindependent) <- id_phenos
            rownames(Cindependent) <- id_samples
            
            cov <- data.frame(independent)
            rownames(cov) <- id_samples
            cov_effect <- data.frame(t(beta_independent))
            rownames(cov_effect) <- id_phenos
        }
        if (NrSharedConfounders != 0 && NrIndependentConfounders != 0) {
            cov <- data.frame(shared, independent)
            rownames(cov) <- id_samples
            cov_effect <- data.frame(t(beta_shared), t(beta_independent))
            rownames(cov_effect) <- id_phenos
        }
        return(list(shared=Cshared, independent=Cindependent, cov=cov, 
                    cov_effect=cov_effect))
    }
    
    param_length <- c(pTraitsAffected=length(pTraitsAffected), 
                      pIndependentConfounders=
                          length(pIndependentConfounders),
                      pTraitIndependentConfounders=
                          length(pTraitIndependentConfounders),
                      keepSameIndependent=length(keepSameIndependent),
                      distConfounders=length(distConfounders),
                      mConfounders=length(mConfounders),
                      sdConfounders=length(sdConfounders),
                      probConfounders=length(probConfounders),
                      catConfounders=length(catConfounders),
                      distBeta=length(distBeta),
                      mBeta=length(mBeta),
                      sdBeta=length(sdBeta))
    
    common <- c(pTraitsAffected=length(pTraitsAffected), 
                pIndependentConfounders=
                    length(pIndependentConfounders),
                pTraitIndependentConfounders=
                    length(pTraitIndependentConfounders),
                keepSameIndependent=length(keepSameIndependent),
                distConfounders=length(distConfounders),
                distBeta=length(distBeta),
                mBeta=length(mBeta),
                sdBeta=length(sdBeta))
    
    norm_unif <- c(mConfounders=length(mConfounders),
                   sdConfounders=length(sdConfounders))
    norm_unif_dist <- length(which(grepl("^unif", distConfounders))) + 
        length(which(grepl("^norm",distConfounders)))
    
    bin <- c(probConfounders=length(probConfounders))
    bin_dist <- length(which(grepl("bin", distConfounders)))
    
    categ <- c(catConfounders=length(catConfounders))
    categ_dist <- length(which(grepl("cat", distConfounders)))
    
    if (NrFixedEffects != 1 && all(param_length <= 1)) {
        stop("NrFixedEffects specified to greater than 1 (i.e. more than one ",
             "type of fixed effects), but not enough parameters specified for ",
             "multiple fixed effects")
    }
    check_common <- sapply(seq_along(common), function(x){
        if (common[x]  > 1 && common[x] != NrFixedEffects) {
            stop("Length of ", names(common)[x], " (", common[x], 
                 ") doesn't match NrFixedEffects (", NrFixedEffects, ")")
        }
    })
    check_nu <- sapply(seq_along(norm_unif), function(x){
        if (norm_unif[x]  > 1 && norm_unif[x] != norm_unif_dist) {
            stop("Length of ", names(norm_unif)[x], " (", norm_unif[x], 
                 ") doesn't match unif/norm distConfounder length (", 
                 norm_unif_dist, ")")
        }
    })
    if (bin > 1 && bin != bin_dist) {
        stop("Length of ", names(bin), " (", bin, 
             ") doesn't match bin distConfounder length (", 
             bin_dist, ")")
    }   
    if (categ > 1 && categ != categ_dist) {
        stop("Length of ", names(categ), " (", categ, 
             ") doesn't match cat_unif/cat_norm distConfounder length (", 
             categ_dist, ")")
    } 
    
    if (NrFixedEffects != 1)  { 
        un <- intersect(which(!grepl("cat", distConfounders)),
                        which(!grepl("bin", distConfounders)))
        if (length(un) != 0) {
            tmp_m_un <- rep(9, length(distConfounders))
            tmp_sd_un <- rep(9, length(distConfounders))
            tmp_m_un[un] <- mConfounders
            tmp_sd_un[un] <- sdConfounders
            mConfounders <- tmp_m_un
            sdConfounders <- tmp_sd_un
        }
        if (!is.null(catConfounders)) {
            tmp_cat <- rep(9, length(distConfounders))
            tmp_cat[which(grepl("cat", distConfounders))] <- catConfounders
            catConfounders <- tmp_cat
        }
        if (!is.null(probConfounders)) {
            tmp_bin <- rep(0, length(distConfounders))
            tmp_bin[which(grepl("bin", distConfounders))] <- probConfounders
            probConfounders <- tmp_bin
        }
        
        if (length(pTraitsAffected) == 1) {
            pTraitsAffected <- rep(pTraitsAffected, 
                                   NrFixedEffects)
        }
        if (length(pIndependentConfounders) == 1) {
            pIndependentConfounders <- rep(pIndependentConfounders, 
                                           NrFixedEffects)
        }
        if (length(pTraitIndependentConfounders) == 1) {
            pTraitIndependentConfounders <- rep(pTraitIndependentConfounders, 
                                                NrFixedEffects)
        }
        if (length(keepSameIndependent) == 1) {
            keepSameIndependent <- rep(keepSameIndependent, NrFixedEffects)
        }
        if (length(NrConfounders) == 1) {
            NrConfounders <- rep(NrConfounders, NrFixedEffects)
        }
        if (length(distConfounders) == 1) {
            distConfounders <- rep(distConfounders, NrFixedEffects)
        }
        if (length(mConfounders) == 1) {
            mConfounders <- rep(mConfounders, NrFixedEffects)
        }
        if (length(sdConfounders) == 1) {
            sdConfounders <- rep(sdConfounders, NrFixedEffects)
        }
        if (length(catConfounders) <= 1) {
            catConfounders <- rep(catConfounders, NrFixedEffects)
        }
        if (length(probConfounders) <= 1) {
            probConfounders <- rep(probConfounders, NrFixedEffects)
        }
        if (length(distBeta) == 1) {
            distBeta <- rep(distBeta, NrFixedEffects)
        }
        if (length(mBeta) == 1) {
            mBeta <- rep(mBeta, NrFixedEffects)
        }
        if (length(sdBeta) == 1) {
            sdBeta <- rep(sdBeta, NrFixedEffects)
        }
        
        tmp <- lapply(1:NrFixedEffects, function(x) {
            oneFixedEffectComponent(N=N, P=P, NrOfEffect=x, 
                                    NrConfounders=NrConfounders[x], 
                                    pTraitsAffected=pTraitsAffected[x],
                                    pIndependentConfounders=
                                        pIndependentConfounders[x], 
                                    pTraitIndependentConfounders=
                                        pTraitIndependentConfounders[x], 
                                    keepSameIndependent=
                                        keepSameIndependent[x],
                                    distConfounders=distConfounders[x], 
                                    mConfounders=mConfounders[x], 
                                    sdConfounders=sdConfounders[x], 
                                    catConfounders=catConfounders[x], 
                                    probConfounders=probConfounders[x], 
                                    distBeta=distBeta[x], mBeta=mBeta[x], 
                                    sdBeta=sdBeta[x])
        })
        shared <- addNonNulls(lapply(tmp, function(x) x$shared))
        independent <- addNonNulls(lapply(tmp, function(x) x$independent))
        cov <- do.call(cbind, lapply(tmp, function(x) x$cov))
        cov_effect <- do.call(cbind, lapply(tmp, function(x) x$cov_effect))
        return(list(shared=shared, independent=independent, cov=cov, 
                    cov_effect=cov_effect))
    } else {
        oneFixedEffectComponent(N=N, P=P, NrOfEffect=NULL, 
                                NrConfounders=NrConfounders, 
                                pTraitsAffected=pTraitsAffected,
                                pIndependentConfounders=
                                    pIndependentConfounders, 
                                pTraitIndependentConfounders=
                                    pTraitIndependentConfounders,
                                keepSameIndependent=keepSameIndependent,
                                distConfounders=distConfounders, 
                                mConfounders=mConfounders, 
                                sdConfounders=sdConfounders, 
                                catConfounders=catConfounders, 
                                probConfounders=probConfounders, 
                                distBeta=distBeta, mBeta=mBeta, sdBeta=sdBeta) 
    } 
}


#' Simulate infinitesimal genetic effects (reflecting sample kinship).
#'
#' geneticBgEffects simulates an infinitesimal genetic effects with a proportion 
#' of the effect shared across samples and a proportion independent across 
#' samples; they are based on the kinship estimates of the (simulated) samples.
#'
#' @param P Number [integer] of phenotypes to simulate .
#' @param N Number [integer] of samples to simulate; has to be provided as a 
#' dimnesionality check for kinship and downstream analyses; nrow(kinship) has 
#' to be equal to N.
#' @param kinship [N x N] Matrix of kinship estimates [double].
#' @param independent [bool] independent effect simulated if set to TRUE.
#' @param shared [bool] shared effect simulated if set to TRUE; at least one of 
#' shared or independent has to be set to TRUE.
#' @param phenoID Prefix [string] for naming traits.
#' @param id_samples Vector of [NrSamples] sample IDs [string]; if not provided
#' colnames(kinship) are used.
#' @param id_phenos Vector of [NrTraits] phenotype IDs [string]; if not provided
#' constructed by paste(phenoID, 1:P, sep="").
#' @return Named list of shared infinitesimal genetic effects (shared: [N x P] 
#' matrix) and independent infinitesimal genetic effects (independent: [N x P] 
#' matrix), the covariance term of the shared effect (cov_shared: [P x P] 
#' matrix) and the covariance term of the independent effect (cov_independent: 
#' [P x P] matrix).
#' @details For the simulation of the infinitesimal genetic effects, three 
#' matrix components are used: i) the kinship matrix K [N x N] which is treated 
#' as the sample design matrix, ii) matrix B [N x P] with vec(B) drawn from a 
#' normal distribution and iii) the trait design matrix A [P x P]. For the
#' independent effect, A is a diagonal matrix with normally distributed values.  
#' A for the shared effect is a matrix of rowrank one, with normally distributed 
#' entries in row 1 and zeros elsewhere. To construct the final effects, the 
#' three matrices are multiplied as: E = cholesky(K)BA^T. 
#' @export
#' @examples
#' genotypes <- simulateGenotypes(N=100, NrSNP=400, verbose=FALSE)
#' kinship <- getKinship(N=100, X=genotypes$genotypes, standardise=TRUE, 
#' verbose=FALSE)
#' geneticBg <- geneticBgEffects(N=100, P=10, kinship=kinship)
geneticBgEffects <- function(P, N, kinship, phenoID="Trait_",
                             id_samples = colnames(kinship), 
                             shared=TRUE, independent=TRUE,
                             id_phenos = NULL) {
    numbers <- list(N=N, P=P)
    positives <- list(P=P, N=N)
    integers <- list(P=P, N=N)
    testNumerics(numbers=numbers, positives=positives, integers=integers)
    if (diff(dim(kinship)) !=0) {
        stop ("Kinship matrix needs to be a square matrix ,",
              "however it has ", nrow(kinship), " rows and ", ncol(kinship), 
              " columns")
    }
    if (N != ncol(kinship)) {
        stop ("The number of samples specified (", N, ") and the number of ",
              "samples in the kinship matrix (", ncol(kinship), 
              ") are different.")
    }
    if (length(id_samples) !=  N) {
        stop("Length of id_samples (", length(id_samples), ") is different ",
             "from N (", N, "). Does you kinship matrix contain column names ",
             "(default for retrieving id_samples)?")
    }
    if (!is.null(id_phenos) && length(id_phenos) !=  P) {
        stop("Length of id_phenos (", length(id_phenos), ") is different ",
             "from P (", P, ")")
    }
    if (!(is.character(phenoID) && length(phenoID) == 1)) {
        stop("phenoID has to be of length 1 and of type character")
    }
    if (is.null(id_phenos)) id_phenos <- paste(phenoID, 1:P, sep="")
    
    kin_eigen <- as.complex(eigen(kinship, only.values = TRUE)$values)
    if (any(Re(kin_eigen) <= 0)) {
        stop("Kinship matrix is not positive semi-definite")
    }
    kinship_chol <- t(chol(kinship))
    N <- ncol(kinship)
    
    if (!shared && !independent) {
        stop("At least one geneticBgEffect has to be specified; set either ", 
             "shared or independent or both to TRUE")
    }
    # shared effect
    if (shared) {
        B <- matrix(rnorm(N * P), ncol=P)
        A <- matrix(rep(0, P * P), ncol=P)
        A[,1] <- rnorm(P)
        genBgShared <- kinship_chol %*% (B %*% t(A))
        colnames(genBgShared) <- id_phenos
        rownames(genBgShared) <- id_samples
        
        cov_shared <- tcrossprod(A)
        diag(cov_shared) <- diag(cov_shared) + 1e-4
    } else {
        genBgShared <- NULL
        cov_shared <- NULL
    }
    
    # independent effect
    if (independent) {
        D <- matrix(rnorm(N * P), ncol=P)
        C <- matrix(rep(0, P * P), ncol=P)
        diag(C) <- rnorm(P)
        genBgIndependent <- kinship_chol %*% (D %*% C)
        colnames(genBgIndependent) <- id_phenos
        rownames(genBgIndependent) <- id_samples
        
        cov_independent <- tcrossprod(C)
        diag(cov_independent) <- diag(cov_independent) + 1e-4
    } else {  
        genBgIndependent <- NULL
        cov_independent <- NULL
    }
    return(list(shared=genBgShared, independent=genBgIndependent, 
                cov_shared=cov_shared, cov_independent=cov_independent))
}

#' Simulate observational noise effects.
#'
#' noiseBgEffects simulates observational noise with a proportion of the effect 
#' shared across samples and a proportion independent across samples.
#'
#' @param P Number [integer] of phenotypes to simulate. 
#' @param N Number [integer] of samples to simulate.
#' @param mean Mean [double] of the normal distribution.
#' @param sd Standard deviation [double] of the normal distribution.
#' @param independent [bool] independent effect simulated if set to TRUE.
#' @param shared [bool] shared effect simulated if set to TRUE; at least one of 
#' shared or independent has to be set to TRUE.
#' @param sampleID Prefix [string] for naming samples.
#' @param phenoID Prefix [string] for naming traits.
#' @param id_samples Vector of [NrSamples] sample IDs [string]; if not provided
#' constructed by paste(sampleID, 1:N, sep="").
#' @param id_phenos Vector of [NrTraits] phenotype IDs [string]; if not provided
#' constructed by paste(phenoID, 1:P, sep="").
#' @return Named list of shared noise effects (shared: [N x P] matrix) and 
#' independent noise effects (independent: [N x P] matrix), the covariance term 
#' of the shared effect (cov_shared: [P x P] matrix) and the covariance term of 
#' the independent effect (cov_independent: [P x P] matrix).
#' @details For the simulation of the observational noise effects, two 
#' components are used: i) matrix B [N x P] with vec(B) drawn from a normal 
#' distribution with mean=mean and sd=sd and ii) the trait design matrix A 
#' [P x P]. For the independent effect, A is a diagonal matrix with normally 
#' distributed values.  A for the shared effect is a matrix of rowrank one, with 
#' normally distributed entries in row 1 and zeros elsewhere.  To construct the 
#' final effects, the two matrices are multiplied as: E = BA^T. 
#' @export
#' @examples
#' noiseBG <- noiseBgEffects(N=100, P=20, mean=2)
noiseBgEffects <- function(N, P, mean=0, sd=1, sampleID="ID_", phenoID="Trait_",
                           shared=TRUE, independent=TRUE,
                           id_samples = NULL,
                           id_phenos = NULL) {
    numbers <- list(N=N, P=P, sd)
    positives <- list(P=P, N=N, sd=sd)
    integers <- list(P=P, N=N)
    testNumerics(numbers=numbers, positives=positives, integers=integers)
    
    if (!is.null(id_samples) && length(id_samples) !=  N) {
        stop("Length of id_samples (", length(id_samples), ") is different ",
             "from N (", N, ")")
    }
    if (!is.null(id_phenos) && length(id_phenos) !=  P) {
        stop("Length of id_phenos (", length(id_phenos), ") is different ",
             "from P (", P, ")")
    }
    if (!(is.character(phenoID) && length(phenoID) == 1)) {
        stop("phenoID has to be of length 1 and of type character")
    }
    if (!(is.character(sampleID) && length(sampleID) == 1)) {
        stop("sampleID has to be of length 1 and of type character")
    }
    if (is.null(id_samples)) id_samples <- paste(sampleID, 1:N, sep="")
    if (is.null(id_phenos)) id_phenos <- paste(phenoID, 1:P, sep="")
    
    if (!shared && !independent) {
        stop("At least one noiseBgEffect has to be specified; set either ", 
             "shared or independent or both to TRUE")
    }
    # shared effect
    if (shared) {
        B <- matrix(rnorm(N * P, mean=mean, sd=sd), ncol=P)
        A <- matrix(rep(0, P * P), ncol=P)
        A[,1] <- rnorm(P, mean=mean, sd=sd)
        noiseBgShared <-  B %*% t(A)
        colnames(noiseBgShared) <- id_phenos
        rownames(noiseBgShared) <- id_samples
        
        cov_shared <- tcrossprod(A)
        diag(cov_shared) <- diag(cov_shared) + 1e-4
    } else {
        noiseBgShared <- NULL
        cov_shared <- NULL
    }
    
    # independent effect
    if (independent) {
        D <- matrix(rnorm(N * P, mean=mean, sd=sd), ncol=P)
        C <- matrix(rep(0, P * P), ncol=P)
        diag(C) <- rnorm(P, mean=mean, sd=sd)
        noiseBgIndependent <- D %*% C
        colnames(noiseBgIndependent) <- id_phenos
        rownames(noiseBgIndependent) <- id_samples
        
        cov_independent <- tcrossprod(C)
        diag(cov_independent) <- diag(cov_independent) + 1e-4
    } else {
        noiseBgIndependent <- NULL
        cov_independent <- NULL
    }
    return(list(shared=noiseBgShared, independent=noiseBgIndependent,
                cov_shared=cov_shared, cov_independent=cov_independent))
}

#' Simulate correlated background effects.
#'
#' correlatedBgEffects computes a background effect that simulates structured 
#' correlation between the phenotypes.
#' 
#' @param N Number [integer] of samples to simulate.
#' @param P Number [integer] of phenotypes to simulate. 
#' @param corr_mat [P x P] correlation matrix [double] as covariance 
#' component for the multivariate normal distribution. If not provided, pcorr is
#' used to construct the correlation matrix.
#' @param pcorr Initial strength of correlation [double] between neighbouring 
#' traits. Decreases by pcorr^(distance); distance from 0 to P-1. See details.
#' @param sampleID Prefix [string] for naming samples.
#' @param phenoID Prefix [string] for naming traits.
#' @param id_samples Vector of [NrSamples] sample IDs [string]; if not provided
#' constructed by paste(sampleID, 1:N, sep="").
#' @param id_phenos Vector of [NrTraits] phenotype IDs [string]; if not provided
#' constructed by paste(phenoID, 1:P, sep="").
#' @param verbose [boolean] If TRUE, progress info is printed to standard out.
#' @return Named list with [N x P] matrix of correlated background effects (
#' correlatedBg) and the correlation matrix (cov_correlated). If corr_mat 
#' provided corr_mat == cov_correlated.
#' @seealso \code{\link[mvtnorm]{rmvnorm}} which is used to simulate the 
#' multivariate normal distribution
#' @details correlatedBgEffects can be used to simulate phenotypes with a 
#' defined level of correlation between traits. If the corr_mat is not provided,
#' a simple correlation structure based on the distance of the traits will be 
#' constructed. Traits of distance d=1 (adjacent columns) will have correlation 
#' cor=\eqn{pcorr^1}{pcorr^1}, traits with d=2 have cor=\eqn{pcorr^2}{pcorr^2} 
#' up to traits with d=(P-1) cor=\eqn{pcorr^{(P-1)}}{pcorr^{(P-1)}} and 
#' 0 < pcorr < 1. The correlated background effect correlated is simulated based 
#' on this correlation structure C: 
#' \eqn{correlated ~ N_{NP}(0,C)}{correlated ~ N_{NP}(0,C)}.  
#' @export
#' @examples
#' correlatedBg <- correlatedBgEffects(N=100, P=20, pcorr=0.4)
correlatedBgEffects <- function(N, P, pcorr=NULL, corr_mat=NULL,
                                sampleID="ID_", phenoID="Trait_",
                                id_samples = NULL,
                                id_phenos = NULL,
                                verbose=FALSE) {
    numbers <- list(N=N, P=P)
    positives <- list(P=P, N=N)
    testNumerics(numbers=numbers, positives=positives)
    if (!is.null(id_samples) && length(id_samples) !=  N) {
        stop("Length of id_samples (", length(id_samples), ") is different ",
             "from N (", N, ")")
    }
    if (!is.null(id_phenos) && length(id_phenos) !=  P) {
        stop("Length of id_phenos (", length(id_phenos), ") is different ",
             "from P (", P, ")")
    }
    if (!(is.character(phenoID) && length(phenoID) == 1)) {
        stop("phenoID has to be of length 1 and of type character")
    }
    if (!(is.character(sampleID) && length(sampleID) == 1)) {
        stop("sampleID has to be of length 1 and of type character")
    }
    if (is.null(id_samples)) id_samples <- paste(sampleID, 1:N, sep="")
    if (is.null(id_phenos)) id_phenos <- paste(phenoID, 1:P, sep="")
    
    if(!is.null(corr_mat) && !is.null(pcorr)) {
        vmessage(c("Both pcorr (", pcorr, ") and corr_mat provided; corr_mat",
                   "will be used to simulate the correlatedBgEffects"), 
                 verbose=verbose)
    }
    if(is.null(corr_mat)) {
        if (is.null(pcorr)) {
            stop("At least one of pcorr or corr_mat have to be provided to", 
                 " simulated the correlatedBgEffects")
        }
        if (!is.numeric(pcorr)) {
            stop("pcorr has to be of type numeric")
        }
        if (length(pcorr) > 1) {
            stop("only a single pcorr value (double) can be specified")
        }
        if (pcorr <= 0 || pcorr >= 1) {
            stop("pcorr has to be greater than zero and less than 1")
        }
        corr_vec <- cumprod(rep(pcorr, P - 1))
        tri_corr_vec <- unlist(sapply(0:(length(corr_vec) -1), function(pos) {
            return(corr_vec[1:(length(corr_vec) - pos)])
        }))

        corr_mat <- diag(P)
        corr_mat[lower.tri(corr_mat, diag=FALSE)] <- tri_corr_vec
        corr_mat <- corr_mat + t(corr_mat) - diag(P)
    } else {
        corr_mat <- as.matrix(corr_mat)
        if (diff(dim(corr_mat)) != 0) {
            stop(paste("Correlation matrix needs to be a square matrix,",
                       "however it has", nrow(corr_mat), "rows and", 
                       ncol(corr_mat), "columns"))
        }
        if (ncol(corr_mat) != P) {
            stop("Dimensions of correlation matrix (", 
                 paste(dim(corr_mat), collapse=","), ") do ", 
                 "not match the number phenotypes (", P, ") to be simulated")
        }
    }
    pheno <- mvtnorm::rmvnorm(N, rep(0,P), corr_mat)
    colnames(pheno) <- id_phenos
    rownames(pheno) <- id_samples
    
    return(list(correlatedBg=pheno, cov_correlated=corr_mat))
}

Try the PhenotypeSimulator package in your browser

Any scripts or data that you put into this service are public.

PhenotypeSimulator documentation built on May 14, 2018, 1:04 a.m.