Simulation-and-LinearModel.R
In PhenotypeSimulator: Flexible Phenotype Simulation from Different Genetic and Noise Models

## ---- echo = FALSE, message=FALSE---------------------------------------------
knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
library("PhenotypeSimulator")
library("ggplot2")

## ---- eval=FALSE--------------------------------------------------------------
#  # load kinship data, add small value to diagonal for numerical stability and
#  # do an eigendecomposition
#  indir <- "~/data/hmeyer/Supplementary/CEU.0908.impute.files"
#  
#  kinship <- read.table(paste(indir,
#                              "/genotypes_genome_hapgen.controls.grm.rel",sep=""),
#                        sep="\t", header=FALSE)
#  
#  kinship <- as.matrix(kinship)
#  diag(kinship) <- diag(kinship) + 1e-4
#  
#  kinship_decomposed <- eigen(kinship)
#  write.table(kinship,
#  			paste(indir, "/genotypes_genome_hapgen.controls.grm.rel",
#  				  sep=""),
#  			sep="\t", col.names=FALSE, row.names=FALSE)
#  
#  write.table(kinship_decomposed$values,
#  			paste(indir, "/genotypes_eigenvalues",
#  				  sep=""),
#  			sep="\t", col.names=FALSE, row.names=FALSE)
#  
#  write.table(kinship_decomposed$vectors,
#  			paste(indir, "/genotypes_eigenvectors",
#  				  sep=""),
#  			sep="\t", col.names=FALSE, row.names=FALSE)

## ---- eval=FALSE--------------------------------------------------------------
#  indir <- "/homes/hannah/data/hmeyer/Supplementary/CEU.0908.impute.files"
#  
#  # specify directory to save data; if it doesn't exist yet, create i.e. needs
#  # writing permissions
#  datadir <- '/homes/hannah/tmp/phenotypeSimulator'
#  if (!dir.exists(datadir)) dir.create(datadir, recursive=TRUE)
#  
#  # specify filenames and parameters
#  totalGeneticVar <- 0.4
#  totalSNPeffect <- 0.1
#  h2s <- totalSNPeffect/totalGeneticVar
#  kinshipfile <- paste(indir, "/genotypes_genome_hapgen.controls.grm.rel",
#                                  sep="")
#  
#  genoFilePrefix <- paste(indir, "/genotypes_", sep="")
#  genoFileSuffix <- "_hapgen.controls.gen"
#  
#  # simulate phenotype with three phenotype components
#  simulation <- runSimulation(N = 1000, P = 3, cNrSNP=10, seed=43,
#                             kinshipfile = kinshipfile,
#                             format = "oxgen",
#                             genoFilePrefix = genoFilePrefix,
#                             genoFileSuffix = genoFileSuffix,
#                             chr = c(5,7,11),
#                             mBetaGenetic = 0, sdBetaGenetic = 0.2,
#                             theta=1,
#                             NrSNPsOnChromosome=c(533966, 503129, 428863),
#                             genVar = totalGeneticVar, h2s = h2s,
#                             phi = 0.6, delta = 0.2, rho=0.2,
#                             NrFixedEffects = 2, NrConfounders = c(2, 2),
#                             distConfounders = c("bin", "norm"),
#                             probConfounders = 0.2,
#                             genoDelimiter=" ",
#                             kinshipDelimiter="\t",
#                             kinshipHeader=FALSE,
#                             verbose = TRUE )

## ---- eval=FALSE, echo=FALSE--------------------------------------------------
#  savedir <- paste(system.file("extdata", package="PhenotypeSimulator"),
#                              "/resultsSimulationAndLinearModel", sep="")
#  saveRDS(simulation$phenoComponentsFinal,
#          paste(savedir, "/simulation_phenoComponentsFinal.rds", sep=""))

## ---- echo=FALSE--------------------------------------------------------------
savedir <- paste(system.file("extdata", package="PhenotypeSimulator"), 
                            "/resultsSimulationAndLinearModel", sep="")
simulation <- list(
    phenoComponentsFinal=readRDS(paste(savedir, 
                                       "/simulation_phenoComponentsFinal.rds", 
                            sep="")))

## ----heatmaps, fig.keep='all', fig.height=3, fig.width=7, eval=TRUE, fig.cap="\\label{fig:heatmaps}Heatmaps of the trait-by-trait correlation (Pearson correlation) of the simulated phenotype and its five phenotype components."----
cor_phenotype <- reshape2::melt(cor(simulation$phenoComponentsFinal$Y))
cor_phenotype$type <- "Y" 

cor_genBg <- reshape2::melt(cor(simulation$phenoComponentsFinal$Y_genBg))
cor_genBg$type <- "U" 

cor_genFixed <- reshape2::melt(cor(simulation$phenoComponentsFinal$Y_genFixed))          
cor_genFixed$type <- "XB" 

cor_noiseFixed <- reshape2::melt(cor(simulation$phenoComponentsFinal$Y_noiseFixed))          
cor_noiseFixed$type <- "WA" 

cor_noiseBg <- reshape2::melt(cor(simulation$phenoComponentsFinal$Y_noiseBg))          
cor_noiseBg$type <- "Psi" 

cor_correlatedBg <- reshape2::melt(cor(simulation$phenoComponentsFinal$Y_correlatedBg))          
cor_correlatedBg$type <- "T" 

cor_components <- rbind(cor_phenotype, cor_genBg, cor_genFixed, cor_noiseBg, 
                        cor_noiseFixed, cor_correlatedBg)
cor_components$type <- factor(cor_components$type, levels=c("Y", 
                                                            "WA",
                                                            "XB", 
                                                            "U", 
                                                            "T",
															"Psi"),
													labels=c("bold(Y)", 
                                                            "bold(WA)",
                                                            "bold(XB)", 
                                                            "bold(U)", 
                                                            "bold(T)",
															"bold(Psi)"))
colnames(cor_components) <- c("TraitA", "TraitB", "Correlation", "type")

# For prettier plotting
cor_components$TraitA <- gsub("_", " ", cor_components$TraitA)
cor_components$TraitB <- gsub("_", " ", cor_components$TraitB)

p_corr <- ggplot(data=cor_components, aes(x=TraitA, y=TraitB, fill=Correlation))  
p_corr <- p_corr + geom_tile() +
    facet_grid(~ type, labeller = label_parsed) +
    scale_x_discrete(expand = c(0,0)) +
    scale_y_discrete(expand = c(0,0)) +
    scale_fill_gradient2(low="#F2AD00", high="#5BBCD6" , mid="white",
                    limits = c(-1,1), 
                    breaks = c(-1, -0.5, 0, 0.5, 1),
                    labels = c(-1, -0.5, 0, 0.5, 1),
                    guide=guide_colorbar(direction = "horizontal",
                                         title.position="top",
                                         title.hjust =0.5)) + 
    xlab(" ") +
    ylab(" ") +
    theme_bw() +
    coord_fixed() +
    theme(axis.text.x = element_text(angle=90, vjust=0.5),
			legend.position = "bottom",
			strip.background = element_rect(fill="white", color="white"),
            axis.text  =element_text(size=8),
            legend.title  =element_text(size=8),
            legend.text  =element_text(size=8),
            strip.text  =element_text(size=8))
print(p_corr)

## ---- echo=FALSE, eval=FALSE--------------------------------------------------
#  ggsave(plot=p_corr,
#          filename=paste(savedir, "/correlation_phenotype.pdf", sep=""),
#          units="mm", width=86, height=60)

## ---- eval=FALSE--------------------------------------------------------------
#  # Save phenotypes, kinship and non-genetic covariates in csv and
#  # GEMMA-specific format
#  outdirectory <- savePheno(simulation, directory = datadir,
#                              intercept_gemma=TRUE, format=c("csv", "gemma"),
#                              saveIntermediate=TRUE)

## ---- eval=FALSE--------------------------------------------------------------
#  datadir="/homes/hannah/tmp/phenotypeSimulator"
#  
#  causalSNPs <- fread(paste(datadir, "/SNP_NrSNP10.csv", sep=""),
#  					header=TRUE, sep=",", stringsAsFactors=FALSE,
#  					data.table=FALSE)
#  causalSNPnames <- colnames(causalSNPs)
#  
#  LMM_mt <- fread(paste(datadir, "/GWAS_gemma_LMM_mt_pvalues.txt", sep=""),
#  				data.table=FALSE, header=FALSE, stringsAsFactors=FALSE)
#  LMM_st_trait1 <- fread(paste(datadir, "/GWAS_gemma_LMM_st_trait1_pvalues.txt",
#                              sep=""),
#  				data.table=FALSE, header=FALSE, stringsAsFactors=FALSE)
#  LMM_st_trait2 <- fread(paste(datadir, "/GWAS_gemma_LMM_st_trait2_pvalues.txt",
#                              sep=""),
#  				data.table=FALSE, header=FALSE, stringsAsFactors=FALSE)
#  LMM_st_trait3 <- fread(paste(datadir, "/GWAS_gemma_LMM_st_trait3_pvalues.txt",
#                              sep=""),
#  				data.table=FALSE, header=FALSE, stringsAsFactors=FALSE)
#  
#  LMM_mt <- LMM_mt[order(LMM_mt[,2]),]
#  LMM_mt$expected <- stats::ppoints(nrow(LMM_mt))
#  LMM_mt$model <- "mvLMM"
#  LMM_mt$trait <- "all"
#  colnames(LMM_mt)[1:2] <- c("rsID", "observed")
#  
#  LMM_st_trait1 <- LMM_st_trait1[order(LMM_st_trait1[,2]),]
#  LMM_st_trait1$expected <- LMM_mt$expected
#  LMM_st_trait1$model <- "uvLMM"
#  LMM_st_trait1$trait <- "trait1"
#  colnames(LMM_st_trait1)[1:2] <- c("rsID", "observed")
#  
#  LMM_st_trait2 <- LMM_st_trait2[order(LMM_st_trait2[,2]),]
#  LMM_st_trait2$expected <- LMM_mt$expected
#  LMM_st_trait2$model <- "uvLMM"
#  LMM_st_trait2$trait <- "trait2"
#  colnames(LMM_st_trait2)[1:2] <- c("rsID", "observed")
#  
#  LMM_st_trait3 <- LMM_st_trait3[order(LMM_st_trait3[,2]),]
#  LMM_st_trait3$expected <- LMM_mt$expected
#  LMM_st_trait3$model <- "uvLMM"
#  LMM_st_trait3$trait <- "trait3"
#  colnames(LMM_st_trait3)[1:2] <- c("rsID", "observed")
#  
#  pvalues <- rbind(LMM_mt, LMM_st_trait1, LMM_st_trait2, LMM_st_trait3)
#  pvalues$causal <- "all"
#  pvalues$causal[which(pvalues$rsID %in% causalSNPnames)] <- "simulated causal"
#  pvalues$causal <- factor(pvalues$causal, levels=c("all", "simulated causal"))
#  
#  p_qq <- ggplot(data=pvalues,
#  				aes(x=-log10(expected), y=-log10(observed)))
#  p_qq <- p_qq + geom_point(aes(color=causal, shape=trait)) +
#      scale_color_manual(values=c('darkgrey', '#1b9e77'),
#                         name="SNPs",
#  					guide=guide_legend(nrow = 2, title.position="top",
#                                         title.hjust =0.5,
#                                         override.aes = list(shape=21))) +
#      scale_shape_manual(values=c(18, 15:17),
#                         name="Traits",
#  					guide=guide_legend(nrow = 2,
#                                         override.aes = list(colour='darkgrey'),
#                                         title.position="top",
#                                         title.hjust =0.5)) +
#      geom_abline(intercept=0, slope=1, col="black") +
#  	geom_point(data=dplyr::filter(pvalues, causal == "simulated causal"),
#  				color='#1b9e77', aes(shape=trait)) +
#      xlab(expression(Expected~~-log[10](italic(p))) ) +
#      ylab(expression(Observed~~-log[10](italic(p)))) +
#      facet_grid(~ model) +
#      theme_bw() +
#  	theme(strip.background = element_rect(fill="white"),
#  			axis.title = element_text(size=12),
#  			axis.text  =element_text(size=10),
#  			legend.title  =element_text(size=12),
#  			legend.text  =element_text(size=10),
#  			strip.text  =element_text(size=10),
#  			legend.position = "bottom",
#              legend.direction = "horizontal")

## ----echo=FALSE, eval=FALSE, out.width='100%'---------------------------------
#  ggsave(plot=p_qq,
#  		filename=paste(savedir, "/gwas_results_gemma_qqplot.png", sep=""),
#  		units="mm", width=86, height=100)

## ----qqplot, echo=FALSE, fig.cap="\\label{fig:qqplot}Quantile-quantile plots of p-values observed from the multivariate linear mixed model (mvLMM, traits:all) and the univariate linear mixed models (uvLMM, traits: trait1/trait2/trait3) fitted to each of about eight million genome-wide SNPs (grey), including the ten SNPs for which a phenotype effect was modelled (green)."----
#knitr::include_graphics("../docs/articles/Simulation-and-LinearModel_files/figure-html/qqplot-1.png")
knitr::include_graphics("gwas_results_gemma_qqplot.png")

## ---- eval=FALSE--------------------------------------------------------------
#  sigSNPs <- dplyr::filter(pvalues, causal == "simulated causal",
#                           observed < 5*10^-8)
#  sigSNPs$observed_adjusted <- sigSNPs$observed
#  sigSNPs$observed_adjusted[which(sigSNPs$trait != "all")] <-
#      3*sigSNPs$observed[which(sigSNPs$trait != "all")]
#  sigSNPs <- dplyr::filter(sigSNPs, observed_adjusted < 5*10^-8)

## ---- echo=FALSE, eval=FALSE--------------------------------------------------
#  write.table(sigSNPs,
#              paste(savedir, "/sigSNPs.csv", sep=""),
#              col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")

## ----sigSNPs, echo=FALSE------------------------------------------------------
                        
sigSNPs <- read.table(paste(savedir, "/sigSNPs.csv", sep=""), header=TRUE, 
                      sep=",")
knitr::kable(sigSNPs[, c(1,4,5,7)], 
             caption="\\label{tab:sigSNPs}P-values from multi-trait GWAS (model == mvLMM) and 
                        single-trait GWAS (model == uvLMM)",
             table.attr = "id=\"sigSNPs\"",
             digits=40)

## ---- eval=FALSE--------------------------------------------------------------
#  # Read the SNPs simulated to have an effect on the phenotype and their SNP
#  # effects
#  SNPs <- read.csv(paste(datadir,"/SNP_NrSNP10.csv", sep=""),
#                   row.names=1)
#  SNPeffect <- read.csv(paste(datadir,"/SNP_effects_NrSNP10.csv", sep=""),
#                        row.names=1)
#  
#  
#  # HapGen simulated SNPs might be encoded as 7-25481146 , which R converts to
#  # X7.25481146 via make.names in read.table; substitute X and . to get original
#  # names
#  SNPnames <- gsub("\\.", "-", gsub("X", "", colnames(SNPs)))
#  SNPnames <- SNPnames[order(SNPnames)]
#  simulated_withEffect <- dplyr::filter(pvalues, rsID %in% SNPnames)

## ---- echo=FALSE, eval=FALSE--------------------------------------------------
#  write.table(simulated_withEffect,
#              paste(savedir,"/simulated_withEffect.csv", sep=""), col.names=TRUE,
#              quote=FALSE,row.names=FALSE)

## ---- echo=FALSE, eval=TRUE---------------------------------------------------
SNPeffect <- read.csv(paste(savedir,"/SNP_effects_NrSNP10.csv", sep=""),
                      row.names=1)
SNPs <- read.csv(paste(savedir,"/SNP_NrSNP10.csv", sep=""),
                 row.names=1)
SNPnames <- gsub("\\.", "-", gsub("X", "", colnames(SNPs)))
SNPnames <- SNPnames[order(SNPnames)]
simulated_withEffect <- read.csv(paste(savedir,"/simulated_withEffect.csv", sep=""))

## ----freq-beta, fig.cap="\\label{fig:freq-beta}P-values, allele frequencies and simulated effect sizes of the ten SNPs simulated to have an effect on phenotype."----
# get the allele frequencies of the SNPs simulated to have an effect on the 
# phenotype
freq <- apply(SNPs, 2, getAlleleFrequencies)
minor <- apply(freq, 2, min)
minor <- minor[order(gsub("X", "", names(minor)))]

# format the effect size table and combine with p-value and frequency
# information
effects <- reshape2::melt(SNPeffect, value.name="beta", variable.name="name")
effects$type <- gsub('\\..*', '', effects$name)
effects$rsID <- gsub("\\.", "-", gsub('.*_', '', effects$name))
effects$trait <- rep(paste("trait", 1:3, sep=""), 10)
effects <- effects[order(effects$rsID),]
effects <- dplyr::filter(effects, rsID %in% simulated_withEffect$rsID)

simulated_withEffect <- simulated_withEffect[order(simulated_withEffect[, 1]),]
simulated_withEffect$beta <- NA
simulated_withEffect$beta[simulated_withEffect$trait != "all"] <- effects$beta
simulated_withEffect$freq <- rep(minor[which(SNPnames %in% effects$rsID)], each=4)

# plot association p-values in relation to the absolute value of the simulated 
# effect sizes and the SNPs allele frequencies
p <- ggplot(dplyr::filter(simulated_withEffect, trait != "all"))
p <- p +
    geom_point(aes(x=freq, y=-log10(observed), color=abs(beta), shape=trait)) +
    geom_hline(yintercept=-log10(5*10^(-8)), color="darkgrey") +
    scale_color_gradientn(colours=c('#f0f9e8','#ccebc5','#a8ddb5','#7bccc4',
                                    '#4eb3d3','#2b8cbe','#08589e'),
                          name=expression("| simulated" ~beta~ "|")) +
    scale_shape_discrete(name="Phenotype in GWAS") +
    xlab("Allele frequencies") +
    ylab(expression(-log[10](p-value))) +
    theme_bw()
print(p)

## ----echo=FALSE, eval=FALSE, out.width='100%'---------------------------------
#  ggsave(plot=p,
#  		filename=paste(savedir, "/effectsizes-freq-pvalues.pdf", sep=""),
#  		units="mm", width=150, height=120)
Any scripts or data that you put into this service are public.
PhenotypeSimulator documentation built on July 16, 2021, 5:06 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
PhenotypeSimulator
Flexible Phenotype Simulation from Different Genetic and Noise Models

inst/doc/Simulation-and-LinearModel.R
In PhenotypeSimulator: Flexible Phenotype Simulation from Different Genetic and Noise Models

Try the PhenotypeSimulator package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PhenotypeSimulator Flexible Phenotype Simulation from Different Genetic and Noise Models

inst/doc/Simulation-and-LinearModel.R In PhenotypeSimulator: Flexible Phenotype Simulation from Different Genetic and Noise Models

Try the PhenotypeSimulator package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PhenotypeSimulator
Flexible Phenotype Simulation from Different Genetic and Noise Models

inst/doc/Simulation-and-LinearModel.R
In PhenotypeSimulator: Flexible Phenotype Simulation from Different Genetic and Noise Models