inst/doc/Simulation-and-LinearModel.R

## ---- echo = FALSE, message=FALSE---------------------------------------------
knitr::opts_chunk$set(collapse = TRUE, comment = "#>")
library("PhenotypeSimulator")
library("ggplot2")

## ---- eval=FALSE--------------------------------------------------------------
#  # load kinship data, add small value to diagonal for numerical stability and
#  # do an eigendecomposition
#  indir <- "~/data/hmeyer/Supplementary/CEU.0908.impute.files"
#  
#  kinship <- read.table(paste(indir,
#                              "/genotypes_genome_hapgen.controls.grm.rel",sep=""),
#                        sep="\t", header=FALSE)
#  
#  kinship <- as.matrix(kinship)
#  diag(kinship) <- diag(kinship) + 1e-4
#  
#  kinship_decomposed <- eigen(kinship)
#  write.table(kinship,
#  			paste(indir, "/genotypes_genome_hapgen.controls.grm.rel",
#  				  sep=""),
#  			sep="\t", col.names=FALSE, row.names=FALSE)
#  
#  write.table(kinship_decomposed$values,
#  			paste(indir, "/genotypes_eigenvalues",
#  				  sep=""),
#  			sep="\t", col.names=FALSE, row.names=FALSE)
#  
#  write.table(kinship_decomposed$vectors,
#  			paste(indir, "/genotypes_eigenvectors",
#  				  sep=""),
#  			sep="\t", col.names=FALSE, row.names=FALSE)

## ---- eval=FALSE--------------------------------------------------------------
#  indir <- "/homes/hannah/data/hmeyer/Supplementary/CEU.0908.impute.files"
#  
#  # specify directory to save data; if it doesn't exist yet, create i.e. needs
#  # writing permissions
#  datadir <- '/homes/hannah/tmp/phenotypeSimulator'
#  if (!dir.exists(datadir)) dir.create(datadir, recursive=TRUE)
#  
#  # specify filenames and parameters
#  totalGeneticVar <- 0.4
#  totalSNPeffect <- 0.1
#  h2s <- totalSNPeffect/totalGeneticVar
#  kinshipfile <- paste(indir, "/genotypes_genome_hapgen.controls.grm.rel",
#                                  sep="")
#  
#  genoFilePrefix <- paste(indir, "/genotypes_", sep="")
#  genoFileSuffix <- "_hapgen.controls.gen"
#  
#  # simulate phenotype with three phenotype components
#  simulation <- runSimulation(N = 1000, P = 3, cNrSNP=10, seed=43,
#                             kinshipfile = kinshipfile,
#                             format = "oxgen",
#                             genoFilePrefix = genoFilePrefix,
#                             genoFileSuffix = genoFileSuffix,
#                             chr = c(5,7,11),
#                             mBetaGenetic = 0, sdBetaGenetic = 0.2,
#                             theta=1,
#                             NrSNPsOnChromosome=c(533966, 503129, 428863),
#                             genVar = totalGeneticVar, h2s = h2s,
#                             phi = 0.6, delta = 0.2, rho=0.2,
#                             NrFixedEffects = 2, NrConfounders = c(2, 2),
#                             distConfounders = c("bin", "norm"),
#                             probConfounders = 0.2,
#                             genoDelimiter=" ",
#                             kinshipDelimiter="\t",
#                             kinshipHeader=FALSE,
#                             verbose = TRUE )

## ---- eval=FALSE, echo=FALSE--------------------------------------------------
#  savedir <- paste(system.file("extdata", package="PhenotypeSimulator"),
#                              "/resultsSimulationAndLinearModel", sep="")
#  saveRDS(simulation$phenoComponentsFinal,
#          paste(savedir, "/simulation_phenoComponentsFinal.rds", sep=""))

## ---- echo=FALSE--------------------------------------------------------------
savedir <- paste(system.file("extdata", package="PhenotypeSimulator"), 
                            "/resultsSimulationAndLinearModel", sep="")
simulation <- list(
    phenoComponentsFinal=readRDS(paste(savedir, 
                                       "/simulation_phenoComponentsFinal.rds", 
                            sep="")))

## ----heatmaps, fig.keep='all', fig.height=3, fig.width=7, eval=TRUE, fig.cap="\\label{fig:heatmaps}Heatmaps of the trait-by-trait correlation (Pearson correlation) of the simulated phenotype and its five phenotype components."----
cor_phenotype <- reshape2::melt(cor(simulation$phenoComponentsFinal$Y))
cor_phenotype$type <- "Y" 

cor_genBg <- reshape2::melt(cor(simulation$phenoComponentsFinal$Y_genBg))
cor_genBg$type <- "U" 

cor_genFixed <- reshape2::melt(cor(simulation$phenoComponentsFinal$Y_genFixed))          
cor_genFixed$type <- "XB" 

cor_noiseFixed <- reshape2::melt(cor(simulation$phenoComponentsFinal$Y_noiseFixed))          
cor_noiseFixed$type <- "WA" 

cor_noiseBg <- reshape2::melt(cor(simulation$phenoComponentsFinal$Y_noiseBg))          
cor_noiseBg$type <- "Psi" 

cor_correlatedBg <- reshape2::melt(cor(simulation$phenoComponentsFinal$Y_correlatedBg))          
cor_correlatedBg$type <- "T" 

cor_components <- rbind(cor_phenotype, cor_genBg, cor_genFixed, cor_noiseBg, 
                        cor_noiseFixed, cor_correlatedBg)
cor_components$type <- factor(cor_components$type, levels=c("Y", 
                                                            "WA",
                                                            "XB", 
                                                            "U", 
                                                            "T",
															"Psi"),
													labels=c("bold(Y)", 
                                                            "bold(WA)",
                                                            "bold(XB)", 
                                                            "bold(U)", 
                                                            "bold(T)",
															"bold(Psi)"))
colnames(cor_components) <- c("TraitA", "TraitB", "Correlation", "type")

# For prettier plotting
cor_components$TraitA <- gsub("_", " ", cor_components$TraitA)
cor_components$TraitB <- gsub("_", " ", cor_components$TraitB)

p_corr <- ggplot(data=cor_components, aes(x=TraitA, y=TraitB, fill=Correlation))  
p_corr <- p_corr + geom_tile() +
    facet_grid(~ type, labeller = label_parsed) +
    scale_x_discrete(expand = c(0,0)) +
    scale_y_discrete(expand = c(0,0)) +
    scale_fill_gradient2(low="#F2AD00", high="#5BBCD6" , mid="white",
                    limits = c(-1,1), 
                    breaks = c(-1, -0.5, 0, 0.5, 1),
                    labels = c(-1, -0.5, 0, 0.5, 1),
                    guide=guide_colorbar(direction = "horizontal",
                                         title.position="top",
                                         title.hjust =0.5)) + 
    xlab(" ") +
    ylab(" ") +
    theme_bw() +
    coord_fixed() +
    theme(axis.text.x = element_text(angle=90, vjust=0.5),
			legend.position = "bottom",
			strip.background = element_rect(fill="white", color="white"),
            axis.text  =element_text(size=8),
            legend.title  =element_text(size=8),
            legend.text  =element_text(size=8),
            strip.text  =element_text(size=8))
print(p_corr)

## ---- echo=FALSE, eval=FALSE--------------------------------------------------
#  ggsave(plot=p_corr,
#          filename=paste(savedir, "/correlation_phenotype.pdf", sep=""),
#          units="mm", width=86, height=60)

## ---- eval=FALSE--------------------------------------------------------------
#  # Save phenotypes, kinship and non-genetic covariates in csv and
#  # GEMMA-specific format
#  outdirectory <- savePheno(simulation, directory = datadir,
#                              intercept_gemma=TRUE, format=c("csv", "gemma"),
#                              saveIntermediate=TRUE)

## ---- eval=FALSE--------------------------------------------------------------
#  datadir="/homes/hannah/tmp/phenotypeSimulator"
#  
#  causalSNPs <- fread(paste(datadir, "/SNP_NrSNP10.csv", sep=""),
#  					header=TRUE, sep=",", stringsAsFactors=FALSE,
#  					data.table=FALSE)
#  causalSNPnames <- colnames(causalSNPs)
#  
#  LMM_mt <- fread(paste(datadir, "/GWAS_gemma_LMM_mt_pvalues.txt", sep=""),
#  				data.table=FALSE, header=FALSE, stringsAsFactors=FALSE)
#  LMM_st_trait1 <- fread(paste(datadir, "/GWAS_gemma_LMM_st_trait1_pvalues.txt",
#                              sep=""),
#  				data.table=FALSE, header=FALSE, stringsAsFactors=FALSE)
#  LMM_st_trait2 <- fread(paste(datadir, "/GWAS_gemma_LMM_st_trait2_pvalues.txt",
#                              sep=""),
#  				data.table=FALSE, header=FALSE, stringsAsFactors=FALSE)
#  LMM_st_trait3 <- fread(paste(datadir, "/GWAS_gemma_LMM_st_trait3_pvalues.txt",
#                              sep=""),
#  				data.table=FALSE, header=FALSE, stringsAsFactors=FALSE)
#  
#  LMM_mt <- LMM_mt[order(LMM_mt[,2]),]
#  LMM_mt$expected <- stats::ppoints(nrow(LMM_mt))
#  LMM_mt$model <- "mvLMM"
#  LMM_mt$trait <- "all"
#  colnames(LMM_mt)[1:2] <- c("rsID", "observed")
#  
#  LMM_st_trait1 <- LMM_st_trait1[order(LMM_st_trait1[,2]),]
#  LMM_st_trait1$expected <- LMM_mt$expected
#  LMM_st_trait1$model <- "uvLMM"
#  LMM_st_trait1$trait <- "trait1"
#  colnames(LMM_st_trait1)[1:2] <- c("rsID", "observed")
#  
#  LMM_st_trait2 <- LMM_st_trait2[order(LMM_st_trait2[,2]),]
#  LMM_st_trait2$expected <- LMM_mt$expected
#  LMM_st_trait2$model <- "uvLMM"
#  LMM_st_trait2$trait <- "trait2"
#  colnames(LMM_st_trait2)[1:2] <- c("rsID", "observed")
#  
#  LMM_st_trait3 <- LMM_st_trait3[order(LMM_st_trait3[,2]),]
#  LMM_st_trait3$expected <- LMM_mt$expected
#  LMM_st_trait3$model <- "uvLMM"
#  LMM_st_trait3$trait <- "trait3"
#  colnames(LMM_st_trait3)[1:2] <- c("rsID", "observed")
#  
#  pvalues <- rbind(LMM_mt, LMM_st_trait1, LMM_st_trait2, LMM_st_trait3)
#  pvalues$causal <- "all"
#  pvalues$causal[which(pvalues$rsID %in% causalSNPnames)] <- "simulated causal"
#  pvalues$causal <- factor(pvalues$causal, levels=c("all", "simulated causal"))
#  
#  p_qq <- ggplot(data=pvalues,
#  				aes(x=-log10(expected), y=-log10(observed)))
#  p_qq <- p_qq + geom_point(aes(color=causal, shape=trait)) +
#      scale_color_manual(values=c('darkgrey', '#1b9e77'),
#                         name="SNPs",
#  					guide=guide_legend(nrow = 2, title.position="top",
#                                         title.hjust =0.5,
#                                         override.aes = list(shape=21))) +
#      scale_shape_manual(values=c(18, 15:17),
#                         name="Traits",
#  					guide=guide_legend(nrow = 2,
#                                         override.aes = list(colour='darkgrey'),
#                                         title.position="top",
#                                         title.hjust =0.5)) +
#      geom_abline(intercept=0, slope=1, col="black") +
#  	geom_point(data=dplyr::filter(pvalues, causal == "simulated causal"),
#  				color='#1b9e77', aes(shape=trait)) +
#      xlab(expression(Expected~~-log[10](italic(p))) ) +
#      ylab(expression(Observed~~-log[10](italic(p)))) +
#      facet_grid(~ model) +
#      theme_bw() +
#  	theme(strip.background = element_rect(fill="white"),
#  			axis.title = element_text(size=12),
#  			axis.text  =element_text(size=10),
#  			legend.title  =element_text(size=12),
#  			legend.text  =element_text(size=10),
#  			strip.text  =element_text(size=10),
#  			legend.position = "bottom",
#              legend.direction = "horizontal")

## ----echo=FALSE, eval=FALSE, out.width='100%'---------------------------------
#  ggsave(plot=p_qq,
#  		filename=paste(savedir, "/gwas_results_gemma_qqplot.png", sep=""),
#  		units="mm", width=86, height=100)

## ----qqplot, echo=FALSE, fig.cap="\\label{fig:qqplot}Quantile-quantile plots of p-values observed from the multivariate linear mixed model (mvLMM, traits:all) and the univariate linear mixed models (uvLMM, traits: trait1/trait2/trait3) fitted to each of about eight million genome-wide SNPs (grey), including the ten SNPs for which a phenotype effect was modelled (green)."----
#knitr::include_graphics("../docs/articles/Simulation-and-LinearModel_files/figure-html/qqplot-1.png")
knitr::include_graphics("gwas_results_gemma_qqplot.png")

## ---- eval=FALSE--------------------------------------------------------------
#  sigSNPs <- dplyr::filter(pvalues, causal == "simulated causal",
#                           observed < 5*10^-8)
#  sigSNPs$observed_adjusted <- sigSNPs$observed
#  sigSNPs$observed_adjusted[which(sigSNPs$trait != "all")] <-
#      3*sigSNPs$observed[which(sigSNPs$trait != "all")]
#  sigSNPs <- dplyr::filter(sigSNPs, observed_adjusted < 5*10^-8)

## ---- echo=FALSE, eval=FALSE--------------------------------------------------
#  write.table(sigSNPs,
#              paste(savedir, "/sigSNPs.csv", sep=""),
#              col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")

## ----sigSNPs, echo=FALSE------------------------------------------------------
                        
sigSNPs <- read.table(paste(savedir, "/sigSNPs.csv", sep=""), header=TRUE, 
                      sep=",")
knitr::kable(sigSNPs[, c(1,4,5,7)], 
             caption="\\label{tab:sigSNPs}P-values from multi-trait GWAS (model == mvLMM) and 
                        single-trait GWAS (model == uvLMM)",
             table.attr = "id=\"sigSNPs\"",
             digits=40)

## ---- eval=FALSE--------------------------------------------------------------
#  # Read the SNPs simulated to have an effect on the phenotype and their SNP
#  # effects
#  SNPs <- read.csv(paste(datadir,"/SNP_NrSNP10.csv", sep=""),
#                   row.names=1)
#  SNPeffect <- read.csv(paste(datadir,"/SNP_effects_NrSNP10.csv", sep=""),
#                        row.names=1)
#  
#  
#  # HapGen simulated SNPs might be encoded as 7-25481146 , which R converts to
#  # X7.25481146 via make.names in read.table; substitute X and . to get original
#  # names
#  SNPnames <- gsub("\\.", "-", gsub("X", "", colnames(SNPs)))
#  SNPnames <- SNPnames[order(SNPnames)]
#  simulated_withEffect <- dplyr::filter(pvalues, rsID %in% SNPnames)

## ---- echo=FALSE, eval=FALSE--------------------------------------------------
#  write.table(simulated_withEffect,
#              paste(savedir,"/simulated_withEffect.csv", sep=""), col.names=TRUE,
#              quote=FALSE,row.names=FALSE)

## ---- echo=FALSE, eval=TRUE---------------------------------------------------
SNPeffect <- read.csv(paste(savedir,"/SNP_effects_NrSNP10.csv", sep=""),
                      row.names=1)
SNPs <- read.csv(paste(savedir,"/SNP_NrSNP10.csv", sep=""),
                 row.names=1)
SNPnames <- gsub("\\.", "-", gsub("X", "", colnames(SNPs)))
SNPnames <- SNPnames[order(SNPnames)]
simulated_withEffect <- read.csv(paste(savedir,"/simulated_withEffect.csv", sep=""))

## ----freq-beta, fig.cap="\\label{fig:freq-beta}P-values, allele frequencies and simulated effect sizes of the ten SNPs simulated to have an effect on phenotype."----
# get the allele frequencies of the SNPs simulated to have an effect on the 
# phenotype
freq <- apply(SNPs, 2, getAlleleFrequencies)
minor <- apply(freq, 2, min)
minor <- minor[order(gsub("X", "", names(minor)))]

# format the effect size table and combine with p-value and frequency
# information
effects <- reshape2::melt(SNPeffect, value.name="beta", variable.name="name")
effects$type <- gsub('\\..*', '', effects$name)
effects$rsID <- gsub("\\.", "-", gsub('.*_', '', effects$name))
effects$trait <- rep(paste("trait", 1:3, sep=""), 10)
effects <- effects[order(effects$rsID),]
effects <- dplyr::filter(effects, rsID %in% simulated_withEffect$rsID)

simulated_withEffect <- simulated_withEffect[order(simulated_withEffect[, 1]),]
simulated_withEffect$beta <- NA
simulated_withEffect$beta[simulated_withEffect$trait != "all"] <- effects$beta
simulated_withEffect$freq <- rep(minor[which(SNPnames %in% effects$rsID)], each=4)

# plot association p-values in relation to the absolute value of the simulated 
# effect sizes and the SNPs allele frequencies
p <- ggplot(dplyr::filter(simulated_withEffect, trait != "all"))
p <- p +
    geom_point(aes(x=freq, y=-log10(observed), color=abs(beta), shape=trait)) +
    geom_hline(yintercept=-log10(5*10^(-8)), color="darkgrey") +
    scale_color_gradientn(colours=c('#f0f9e8','#ccebc5','#a8ddb5','#7bccc4',
                                    '#4eb3d3','#2b8cbe','#08589e'),
                          name=expression("| simulated" ~beta~ "|")) +
    scale_shape_discrete(name="Phenotype in GWAS") +
    xlab("Allele frequencies") +
    ylab(expression(-log[10](p-value))) +
    theme_bw()
print(p)

## ----echo=FALSE, eval=FALSE, out.width='100%'---------------------------------
#  ggsave(plot=p,
#  		filename=paste(savedir, "/effectsizes-freq-pvalues.pdf", sep=""),
#  		units="mm", width=150, height=120)

Try the PhenotypeSimulator package in your browser

Any scripts or data that you put into this service are public.

PhenotypeSimulator documentation built on July 16, 2021, 5:06 p.m.