sim1000G allows for easy simulation of unrelated individuals starting from sequencing 1000 genomes varians.
The following, somewhat complicated example, showcases the following:
The original 1000 genomes VCF files are obtained from 1000 genomes ftp site, at the location:
http://hgdownload.cse.ucsc.edu/gbdb/hg19/1000Genomes/phase3/
```{R echo=FALSE, results='hide'}
cat("")
### Generating IDs of individuals to extract ```{R message=FALSE, warning=FALSE, include=TRUE} sink(tempfile()) ped_file_1000genomes = system.file("examples", "20130606_g1k.ped", package = "sim1000G") ped = read.table(ped_file_1000genomes,h=T,as=T,sep="\t") pop1 = c("CEU","TSI","GBR") id1 = ped$Individual.ID [ ped$Population %in% pop1 ] id2 = ped$Individual.ID [ ped$Population == "ASW" ] pop_map = ped$Population names(pop_map) = ped$Individual.ID write_sample_files = 0 if(write_sample_files == 1) { cat(c(id1,id2),file="/tmp/samples1.txt",sep="\n") # cat(c(id2),file="/tmp/samples2.txt",sep="\n") }
We extract the CEU,TSI,GBR and ASW samples from a region of chromosome 4 from 73MBp to 74MBp using bcftools. The following command are run in the shell:
```{sh, eval=F, message=FALSE, warning=FALSE}
INPUT_VCF=ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz
bcftools view -S /tmp/samples1.txt -r 4:73000000-74000000 --force-samples $INPUT_VCF > /tmp/chr4-80.vcf
bcftools filter -i 'AF>0 && EUR_AF>0 && AFR_AF>0' < /tmp/chr4-80.vcf | gzip > /tmp/chr4-80-filt.vcf.gz
### Reading the vcf file ```{R echo=TRUE, fig.height=12, fig.width=12, message=FALSE, warning=FALSE , results = 'hide' } library(sim1000G) vcf_file = "/tmp/chr4-80-filt.vcf.gz" if(1) { examples_dir = system.file("examples", package = "sim1000G") vcf_file = file.path(examples_dir, "region-chr4-93-TMEM156.vcf.gz" ) } vcf = readVCF(vcf_file, maxNumberOfVariants = 200 , min_maf = 0.02, max_maf = 0.32) table( pop_map[ vcf$individual_ids ]) C = cor(t( vcf$gt1+vcf$gt2))^2 gplots::heatmap.2(C,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101)) #gplots::heatmap.2(cor(t(vcf2$gt1))^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none") if(0) { ids = vcf$individual_ids id_pop1 = which(ids %in% id1) id_pop2 = which(ids %in% id2) gplots::heatmap.2(cor(t( vcf$gt1[,id_pop1]+vcf$gt2[,id_pop1]))^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101)) gplots::heatmap.2(cor(t( vcf$gt1[,id_pop2]+vcf$gt2[,id_pop2]))^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101)) }
```{R echo=TRUE, eval=FALSE, fig.width=12, fig.height=12}
startSimulation(vcf, subset = id1)
saveSimulation("pop1")
N = 200 id = c() for(i in 1:N) id[i] = SIM$addUnrelatedIndividual()
genotypes = retrieveGenotypes(id)
gplots::heatmap.2(cor( genotypes )^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101))
startSimulation(vcf, subset = id2)
saveSimulation("pop2")
N = 200 id = c() for(i in 1:N) id[i] = SIM$addUnrelatedIndividual()
genotypes2 = retrieveGenotypes(id)
gplots::heatmap.2(cor( genotypes2 )^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101))
### Combining the two datasets and running SKAT ```r library(SKAT) loadSimulation("pop1") plot(apply(genotypes,2,mean), apply(genotypes2,2,mean)) gt = rbind(genotypes,genotypes2) #gt = genotypes dim(gt) maf = apply(gt,2,mean,na.rm=T)/2 apply(gt,2,function(x) sum(is.na(x))) flip = which(maf > 0.5) ; gt[,flip] = 2 - gt[,flip] #gt = genotypes dim(gt) maf = apply(gt,2,mean,na.rm=T)/2 plot(maf) sum(maf==0) apply(gt,2,function(x) sum(is.na(x))) flip = which(maf > 0.5) gt[,flip] = 2 - gt[,flip] dim(gt) effect_sizes = rep(0, ncol(gt)) nvar = length(effect_sizes) s = sample(1:nvar, 33) effect_sizes[s] = 5 apply(gt[,s],1,sum) predictor2 = function(b, geno) { x = b[1] for(i in 1:ncol(geno)) { x = x + b[i+1] * ( geno[,i] > 0) + b[i+1] * ( geno[,i] > 1) } exp(x) / (1+exp(x) ) } p =predictor2 ( c(-1.5,effect_sizes) , gt) phenotype = rbinom( length(p) , 1 , p ) #phenotype = sample(phenotype) obj<-SKAT_Null_Model(phenotype ~ 1, out_type="D") library(SKAT) SKATBinary((gt),obj)$p.value
Through the functions readGeneticMap and downloadGeneticMap, we provide the functionality to automatically download genetic maps for GRCH37 build of the human genome.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.