sim1000G - Extracting regions from 1000 genomes vcf files for use with sim1000G

Introduction

sim1000G allows for easy simulation of unrelated individuals starting from sequencing 1000 genomes varians.

The following, somewhat complicated example, showcases the following:

The original 1000 genomes VCF files are obtained from 1000 genomes ftp site, at the location:

http://hgdownload.cse.ucsc.edu/gbdb/hg19/1000Genomes/phase3/

```{R echo=FALSE, results='hide'}

cat("")

### Generating IDs of individuals to extract

```{R message=FALSE, warning=FALSE, include=TRUE}

sink(tempfile())
ped_file_1000genomes = system.file("examples", "20130606_g1k.ped", package = "sim1000G")

ped = read.table(ped_file_1000genomes,h=T,as=T,sep="\t")

pop1 = c("CEU","TSI","GBR")
id1 = ped$Individual.ID [ ped$Population %in% pop1 ]



id2 = ped$Individual.ID [ ped$Population == "ASW" ]


pop_map = ped$Population
names(pop_map) = ped$Individual.ID



write_sample_files = 0

if(write_sample_files == 1) {
  cat(c(id1,id2),file="/tmp/samples1.txt",sep="\n")
  # cat(c(id2),file="/tmp/samples2.txt",sep="\n")
}

Extracting variant sets using bcftools

We extract the CEU,TSI,GBR and ASW samples from a region of chromosome 4 from 73MBp to 74MBp using bcftools. The following command are run in the shell:

```{sh, eval=F, message=FALSE, warning=FALSE}

77356278-77703432

INPUT_VCF=ALL.chr4.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz

bcftools view -S /tmp/samples1.txt -r 4:73000000-74000000 --force-samples $INPUT_VCF > /tmp/chr4-80.vcf

bcftools filter -i 'AF>0 && EUR_AF>0 && AFR_AF>0' < /tmp/chr4-80.vcf | gzip > /tmp/chr4-80-filt.vcf.gz

### Reading the vcf file

```{R echo=TRUE, fig.height=12, fig.width=12, message=FALSE, warning=FALSE , results = 'hide' }
library(sim1000G)


vcf_file = "/tmp/chr4-80-filt.vcf.gz"

if(1) {
examples_dir = system.file("examples", package = "sim1000G")
vcf_file = file.path(examples_dir, "region-chr4-93-TMEM156.vcf.gz" )
}


vcf  = readVCF(vcf_file,   maxNumberOfVariants = 200 , min_maf = 0.02, max_maf = 0.32)

table( pop_map[ vcf$individual_ids ])

C = cor(t( vcf$gt1+vcf$gt2))^2
gplots::heatmap.2(C,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101))


#gplots::heatmap.2(cor(t(vcf2$gt1))^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none")



if(0) {
ids = vcf$individual_ids


id_pop1 = which(ids %in% id1)
id_pop2 = which(ids %in% id2)



gplots::heatmap.2(cor(t( vcf$gt1[,id_pop1]+vcf$gt2[,id_pop1]))^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101))


gplots::heatmap.2(cor(t( vcf$gt1[,id_pop2]+vcf$gt2[,id_pop2]))^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101))

}

Starting two distinct simulations for the two sample sets and simulating unrelated individuals

```{R echo=TRUE, eval=FALSE, fig.width=12, fig.height=12}

startSimulation(vcf, subset = id1)

saveSimulation("pop1")

N = 200 id = c() for(i in 1:N) id[i] = SIM$addUnrelatedIndividual()

genotypes = retrieveGenotypes(id)

gplots::heatmap.2(cor( genotypes )^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101))

startSimulation(vcf, subset = id2)

saveSimulation("pop2")

N = 200 id = c() for(i in 1:N) id[i] = SIM$addUnrelatedIndividual()

genotypes2 = retrieveGenotypes(id)

gplots::heatmap.2(cor( genotypes2 )^2,col=rev( heat.colors(100) ) ,Rowv=F,Colv=F,trace="none",breaks=seq(0,1,l=101))

### Combining the two datasets and running SKAT


```r
library(SKAT)

loadSimulation("pop1")

plot(apply(genotypes,2,mean), apply(genotypes2,2,mean))

gt = rbind(genotypes,genotypes2)

#gt = genotypes

dim(gt)

maf = apply(gt,2,mean,na.rm=T)/2
apply(gt,2,function(x) sum(is.na(x)))

flip  = which(maf > 0.5) ; gt[,flip] = 2 - gt[,flip]


#gt = genotypes

dim(gt)

maf = apply(gt,2,mean,na.rm=T)/2
plot(maf)

sum(maf==0)


apply(gt,2,function(x) sum(is.na(x)))



flip  = which(maf > 0.5)
gt[,flip] = 2 - gt[,flip]


dim(gt)


effect_sizes = rep(0, ncol(gt))
nvar = length(effect_sizes)

s = sample(1:nvar, 33)
effect_sizes[s] = 5


apply(gt[,s],1,sum)




predictor2 = function(b, geno) {
    x = b[1] 
    for(i in 1:ncol(geno)) { x = x  + b[i+1] * ( geno[,i] > 0) + b[i+1] * ( geno[,i] > 1)   }
    exp(x) / (1+exp(x) )
}

p =predictor2 (  c(-1.5,effect_sizes) ,  gt)



phenotype = rbinom( length(p) , 1 , p ) 

#phenotype = sample(phenotype)
obj<-SKAT_Null_Model(phenotype ~ 1, out_type="D")


library(SKAT)
SKATBinary((gt),obj)$p.value

Genetic maps in sim1000G

Through the functions readGeneticMap and downloadGeneticMap, we provide the functionality to automatically download genetic maps for GRCH37 build of the human genome.



Try the sim1000G package in your browser

Any scripts or data that you put into this service are public.

sim1000G documentation built on June 10, 2019, 1:01 a.m.