inst/examples/example4-population-stratification.R

# Example 4 , read a region from 1000 genomes, simulate individuals from two populations

library(sim1000G)
library(gplots)

simulatePopulation = function(num_individuals = 300)

{
    ids = generateUnrelatedIndividuals(num_individuals)
    genotypes = retrieveGenotypes(ids)
}


## Read two VCF file with regions from two different populations
## vcf1: variants from european population
## vcf2: variants from african populations

vcf1 = readVCF("/tmp/fs/tmp/CEU-TSI-GBR/CEU-TSI-GBR-region-chr4-205-MAML3.vcf.gz",
               maxNumberOfVariants = 5000,  min_maf =  1e-6, max_maf = 0.02)


vcf2 = readVCF("/tmp/fs/tmp/AFR/ASW-LWK-YRI-region-chr4-205-MAML3.vcf.gz",
               maxNumberOfVariants = 5000,  min_maf = 1e-6, max_maf = 0.05)


## Since the filtering was performed seperately in each file,
## we select only the common variants between the to VCF files
##

common = intersect(vcf1$varid,vcf2$varid)

vcf1 = subsetVCF(vcf1, var_id = common)
vcf2 = subsetVCF(vcf2, var_id = common)

table(vcf1$varid == vcf2$varid)


startSimulation(vcf1, totalNumberOfIndividuals = 2000)
saveSimulation("pop1")

startSimulation(vcf2, totalNumberOfIndividuals = 2000)
saveSimulation("pop2")






ped = read.table("20130606_g1k.ped",h=T,as=T,sep="\t")
pops = split(ped$Individual.ID,list(ped$Population))

loadSimulation("pop1")
gt1 = simulatePopulation(100)

loadSimulation("pop2")
gt2 = simulatePopulation(30)






loadSimulation("pop1")

maf1 = apply(simulatePopulation(200) , 2 , mean)/2
maf1[maf1>0.5] = 1 - maf1[maf1>0.5]

loadSimulation("pop2")

maf2 = apply(simulatePopulation(200) , 2 , mean)/2
maf2[maf2>0.5] = 1 - maf2[maf2>0.5]

plot(maf1,maf2)









v1 = list.files(path = "~/fs/tmp/ASW/")
id = sub(".*chr4-","",v1)
id = sub(".vcf.gz","",id)
id

v2 = list.files(path = "~/fs/tmp/CEU-TSI-GBR//")

v1

data.frame(v1,v2)


readAllVCF = function() {
    v1 = list.files(path = "~/fs/tmp/AFR/", full.names = TRUE)
    v2 = list.files(path = "~/fs/tmp/CEU-TSI-GBR/", full.names = TRUE)



    id = sub(".*chr4-","",v1)
    id = sub(".vcf.gz","",id)
    id


    vcf_pop1 = list()
    vcf_pop2 = list()


        parallel::mclapply(mc.cores=10, 1:length(v1), function(i)
        {


        geneid = id[i]

        vcf1 = readVCF( v1  [ grep(geneid, v1)  ] ,
                       maxNumberOfVariants = 5000,  min_maf =  1e-6, max_maf = 0.01)


        vcf2 = readVCF( v2  [ grep(geneid, v2)  ] ,
                       maxNumberOfVariants = 5000,  min_maf = 1e-6, max_maf = 0.05)


        if(is.na(vcf1)) return(1);
        if(is.na(vcf2)) return(1);

        common = intersect(vcf1$varid,vcf2$varid)

        if(length(common) < 10) return(1);

        vcf1 = subsetVCF(vcf1, var_id = common)
        vcf2 = subsetVCF(vcf2, var_id = common)

        cat(geneid, common," \n");


        file = sprintf("/tmp/rd-%s",geneid)

        save(vcf1,vcf2,file=file)



      # vcf_pop1[[ geneid ]] = vcf1
      # vcf_pop2[[ geneid ]] = vcf2


    })


}



sapply(vcf_pop1,function(x) length(x$maf))

vcf1 = vcf_pop1[["149-PTPN13"]]

vcf2 = vcf_pop2[["149-PTPN13"]]

Try the sim1000G package in your browser

Any scripts or data that you put into this service are public.

sim1000G documentation built on June 10, 2019, 1:01 a.m.