example2-population-stratification.R
In sim1000G: Genotype Simulations for Rare or Common Variants Using Haplotypes from 1000 Genomes

#
#for j in 1 2 3 4 5 6 7 8 9 10; do  for i in 1 2 3 4 5 6 7 8 9 10; do
#submit j3-$j-$i Rscript example2-population-stratification.R ; done ; done
#

library(sim1000G)
library(parallel)
library(SKAT)



par = commandArgs(T)

NREP = 15

run_parallel = 1

numcpus = 10

#### Initialize the simulation ####


# List all available gene region vcf files
all_vcf_files = list.files(path="pop1",  full.names = T)
selected_vcf_file = sample(all_vcf_files , 1)


# Find corresponding vcf file of second population
selected_vcf_file2 = gsub("pop1","pop2", selected_vcf_file)



cat("Reading the two vcf files: " , selected_vcf_file,selected_vcf_file2 ,"\n")




initSimulation = function() {


    ## Read two VCF file with regions from two different populations
    ## vcf1: variants from european population
    ## vcf2: variants from african populations


    gn = strsplit(selected_vcf_file,"-")
    gn = sapply(gn, function(x) x[4])

    genename <<- gn
    geneid = gn

    vcf1 <<- readVCF(selected_vcf_file ,
                    maxNumberOfVariants = 800,  min_maf =  1e-6, max_maf = 0.02)


    if(class(vcf1) != "environment") stop(1);


    vcf2 <<- readVCF( selected_vcf_file2  ,
                    maxNumberOfVariants = 800,  min_maf = 1e-6, max_maf = 0.02)


    print(class(vcf1))
    print(class(vcf2))


    if(class(vcf1) != "environment") stop(1);
    if(class(vcf2) != "environment") stop(1);




    ## We select only the common variants between the 2 VCF files
    ##

    common = intersect(vcf1$varid,vcf2$varid)
    print(length(common))

    if(length(common) < 10) stop(0);

    vcf1 <<- subsetVCF(vcf1, common)
    vcf2 <<- subsetVCF(vcf2, common)

    cat(geneid, length(common)," \n");


    common_causal_pool <<- 1:length(vcf1$maf)

    cat("start sim\n");


    startSimulation(vcf1, totalNumberOfIndividuals = 10000)
    saveSimulation("pop1")

    startSimulation(vcf2, totalNumberOfIndividuals = 10000)
    saveSimulation("pop2")


    print(length(vcf1$varid))

}





SEED = 1

#### Function to generate one simulation replicate ####




predictor_logistic = function(b, geno) {
    x = b[1]
    for(i in 1:ncol(geno)) { x = x  + b[i+1] * ( geno[,i] > 0) + b[i+1] * ( geno[,i] > 1)   }
    exp(x) / (1+exp(x) )
}





replicate1 = function(seed=NA,
                     numcausal = 10,
                     effect_size = 10,
                     pop_strat = 0,
                     N_pop1 = 400,
                     N_pop2 = 200
                    )
{

        # if(!is.na(seed)) set.seed(seed)



        loadSimulation("pop1")

        id = generateUnrelatedIndividuals(N_pop1)

        genotypes = retrieveGenotypes(id)
        rownames(genotypes)= rep(  "CEU" , nrow(genotypes) )


        genotypes2 = NA
        gt = genotypes

        if(N_pop2 > 0) {
            loadSimulation("pop2")
            id = generateUnrelatedIndividuals(N_pop2)


            genotypes2 = retrieveGenotypes(id)
            rownames(genotypes2)= rep(  "ASW" , nrow(genotypes2) )

            gt = rbind(genotypes,genotypes2)
        }



        pca = prcomp(gt)


        maf = apply(gt,2,mean,na.rm=T)/2
        apply(gt,2,function(x) sum(is.na(x)))
        flip  = which(maf > 0.5) ; gt[,flip] = 2 - gt[,flip]
        maf = apply(gt,2,mean,na.rm=T)/2




        #EFFECT_SIZE = 10


        maf_gt0 = which(maf > 0)

        effect_sizes = rep(0, ncol(gt))
        nvar = length(effect_sizes)

        s = sample(common_causal_pool, numcausal)
        effect_sizes[s] = effect_size




        S=0

        while(1) {
                p = rep(NA, nrow(gt) )

                s = rownames(gt) != "ASW"

                p[s] = predictor_logistic (  c(S,effect_sizes) ,  gt[s,])
                p[!s] = predictor_logistic (  c(S+pop_strat,effect_sizes) ,  gt[!s,])

                phenotype = rbinom( length(p) , 1 , p )


                ncases = sum(phenotype==0)
                ncontrols = sum(phenotype==1)

                # cat(ncases,ncontrols,"\n")
                if( abs(ncases-ncontrols) < 5 ) break();

                if(ncases > ncontrols) S = S + 0.1
                if(ncases < ncontrols) S = S - 0.1
        }



        t=( table(rownames(gt)  , phenotype ) )
        print(t)




        #phenotype = sample(phenotype)

        obj<-SKAT_Null_Model(phenotype ~ 1, out_type="D")

        pv = SKAT((gt),obj)$p.value




        if(sum(rownames(gt) != "CEU") > 0) {

            population = factor(rownames(gt))

            # obj<-SKAT_Null_Model(phenotype ~ population, out_type="D")

            obj<-SKAT_Null_Model(phenotype ~
                 pca$x[,1] + pca$x[,2] + pca$x[,3] + pca$x[,4] + pca$x[,5] + pca$x[,6], out_type="D")

            pv_cov = SKAT((gt),obj)$p.value


        } else {
            pv_cov = -1
        }



        cat("EFF= ", effect_size,
            "POP1=",N_pop1,
            " POP_STRAT=" , pop_strat,
            " SKATPV=", pv,
            " SKATPV_COV=", pv_cov,
            " GENE=", genename,
            # " t1= ", table(rownames(gt)  , phenotype  ),

            "\n");


        tbl = table(rownames(gt),phenotype)
        data.frame(ID="EFFZZ", eff=effect_size,strat=pop_strat,pop1=N_pop1, pop2=N_pop2,
                   pv1=pv,pv2=pv_cov,
                   gene=genename

                   )
}


# replicate1(NA,10,0, 2,1000,1000)


#### Generate multiple simulation replicates ####

rand_name = function() {

    x = sprintf("%04d",round(runif(10,1,1000)) )
    paste(x,sep="",collapse="")

}


seed=NA
numcausal=7
effect_size = 0
pop_strat = 10
N_pop1 = 400
N_pop2 = 200





par = as.numeric(par)
EFF = par[1]
p2 = par[2]
pop_strat = par[3]

nsim = par[4]

if(0) {
    EFF=2
    p2 = 100
    pop_strat = 2
    nsim=2
}



base_seed = round( runif(1,0,1e9) )

initSimulation()


if(run_parallel == 1) {


    library(parallel)

    cat("Cluster setup\n");
    cl <- makeCluster(getOption("cl.cores", numcpus))

    clusterExport(cl, c(
            "selected_vcf_file","selected_vcf_file2",
            "initSimulation","replicate1",
            "par","EFF","p2","pop_strat","nsim",
            "predictor_logistic",
            "base_seed"
       ) )

    cat("exportDone\n");

    clusterEvalQ(cl, library(sim1000G))
    clusterEvalQ(cl, library(SKAT))
    cat("initSim\n");

    clusterEvalQ(cl, initSimulation())

    cat("initSim Done\n");



}





clusterRep = function(x) {

    replicate1(seed=base_seed + 101*x, numcausal = 10, effect_size = EFF,
               N_pop1 = 2000-p2,
               N_pop2 = p2,
               pop_strat = pop_strat
    )
}


results = list()



pop_strat=4; EFF=0; p2=1400; clusterRep()





pop_strat = 0
EFF = log(3)
p2 = 0


effect_sizes = c(0,log(1.5), log(1.8) , log(3), log(5)  )

# pop_strat_list = c(0,0.5,1,2)
pop_strat_list = c(2)

for(pop_strat in pop_strat_list)
    for(EFF in effect_sizes)
        for(p2 in c(0,100,200,400))

{


        cat(pop_strat, EFF, p2,"\n");

        if(run_parallel == 1) {
            clusterExport(cl,c("p2","EFF","pop_strat","genename"))

             v = clusterApply(cl,1:NREP, clusterRep)
        }

        if(run_parallel == 0) v = lapply(1:NREP, clusterRep)

        results = c( results  ,  v   )
}





pz = plyr::ldply(results)
pz$MASTER_ID = "REPLICATE"

options(width = 3000)
print(pz)








pz$pv2 [pz$pv2 < 0] = NA

reshape2::dcast(pz, eff + factor(pop2) ~ 1,value.var="pv1",function(x) mean(x<0.05))
reshape2::dcast(pz, eff + factor(pop2) ~ 1,value.var="pv2",function(x) mean(x<0.05))


save(results,pz,file=sprintf("sim-results-%s.rdata",rand_name()) )

write.table(pz,  row=F,quote=F,sep=" ", col=F, file=sprintf("out-%s.txt", rand_name()) )

Any scripts or data that you put into this service are public.

sim1000G documentation built on June 10, 2019, 1:01 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

sim1000G
Genotype Simulations for Rare or Common Variants Using Haplotypes from 1000 Genomes

inst/examples/example2-population-stratification.R
In sim1000G: Genotype Simulations for Rare or Common Variants Using Haplotypes from 1000 Genomes

Try the sim1000G package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sim1000G Genotype Simulations for Rare or Common Variants Using Haplotypes from 1000 Genomes

inst/examples/example2-population-stratification.R In sim1000G: Genotype Simulations for Rare or Common Variants Using Haplotypes from 1000 Genomes

Try the sim1000G package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sim1000G
Genotype Simulations for Rare or Common Variants Using Haplotypes from 1000 Genomes

inst/examples/example2-population-stratification.R
In sim1000G: Genotype Simulations for Rare or Common Variants Using Haplotypes from 1000 Genomes