R/readvcf.R
In sim1000G: Genotype Simulations for Rare or Common Variants Using Haplotypes from 1000 Genomes

#' Creates a regional vcf file using bcftools to extract a region from 1000 genomes vcf files
#'
#'
#' @return none
#' @export
createVCF = function() {

    # node ../fq-thin-vcf-file.js --out vt.vcf --in chr2.vcf --thin 0.7
    # bcftools view -s NA19904,NA19913,NA20317,NA20318,NA20334,NA20355,NA20359,NA20362 vt.vcf|grep -v '^##' > vt2.vcf

    #4 90645249 90759446 strand -1 id SNCA name SNCA bioType protein_coding
    '



CHROM=4
VCF=~/1000genomes/ALL.chr"$CHROM".*.vcf.gz
REGION=4:90645249-90759446
REGION=4:60995249-61569446


# 4 77356252 77704404 strand 1 id SHROOM3 name SHROOM3 bioType protein_coding
#REGION=4:77356252-77704404
#REGION=5:77356252-77704404



bcftools view -S /tmp/CEU.txt --force-samples -r $REGION  $VCF > /tmp/1.vcf
grep -v "^##" /tmp/1.vcf > /tmp/2.vcf
cp /tmp/2.vcf ~/tmp



'

}



#' Read a vcf file, with options to filter out low or high frequency markers.
#'
#'
#' @param filename Input VCF file
#' @param thin How much to thin markers
#' @param maxNumberOfVariants Maximum number of variants to keep from region
#' @param min_maf Minimum allele frequency of markers to keep. If NA skip min_maf filtering.
#' @param max_maf Maximum allele frequency of markers to keep. If NA skip max_maf filtering.
#' @param region_start Extract a region from a vcf files with this starting basepair position
#' @param region_end Extract a region from a vcf files with this ending basepair position
#'
#' @return VCF object to be used by startSimulation function.
#'
#' @examples
#'
#' examples_dir = system.file("examples", package = "sim1000G")
#' vcf_file = file.path(examples_dir,
#'   "region-chr4-93-TMEM156.vcf.gz")
#'
#' vcf = readVCF( vcf_file, maxNumberOfVariants = 500 ,
#'                min_maf = 0.02 ,max_maf = NA)
#'
#' str(as.list(vcf))
#' @export
readVCF = function(filename = "data.vcf",
                   thin = NA,
                   maxNumberOfVariants = 400,
                   min_maf = 0.02,
                   max_maf = NA,
                   region_start = NA,
                   region_end = NA
                   )
{


    if(  grepl(".rdata", filename) ) {

        cat("[#.......] Loading VCF environment from rdata file\n")

        e1 = new.env()
        load(filename, envir=e1)

        vcf = e1$vcf
        cat("[##......] Number of variants in VCF: ", nrow(vcf$vcf) , "\n");
        cat("[##......] Number of individuals in VCF: ", ncol(vcf$gt1) , "\n");

        return(vcf);


    }




    cat("[#.......] Reading VCF file..\n")



    # vcf = read.table(filename, sep="\t", comment.char="", as.is=T, header=T)


    readr_locale = locale(date_names = "en", date_format = "%AD", time_format = "%AT",
           decimal_mark = ".", grouping_mark = ",", tz = "",
           encoding = "UTF-8", asciify = FALSE)


    # in the futre we would like to use data.table as it is faster
    #data.table::fread(filename,"\t",comment="##")

    #print(readr_locale)


    vcf = read_delim(filename, "\t", comment = "##", progress = FALSE, locale = readr_locale)
    vcf = as.data.frame(vcf)




    # Keep bi-allelic markers only
    #vcf = vcf[str_length(vcf[,5]) == 1 ,]
    vcf = vcf[ !grepl(",",vcf[,5]) ,]




    if( !is.na( region_start ) ) vcf = vcf[ vcf[,2] >= region_start  , ]
    if( !is.na( region_end   ) ) vcf = vcf[ vcf[,2] <= region_end    , ]

    if( nrow(vcf) < 5 ) {
         cat("Too few variants in VCF file ( < 5 )\n");
         return(NA);
    }



    if(is.na(maxNumberOfVariants) ) maxNumberOfVariants = 1e10


    # Reduce number of variants
    if( !is.na(thin) )
        vcf = vcf[seq(1,nrow(vcf),by = thin) ,]


    cat("[##......] Chromosome:  ", unique(vcf[,1]),
        " Mbp: " , min(vcf[,2])/1e6,
        " Region Size: ", 1e-3 * ( max(vcf[,2])-min(vcf[,2]) ) ,"kb ",
        "Num of individuals:", ncol(vcf)-9,
        "\n");


    cat("[##......] Before filtering ",
        "Num of variants:", dim(vcf)[1] ,
        "Num of individuals:", ncol(vcf)-9,
        "\n");


    individual_ids = colnames(vcf)[-(1:9)]
    gt = vcf[,-(1:9) ]
    vcf = vcf[,1:9]


    gt1 = apply( gt, 2,  function(x) as.numeric( str_sub(x,1,1) )  )
    gt2 = apply( gt, 2,  function(x) as.numeric( str_sub(x,3,3) )  )



    #### --- Filter by MAF ---- ####


    #cat("[###.....] Filtering and thinning variants\n");

    maf = apply( (gt1>0)+(gt2>0) ,1,function(x) mean(x,na.rm=T))/2

    #sites = apply(gt1+gt2,1,function(x) sum(!is.na(x)) )

    #maf = maf / ( 2*sites )

    #print(range(maf))
    #print(length(maf))

    #maf[maf>0.5] = 1-maf[maf>0.5]

    #maf2 = apply(gt2,1,function(x) mean(x,na.rm=T))
    #maf2[maf2>0.5] = 1 - maf2[maf2>0.5]

    ok = apply(gt1,1,function(x) max(x,na.rm=T))
    s = which(ok < 2)



    if( ! is.na(min_maf) ) {  s = intersect( s  ,   which(maf >= min_maf ) ) }

    if( ! is.na(max_maf) ) {  s = intersect( s  ,   which(maf <= max_maf ) ) }

    total_number_of_variants_within_maf = length(s)


    if(length(s) > maxNumberOfVariants) s = sort( sample(s,maxNumberOfVariants) )




    gt1 = gt1[s,]
    gt2 = gt2[s,]
    gt = gt[s,]
    vcf = vcf[s,]
    maf = maf[s]




    # cat("Q: flip genotypes if maf > 0.5???????????\n")


    cat("[###.....] After filtering ",
        "Num of variants:", dim(vcf)[1] ,
        "Num of individuals:", ncol(gt1),
        "\n");

    ##


    R = new.env()

    R$vcf = vcf
    R$gt1 = gt1
    R$gt2 = gt2
    R$individual_ids = individual_ids
    R$maf = maf

    R$varid = paste(R$vcf[,1],R$vcf[,2],R$vcf[,3],R$vcf[,4],R$vcf[,5] )

    R$total_number_of_variants_within_maf = total_number_of_variants_within_maf

    # R$gt = gt

    R

}














#' Generate a market subset of a vcf file
#'
#'
#' @param vcf VCF data as created by function readVCF
#' @param var_id id  of markers to subset. Should be a selection from vcf$varid. NA if no filtering on id to be performed.
#' @param var_index index of number to subset. Should be in the range 1..length(vcf$varid)
#' @param individual_id IDs of individuals to subset. Should be a selection from vcf$individual_id
#'
#'
#' @return VCF object to be used by startSimulation function.
#'
#' @examples
#'
#' examples_dir = system.file("examples", package = "sim1000G")
#'
#' vcf_file = file.path(examples_dir, "region-chr4-93-TMEM156.vcf.gz")
#'
#' vcf = readVCF( vcf_file, maxNumberOfVariants = 500 ,
#'                min_maf = 0.02 ,max_maf = NA)
#'
#' vcf2 = subsetVCF(vcf, var_index = 1:50)
#'
#' @export
subsetVCF = function(vcf , var_index = NA , var_id = NA, individual_id = NA )
{

    R = new.env()

    if(!is.na(var_id[1]) && !is.na(var_index[1])) { stop("only one of var_id and var_index should be given")}

    if(!is.na(var_id[1])) var_subset = which(vcf$varid %in% var_id)

    if(!is.na(var_index[1])) var_subset = var_index

    if(length(var_subset) == 0) stop("no variants found")

    s = var_subset

    vcf2 = as.list(vcf)


    vcf2$maf = vcf2$maf [s]
    vcf2$varid = vcf2$varid [s]

    vcf2$gt1 = vcf2$gt1 [s,]
    vcf2$gt2 = vcf2$gt2 [s,]

    vcf2$vcf = vcf2$vcf [s,]


    if(!is.na(individual_id)) {

        s = which(vcf$individual_ids %in% individual_id)

        if(length(s) == 0) stop("no individuals found")

        vcf2$individual_ids = vcf2$individual_ids[s]
        vcf2$gt1 = vcf2$gt1[,s]
        vcf2$gt2 = vcf2$gt2[,s]

        # vcf$maf = apply(vcf2$gt1+vcf2$gt2, mean, na.rm=T) / 2
    }

    as.environment(vcf2)
}

Any scripts or data that you put into this service are public.

sim1000G documentation built on June 10, 2019, 1:01 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

sim1000G
Genotype Simulations for Rare or Common Variants Using Haplotypes from 1000 Genomes

R/readvcf.R
In sim1000G: Genotype Simulations for Rare or Common Variants Using Haplotypes from 1000 Genomes

Try the sim1000G package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sim1000G Genotype Simulations for Rare or Common Variants Using Haplotypes from 1000 Genomes

R/readvcf.R In sim1000G: Genotype Simulations for Rare or Common Variants Using Haplotypes from 1000 Genomes

Try the sim1000G package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

sim1000G
Genotype Simulations for Rare or Common Variants Using Haplotypes from 1000 Genomes

R/readvcf.R
In sim1000G: Genotype Simulations for Rare or Common Variants Using Haplotypes from 1000 Genomes