R/bambaf_from_vcf.R

Defines functions bam.baf bambaf_from_vcf

Documented in bambaf_from_vcf

#' Get BAM baf information from vcf
#' 
#' If your vcf follow the format in the example, you could use this function to 
#' extract NGS baf from vcf files. Remember to load library before hands.
#' Save 6 lists, each list has N entry. N = # of individuals (or vcf file)
#' ngs_baf.nm: name of the bamfiles; ngs_baf.chr: the chromosome; ngs_baf.pos: the position of the variants; 
#' ngs_baf: the BAF of the variants; ngs_baf.id: the ID of the variants; filenm:the file name
#' 
#' @importFrom grDevices colorRampPalette dev.off pdf
#' @importFrom graphics axis grid legend par plot points
#' @importFrom stats aggregate dnorm dunif kmeans sd
#' @importFrom utils read.table write.table
#' @importFrom rlang .data
#' @param dir The directory to all the vcf stored; default is right in this folder. Type character. Defualt '.'
#' @param vcf_list All the vcf names stored in vcf.list; could use command:"ls *.vcf > vcf.list" to generate. Type character. 
#' @param chr Specify the chromosome you want to generate. Must be of int from 1-22. If not specify, this function will generate all chromosomes. Defualt NULL
#' @param projname Name of the project. Type character. Default ''
#' @return void 
#' @examples
#' dir <- system.file("extdata", package="iCNV")
#' bambaf_from_vcf(dir,'bam_vcf.list',projname='icnv.demo.')
#' bambaf_from_vcf(dir,'bam_vcf.list',chr=22,projname='icnv.demo.')
#' @export
bambaf_from_vcf = function(dir='.',vcf_list,chr=NULL,projname=''){
    stopifnot(is.character(dir))
    stopifnot(is.character(vcf_list))
    stopifnot(is.character(projname))
    `%>%`=tidyr::`%>%`
    filenm=file.path(dir,read.table(file.path(dir,vcf_list),
        header=FALSE,as.is=TRUE)[[1]])

    # Read all the VCF data
    baf.all=lapply(filenm,bam.baf)

    # Seperate the data into different chromosome
    if(is.null(chr)){
        for (chr in seq_len(22)){
            baf.all.chr=lapply(baf.all,function(x){x1=x %>% dplyr::filter(.data$`#CHROM`==chr); return(x1)})
            ngs_baf.nm=list()
            ngs_baf.chr=list()
            ngs_baf.pos=list()
            ngs_baf.id=list()
            ngs_baf=list()
            for (i in seq_along(baf.all.chr)){
                x=baf.all.chr[[i]]
                ngs_baf.nm[[i]]=names(x)[4]
                ngs_baf.chr[[i]]=x$`#CHROM`
                ngs_baf.pos[[i]]=x$POS
                ngs_baf.id[[i]]=x$ID
                ngs_baf[[i]]=x[[4]]
            }
            save(chr,ngs_baf.nm,ngs_baf.chr,ngs_baf.pos,ngs_baf,
                ngs_baf.id,filenm,file=paste0(projname,'bambaf_',chr,'.rda'))
        }
    }else{
        baf.all.chr=lapply(baf.all,function(x){return(x %>% dplyr::filter(.data$`#CHROM`==chr))})
        ngs_baf.nm=list()
        ngs_baf.chr=list()
        ngs_baf.pos=list()
        ngs_baf.id=list()
        ngs_baf=list()
        for (i in seq_along(baf.all.chr)){
            x=baf.all.chr[[i]]
            ngs_baf.nm[[i]]=names(x)[4]
            ngs_baf.chr[[i]]=x$`#CHROM`
            ngs_baf.pos[[i]]=x$POS
            ngs_baf.id[[i]]=x$ID
            ngs_baf[[i]]=x[[4]]
        }
        save(ngs_baf.nm,ngs_baf.chr,ngs_baf.pos,ngs_baf,ngs_baf.id,filenm,
            file=paste0(projname,'bambaf_',chr,'.rda'))
    }
}

bam.baf=function(filenm){
    `%>%`=tidyr::`%>%`
    cat('load ',filenm,'...\n')
    dt = data.table::fread(filenm,skip='#CHROM')
    nm=names(dt)[length(dt)]
    cat('individual ',nm,'\n')
    dt <- dt %>% dplyr::filter(.data$QUAL!='.') %>% dplyr::select(-.data$REF,-.data$ALT,-.data$QUAL,-.data$FILTER,-.data$INFO)
    ind_AD=match('AD',strsplit(dt[[4]],':')[[1]])
    ind_DP=match('DP',strsplit(dt[[4]],':')[[1]])
    bafi <- vapply(strsplit(dt[[5]],':'),
        function(x){a=as.numeric(strsplit(x[ind_AD],',')[[1]]);return(a[1]/(a[1]+a[2]))},0.0)
    dt[nm] <- bafi
    dt <- dt %>% dplyr::select(-.data$FORMAT)
    return(dt)
}

Try the iCNV package in your browser

Any scripts or data that you put into this service are public.

iCNV documentation built on Nov. 8, 2020, 11:12 p.m.