R/F2.3-get_vcf_info.R

# 02-get genotype by querying vcf files
## ==================================================================================
## Function: get_vcf_info_main
###  Aim: To get the genotypes of input SNPs by vcf files
###  INPUT: input snp file (.csv)
###  OUTPUT: input snp file with information of genotypes of vcf files
## ==================================================================================

# outline ---------------------------------------------------------------------------------------------------------

# # 1. read snp file into data frame
# # snp_list should contain both the df and GRange object
# snp_list = read_snpFile()
#
# # 2. read peak files into a list of GRange object
# vcf_list = gen_vcf_list(vcf_dir, snp_info_gr)
#
# # 3. add overlapping info to original snp table
# snp_df_proc = add_vcf_info(snp_info_df, vcf_list)

# 0. load packages ------------------------------------------------------------------------------------------------

load_packages_2.3 = function(){
        packages = c('dplyr', 'VariantAnnotation','GenomicRanges', 'Biostrings')
        load = lapply(packages, require, character.only = T)
}

# function: write.csv0
# source("./src/scripts/T1-toolbox.R")

# toolbox: get short names

get_short_fileName = function(fileName_fullPath) {
        str_vec = strsplit(fileName_fullPath, "/")[[1]]
        str_vec[length(str_vec)]
}

# 0. generate output file name ------------------------------------------------------------------------------------

gen_output_file_vcfInfo = function(snp_info_file, output_dir = "./", sample_name = "") {
        # Aim: to get output file name
        snp_info_vec = strsplit(snp_info_file, "/")[[1]]
        snp_batch_id = gsub(".csv", "", snp_info_vec[length(snp_info_vec)])
        output_file = paste0(output_dir, "/", snp_batch_id, "_", sample_name, "_genotypeInfo.csv")

        return (output_file)

}

# 1. read snp file ------------------------------------------------------------------------------------------------

# source ("./R/F2.1-get_alleleDist_info.R")
# same as read_inputSNP_file

# Test #
# snp_info_list = read_inputSNP_file("./data/haploreg_files/LUC_Index+LD_SNPs_20160607.csv")


# 2. read vcf files into a list -----------------------------------------------------------------------------------

gen_vcf_list = function(vcf_dir = NA, vcf_file = NA, snp_info_gr) {
# Aim: to generate vcf list by reading vcf files
# Input: 1. vcf dir; 2. snp_info_gr
# Output: vcf list

        # Test #
        # vcf_dir = "./data/samples/DDBJ_A549/vcf_files/"

        if (!is.na(vcf_dir)) {
                vcf_files = list.files(vcf_dir, ".vcf$", full.names=T)
                vcf_files_short = list.files(vcf_dir, ".vcf$", full.names=F)
        }
        if (!is.na(vcf_file)) {
                vcf_files = vcf_file
                vcf_files_short = get_short_fileName(vcf_file)
        }

        vcf_list = replicate(length(vcf_files), list())
        snp_param = ScanVcfParam(which= snp_info_gr)

        # read vcf files
        for (i in 1:length(vcf_files)) {
                cat("reading", vcf_files[i], "\n")
                # read vcf files
                vcf_obj_i = read_vcfFile(vcf_files[i], snp_param)
                vcf_list[[i]] = vcf_obj_i
        }

        names(vcf_list) = vcf_files_short

        return (vcf_list)
}

# helper function
read_vcfFile = function(vcf_file, param){
        compressVcf <- bgzip(vcf_file, tempfile())
        idx <- indexTabix(compressVcf, "vcf")
        tab <- TabixFile(compressVcf, idx)
        vcf <- readVcf(tab, "hg19", param)
}

# Test #
# vcf_list = gen_vcf_list(snp_info_gr = snp_info_list$snp_info_gr)

# 3. process vcf list data -------------------------------------------------------------------------------------
process_vcf_list = function(vcf_list) {
# To process vcf list
# Output: to get a processed df for each vcf elemnt in the list
# - row: SNPs
# - col: seqnames start end width strand paramRangeID REF ALT QUAL FILTER ref_count alt_count
# helper functions: get_vcf_software, sel_hetSnps_vcf, process_vcf_data

        # Test #
        # vcf_file = "./data/samples/DDBJ_A549/vcf_files/A549_snv.GATK.formatcor.vcf"

        vcf_df_list = replicate(length(vcf_list), list())
        names(vcf_df_list) = names(vcf_list)

        for (i in 1:length(vcf_list)) {
                vcf_data_i = vcf_list[[i]]
                # get the software that generates the vcf file
                vcf_software_i = get_vcf_software (names(vcf_list)[[i]])
                # subset vcf with het genotypes
                vcf_data_i_het = sel_hetSnps_vcf(vcf_data_i)
                # process vcf data to data.frame format
                vcf_data_i_df = process_vcf_data(vcf_data_i_het, vcf_software_i)
                # assign vcf_df_list with processed data.frame
                vcf_df_list[[i]] = vcf_data_i_df
        }

        # return to our df
        return (vcf_df_list)
}


# 3-1 helper function
get_vcf_software = function(vcf_fileName) {
# Aim: to get the software that was used to generate the vcf file
        if (grepl("[Gg][Aa][Tt][Kk]", vcf_fileName)) {
                return ("GATK")
        }
        if (grepl("[Ss][Aa][Mm][Tt][Oo][Oo][Ll][Ss]", vcf_fileName)) {
                return ("Samtools")
        }
        if (grepl("[Bb][Ii][Ss][Ss][Nn][Pp]", vcf_fileName)) {
                return ("BisSNP")
        }
        else {
                stop("Please add the software name you used in generating vcf file to your vcf file name.
                     for example: A549.GATK.vcf")
        }
}

# 3-2 helper function
sel_hetSnps_vcf = function(vcf_data){
# Aim: to select heterozygous SNPs from vcf data

        genotype = geno(vcf_data)$GT
        vcf_data_het = vcf_data[genotype[,1] == "0/1"]

        return(vcf_data_het)
}

# 3-3 helper function
process_vcf_data = function(vcf_data, vcf_software = "GATK") {
# Aim: to transform vcf data to data.frame format
# Input: vcf_data, (vcf annotation format)
# Output: vcf data frame, with genotype, counts of ref and alt alleles added
# helper function: extract_allelic_dist
        ### Test ###
        # vcf_data = vcf_data_i_het
        # vcf_software = vcf_software_i

        # calculate ref and alt allele counts
        vcf_allelic_dist = extract_allelic_dist(vcf_data, vcf_software)

        ref_count_vec = sapply(vcf_allelic_dist, function(x) {
                as.numeric(strsplit(x, ",")[[1]][1])
        })
        alt_count_vec = sapply(vcf_allelic_dist, function(x) {
                as.numeric(strsplit(x, ",")[[1]][2])
        })

        # add ref and alt allele counts information
        vcf_data_gr = GenomicRanges::rowRanges(vcf_data)
        names(vcf_data_gr) = 1:length(vcf_data_gr)
        # vcf_data_df = as.data.frame(vcf_data_gr) # This might not work in R package. Don't know why
        vcf_data_df = data.frame(seqnames = vcf_data_gr@seqnames, vcf_data_gr@ranges, vcf_data_gr@elementMetadata)
        vcf_data_df = dplyr::mutate(vcf_data_df, ref_count = ref_count_vec, alt_count = alt_count_vec)

        # modify the data format (from DNAString to character)
        vcf_data_df$ALT = sapply(vcf_data_df$ALT, function(x) as.character(unlist(x)))
        # vcf_data_df$ALT = sapply(vcf_data_df$ALT, function(x) toString(x))

        return(vcf_data_df)
}

# 3-3-1 helper function
extract_allelic_dist = function(vcf_data, vcf_software = "GATK"){
# Aim: to extract allelic distribution in vcf file generated by various softwares

        if (vcf_software == "GATK") {
                AD = geno(vcf_data)$AD
                AD_1 = gsub("c\\(","",paste(AD))
                AD_2 = gsub("\\)","", AD_1)
                AD_3 = gsub(":",", ",AD_2)
                AD_4 = gsub(", ", ",", AD_3)
                return(AD_4)
        }

        if (vcf_software == "Samtools") {
                DP4 = info(vcf_data)$DP4
                AD = sapply(DP4, function(x) paste0(x[1] + x[2], ",", x[3] + x[4]))
                return(AD)
        }

        if (vcf_software == "BisSNP") {
                DP4 = geno(vcf_data)$DP4[ , , c(1,2,3,4)]
                AD = apply(DP4, 1, function(x){paste0(x[1]+x[2],',',x[3]+x[4])})
                return(AD)
        }
}

# Test #
# vcf_df_list = process_vcf_list(vcf_list)

# 4. to add genotype information to snp info data frame -----------------------------------------------------------

add_vcf_info = function(snp_info_df, vcf_df_list) {
# Aim: to get genotype information added

        # add snp genotype information
        for (i in 1:length(vcf_df_list)) {
                vcf_df_i = vcf_df_list[[i]]
                vcf_file_i = names(vcf_df_list)[i]
                # select SNPs with het genotypes
                sel_rows = as.character(snp_info_df$rsID) %in% as.character(vcf_df_i$paramRangeID)

                # add vcf information
                snp_info_df[, vcf_file_i] = sel_rows
                snp_info_df[sel_rows, paste0(vcf_file_i, "_ref_count")] = vcf_df_i$ref_count
                snp_info_df[sel_rows, paste0(vcf_file_i, "_alt_count")] = vcf_df_i$alt_count
        }

        return(snp_info_df)
}



# main function ---------------------------------------------------------------------------------------------------

get_vcf_info_main = function(snp_info_file, vcf_dir = NA, vcf_file = NA, output_dir = "./", sample_name = "", output_file = NA) {
        ### Test ###
        # snp_info_file = snp_file_loc
        # vcf_dir = "./inst/extdata/sample/A549/vcf_files/"
        # vcf_file = NA
        # output_dir = "./"
        # sample_name = ""
        # output_file = NA


        # 0. load packages
        load_packages_2.3()
        cat("get vcf information for SNPs ... \n")

        # 0. generate output file name
        if (is.na(output_file)) {
                output_file = gen_output_file_vcfInfo(snp_info_file = snp_info_file,
                                                      output_dir = output_dir,
                                                      sample_name = sample_name)
        }
        cat("    output file name:", output_file, '\n')
        # 1. read snp file into data frame
        snp_info_list = read_inputSNP_file(snp_info_file)

        # 2. read vcf file into a list
        vcf_list = gen_vcf_list(vcf_dir = vcf_dir,
                                vcf_file = vcf_file,
                                snp_info_gr = snp_info_list$snp_info_gr)

        # 3. process vcf data
        vcf_df_list = process_vcf_list(vcf_list)

        # 4. add overlapping info to original snp table
        snp_info_addVcf_df = add_vcf_info (snp_info_df = snp_info_list$snp_info_df,
                                           vcf_df_list = vcf_df_list)

        if (output_file  != F) { # if output_file == F, do not write down file
                write.csv0(snp_info_addVcf_df, output_file)
        }

        cat("vcf information added ... \n")
        return(list(snp_info_addVcf_df = snp_info_addVcf_df, output_file = output_file))
}

# Test #
# snp_file_loc = "./input_snp_example2_A549_assnp/input_snp_example2_riskPop_0.5.csv"
# get_vcf_info_main(snp_info_file = snp_file_loc, vcf_dir = "./inst/extdata/sample/A549/vcf_files/")
foreverycc/AlleleSNP_Package documentation built on May 16, 2019, 1:50 p.m.