R/F2.4.1-get_loci_cnv_byEthic.R

# 1. prepare SNP files --------------------------------------------------------------------------------------------

# 1-1 prepare SNP files for cnv inference
prepare_files_for_cnvInfer = function(index_snp_file, r2_cutoff = 0.5, sample_ethic = "EUR",
                                      output_dir = "./", sample_name = "",
                                      vcf_file = "data/samples/DDBJ_A549/vcf_files/A549_snv.GATK.formatcor.vcf") {
        # Aim: to generate wgs info added snp file
        # 1. generate ld snp info df from index snp file
        # 2. add wgs info
        # Input: 1. index snp file ; 2. the ethnicity of the sample (not the SNPs)

        # Test #
        # index_snp_file = "./data/input_snps/LUC_Index_SNPs_20160607.csv"
        # r2_cutoff = 0.5
        # sample_ethic = "EUR"
        # output_dir = "./"
        # sample_name = ""
        # vcf_file_for_cnv = "/Volumes/Macintosh_HD_2/Research/00-PROJECTS/00-Allele_Specific_SNP_Finding/data/Samples/DDBJ_A427/vcf_files/A427_snv.GATK.formatcor.vcf"

        # source("./src/scripts/F1.1-get_ldsnp_info.R")
        # source("./src/scripts/F2.3-get_vcf_info.R")

        ldsnp_info_list = get_ldsnp_info_main(index_snp_file = index_snp_file,
                                              population = sample_ethic,
                                              output_dir = output_dir,
                                              r2_cutoff = r2_cutoff,
                                              for_cnv_call = T)
        ldsnp_info_addVcf_list = get_vcf_info_main(snp_info_file = ldsnp_info_list$output_file,
                                                   vcf_file = vcf_file,
                                                   output_dir = output_dir,
                                                   sample_name = sample_name)

        return (ldsnp_info_addVcf_list$snp_info_addVcf_df)

}

# 1-2. add locus information
add_locus_info = function(snp_info_addVcf_df) {
        # Aim: to add locus information (index SNPs in high LD are considered as one locus)
        ### Test ###
        # snp_info_addVcf_df = snp_info_addVcf_df_raw

        # source the indexSNP class
        # source("./src/scripts/F2.4.1.1-indexSNP_class.R")

        # generate {indexSNP object} for each index SNPs
        index_snps = unique(as.character(snp_info_addVcf_df$query_snp))
        snp_list = replicate(length(index_snps), list())
        names(snp_list) = index_snps

        for (i in 1:length(index_snps)) {
                snp_info_addVcf_df_sel_indexSnp = snp_info_addVcf_df[snp_info_addVcf_df$query_snp == index_snps[i], ]
                snp_list[[i]] = new("indexSNP", rsID = index_snps[i],
                                    ldSNPs = as.character(snp_info_addVcf_df[snp_info_addVcf_df$rsID %in% snp_info_addVcf_df$query_snp &
                                                                        snp_info_addVcf_df$query_snp == index_snps[i], "rsID"]),
                                    ldInfo = snp_info_addVcf_df[snp_info_addVcf_df$rsID %in% snp_info_addVcf_df$query_snp &
                                                                        snp_info_addVcf_df$query_snp == index_snps[i], "D."])
        }
        # calculate locus and locus D'
        index_snp_mat = matrix(nrow = length(snp_list), ncol = length(snp_list))

        for (i in 1:length(snp_list)) {
                for (j in 1:length(snp_list)) {
                        index_snp_mat[i, j] = ldRelation(snp_list[[i]], snp_list[[j]])
                }
        }

        index_snp_info_df = data.frame(query_snp = index_snps)
        rownames(index_snp_info_df) = index_snps
        index_snp_info_df$locus = NA
        index_snp_info_df$locus_D = NA

        for (i in 1:nrow(index_snp_info_df)) {
                locus_snp_idx = which(!is.na(index_snp_mat[i, ]))[1]
                index_snp_info_df$locus[i] = index_snps[locus_snp_idx]
                index_snp_info_df$locus_D[i] = index_snp_mat[i, locus_snp_idx]
        }

        # add locus information
        snp_info_addVcf_df$locus = index_snp_info_df[as.character(snp_info_addVcf_df$query_snp), "locus"]
        snp_info_addVcf_df$locus_D = index_snp_info_df[as.character(snp_info_addVcf_df$query_snp), "locus_D"]

        return (list(snp_info_addVcf_df = snp_info_addVcf_df,
                     index_snp_info_df = index_snp_info_df))
}


# 1-3. modify vcf information for further analysis
mod_vcf_info = function(snp_info_addVcf_df) {
        # Aim: to modify vcf df to make it more suitable for further analysis

        # Test #
        # snp_info_addVcf_df = snp_info_addVcf_list$snp_info_addVcf_df

        snp_info_addVcf_df$allele_1_count = NA
        snp_info_addVcf_df$allele_2_count = NA

        # 1. change column names
        names(snp_info_addVcf_df)[grepl("ref_count", names(snp_info_addVcf_df))] = "ref_count"
        names(snp_info_addVcf_df)[grepl("alt_count", names(snp_info_addVcf_df))] = "alt_count"
        # head(snp_info_addVcf_df)

        # 2. sort df by query snp / locus
        if ("locus" %in% names(snp_info_addVcf_df)) {
                snp_info_addVcf_df = arrange(snp_info_addVcf_df, locus)
                D_prime_sign = as.numeric(as.character(snp_info_addVcf_df$D.)) * snp_info_addVcf_df$locus_D > 0
        } else {
                snp_info_addVcf_df = arrange(snp_info_addVcf_df, query_snp)
                D_prime_sign = as.numeric(as.character(snp_info_addVcf_df$D.)) > 0
        }

        # 3. modify vcf information by D'
        snp_info_addVcf_df$allele_1_count[D_prime_sign] = snp_info_addVcf_df$ref_count[D_prime_sign]
        snp_info_addVcf_df$allele_2_count[D_prime_sign] = snp_info_addVcf_df$alt_count[D_prime_sign]

        snp_info_addVcf_df$allele_1_count[!D_prime_sign] = snp_info_addVcf_df$alt_count[!D_prime_sign]
        snp_info_addVcf_df$allele_2_count[!D_prime_sign] = snp_info_addVcf_df$ref_count[!D_prime_sign]

        return(snp_info_addVcf_df)
}


# 2. get loci cnv raw ---------------------------------------------------------------------------------------------
get_het_locus_summary_df = function(snp_info_addVcf_df, sample_ethic = "EUR", r2_cutoff = 0.5,
                                    distance_threshold = 50, min_ldsnp_num = 3, read_count_cutoff = 500,
                                    index_snp_info_df) {
        # To calculate CNV number for each locus
        # step 1: collect distritbution total SNPs for each locus

        # ----- Test ----- #
        # snp_info_addVcf_df = snp_info_addVcf_df
        # read_count_cutoff = 200
        # index_snp_info_df = snp_info_addVcf_list$index_snp_info_df
        # ---------------- #

        # calculate het-SNP count by LD
        snp_info_addVcf_df_narm = snp_info_addVcf_df[complete.cases(snp_info_addVcf_df) &
                                                             snp_info_addVcf_df$r2 >= r2_cutoff, ]
        snp_info_addVcf_df_narm_mk = mark_close_snp(snp_info_addVcf_df_narm, distance_threshold = distance_threshold)
        snp_info_addVcf_df_narm_rm_close_snp = filter(snp_info_addVcf_df_narm_mk, for_cnv_calculation == "Y")
        het_snp_summary_df_locus = snp_info_addVcf_df_narm_rm_close_snp %>% group_by(locus) %>%
                summarise(sum(allele_1_count), sum(allele_2_count))
        colnames(het_snp_summary_df_locus) = c("locus", "sum_allele_1_count", "sum_allele_2_count")
        # filter low count het SNPs
        sel_high_count_locus = table(snp_info_addVcf_df_narm_rm_close_snp$locus) > min_ldsnp_num # an input_SNP must have >= 3 het SNPs in LD
        sel_locus = names(table(snp_info_addVcf_df_narm_rm_close_snp$locus)[sel_high_count_locus])
        het_snp_summary_df_locus = filter(het_snp_summary_df_locus,
                                    sum_allele_1_count + sum_allele_2_count > read_count_cutoff &
                                            locus %in% sel_locus)

        # add SNPs in the same locus
        het_snp_summary_df = add_locus_snp(het_snp_summary_df_locus, index_snp_info_df)
        return (het_snp_summary_df)
}

# 2-1. Mark SNPs that are too close
mark_close_snp = function(snp_info_addVcf_df, distance_threshold = 50) {
        snp_info_addVcf_df$for_cnv_calculation = NA
        for (i in 1: nrow(snp_info_addVcf_df)) {
                if (i == 1)  {
                        snp_info_addVcf_df$for_cnv_calculation[i] = "Y"
                } else if (snp_info_addVcf_df$chr[i] == snp_info_addVcf_df$chr[i-1] &
                           abs(snp_info_addVcf_df$pos[i] - snp_info_addVcf_df$pos[i-1]) < distance_threshold ) {
                        snp_info_addVcf_df$for_cnv_calculation[i] = "N"
                } else {
                        snp_info_addVcf_df$for_cnv_calculation[i] = "Y"
                }
        }

        return (snp_info_addVcf_df)
}

# 2-2. add SNPs in the same locus
add_locus_snp = function(het_snp_summary_df_locus, index_snp_info_df) {
        index_snp_info_df$sum_allele_2_count =  index_snp_info_df$sum_allele_1_count = NA
        for (i in 1:nrow(index_snp_info_df)) {
                locus_i = index_snp_info_df$locus[i]
                if (locus_i %in% as.character(het_snp_summary_df_locus$locus)) {
                        j = which(as.character(het_snp_summary_df_locus$locus) == locus_i)
                        if (index_snp_info_df$locus_D[i] > 0) {
                                index_snp_info_df$sum_allele_1_count[i] = het_snp_summary_df_locus$sum_allele_1_count[j]
                                index_snp_info_df$sum_allele_2_count[i] = het_snp_summary_df_locus$sum_allele_2_count[j]
                        } else {
                                index_snp_info_df$sum_allele_1_count[i] = het_snp_summary_df_locus$sum_allele_2_count[j]
                                index_snp_info_df$sum_allele_2_count[i] = het_snp_summary_df_locus$sum_allele_1_count[j]
                        }
                }
        }

        het_snp_summary_df_querySNP = index_snp_info_df[complete.cases(index_snp_info_df), ]
        rownames(het_snp_summary_df_querySNP) = 1:nrow(het_snp_summary_df_querySNP)

        return (het_snp_summary_df_querySNP)
}

# get CNV info ---------------------------------------------------------------------------------------------------

get_loci_cnv_byEthic = function(index_snp_file, cnv_param_list,
                                sample_name = "", sample_ethic = "EUR", output_dir = "./") {
# Aim: to get loci cnv information of a particular ethic group
# Input: index snp file, vcf file, ethic
# Output: het_snp_summary_df_ethic (summarized info of loci cnv based on a particular ethic group)

        # Test #
        # index_snp_file = "./data/input_snps/LUC_Index_SNPs_20160607.csv"
        # sample_name = ""
        # sample_ethic = "ASN"
        # output_dir = "./"

        # 1. process snp data
        ## 1) get ld snp table; 2) add vcf information
        snp_info_addVcf_df_raw = prepare_files_for_cnvInfer(index_snp_file = index_snp_file,
                                                            r2_cutoff = cnv_param_list$r2_cutoff,
                                                            sample_name = sample_name,
                                                            sample_ethic = sample_ethic,
                                                            output_dir = output_dir,
                                                            vcf_file = cnv_param_list$vcf_file)
        ## modify vcf information for further analysis
        snp_info_addVcf_list = add_locus_info(snp_info_addVcf_df_raw)
        snp_info_addVcf_df = mod_vcf_info(snp_info_addVcf_list$snp_info_addVcf_df)

        # 2. get loci cnv by calculating summing up r2 SNP distribution
        het_snp_summary_df_r2 = get_het_locus_summary_df(snp_info_addVcf_df = snp_info_addVcf_df,
                                                         sample_ethic = sample_ethic,
                                                         r2_cutoff = cnv_param_list$r2_cutoff,
                                                         distance_threshold = cnv_param_list$distance_threshold,
                                                         min_ldsnp_num = cnv_param_list$min_ldsnp_num,
                                                         read_count_cutoff = cnv_param_list$read_count_cutoff,
                                                         index_snp_info_df = snp_info_addVcf_list$index_snp_info_df)

        return(het_snp_summary_df_r2)

}
foreverycc/AlleleSNP_Package documentation built on May 16, 2019, 1:50 p.m.