R/gather_vcf_functions.R

# ## Functions for loading data into sqlite db
.cbtsv_tosql <- function (tsv_file,db_con,tbl_name) {
  # tsv_file generated using vcf2tsv in vcflib
  # db_conn dplyr sqlite db connection
  #tbl_name of table creating in sqlite db

  ## new column headers to deal with DP used twice in vcf
  vcf_cols <- readLines(tsv_file, n = 1) %>%
      stringr::str_replace("DP", "PLATDP") %>%
      stringr::str_split("\t") %>% unlist()
  vcf <- data.table::fread(tsv_file,sep = "\t",
                           header = FALSE, skip = 1,
                           stringsAsFactors = FALSE) %>%
            data.table::setnames(colnames(.), vcf_cols)
  dplyr::copy_to(db_con, vcf, name = tbl_name, temporary = FALSE,
                 indexes = list("CHROM","POS","SAMPLE"))
  ## removing from workspace
  ##rm(vcf)
}

# Calculating purity and filtering indels
.pur_tbl <- function (vcf_tbl, db_con,tbl_name) {
  # vcf_tbl sqlite table generated from a vcf file using cbtsv, note requires
  # DP4 info for each sample db_conn dplyr sqlite db connection tbl_name of
  # table creating in sqlite db select desired columns and filtering indels
  vcf_dp4 <-dplyr::tbl(src = db_con, from = vcf_tbl)  %>%
    dplyr::select(CHROM, POS, SAMPLE, INDEL, DP, DP4, SP) %>%
    dplyr::filter(INDEL == 0) %>% dplyr::collect()

  vcf_dp4 <- vcf_dp4  %>%
    tidyr::separate(DP4, c("Ref_For","Ref_Rev","Alt_For","Alt_Rev")) %>%
    dplyr::mutate(Ref_For = as.numeric(Ref_For),
           Ref_Rev = as.numeric(Ref_Rev),
           Alt_For = as.numeric(Alt_For),
           Alt_Rev = as.numeric(Alt_Rev),
           Ref=Ref_For + Ref_Rev,
           Alt = Alt_For + Alt_Rev,
           Pur = Ref/(Ref+ Alt))
  dplyr::copy_to(db_con, vcf_dp4, name=tbl_name,temporary = FALSE, indexes = list("CHROM","POS","SAMPLE"))
  ## rm(vcf_dp4)
}

# Generating purity by platform summary
.pur_plat <-function(pur_plat_tbl, db_con,tbl_name){
 dplyr::tbl(src = db_con, from=pur_plat_tbl) %>%
    dplyr::group_by(CHROM, POS) %>%
    dplyr::summarize(Ref = sum(Ref), Alt = sum(Alt)) %>%
    dplyr::mutate(Pur = Ref/(Ref + Alt)) %>%
    dplyr::compute(name =tbl_name, temporary = FALSE)
}

# Joining two purity platform tables
.pur_plat_join <- function(pur_plat_tbl1, pur_plat_tbl2, db_con,tbl_name,
                           plat1_name = "plat1", plat2_name = "plat2"){
  plat1_tbl <-dplyr::tbl(src = db_con, from = pur_plat_tbl1) %>%
    dplyr::group_by(CHROM,POS) %>%
    dplyr::rename(plat1=Pur) %>%
    dplyr::select(CHROM, POS, plat1)
  plat2_tbl <-dplyr::tbl(src = db_con, from = pur_plat_tbl2) %>%
    dplyr::group_by(CHROM,POS) %>%
    dplyr::rename(plat2=Pur) %>%
    dplyr::select(CHROM, POS, plat2)
  # may want to change to outer_join later
  dplyr::inner_join(plat1_tbl, plat2_tbl) %>%
      dplyr::compute(name="pur_join", temporary = FALSE)
  ##rm(plat1_tbl, plat2_tbl)
}
usnistgov/peprr documentation built on May 3, 2019, 2:38 p.m.