R/prepareVariants.R

Defines functions getVcfs getSNVandINDEL VariantSummaryReport plotSNVReplicateOverlap

Documented in getSNVandINDEL getVcfs plotSNVReplicateOverlap VariantSummaryReport

#' Read Vcf files to GRanges
#' @details Takes VCF output files from variant calling pipeline and combines
#'  a GRanges object ready for fasta generation.
#' @param vcf_dir Path to directory with Vcf files of all replicate samples
#' @param pattern Pattern to grep the specific Vcf files in the directory
#' @param share_num Number of samples that must contain the variant.
#' Two options, percentage format or sample number.
#' @import customProDB
#' @import GenomeInfoDb
#' @export


getVcfs <- function(vcf_dir, pattern =".vcf", share_num = "100%"){

  vcfFile_path <- paste(vcf_dir,list.files(vcf_dir, pattern = pattern),sep="/")
  vcfs <- lapply(vcfFile_path, function(x) InputVcf(x))

  for (i in 1:length(vcfs)) {
    seqlevelsStyle(vcfs[[i]][[1]])<-"NCBI"
  }

  shared <- Multiple_VCF(vcfs, share_num=share_num)

  # shared_out <- getSNVandINDEL(shared, exon_annotation, procodingseq)

  return(shared)
}

#' Annotate Variants
#' @details Annotates the variants in a granges object produced by getVcfs
#' @param granges_variants GRanges object produced by getVcfs
#' @param exon_annotation Exon annotation from annotation folder
#' @param procodingseq Coding sequences from annotation folder
#' @import customProDB
#' @import rtracklayer
#' @export

getSNVandINDEL <- function(granges_variants, exon_annotation, procodingseq){

  variants_df <- values(granges_variants)
  indel_index <- sapply(variants_df$REF,nchar) != sapply(variants_df$ALT,nchar)

  indelvcf <- granges_variants[indel_index]
  snvvcf <- granges_variants[!indel_index]

  postable_snv <- Positionincoding(snvvcf, exon_annotation)
  postable_indel <- Positionincoding(indelvcf, exon_annotation)

  txlist <- unique(postable_snv[, 'txid'])
  codingseq <- procodingseq[procodingseq[, 'tx_id'] %in% txlist,]
  snvtab <- aaVariation (postable_snv, codingseq)

  txlist_indel <- unique(postable_indel[, 'txid'])
  codingseq_indel <- procodingseq[procodingseq[, 'tx_id'] %in% txlist_indel, ]

  return(list(postable_snv = postable_snv, SNV_tab = snvtab,
              postable_indel = postable_indel, codingseq_indel = codingseq_indel,
              snvvcf = snvvcf, indelvcf = indelvcf))
}

#' Generate a Variant Summary Report
#' @details Prints a table of the location of the variants. Caution:
#' Because the Varlocation function of CustomProDB is very slow, this takes a long time
#' to compute.
#' @param snv_and_indel List object produced by getSNVandINDEL
#' @param ids Id mapping from annotation folder
#' @param txdb Txdb object from annotation folder
#' @param print boolean, should the result be printed in working directory as pdf
#' or be returned as a list.
#' @import customProDB
#' @import ggplot2
#' @export


VariantSummaryReport <- function(snv_and_indel, ids, txdb, print = TRUE){

  SNVloc <- Varlocation(snv_and_indel$snvvcf,txdb,ids)
  indelloc <- Varlocation(snv_and_indel$indelvcf,txdb,ids)
  output <- list(SNV = table(SNVloc[,'location']),
                 INDEL = table(indelloc[,'location']))

  if(print){
    pdf("Variant_summary_report.pdf", width = 6, height = 4)
    # SNV's
    plot_df <- data.table(Position = names(output[[1]][-3]), SNVs = as.numeric(output[[1]][-3]), AA_Impact = FALSE)
    coding_df <- data.table(Position = "Coding",
                            SNVs =c(nrow(unique(snv_and_indel$SNV_tab[snv_and_indel$SNV_tab$vartype == "synonymous",c("pos", "chr")])),
                                    nrow(unique(snv_and_indel$SNV_tab[snv_and_indel$SNV_tab$vartype == "non-synonymous",c("pos", "chr")]))),
                            AA_Impact = c(FALSE, TRUE))
    plot_df <- rbind(plot_df, coding_df)
    p <- ggplot(plot_df) +
      geom_bar(aes(x=Position, y=SNVs, fill = AA_Impact), stat = "identity") +
      ggtitle(paste0("Location of ", names(output)[1],"s")) +
      theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) +
      ylab(paste0("# of ", names(output[1])))
    print(p)
    # Indels
    plot_df2 <- data.table(Position = names(output[[2]]), SNVs = as.numeric(output[[2]]),
                          AA_Impact = c(F,F,T,F,F,F,F))
    p2 <- ggplot(plot_df2) +
      geom_p(aes(x=Position, y=SNVs, fill = AA_Impact), stat = "identity") +
      ggtitle(paste0("Location of ", names(output)[2],"s")) +
      theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) +
      ylab(paste0("# of ", names(output[2])))

    # pl <- rbind(plot_df[, Variation_type := "SNV"], plot_df2[, Variation_type := "INDEL"])
    # p <- ggplot(pl) +
    #   geom_bar(aes(x=Position, y=SNVs, fill = AA_Impact), stat = "identity") +
    #   ggtitle(paste0("Location of ", names(output)[2],"s")) +
    #   theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) +
    #   ylab(paste0("# of ", names(output[2]))) +
    #   facet_grid(. ~ Variation_type)
    print(p2)
    dev.off()
  } else{
    return(output)
  }

  table(SNVloc[,'location'])
  table(indelloc[,'location'])

}
#' Plot the overlap of identified Variants between replicates
#' @param vcf_dir Path to folder with all replicate vcf files
#' @param pattern pattern to match
#' @import data.table
#' @import VennDiagram
#' @export

plotSNVReplicateOverlap <- function(vcf_dir = vcf_directory, pattern = ".vcf$", outfile = "SNVReplicate_overlap.tiff"){
  files <- list.files(vcf_dir, pattern = pattern)
  vcfs <- lapply(paste0(vcf_dir,"/",files), function(x) fread(x, sep = "\t", na.strings = "#", skip = "CHROM"))
  vennlist <- lapply(vcfs, function(x) paste0(x$'#CHROM', x$POS))
  names(vennlist) <- paste0("Rep. ", seq_along(vennlist))
  venn.diagram(vennlist,
               filename = outfile,
               print.mode = "raw", fill = c("red", "green"), alpha = 0.5,
               cex = 2.5, margin = 0.06, cat.cex = 2.5, cat.dist = c(0.06,0.06), lty ="blank")
}
mffrank/RNAseqToCustomFASTA documentation built on May 22, 2019, 7:55 p.m.