R/correctInversionPhasing.R

Defines functions sliceRanges correctHetInv correctHomInv phaseHETinversion correctInvertedRegionPhasing

Documented in correctHetInv correctHomInv correctInvertedRegionPhasing phaseHETinversion sliceRanges

#' Correct StrandPhaseR phasing over inverted regions
#' 
#' This function attempts to correct phasing over inverted regions using both \pkg{\link{breakpointR}} and \pkg{\link{StrandPhaseR}} results.
#' 
#' @param input.bams A path to a VCF file to be loaded.
#' @param outputfolder A path to a folder where all summary and intermediate results will be stored.
#' @param inv.bed A BED formatted file that contains likely inverted regions to evaluate.
#' @param recall.phased If set to \code{TRUE},inversion signatures will be re-called using phased Strand-seq reads (Useful to redefine HET inversion boundaries).
#' @param het.genotype If set to 'strict' heterozygous inversion has to be supported by both 'breakpointR.data' and 'strandphaseR.data',
#' if set to 'lenient' support from either 'breakpointR.data' only is sufficient.
#' @param chromosomes Limit analysis to a certain chromosomes only.
#' @param snv.positions A path to a VCF files containing SNV positions to phase.
#' @param composite.file ...
#' @param breakpointR.data A path to results obtained by running \code{\link{breakpointR}} package on a given sample.
#' @param strandphaseR.data A path to results obtained by running \code{\link{StrandPhaseR}} package on a given sample.
#' @param vcfs.files A path to a folder where all VCF files to be corrected are stored (sample specific).
#' @param overwrite.results Set to \code{FALSE} if existing corrected VCF files should be used instead of creating a new copy of original VCF file. 
#' @importFrom breakpointR synchronizeReadDir removeReadPileupSpikes runBreakpointr genotype.binom
#' @importFrom GenomeInfoDb seqnames
#' @importFrom IRanges subsetByOverlaps
#' @importFrom utils read.table write.table
#' @inheritParams bamregion2GRanges
#' @inheritParams phaseChromosome
#' @inheritParams phaseHETinversion
#' @inheritParams vcf2vranges
#' @return \code{NULL} or \code{data.frame} depending if 'outputfolder' is defined.
#' @author David Porubsky
#' @export
#'
correctInvertedRegionPhasing <- function(input.bams, outputfolder=NULL, inv.bed=NULL, recall.phased=FALSE, het.genotype='strict', chromosomes=NULL, snv.positions=NULL, composite.file=NULL, breakpointR.data=NULL, strandphaseR.data=NULL, pairedEndReads=TRUE, min.mapq=10, background=0.05, vcfs.files=NULL, lookup.bp=1000000, lookup.blacklist=NULL, bsGenome=NULL, ref.fasta=NULL, assume.biallelic=TRUE, overwrite.results=TRUE) {
  
  ## Check user input ##
  ######################
  ## Check if set of inverted regions is defined
  if (!is.null(inv.bed)) {
    if  (file.exists(inv.bed)) {
      ## Load set of inverted regions
      inv.df <- utils::read.table(file = inv.bed, header = FALSE, stringsAsFactors = FALSE)
      colnames(inv.df)[c(1:3)] <- c('seqnames', 'start', 'end')
      inv.gr <- makeGRangesFromDataFrame(df = inv.df)
      call.inversion <- FALSE
    } else {
      message("Parameter 'inv.bed' is not specified or given file doesn't exists.\nLikely inverted regions will be called de novo.")
      call.inversion <- TRUE
      recall.phased <- TRUE
    }
  } else {
    call.inversion <- TRUE
    recall.phased <- TRUE
  }  
  
  ## Check input format of blacklisted regions and load a BED file if needed
  if (!is.null(lookup.blacklist) & !class(lookup.blacklist) == 'GRanges') {
    if (file.exists(lookup.blacklist)) {
      lookup.blacklist.df <- utils::read.table(file = lookup.blacklist, header = FALSE, sep = '\t', stringsAsFactors = FALSE)
      if (ncol(lookup.blacklist.df) >= 3) {
        lookup.blacklist.df <- lookup.blacklist.df[,c(1:3)]
        colnames(lookup.blacklist.df) <-  c('seqnames', 'start', 'end')
        lookup.blacklist <- GenomicRanges::makeGRangesFromDataFrame(lookup.blacklist.df)
      } else {
        lookup.blacklist <- NULL
        warning("The BED file, '", lookup.blacklist, "' doesn't contain required fields ('chr.name', 'start', 'end').")
      }
    } else {
      lookup.blacklist <- NULL
      warning("The BED file, '", lookup.blacklist, "' doesn't exists.")
    }
  }
  
  ## Abort function execution if breakpointR.data are not defined
  if (!is.null(composite.file)) {
    if (file.exists(composite.file)) {
      message("Loading composite file: ", composite.file)
      dir.reads <- get(load(composite.file))
    } else {
      warning("User defined 'composite.file' doesn't exists !!!")
    }  
  } else if (!is.null(breakpointR.data) & file.exists(breakpointR.data)) {
    ## Load breakpointR.data and create a composite file
    breakp.data <- list.files(breakpointR.data, full.names = TRUE)
    dir.reads <- breakpointR::synchronizeReadDir(files2sync = breakp.data)
  } else {
    stop("Required parameters 'composite.file' or 'breakpointR.data' are not defined or files doesn't exists, aborting ...")
  }
  ## Filter by mapping quality
  if (min.mapq > 0) {
    dir.reads <- dir.reads[dir.reads$mapq >= min.mapq]
  }
  
  ## Abort function execution if strandphaseR.data are not defined
  if (is.null(strandphaseR.data) & !file.exists(strandphaseR.data)) {
    stop("Required parameter 'strandphaseR.data' is not defined or file doesn't exists, aborting ...")
  } else {
    phased.data <- list.files(strandphaseR.data, pattern = "reads.RData", full.names = TRUE)
    phased.files <- basename(basename(phased.data))
    phased.chroms <- sapply(phased.files, function(x) strsplit(x, '_')[[1]][1])
  }
  
  ## Get chromosomes to process
  if (is.null(chromosomes)) {
    chromosomes <- phased.chroms
  }
  chroms2use <- intersect(chromosomes, phased.chroms)
  
  ## Create outputfolder if it doesn't exists
  if (!is.null(outputfolder)) {
    if (!dir.exists(outputfolder)) {
      dir.create(outputfolder)
    }
  }
  
  ## Call inversions in composite file if inverted regions not defined ##
  #######################################################################
  if (call.inversion) {
    ## Remove pileup spikes in coverage
    dir.reads <- breakpointR::removeReadPileupSpikes(dir.reads, max.pileup = 10)
    ## Analyze breakpoints in composite files
    breakpoints <- breakpointR::runBreakpointr(bamfile = dir.reads, 
                                               pairedEndReads = pairedEndReads, 
                                               chromosomes = chromosomes,
                                               windowsize = 10000, 
                                               peakTh = 0.33, 
                                               binMethod = "size", 
                                               genoT = 'binom',
                                               background = background, 
                                               minReads = 50)
    break.regions <- breakpoints$counts
    inv.gr <- break.regions[break.regions$states != 'cc']
  }
  
  ## Process each chromosome separately ##
  ########################################
  correction.summary <- list()
  for (chr in chroms2use) {
    ## Initialize corrected VCF file
    if (!is.null(vcfs.files)) {
      vcf.filename <- paste0(chr, '_phased.vcf')
      vcf.file <- file.path(vcfs.files, vcf.filename)
      if (file.exists(vcf.file)) {
        ## Make a copy of an input VCF file
        vcf.file.corr <- file.path(dirname(vcf.file), gsub(vcf.filename, pattern = '\\.vcf', replacement = '_INVcorr.vcf', ignore.case = TRUE))
        if (!file.exists(vcf.file.corr) | overwrite.results == TRUE) {
          message("Writing a VCF file copy, '", vcf.file.corr,"' for inversion correction.")
          file.copy(vcf.file, vcf.file.corr, overwrite = TRUE)
        } else {
          message("Using existing VCF file, '", vcf.file.corr,"' for inversion correction.")
        } 
      } else {
        warning(paste0("VCF file: ", vcf.file, " doesn't exists, skipping correction ..."))
      }  
    } else {
      warning("Parameter 'vcfs.files' not defined, skipping correction ...")
    }
    
    ## Load strandphaseR.data and create a composite phased file (per chromosome)
    phased.data.chr <- file.path(strandphaseR.data, paste0(chr, '_reads.RData'))
    reads <- get(load(phased.data.chr))
    hap1.reads <- reads[['hap1']]
    hap2.reads <- reads[['hap2']]
    GenomicRanges::strand(hap1.reads) <- "+" ## HAP1 == CRICK
    GenomicRanges::strand(hap2.reads) <- "-" ## HAP2 == WATSON
    phased.reads <- c(hap1.reads, hap2.reads)
    phased.reads <- GenomicRanges::sort(phased.reads, ignore.strand=TRUE)
    ## Filter by mapping quality
    if (min.mapq > 0) {
      phased.reads <- phased.reads[phased.reads$mapq >= min.mapq]
    }
    
    if (recall.phased) {
      breakp.phased <- breakpointR::runBreakpointr(bamfile = phased.reads, 
                                                   pairedEndReads = FALSE, 
                                                   chromosomes = chr,
                                                   windowsize = 10000, 
                                                   peakTh = 0.33, 
                                                   binMethod = "size", 
                                                   genoT = 'binom',
                                                   background = background, 
                                                   minReads = 50)
      detected.ranges <- breakp.phased$counts
      ## Report regions that are genotyped as purely WW or CC as those point to likely HET inversions
      het.inv.gr <- detected.ranges[detected.ranges$states %in% c('ww', 'cc')]
    }
    
    ## Subset inversions for a given chromosome
    inv.gr.chr <- inv.gr[GenomeInfoDb::seqnames(inv.gr) == chr] 
    if (length(inv.gr.chr) > 0) {
      message("Correcting phasing for inversions on chromosome: ", chr)
    } else {
      message("No inversions reported for chromosome: ", chr)
      next
    }
    
    ## Process each inversion separately ##
    #######################################
    for (j in seq_along(inv.gr.chr)) {
      roi.gr <- inv.gr.chr[j]
      roi.gr <- GenomeInfoDb::keepSeqlevels(roi.gr, value = GenomeInfoDb::seqnames(roi.gr))
      ## Get directional and phased reads for a given region
      roi.dir.reads <- IRanges::subsetByOverlaps(dir.reads, roi.gr)
      ## If 'recall.phased' set to TRUE try to use ranges defined by breakpoint calling on phased reads
      if (recall.phased) {
        gr <- IRanges::subsetByOverlaps(het.inv.gr, roi.gr)
        ## Calculate what portion of the original range 
        shared.width <- max(GenomicRanges::width(GenomicRanges::intersect(roi.gr, gr)) / GenomicRanges::width(roi.gr), 0)
        if (length(gr) > 0 & shared.width >= 0.9) {
          roi.phased.reads <- IRanges::subsetByOverlaps(phased.reads, range(gr))
        } else {
          roi.phased.reads <- IRanges::subsetByOverlaps(phased.reads, roi.gr)
        }
      } else {
        roi.phased.reads <- IRanges::subsetByOverlaps(phased.reads, roi.gr)
      }  
      ## Genotype region of interest (ROI) ##
      ## Remove pileup spikes in coverage
      roi.dir.reads <- breakpointR::removeReadPileupSpikes(gr = roi.dir.reads, max.pileup = 10)
      ## Genotype roi directional reads
      wReads <- roi.dir.reads[as.character(GenomicRanges::strand(roi.dir.reads)) == '-']
      cReads <- roi.dir.reads[as.character(GenomicRanges::strand(roi.dir.reads)) == '+']
      dir.reads.genot <- breakpointR::genotype.binom(wReads=length(wReads), cReads=length(cReads), background=background, minReads=10, log=TRUE)
      ## Genotype roi phased reads
      wReads <- roi.phased.reads[as.character(GenomicRanges::strand(roi.phased.reads)) == '-']
      cReads <- roi.phased.reads[as.character(GenomicRanges::strand(roi.phased.reads)) == '+']
      phased.reads.genot <- breakpointR::genotype.binom(wReads=length(wReads), cReads=length(cReads), background=background, minReads=10, log=TRUE)
      ## Get inversion genotype
      if (!is.na(dir.reads.genot$bestFit) & !is.na(phased.reads.genot$bestFit)) {
        ## Defined if inversion is homozygous (HOM)
        if (dir.reads.genot$bestFit == 'ww' & phased.reads.genot$bestFit == 'wc') {
          inv.genot <- 'HOM'
        } else if (dir.reads.genot$bestFit == 'wc' & phased.reads.genot$bestFit == 'wc') {
          inv.genot <- 'complex'
        } else {
          inv.genot <- 'REF'
        }
        ## Defined if inversion is heterozygous (HET)
        if (het.genotype == 'strict') {
          if (dir.reads.genot$bestFit == 'wc' & phased.reads.genot$bestFit %in% c('ww', 'cc')) {
            inv.genot <- 'HET'
          }
        } else if (het.genotype == 'lenient') {
          if (dir.reads.genot$bestFit == 'wc') {
            inv.genot <- 'HET'
          }
        }  
      } else {
        inv.genot <- 'lowReads'
      }  
      
      ## Phase HET inversion
      if (inv.genot == 'HET') {
        ## Set range to be phased
        if (recall.phased) {
          het.gr <- IRanges::subsetByOverlaps(het.inv.gr, roi.gr) 
          if (length(gr) > 0) {
            het.range <- range(het.gr)
            ## Calculate what portion of the original range 
            shared.width <- max(GenomicRanges::width(GenomicRanges::intersect(roi.gr, het.range)) / GenomicRanges::width(roi.gr), 0)
            if (length(het.range) > 0 & shared.width >= 0.9) {
              het.gr <- range(het.gr)
            } else {
              het.gr <- roi.gr
            }  
          } else {
            het.gr <- roi.gr
          }
        } else {
          het.gr <- roi.gr
        }
        
        het.haps <- phaseHETinversion(input.bams = input.bams, snv.positions=snv.positions, phase.gr=het.gr, lookup.bp = lookup.bp, lookup.blacklist = lookup.blacklist, pairedEndReads = pairedEndReads, min.mapq = min.mapq, bsGenome = bsGenome, ref.fasta = ref.fasta, assume.biallelic = assume.biallelic)
        ## Assign inverted haplotype to either H1 or H2 based on 'phased.reads.genot'
        if (phased.reads.genot$bestFit == 'cc') {
          if (!is.null(het.haps)) {
            het.haps$H1 <- het.haps$ref.phase
            het.haps$H2 <- het.haps$inv.phase
          }
          H1 <- 'ref'
          H2 <- 'inv'
        } else if (phased.reads.genot$bestFit == 'ww') {
          if (!is.null(het.haps)) {
            het.haps$H1 <- het.haps$inv.phase
            het.haps$H2 <- het.haps$ref.phase
          }  
          H1 <- 'inv'
          H2 <- 'ref'
        } else {
          ## Not able to determine keep H1 as REF and H2 as ALT
          if (!is.null(het.haps)) {
            het.haps$H1 <- het.haps$ref.phase
            het.haps$H2 <- het.haps$inv.phase
          }  
          H1 <- '.'
          H2 <- '.'
        }
      }
      
      ## Correct haplotypes overlapping with inverted region (roi.gr)
      #if (!is.null(vcfs.files)) {
      #vcf.filename <- paste0(chr, '_phased.vcf')
      #vcf.file <- file.path(vcfs.files, vcf.filename)
      if (file.exists(vcf.file.corr)) {
        if (inv.genot == 'HOM') {
          correctHomInv(correct.gr = roi.gr, vcf.file = vcf.file.corr)
          H1 <- 'inv'
          H2 <- 'inv'
        } else if (inv.genot == 'HET') {
          if (!is.null(het.haps)) {
            correctHetInv(correct.gr = het.gr, vcf.file = vcf.file.corr, het.haps = het.haps)
          }
        } else {
          H1 <- inv.genot
          H2 <- inv.genot
        }
      } else {
        warning(paste0("VCF file: ", vcf.file, " doesn't exists, skipping correction ..."))
      }  
      #} else {
      #  warning("Parameter 'vcfs.files' not defined, skipping correction ...")
      #}
      ## Report INV correction summary
      if (inv.genot == 'HET') { 
        het.region <- as.character(het.gr)
      } else {
        het.region <- 'NA'
      }
      report.df <- data.frame(bed.region=as.character(roi.gr), 
                              dir.reads.gt=dir.reads.genot$bestFit, 
                              phased.reads.gt=phased.reads.genot$bestFit, 
                              predict.inv.gt=inv.genot,
                              het.region=het.region, 
                              H1=H1, 
                              H2=H2)
      correction.summary[[length(correction.summary) + 1]] <- report.df
    } ## End of INV loop
  } ## End of CHR loop
  ## Report correction summary table
  correction.summary.df <- do.call(rbind, correction.summary)
  destination <- file.path(outputfolder, 'invPhasingCorrection.summary.tsv')
  ## Reuse existing file if available
  if (overwrite.results == FALSE) {
    if (file.exists(destination)) {
      message("Reporting inversions correction summary into an exisiting file, ", destination)
      ## Read in existing report file
      correction.summary.old <- utils::read.table(destination, header = TRUE)
      ## Replace matching bed.regions with new results or append new results
      replace.idx <- which(correction.summary.old$bed.region %in% correction.summary.df$bed.region)
      toReplace <- which(correction.summary.df$bed.region %in% correction.summary.old$bed.region)
      correction.summary.old[replace.idx,] <- correction.summary.df[toReplace,]
      correction.summary.old <- rbind(correction.summary.old, correction.summary.df[-toReplace,])
      correction.summary.df <- correction.summary.old
    }
  }
  
  if (!is.null(outputfolder)) {
    utils::write.table(correction.summary.df, file = destination, quote = FALSE, sep = '\t', row.names = FALSE)
    return(NULL)
  } else {
    return(correction.summary.df)
  }  
} ## End of function  


#' Phase a heterozygous inversion using Strand-seq data
#' 
#' This function takes as an input region deemed to be a heterozygous inversion and attempts to phase SNVs inside this region using
#' haplotype informative Strand-seq reads from multiple single cells.
#' 
#' @param phase.gr A \code{\link{GRanges}} object containing a single region to be phased. This region should point to a heterozygous inversion site.
#' @param lookup.bp A number of nucleotides, downstream and upstream, from the heterozygous inversion site ('phase.gr') to be genotyped.
#' @param lookup.blacklist A \code{\link{GRanges}} object or a path to a BED file containing a set of ranges to be excluded 
#' when extending 'phase.gr' by 'lookup.bp'. The total size of 'lookup.bp' is kept after filtering.
#' @param assume.biallelic If set to \code{TRUE} parameter 'snv.positions' is expected to contain biallelic loci (0/1, 1/0) and thus
#' gaps in haplotypes will be filled accordingly.
#' @param verbose Is set to \code{TRUE} function will provide a more detailed messaging of ongoing analysis steps.
#' @importFrom bamsignals bamCount
#' @importFrom Biostrings alphabetFrequency Views
#' @importFrom IRanges findOverlaps
#' @importFrom breakpointR genotype.binom
#' @importFrom GenomeInfoDb keepSeqlevels
#' @inheritParams correctInvertedRegionPhasing
#' @inheritParams bamregion2GRanges
#' @inheritParams phaseChromosome
#' @inheritParams vcf2vranges
#' @inheritParams exportVCF
#' @return A \code{data.frame} with phased alleles per inverted and reference haplotype.
#' @author David Porubsky
#' @export
#'
phaseHETinversion <- function(input.bams=NULL, snv.positions=NULL, phase.gr=NULL, lookup.bp=1000000, lookup.blacklist=NULL, pairedEndReads=TRUE, min.mapq=10, bsGenome=NULL, ref.fasta=NULL, assume.biallelic=TRUE, verbose=FALSE) {
  
  ## Load BSgenome
  if (class(bsGenome) != 'BSgenome') {
    if (is.character(bsGenome)) {
      suppressPackageStartupMessages(library(bsGenome, character.only=TRUE))
      bsGenome <- eval(parse(text=bsGenome)) # replacing string by object
    } else {
      bsGenome <- NULL
    }
  }
  
  ## Check if ref.fasta file is in a valid FASTA format
  if (!is.null(ref.fasta)) {
    if (class(Rsamtools::FaFile(ref.fasta)) != "FaFile") {
      ref.fasta <- NULL
      warning("User defined reference FASTA file, '", ref.fasta, "' is not in proper FASTA format!!!")
    }
  }
  
  ## Get chromosome name from 'phase.gr' object
  chromosome <- as.character(unique(GenomeInfoDb::seqnames(phase.gr)))
  
  ## Get SNV positions for a region defined in 'phase.gr'
  if (!is.null(snv.positions)) {
    if (grepl(snv.positions, pattern = 'vcf')) {
      snvs <- suppressMessages( vcf2ranges(vcfFile=snv.positions, genotypeField=1, chromosome = chromosome) )
    } else if (grepl(snv.positions, pattern = 'bed')) {
      snvs <- read.table(snv.positions, sep = '\t', header = FALSE, stringsAsFactors = FALSE)
      snvs <- GenomicRanges::GRanges(seqnames=snvs$V1, ranges=IRanges::IRanges(start=snvs$V2, end=snvs$V2))
    } else {
      stop("Required parameter 'snv.positions' not defined in one of the allowed formats [vcf, bed] !!!")
    } 
    region.snvs <- IRanges::subsetByOverlaps(snvs, phase.gr)
    region.snvs <- GenomeInfoDb::keepSeqlevels(region.snvs, value = as.character(unique(GenomeInfoDb::seqnames(region.snvs))))
    ## Remove duplicated SNV positions
    region.snvs <- region.snvs[!duplicated(region.snvs)]
  } else {
    stop("Required parameter 'snv.positions' not defined!!!")
  }
  ## Report empty data.frame object if there are no SNVs overlapping region of interest
  if (length(region.snvs) > 0) {
    
    ## Loop over all BAM files and select cells with WC genotype over region of interest
    file.list <- list.files(path = input.bams, pattern = '\\.bam$', full.names = TRUE) 
    
    inv.reads <- list()
    ref.reads <- list()
    for (i in seq_along(file.list)) {
      bamfile <- file.list[i]
      bamfile.name <- basename(bamfile)
      if (verbose) {
        message("Processing bamfile: ", bamfile.name) 
      }  
      ## Genotype inversions flanks ##
      lookup.gr <- GenomicRanges::resize(phase.gr, width = (2 * lookup.bp) + width(phase.gr), fix = 'center')
      ## Make sure lookup.gr is no larger than set seqlength
      lookup.gr <- GenomicRanges::trim(lookup.gr)
      flanks.gr <- GenomicRanges::setdiff(lookup.gr, phase.gr)
      ## Account for blacklisted regions
      if (!is.null(lookup.blacklist)) {
        downANDupsteam <- GenomicRanges::gaps(phase.gr)
        downANDupsteam <- downANDupsteam[GenomicRanges::strand(downANDupsteam ) == '*']
        unique <- suppressWarnings( GenomicRanges::setdiff(downANDupsteam, lookup.blacklist) )
        hits <- IRanges::findOverlaps(unique, downANDupsteam)
        unique.grl <- GenomicRanges::split(unique, subjectHits(hits))
        flanks.grl <- GenomicRanges::GRangesList()
        for (j in seq_along(unique.grl)) {
          gr <- unique.grl[[j]]
          if (j == 1) {
            flanks.grl[[j]] <- sliceRanges(gr, slice.size = lookup.bp, from = 'end')
          } else {
            flanks.grl[[j]] <- sliceRanges(gr, slice.size = lookup.bp, from = 'start')
          }
        }
        flanks.gr <- unlist(flanks.grl)
      }
      ## Set parameter for bamsignals counts
      paired.end <- 'ignore'
      if (pairedEndReads) {
        paired.end <- 'filter'
      }
      max.frag <- 1000
      counts <- bamsignals::bamCount(bamfile, flanks.gr, mapq=min.mapq, filteredFlag=1024, paired.end=paired.end, tlenFilter=c(0, max.frag), verbose=FALSE, ss=TRUE)
      flanks.genot <- breakpointR::genotype.binom(wReads=sum(counts['antisense',]), cReads=sum(counts['sense',]), background=0.05, minReads=50, log=TRUE)
      
      if (flanks.genot$bestFit %in% c('ww','cc')) {
        ## Load reads from inverted region
        bam.reads <- bamregion2GRanges(bamfile, region=phase.gr, pairedEndReads=pairedEndReads, min.mapq=min.mapq)
      } else {
        if (verbose) {
          message("    Uninformative cell, skipping ...")
        }  
        next
      } 
      
      ## Store phased reads
      if (flanks.genot$bestFit == 'ww') {
        inv.reads[[length(inv.reads) + 1]] <- bam.reads[GenomicRanges::strand(bam.reads) == '+']
        ref.reads[[length(ref.reads) + 1]] <- bam.reads[GenomicRanges::strand(bam.reads) == '-']
      } else if (flanks.genot$bestFit == 'cc') {
        inv.reads[[length(inv.reads) + 1]] <- bam.reads[GenomicRanges::strand(bam.reads) == '-']
        ref.reads[[length(ref.reads) + 1]] <- bam.reads[GenomicRanges::strand(bam.reads) == '+']
      }
    }
    #inv.reads <- unlist(inv.reads, use.names = FALSE)
    #ref.reads <- unlist(ref.reads, use.names = FALSE)
    inv.reads <- do.call(c, inv.reads)
    ref.reads <- do.call(c, ref.reads)
    
    if (length(inv.reads) != 0 & length(ref.reads) != 0) {
      ## Make sure only reads from a chromosome in phase.gr are kept
      inv.reads <- GenomeInfoDb::keepSeqlevels(inv.reads, value = chromosome, pruning.mode = 'coarse')
      ref.reads <- GenomeInfoDb::keepSeqlevels(ref.reads, value = chromosome, pruning.mode = 'coarse')
      
      ## Extract alleles at variable positions ##
      ## Extract read sequences for watson and crick reads
      inv.reads.seq <- GenomicRanges::mcols(inv.reads)$seq
      ref.reads.seq <- GenomicRanges::mcols(ref.reads)$seq
      ## Extract base qualities for watson and crick reads
      #inv.reads.qual <- mcols(inv.reads)$qual
      #ref.reads.qual <- mcols(ref.reads)$qual
      
      ## get piles of bases at each variable position
      piles.inv.reads <- GenomicAlignments::pileLettersAt(inv.reads.seq, GenomeInfoDb::seqnames(inv.reads), start(inv.reads), mcols(inv.reads)$cigar, region.snvs)
      piles.ref.reads <- GenomicAlignments::pileLettersAt(ref.reads.seq, GenomeInfoDb::seqnames(ref.reads), start(ref.reads), mcols(ref.reads)$cigar, region.snvs)
      
      ## get piles of qualities at each variable position
      #quals.inv.reads <- GenomicAlignments::pileLettersAt(inv.reads.qual, seqnames(inv.reads), start(inv.reads), mcols(inv.reads)$cigar, region.snvs)
      #quals.ref.reads <- GenomicAlignments::pileLettersAt(ref.reads.qual, seqnames(ref.reads), start(ref.reads), mcols(ref.reads)$cigar, region.snvs)
      
      if (sum(width(piles.inv.reads)) > 0) {
        ## Get inverted alleles indices
        inv.bases.freq <- Biostrings::alphabetFrequency(piles.inv.reads, baseOnly=TRUE)
        ## Remove empty sites
        inv.bases.idx <- which(rowSums(inv.bases.freq) > 0)
        inv.bases.freq <- inv.bases.freq[drop=FALSE, inv.bases.idx,]
        # Remove positions with ambiguous allele counts (max and second max equally abundant)
        filt.ambig <- apply(inv.bases.freq, 1, function(x) (sum(x) - max(x)) < max(x))
        inv.bases.freq <- inv.bases.freq[drop=FALSE, filt.ambig,]
        inv.bases.idx <-  inv.bases.idx[filt.ambig]
        # Get max covered alleles
        max.idx <- apply(inv.bases.freq, 1, which.max)
        # Keep only standard nucleotides
        filt.nonstand <- max.idx <= 4
        max.idx <- max.idx[filt.nonstand]
        inv.bases.idx <- inv.bases.idx[filt.nonstand]
        inv.bases <- chartr("1234", "ACGT", max.idx)
      } else {
        inv.bases <- 'N'
        inv.bases.idx <- 1
      }  
      
      # if (sum(width(piles.ref.reads)) > 0) {
      #   ## Get reference alleles indices
      #   ref.bases.freq <- Biostrings::alphabetFrequency(piles.ref.reads, baseOnly=TRUE)
      #   ref.bases.idx <- which( ref.bases.freq > 0, arr.ind=TRUE)
      #   ref.bases.idx <- ref.bases.idx[drop=FALSE, order(ref.bases.idx[,1]),] # sort by SNV position
      #   ref.bases.idx <- ref.bases.idx[drop=FALSE, ref.bases.idx[, 2] <= 4, ] # keep only standard nucleotides
      #   ref.bases <- chartr("1234", "ACGT", ref.bases.idx[, 2])
      # } else {
      #   ref.bases <- 'N'
      #   ref.bases.idx <- matrix(c(1,1), nrow = 1)
      # }  
      
      if (sum(width(piles.ref.reads)) > 0) {
        ## Get inverted alleles indices
        ref.bases.freq <- Biostrings::alphabetFrequency(piles.ref.reads, baseOnly=TRUE)
        ## Remove empty sites
        ref.bases.idx <- which(rowSums(ref.bases.freq) > 0)
        ref.bases.freq <- ref.bases.freq[drop=FALSE, ref.bases.idx,]
        # Remove positions with ambiguous allele counts (max and second max equally abundant)
        filt.ambig <- apply(ref.bases.freq, 1, function(x) (sum(x) - max(x)) < max(x))
        ref.bases.freq <- ref.bases.freq[drop=FALSE, filt.ambig,]
        ref.bases.idx <-  ref.bases.idx[filt.ambig]
        # Get max covered alleles
        max.idx <- apply(ref.bases.freq, 1, which.max)
        # Keep only standard nucleotides
        filt.nonstand <- max.idx <= 4
        max.idx <- max.idx[filt.nonstand]
        ref.bases.idx <- ref.bases.idx[filt.nonstand]
        ref.bases <- chartr("1234", "ACGT", max.idx)
      } else {
        ref.bases <- 'N'
        ref.bases.idx <- 1
      }
      
      ## Assemble haplotypes
      # pos.idx <- unique(sort(c(inv.bases.idx[, 1], ref.bases.idx[, 1])))
      # pos.gen <- start(region.snvs)[pos.idx]
      # inv.allele <- rep('.', times=length(pos.gen))
      # inv.allele[match(inv.bases.idx[, 1], pos.idx)] <- inv.bases
      # ref.allele <- rep('.', times=length(pos.gen))
      # ref.allele[match(ref.bases.idx[, 1], pos.idx)] <- ref.bases
      
      pos.idx <- unique(sort(c(inv.bases.idx, ref.bases.idx)))
      pos.gen <- start(region.snvs)[pos.idx]
      inv.allele <- rep('.', times=length(pos.gen))
      inv.allele[match(inv.bases.idx, pos.idx)] <- inv.bases
      ref.allele <- rep('.', times=length(pos.gen))
      ref.allele[match(ref.bases.idx, pos.idx)] <- ref.bases
      
      ## Extract reference alleles from the reference bsgenome object if available
      snv.ranges <- GenomicRanges::GRanges(seqnames=chromosome, ranges=IRanges::IRanges(start=pos.gen, end=pos.gen))
      ## Assign reference and alternative alleles
      if (!is.null(bsGenome)) {
        ref.base <- Biostrings::Views(bsGenome, snv.ranges)
        ref.base <- as(ref.base, "DNAStringSet")
        ref.base <- as(ref.base, "vector")
      } else if (!is.null(ref.fasta)) {
        ## Extract SNV bases from user defined reference FASTA file
        fa.file <- open(Rsamtools::FaFile(ref.fasta))
        snv.seq <- Rsamtools::scanFa(file = fa.file, param = snv.ranges, as = "DNAStringSet")     
        names(snv.seq) <- NULL
        ref.base <- as.character(snv.seq)
      } else {
        ref.base <- rep('N', length(pos.gen))
      }
        
      ## Report inverted and reference haplotypes
      inv.phase <- rep('.', times=length(pos.gen))
      ref.phase <- rep('.', times=length(pos.gen))
      inv.phase[inv.allele == ref.base] <- 0
      inv.phase[inv.allele != ref.base & inv.allele != '.'] <- 1
      ref.phase[ref.allele == ref.base] <- 0
      ref.phase[ref.allele != ref.base & ref.allele != '.'] <- 1
      inv.allele[inv.allele == '.'] <- 'N'
      ref.allele[ref.allele == '.'] <- 'N'
      
      if (assume.biallelic) {
        inv.phase.gaps <- which(inv.phase == '.')
        inv.phase[inv.phase.gaps] <- dplyr::recode(ref.phase[inv.phase.gaps], '0' = '1', '1' = '0')
        ref.phase.gaps <- which(ref.phase == '.')
        ref.phase[ref.phase.gaps] <- dplyr::recode(inv.phase[ref.phase.gaps], '0' = '1', '1' = '0')
        inv.allele[inv.phase == '0'] <- ref.base[inv.phase == '0']
        ref.allele[ref.phase == '0'] <- ref.base[ref.phase == '0']
      }  
      phased.df <- data.frame(pos.gen=pos.gen, ref.allele=ref.allele, inv.allele=inv.allele, ref.phase=ref.phase, inv.phase=inv.phase, ref.base=ref.base, 
                              stringsAsFactors = FALSE)
    } else {
      phased.df <- NULL
    }
  } else {
    phased.df <- NULL
  }
  return(phased.df)
}


#' Correct homozygous inversion in StrandPhaseR VCF file.
#' 
#' This function takes as an input region deemed to be a homozygous inversion and flips H1 and H2 haplotypes for SNVs that 
#' overlap with this region.
#' 
#' @param correct.gr A \code{\link{GRanges}} object of homozygous inversion site to be corrected.
#' @param vcf.file A \pkg{\link{StrandPhaseR}} formatted VCF file to be corrected for inversion phasing.
#' @param ID A unique id to be appended at the end of each corrected VCF file.
#' @importFrom VariantAnnotation readVcfAsVRanges ScanVcfParam
#' @return \code{NULL}
#' @author David Porubsky
#' @export
#'
correctHomInv <- function(correct.gr=NULL, vcf.file=NULL, ID='') {
  ## Read VCF file header
  file.conn <- file(vcf.file, "r")
  line <- readLines(file.conn, 1)
  header <- list()
  while(grepl("#", line)) {
    header[[length(header ) + 1]] <- line
    line <- readLines(file.conn, 1)
  }
  #header <- cat(unlist(header), sep = '\n')
  
  ## Read VCF file
  #vcf.data <- read.table(vcf.file, header = FALSE, comment.char = "#")
  suppressWarnings( vcf.data <- VariantAnnotation::readVcfAsVRanges(x = vcf.file, 
                                                                    param = VariantAnnotation::ScanVcfParam(fixed=c('ALT'), info = NA, geno = c('GT','Q1','Q2','P1','P2') )) )
  ## Filter positions with no reference allele and two alternative alleles
  vcf.data <- vcf.data[!vcf.data$GT %in% c('1|2', '1/2')]
  ## Make sure that missing values are converted from NAs to dots
  vcf.data$Q1[is.na(vcf.data$Q1)] <- '0'
  vcf.data$Q2[is.na(vcf.data$Q2)] <- '0'
  vcf.data$P1[is.na(vcf.data$P1)] <- '0'
  vcf.data$P2[is.na(vcf.data$P2)] <- '0'
  
  ## Flip haplotypes overlapping HOM inversion
  seq.len <- c(seqlengths(correct.gr), seqlengths(vcf.data))
  seqlengths(correct.gr) <- max(seq.len[!is.na(seq.len)])
  seqlengths(vcf.data) <- max(seq.len[!is.na(seq.len)])
  hits <- IRanges::findOverlaps(vcf.data, correct.gr, type = 'within')
  snv.idx <- S4Vectors::queryHits(hits)
  gt.toFlip <- vcf.data$GT[snv.idx]
  gt.toFlip[vcf.data$GT[snv.idx] == '1|0'] <- '0|1'
  gt.toFlip[vcf.data$GT[snv.idx] == '0|1'] <- '1|0'
  vcf.data$GT[snv.idx] <- gt.toFlip
  
  ## Print VCF ##
  if (is.character(ID) & nchar(ID) > 0) {
    destination <- gsub(vcf.file, pattern = '\\.vcf', replacement = paste0('_', ID, '\\.vcf'))
  } else {
    destination <- vcf.file
  }  
  ## Print header
  cat(unlist(header), sep = '\n', file = destination)
  ## Print corrected phasing
  gt <- apply(mcols(vcf.data), 1, function(x) paste(x, collapse = ':'))
  print.df <- data.frame(chr=unique(GenomeInfoDb::seqnames(vcf.data)),
                         pos=start(vcf.data),
                         id=rep('.', length(vcf.data)),
                         ref=VariantAnnotation::ref(vcf.data),
                         alt=VariantAnnotation::alt(vcf.data),
                         qual=rep('.', length(vcf.data)),
                         filter=rep('.', length(vcf.data)),
                         info=rep('.', length(vcf.data)),
                         format='GT:Q1:Q2:P1:P2',
                         gt=gt)
  utils::write.table(print.df, file=destination, row.names=FALSE, col.names=FALSE, quote=FALSE, append=TRUE, sep='\t')
  return(NULL)
}


#' Correct heterozygous inversion in StrandPhaseR VCF file.
#' 
#' This function takes as an input region deemed to be a heterozygous inversion along with phased SNVs within a region and uses
#' this information to add/correct into the \pkg{\link{StrandPhaseR}} formatted VCF file.
#' 
#' @param het.haps A \code{data.frame} with phased alleles per inverted and reference haplotype.
#' @importFrom VariantAnnotation readVcfAsVRanges ScanVcfParam
#' @inheritParams correctInvertedRegionPhasing
#' @inheritParams correctHomInv
#' @return \code{NULL}
#' @author David Porubsky
#' @export
#'
correctHetInv <- function(correct.gr=NULL, vcf.file=NULL, het.haps=NULL, ID='') {
  ## Check if het.haps parameter is defined and that it contains phase information
  if (!is.null(het.haps) & any(grepl(colnames(het.haps), pattern = 'phase'))) {
    ## Read VCF file header
    file.conn <- file(vcf.file, "r")
    line <- readLines(file.conn, 1)
    header <- list()
    while(grepl("#", line)) {
      header[[length(header ) + 1]] <- line
      line <- readLines(file.conn, 1)
    }
    #header <- cat(unlist(header), sep = '\n')
    
    ## Read VCF file
    suppressWarnings( vcf.data <- VariantAnnotation::readVcfAsVRanges(x = vcf.file, 
                                                                      param = VariantAnnotation::ScanVcfParam(fixed=c('ALT'), info = NA, geno = c('GT','Q1','Q2','P1','P2') )) )
    ## Filter positions with no reference allele and two alternative alleles
    vcf.data <- vcf.data[!vcf.data$GT %in% c('1|2', '1/2')]
    ## Make sure that missing values are converted from NAs to zeroes
    vcf.data$Q1[is.na(vcf.data$Q1)] <- '0'
    vcf.data$Q2[is.na(vcf.data$Q2)] <- '0'
    vcf.data$P1[is.na(vcf.data$P1)] <- '0'
    vcf.data$P2[is.na(vcf.data$P2)] <- '0'
    
    ## Remove SNVs overlapping with HET inversion
    seq.len <- c(seqlengths(correct.gr), seqlengths(vcf.data))
    seqlengths(correct.gr) <- max(seq.len[!is.na(seq.len)])
    seqlengths(vcf.data) <- max(seq.len[!is.na(seq.len)])
    hits <- IRanges::findOverlaps(vcf.data, correct.gr, type = 'within')
    snv.idx <- S4Vectors::queryHits(hits)
    if (length(snv.idx) > 0) {
      vcf.data <- vcf.data[-snv.idx]
    }
    
    #gt <- paste0(het.haps$ref.phase, '|', het.haps$inv.phase) ## Wrongly assuming H1 is always a ref.phase ... !!!
    gt <- paste0(het.haps$H1, '|', het.haps$H2)
    gt <- paste0(gt, ':0:0:0:0')
    ## Get REF and ALT alleles
    REF <- het.haps$ref.base
    ALT <- rep('N', length(gt))
    ALT[het.haps$ref.phase == 1] <- het.haps$ref.allele[het.haps$ref.phase == 1]
    ALT[het.haps$inv.phase == 1] <- het.haps$inv.allele[het.haps$inv.phase == 1]  
    add.df <- data.frame(chr=unique(GenomeInfoDb::seqnames(vcf.data)),
                         pos=het.haps$pos.gen,
                         id=rep('.', nrow(het.haps)),
                         ref=REF,
                         alt=ALT,
                         qual=rep('.', nrow(het.haps)),
                         filter=rep('.', nrow(het.haps)),
                         info=rep('.', nrow(het.haps)),
                         format='GT:Q1:Q2:P1:P2',
                         gt=gt)
    
    ## Print VCF ##
    if (is.character(ID) & nchar(ID) > 0) {
      destination <- gsub(vcf.file, pattern = '\\.vcf', replacement = paste0('_', ID, '\\.vcf'))
    } else {
      destination <- vcf.file
    }  
    ## Print header
    cat(unlist(header), sep = '\n', file = destination)
    ## Print corrected phasing
    gt <- apply(mcols(vcf.data), 1, function(x) paste(x, collapse = ':'))
    print.df <- data.frame(chr=unique(GenomeInfoDb::seqnames(vcf.data)),
                           pos=start(vcf.data),
                           id=rep('.', length(vcf.data)),
                           ref=VariantAnnotation::ref(vcf.data),
                           alt=VariantAnnotation::alt(vcf.data),
                           qual=rep('.', length(vcf.data)),
                           filter=rep('.', length(vcf.data)),
                           info=rep('.', length(vcf.data)),
                           format='GT:Q1:Q2:P1:P2',
                           gt=gt)
    ## Add newly phased variants from HET inversion
    print.df <- rbind(print.df, add.df)
    print.df <- print.df[order(print.df$pos),]
    utils::write.table(print.df, file=destination, row.names=FALSE, col.names=FALSE, quote=FALSE, append=TRUE, sep='\t')
  } else {
    stop("Parameter 'het.haps' is not defined or it doesn't contain phase information expected to be in column 4 and 5 !!!")
  }  
}  

## Outside functions ##
# genotype.binom <- function(wReads, cReads, background=0.05, minReads=10, log=FALSE) {
#   ## Set parameters
#   roiReads <- wReads + cReads
#   alpha <- background
#   ## Calculate binomial probabilities for given read counts
#   result <- list(bestFit=NA, pval=NA)
#   if (length(roiReads)==0) {
#     return(result)
#   }
#   if (is.na(roiReads)) {
#     return(result)
#   }
#   if ( roiReads >= minReads ) {
#     ## Test if a given read counts are WW
#     WWpVal <- stats::dbinom(wReads, size = roiReads, prob = 1-alpha, log = log)
#     ## Test if a given read counts are CC
#     CCpVal <- stats::dbinom(wReads, size = roiReads, prob = alpha, log = log)
#     ## Test if a given read counts are WC
#     WCpVal <- stats::dbinom(wReads, size = roiReads, prob = 0.5, log = log)
#     ## Export results
#     pVal <- c(wc = WCpVal, cc = CCpVal, ww = WWpVal)
#     result <- list(bestFit = names(pVal)[which.max(pVal)], pval = max(pVal))
#     return(result)
#   } else {
#     return(result)
#   }
# }


#' Slice a certain length from a set of genomic ranges. 
#' 
#' This function takes as an input a set of ranges stored in \code{\link{GRanges}} object and export slice of certain length
#' starting either from the first or last range.
#' 
#' @param gr A \code{\link{GRanges}} object containing a set of genomic ranges.
#' @param slice.size A total length of sliced ranges to be reported given the starting range defined by 'from'.
#' @param from Set to 'start' or 'end' if to start counting bases to slice from the first or the last range.
#' @return A \code{\link{GRanges}} object.
#' @author David Porubsky
#' @export
#'
sliceRanges <- function(gr, slice.size=1000000, from='start') {
  ## Make sure submitted set of ranges is sorted by position
  gr <- GenomicRanges::sort(gr)
  if (sum(width(gr)) > slice.size) {
    if (from == 'start') {
      ## Get cumulative sum of ranges width
      cs.width <- cumsum(as.numeric(width(gr)))
      cut.idx <- findInterval(cs.width, slice.size)
      cut.idx <- which(cut.idx == 1)[1]
      gr.sub <- gr[1:cut.idx]
      ## Get cut position
      pos <- mapply(seq, start(gr.sub), end(gr.sub))
      pos <- unlist(pos)[1:slice.size]
      cut.pos <- pos[length(pos)]
      ## Adjust size of the last range
      end(gr.sub[length(gr.sub)]) <- cut.pos
    } else if (from == 'end') {
      ## Get cumulative sum of ranges width
      gr.rev <- rev(gr)
      cs.width <- cumsum(as.numeric(width(gr.rev)))
      cut.idx <- findInterval(cs.width, slice.size)
      cut.idx <- which(cut.idx == 1)[1]
      gr.sub <- gr.rev[1:cut.idx]
      ## Get cut position
      pos <- mapply(seq, end(gr.sub), start(gr.sub))
      pos <- unlist(pos)[1:slice.size]
      cut.pos <- pos[length(pos)]
      ## Adjust size of the last range
      start(gr.sub[length(gr.sub)]) <- cut.pos
    } else {
      stop("Set parameter 'from' to either 'start' or 'end'!!!")
    }
  } else {
    gr.sub <- gr
  }  
  return(gr.sub)
}
daewoooo/StrandPhaseR documentation built on April 7, 2024, 7:13 p.m.