funcivar: Calculate overlaps and enrichment between genomic variants and genomic features or segmentations

Documented in CalcLD CalculateEnrichment GetBioFeatures GetSegmentations GetVariantsInWindow PlotEnrichment SetOverlaps SetPopulation ShowOverlaps SplitVcfLd

#' Get all variants in a specified genomic window
#' @description Import variants from VCF or BED files, optionally restricted to a specific window.
#' @param file A character vector describing a file to read
#'
#' @param position An optional GRanges object defining the coordinates of the
#'   file to import
#' @param type A character vector stating that the filetype is either 'vcf' or
#'   'bed'
#' @param genome A character vector describing the genome build against which
#'   the variants were mapped e.g. "hg19", "b37", "hg38", "mm10".
#'
#' @return an object of class VCF or GRanges representing the variants
#' @importFrom VariantAnnotation ScanVcfParam
#' @importFrom Rsamtools TabixFile
#' @importFrom rtracklayer import.bed
#' @importFrom GenomeInfoDb seqlevelsStyle seqlevelsStyle<-
#' @export
GetVariantsInWindow <- function(file, position, genome = "hg19", type = "vcf") {
  if (tolower(type) == "vcf") {
    if(!missing(position)) {
      if(is(position, "GRanges")) {
        params <- ScanVcfParam(which = position)
      } else if(position == "all") {
        params <- ScanVcfParam()
      } else if(is(position, "character")) {
        position <- as(position, "GRanges")
        params <- ScanVcfParam(which = position)
      } else {
        stop("I don't understand the position argument, must be unset, GRanges, or 'all'")
      }
    } else {
      stop("without a position argument, the full vcf will be imported into memory,\n",
           "this can be a very expensive operation. Set the position arg to 'all' to allow")
    }
    vcf <- TabixFile(file)
    variants <- getFILE(file, GetVariantsInWindowVCF, params, genome, N.TRIES = 3L)
    return(variants)
  } else if (tolower(type) == "bed") {
    if(!missing(position) & position != 'all') {
      variants <- import.bed(file, which = position, genome = genome)
    } else {
      variants <- import.bed(file, genome = genome)
    }
    return(variants)
  } else {
    stop("type ", type, " is not yet implemented")
  }
}

#' Set Population in VCF
#' @description Add population specific data to your VCF
#' @param vcf An object of class VCF
#'
#' @param sample_sheet a data.frame with columns describing the sample and
#'   population that you may wish to filter on
#' @return returns the vcf that was passed as an argument with it's colData()
#'   modified to include the population data
#' @importFrom S4Vectors merge
#' @importFrom SummarizedExperiment colData colData<- rowData rowData<-
#' @export
SetPopulation <- function(vcf, sample_sheet) {
  population <- colData(vcf)
  if (!("Samples" %in% colnames(population))) {
    stop("vcf does not contain sample information")
  }
  population$Samples <- rownames(population)
  sample.col <- which(grepl(pattern = "sample",
                            x = colnames(sample_sheet),
                            ignore.case = TRUE))
  if (length(sample.col) > 0L) {
    sample_sheet <- DataFrame(sample_sheet)
    colnames(sample_sheet)[sample.col] <- "Samples"
    population <- S4Vectors::merge(population, sample_sheet, all.x = TRUE, all.y = FALSE, by = "Samples")
    rownames(population) <- population$Samples
    colData(vcf) <- population
    metadata(vcf)$source <- "funciVar"
    metadata(vcf)$ld <- "none"
    return(vcf)
  } else {
    stop("sample sheet does not contain sample column")
  }
}

#' Calculate LD
#' @description Calculate the Linkage Disequilibrium between an index SNP and the rest of the
#' SNPs in the VCF
#' @param vcf An object of class VCF, optionally processed with SetPopulation()
#'   to allow for calculating LD with a specific population.
#'
#' @param index A character vector that contains a SNP that is present in the
#'   vcf
#' @param population An optional character vector specifying the population upon
#'   which to filter the samples in the vcf
#' @param return A character vector, either 'valid' or 'all'. 'valid' returns
#'   valid SNPs from the vcf only, 'all' attempts to return all variants,
#'   including indels. In the case of 'all', still, only SNPs are used for
#'   calculating LD.
#' @param force Logical indicating that even though LD has already been
#'   calculated upon this VCF, you wish to overwrite that information with the
#'   LD calculation of a new index.
#' @return returns the vcf that was passed as an argument with it's rowRanges()
#'   modified to include the information relative to the index. Including ref,
#'   and alt alleles, allele frequencies (in the population queried), distance
#'   to the index, D prime and R squared.
#' @importFrom VariantAnnotation snpSummary isSNV genotypeToSnpMatrix
#' @importFrom SummarizedExperiment rowRanges rowRanges<-
#' @importFrom snpStats ld
#' @importFrom BiocGenerics start end strand
#' @import methods
#' @export
CalcLD <- function(vcf, index, population, return = "valid", force = TRUE) {
  ## check input
  if(!is(vcf, "VCF")) {
    stop("vcf object must be of the class VCF")
  }
  if (index %in% rownames(vcf)) {
    if (!isSNV(vcf[rownames(vcf) %in% index, ])) {
      stop("funciVar can only calculate LD for SNVs at this time")
    }
  } else {
    stop("index snp ", index, " not found in current vcf")
  }
  if(missing(population)) {
    population <- "ALL"
    vcf.snv <- isSNV(vcf)
    if (return == "valid") {
      if (any(!vcf.snv)) {
        vcf <- vcf[vcf.snv, ]
        warning("CalcLD only operates on SNVs some of your variants were dropped at this step, set return = 'all', to override this behaviour")
      }
      vcf <- vcf[isSNV(vcf, singleAltOnly = TRUE), ]
    }
  } else {
    samples <- as.data.frame(colData(vcf))
    samples.col <- which(samples == population, arr.ind = TRUE)
    if (nrow(samples.col) > 0L) {
      ## metadata
      samples.col <- samples.col[1, "col"]
      pop.samples <- rownames(samples)[grepl(population, samples[, samples.col])]
      vcf <- vcf[, pop.samples]
      vcf.snv <- isSNV(vcf)
      if (return == "valid") {
        if (any(!vcf.snv)) {
          vcf <- vcf[vcf.snv, ]
          warning("CalcLD only operates on SNVs some of your variants were dropped at this step, set return = 'all', to override this behaviour")
        }
        vcf <- vcf[isSNV(vcf, singleAltOnly = TRUE), ]
      }
    } else {
      stop("your population was not found in your vcf.\nUse setPopulation() to add your sample descriptions, and double check that ",
           population, " is present")
    }
  }
  if(nrow(vcf) >= 1L) {
    vcf.ranges <- rowRanges(vcf)[, c("REF", "ALT")]
    mcols(vcf.ranges) <- c(mcols(vcf.ranges), snpSummary(vcf)[, c("a0Freq", "a1Freq", "HWEpvalue")])
    colnames(mcols(vcf.ranges)) <- c("ref", "alt", "refAlleleFreq", "altAlleleFreq", "HWEpvalue")
    mcols(vcf.ranges)[, "indexSNP"] <- index
    mcols(vcf.ranges)[, "population"] <- population
    mcols(vcf.ranges)[, "distanceToIndex"] <- abs(start(vcf.ranges[index, ]) - start(vcf.ranges))
    ## genotype
    vcf.geno <- genotypeToSnpMatrix(vcf)$genotypes
    vcf.ld <- ld(vcf.geno, vcf.geno[, index], stats = c('D.prime', 'R.squared'))
    mcols(vcf.ranges)$D.prime <- vcf.ld$D.prime[, 1]
    mcols(vcf.ranges)$R.squared <- vcf.ld$R.squared[, 1]
    rowRanges(vcf) <- vcf.ranges
    ## xXx rowData(vcf) <- cbind(rowData(vcf), mcols(vcf.ranges))
    if ("none" %in% metadata(vcf)$ld) {
      metadata(vcf)$ld <- index
    } else if(force) {
      metadata(vcf)$ld <- index
    } else if(!("ld" %in% names(metadata(vcf)))) {
      metadata(vcf)$source <- "funciVar"
      metadata(vcf)$ld <- index
    } else {
      stop("ld has already been calculated on this object for index snp: ", index)
    }
  }
  return(vcf)
}

#' Import Biofeatures
#' @description Imports a set of biofeatures in the form of BED files or derivitive formats
#' for the purpose of annotating variants
#' @param files A character vector containing a list of files to
#'
#' @param genome A character vector describing the genome build against which
#'   the biofeatures were created, e.g. "hg19", "b37", "hg38", "mm10".
#'
#' @return A GRangesList containing the biofeatures imported.
#'
#' @importFrom data.table fread
#' @importFrom stringr str_replace
#' @importFrom GenomeInfoDb Seqinfo
#' @importFrom GenomicRanges GRangesList GRanges
#' @importFrom IRanges IRanges
#' @importFrom S4Vectors mcols mcols<- metadata metadata<- DataFrame
#' @importFrom readr read_tsv cols_only col_character col_integer col_number
#' @importFrom utils head
#' @export
GetBioFeatures <- function(files, genome) {
  if (length(files) < 1L) return(NULL)
  good.files <- sapply(files, function(x) file.exists(x))
  if (sum(good.files) < length(files)) {
    if (sum(!good.files) <= 5L) {
      stop(paste("cannot find the following", sum(!good.files), "files:\n"),
           paste0(files[!good.files], "\n"))
    } else {
      stop(paste("cannot find", sum(!good.files), "files. Some of which are:\n"),
           paste0(head(files[!good.files], n = 5L), "\n"))
    }
  } else {
    genome <- tryCatch(Seqinfo(genome = genome), error = NULL)
    bed.list <- lapply(files,
                       function(file, s.info) {
                         if (grepl(".narrowPeak", file, ignore.case = TRUE)) {
                           col.types <- cols_only(chr = col_character(),
                                                  start = col_integer(),
                                                  end = col_integer(),
                                                  name = col_character(),
                                                  score = col_integer(),
                                                  strand = col_character(),
                                                  signalValue = col_number())
                           col.names <- c("chr", "start", "end", "name", "score", "strand", "signalValue")
                           col.numbers <- c(1:7)
                         } else {
                           col.types <- cols_only(chr = col_character(),
                                                  start = col_integer(),
                                                  end = col_integer())
                           col.names <- c("chr", "start", "end")
                           col.numbers <- c(1:3)
                         }
                         if (any(grepl(".gz$", file))) {
                           xf <- suppressMessages(suppressWarnings(read_tsv(file = file, col_names = col.names,
                                                                            col_types = col.types,
                                                                            progress = FALSE)))
                         } else {
                           xf <- fread(input = file, sep = "\t", header = FALSE,
                                       select = col.numbers, skip = "chr",
                                       col.names = col.names,
                                       encoding = "UTF-8",
                                       stringsAsFactors = FALSE,
                                       data.table = FALSE, showProgress = FALSE)
                         }
                         if (ncol(xf) < 7) xf$signalValue <- NA
                         xf <- try(GRanges(seqnames = xf$chr,
                                           ranges = IRanges(start = xf$start + 1L,
                                                            end = xf$end),
                                           strand = "*",
                                           feature = base::rep.int(basename(file), nrow(xf)),
                                           signalValue = xf$signalValue,
                                           seqinfo = s.info))
                         return(xf)
                       }, s.info = genome)
  }
  if (is.null(bed.list)) {
    return(GRangesList())
  } else {
    bed.list <- GRangesList(bed.list)
    names(bed.list) <- basename(files)
    return(bed.list)
  }
}

#' Imports a set of genome segmentation files, particularly those generated by
#' StatePaintR, for the purpose of annotating variants
#' @param files A character vector containing a list of files to import. These
#'   files need to be BED9 format. see
#'   \url{https://genome.ucsc.edu/FAQ/FAQformat.html#format1}. And they need to
#'   be completely disjoint intervals.
#'
#' @param genome A character vector describing the genome build against which
#'   the segmentations were created, e.g. "hg19", "b37", "hg38", "mm10".
#'
#' @return A GRangesList containing the segmentations imported.
#' @importFrom rtracklayer import.bed
#' @export
GetSegmentations <- function(files) {
  bed.list <- sapply(files, function(file) {
    bed <- import.bed(file)
  })
  bed.names <- sapply(bed.list, function(bed) {
    if (.hasSlot(bed, 'trackLine')) {
      bed.name <- bed@trackLine@name
    } else {
      bed.name <- NA
    }
  })
  bed.names[is.na(bed.names)] <- files[is.na(bed.names)]
  bed.list <- Map(format.bed, bed.list, bed.names)
  if (is.null(bed.list)) {
    return(GRangesList())
  } else {
    bed.list <- GRangesList(bed.list)
    names(bed.list) <- bed.names
    return(bed.list)
  }
}

format.bed <- function(bed, name) {
  bed.m <- mcols(bed)
  mcols(bed) <- NULL
  mcols(bed)$sample <- name
  mcols(bed)$state <- bed.m$name
  return(bed)
}

#' Split VCF by Linkage Disequilibrium
#' @description Splits a VCF file into foreground and background variant sets based upon a
#' Linkage Disequilibrium cutoff
#'
#' @param vcf an object of class VCF, particularly one that has been processed
#'   by CalcLD()
#'
#' @param ld a list containing three fields. These include: \itemize{ \item
#'   metric, either 'R.squared' or 'D.prime' \item cutoff, numeric between 0 and
#'   1, the value of the metric upon which to perform the cutoff \item maf,
#'   numeric between 0 and 0.5, the minor allele frequency considered to be
#'   included in both foreground and background }
#'
#' @param strict.subset boolean indicating if one wishes the foreground  to be a
#'   strict subset of the background, or an independant set.
#' @return a list of length 2 containing in the 'fg' slot the VCF object of the
#'   foreground SNPs, and the bg slot containing the VCF object of the
#'   background SNPs
#' @export
SplitVcfLd <- function(vcf, ld = list(metric = "R.squared", cutoff = 0.8, maf = 0.01), strict.subset = TRUE) {
  if (!is(vcf, "VCF")) {
    stop("parameter vcf must be a VCF object")
  }
  if (!all(names(ld) %in% c("metric", "cutoff", "maf"))) {
    stop("parameter ld must contain the fields 'metric', 'cutoff', and 'maf'")
  }
  if (!(ld[["metric"]] %in% colnames(mcols(rowRanges(vcf))))) {
    stop("in argument ld$metric: '", ld[['metric']],"' must be present in rowRanges(vcf)")
  }
  if (!(ld[["maf"]] >= 0 && ld[["maf"]] <= 0.5)) {
    stop("in argument ld$maf: '", ld[['maf']], "' must be between the values of 0 and 0.5")
  }
  orient <- mcols(rowRanges(vcf))[, "altAlleleFreq"] < mcols(rowRanges(vcf))[, "refAlleleFreq"]
  filter <- mcols(rowRanges(vcf))[, "altAlleleFreq"] >= ld[["maf"]]
  filter[!orient] <- mcols(rowRanges(vcf))[, "refAlleleFreq"][!orient] >= ld[["maf"]]
  fg.filter <- filter & mcols(rowRanges(vcf))[, ld[["metric"]]] >= ld[["cutoff"]]
  if (strict.subset) {
    bg.filter <- fg.filter | mcols(rowRanges(vcf))[, ld[["metric"]]] < ld[["cutoff"]]
  } else {
    bg.filter <- filter & !mcols(rowRanges(vcf))[, ld[["metric"]]] >= ld[["cutoff"]]
  }
  metadata(vcf)$strict.subset <- strict.subset
  bg.filter <- fg.filter | mcols(rowRanges(vcf))[, ld[["metric"]]] < ld[["cutoff"]]
  return(list(fg = vcf[fg.filter & !is.na(fg.filter), ], bg = vcf[bg.filter & !is.na(bg.filter), ]))
}


#' Calculate Enrichment
#' @description Calculates the enrichment of foreground variants vs background
#'   variants against a list of biofeatures.
#' @param variants a list of two GRanges or VCF objects representing the
#'   foreground and background variants to be compared. In the format list(fg =
#'   object, bg = object).
#'
#' @param features A GRangesList of biofeatures or segmentations to calculate
#'   enrichment of the variants against. This field is optional if feature.type
#'   = "biofeatures", and the variants objects have already had SetOverlaps
#'   performed against them.
#' @param feature.type a character vector, either "biofeatures" for generic
#'   feature overlaps, or "segmentations" specifying that the features slot is
#'   populated with disjoint genomic segmentations imported by
#'   GetSegmentations()
#' @param CI numeric, the width of the credible interval for the calculation of
#'   the beta-binomal test for enrichment
#' @param prior numeric vector of length two setting the parameters of the prior
#'   of the beta distribution, in the form, e.g., c(a = 0.5, b = 0.5)
#' @param strict.subset either a boolean value, or "guess" indicating if the
#'   foreground is a strict subset of the background, or an independant set.
#' @param return.overlaps boolean value indicating if, in addition to the
#'   enrichment data.frame, a GRanges object representing the overlaps of the
#'   variants and features be returned.
#'
#' @return either a data.frame indicating the enrichment statistics of the
#'   variant set in the features, or a list including the overlaps of the the
#'   variants and features, in addition to the enrichment data.frame. log1p(oddsratio) are reported from fisher.test
#'
#' @importFrom stats fisher.test formula as.formula quantile rbeta
#' @importFrom GenomicRanges gaps
#' @importMethodsFrom GenomicRanges range
#' @importMethodsFrom IRanges subsetByOverlaps
#' @export
CalculateEnrichment <- function(variants, features, feature.type = "biofeatures",
                            CI = 0.95, prior = c(a = 0.5, b = 0.5),
                            strict.subset = "guess", return.overlaps = FALSE) {
  # set strict.subset
  if (strict.subset == "guess") {
    test.sub <- c(metadata(variants$fg)$strict.subset, metadata(variants$bg)$strict.subset)
    if (all(test.sub, !is.null(test.sub))) {
      strict.subset <- TRUE
    } else if (any(metadata(variants$fg)$strict.subset, metadata(variants$bg)$strict.subset)) {
      stop("foreground and background disagree about whether fg is a strict subset of bg\n",
           "create foreground and background with SplitVcfLD")
    } else if (all(rownames(variants$fg) %in% rownames(variants$bg))) {
      strict.subset <- TRUE
    } else {
      strict.subset <- FALSE
    }
  } else if (!is.logical(strict.subset)) {
    stop("strict.subset must be one of 'guess', TRUE, or FALSE")
  }
  # features
  if(!is(variants, "list")) stop("variants must be an object of class list")
  if (!missing(features)) {
    if (feature.type == "biofeatures") {
      if(inherits(features, "GRangesList")) features <- unlist(features, use.names = FALSE)
      variants$fg <- SetOverlaps(variants$fg, features)
      variants$bg <- SetOverlaps(variants$bg, features)
      fg.features <- colnames(mcols(ShowOverlaps(variants$fg)))
      bg.features <- colnames(mcols(ShowOverlaps(variants$bg)))
      all.features <- union(fg.features, bg.features)
      ## first fg
      if (any(!(is.element(all.features, fg.features)))) {
        if (is(variants$fg, "VCF")) {
          mcols(rowRanges(variants$fg))[, all.features[!(is.element(all.features, fg.features))]] <- 0L
        } else if (is(variants$fg, "GRanges")) {
          mcols(variants$fg)[, all.features[!(is.element(all.features, fg.features))]] <- 0L
        }
      }
      if (is(variants$bg, "VCF")) {
        mcols(rowRanges(variants$fg)) <- cbind(mcols(rowRanges(variants$fg))[, 1:metadata(variants$fg)$overlap.offset-1], mcols(rowRanges(variants$fg))[, all.features, drop = FALSE])
      } else if (is(variants$fg, "GRanges")) {
        mcols(variants$fg) <- cbind(mcols(variants$fg)[, 1:attributes(variants$fg)$metadata$overlap.offset-1], mcols(variants$fg)[, all.features, drop = FALSE])
      }
      ## then bg
      if (any(!(is.element(all.features, bg.features)))) {
        if (is(variants$bg, "VCF")) {
          mcols(rowRanges(variants$bg))[, all.features[!(is.element(all.features, bg.features))]] <- 0L
        } else if (is(variants$bg, "GRanges")) {
          mcols(variants$bg)[, all.features[!(is.element(all.features, bg.features))]] <- 0L
        }
      }
      if (is(variants$bg, "VCF")) {
        mcols(rowRanges(variants$bg)) <- cbind(mcols(rowRanges(variants$bg))[, 1:metadata(variants$bg)$overlap.offset-1], mcols(rowRanges(variants$bg))[, all.features, drop = FALSE])
      } else if (is(variants$bg, "GRanges")) {
        mcols(variants$bg) <- cbind(mcols(variants$bg)[, 1:attributes(variants$bg)$metadata$overlap.offset-1], mcols(variants$bg)[, all.features, drop = FALSE])
      }
    }
  }
  # feature.type = "biofeatures"
  if (feature.type == "biofeatures") {
    enrichment <- enrich.features(fg = variants$fg, bg = variants$bg, CI = CI, prior = prior, strict.subset = strict.subset)
    if(return.overlaps) {
      if(all(names(mcols(variants$fg)) == names(mcols(variants$bg)))) {
        list.fun <- GenomicRanges::GRangesList
      } else {
        list.fun <- S4Vectors::List
      }
      if (is(variants$fg, "GRanges") & is(variants$bg, "GRanges")) {
        overlaps <- list.fun(foreground.overlaps = variants$fg, background.overlaps = variants$bg)
      } else {
        overlaps <- list.fun(foreground.overlaps = rowRanges(variants$fg), background.overlaps = rowRanges(variants$bg))
      }
      return(list(overlaps = overlaps, enrichment = enrichment))
    } else {
      return(enrichment)
    }
  } else if (feature.type == "segmentations") {
    if (missing(features)) {
      stop("include segmentations as the 'features' argument")
    } else {
      if (is(variants$fg, "GRanges") & is(variants$bg, "GRanges")) {
        search.range <- GenomicRanges::union(range(variants$fg), range(variants$bg))
      } else {
        search.range <- GenomicRanges::union(range(rowRanges(variants$fg)), range(rowRanges(variants$bg)))
      }
      if (is(features, "GRangesList")) {
        features <- unlist(features, use.names = FALSE)
      }
      # nfeatures <- gaps(features)
      # nfeatures <- nfeatures[strand(nfeatures) == "*"]
      # mcols(nfeatures)$sample <- "none"
      # mcols(nfeatures)$state <- "unclassified"
      # features <- c(features, nfeatures)
      features <- keepSeqlevels(subsetByOverlaps(features, search.range), seqlevelsInUse(search.range))
      enrichment <- enrich.segments(fg = variants$fg,
                                    bg = variants$bg,
                                    features = features,
                                    CI = CI,
                                    prior = prior,
                                    strict.subset = strict.subset,
                                    return.overlaps = return.overlaps)
    }
  } else {
    stop("feature.type: ", feature.type, "is not availible; try 'biofeatures' or 'segmentations'")
  }
  return(enrichment)
}


#' Assign variants to features
#' @param variants a VCF or GRanges object representing variants to annotate
#'
#' @param features a GRanges or GRangesList object representing features used to
#'   annotate variants.
#'
#' @return returns the variants object that was passed as an argument, VCF
#'   objects have overlaps stored in rowRanges(), GRanges have overlaps stored
#'   in mcols(), ShowOverlaps() can be used to reveal this data.
#' @importFrom S4Vectors to from List
#' @importFrom GenomicRanges findOverlaps
#' @importFrom dplyr left_join
#' @export
SetOverlaps <- function(variants, features) {
  nfeatures <- gaps(features)
  nfeatures <- nfeatures[strand(nfeatures) == "*"]
  if(all(names(mcols(features)) == c("sample", "state"))) {
    mcols(nfeatures)$sample <- "none"
    mcols(nfeatures)$state <- "unclassified"
    id.name <- "sample"
  } else if (all(names(mcols(features)) == c("feature", "signalValue"))) {
    mcols(nfeatures)$feature <- "none"
    mcols(nfeatures)$signalValue <- 0L
    id.name <- "feature"
  }
  features <- c(features, nfeatures)
  if (is.null(names(variants))) {
    names(variants) <- make.unique(as.character(variants))
  }
  overlaps <- findOverlaps(variants, features, ignore.strand = TRUE)
  overlaps <- data.frame(from = names(variants)[from(overlaps)],
                         to = mcols(features)[to(overlaps), id.name],
                         stringsAsFactors = FALSE)
  resmatrix <- make.overlap.matrix(overlaps)
  resmatrix <- resmatrix[names(variants), !grepl("^none$",
                                                 colnames(resmatrix)), drop = FALSE]
  if(ncol(resmatrix) > 0L) {
    if(is(variants, "GRanges")) {
      overlap.offset <- ncol(mcols(variants)) + 1
      mcols(variants) <- cbind(mcols(variants), DataFrame(resmatrix))
      attributes(variants)$metadata$overlap.offset <- overlap.offset
    } else if (is(variants, "VCF")) {
      overlap.offset <- (ncol(mcols(rowRanges(variants))) + 1) - 4
      mcols(rowRanges(variants)) <- cbind(mcols(rowRanges(variants)), DataFrame(resmatrix))
      metadata(variants)$overlap.offset <- overlap.offset
    } else {
      stop("variants must be either a GRanges or VCF object, your object is: ", class(variants))
    }
  } else {
    if(is(variants, "GRanges")) {
      metadata(variants)$overlap.offset <- ncol(mcols(variants)) + 1
    } else if (is(variants, "VCF")) {
      metadata(variants)$overlap.offset <- ncol(mcols(rowRanges(variants))) + 1
    } else {
      stop("variants must be either a GRanges or VCF object, your object is: ", class(variants))
    }
    warning("No overlaps were found for this variant/feature combination")
  }
  return(variants)
}

#' Show the overlaps between variants and features
#' @param variants a VCF or GRanges object representing annotated variants
#'
#' @param feature a character list of features about which to display overlaps, this is optional, if set as NULL, all features are returned.
#'
#' @return a GRanges list representing the overlap between variants and features.
#' @export
ShowOverlaps <- function(variants, feature = NULL) {
  if (is(variants, "GRanges")) {
    offset <- attributes(variants)$metadata$overlap.offset
  } else if (is(variants, "VCF")) {
    offset <- metadata(variants)$overlap.offset
    variants <- rowRanges(variants)
	variants <- variants[, 1:(ncol(mcols(variants)) - 4)]
  } else {
    stop("variants must be either a VCF or GRanges object")
  }
  if (ncol(mcols(variants)) < offset) {
    return(GRanges())
  }
  if (is.null(feature)) {
    if (!is.null(offset)) {
	  return(variants[, offset:ncol(mcols(variants))])
    } else {
      stop("no overlap data is availible, run SetOverlaps() first")
    }
  } else {
    if (any(feature %in% colnames(mcols(rowRanges(variants))))) {
      return(variants[, feature])
    } else {
      stop("no overlap data is availible for feature search: ", feature)
    }
  }
}

#' Plot Enrichment
#' @description a flexible plotting tool of the enrichment data.frame generated
#'   by CalculateEnrichment()
#' @param variant.enrichment a data.frame of enrichment data generated by
#'   CalculateEnrichment()
#'
#' @param value one of 'difference' or 'oddsratio', thereby plotting the
#'   difference in beta distributions between foreground and background, or the
#'   oddsratio of the comparison.
#' @param block1 character string defining the column of variant.enrichment to
#'   be used as a factor by which to separate the enrichment into blocks
#'   (separate panels) in the plot. Default is NULL, in which case there is no
#'   blocking.
#' @param block2 character string defining the column of variant.enrichment to
#'   be used as a factor by which to separate the enrichment into blocks
#'   (separate panels) in the plot. Default is NULL, in which case there is no
#'   blocking.
#' @param color.by optional character string defining the column of
#'   variant.enrichment to be used to color the values. If this column has hex
#'   color values, in the format #ff0000 they will be used.
#' @param colors optional character vector defining the colors to use for each
#'   value. Must be the same length as the the number of rows in the
#'   variante.enrichment data.frame.
#' @param ncol number of columns to use for facet_wrap if only one block is
#'   defined.
#'
#' @return a ggplot plot object
#'
#' @import ggplot2
#' @importFrom RColorBrewer brewer.pal
#' @importFrom grDevices colorRampPalette
#' @importFrom dplyr n_distinct
#' @importFrom scales alpha
#' @importFrom graphics plot
#' @importFrom stringr str_length
#' @export
PlotEnrichment <- function(variant.enrichment, value = "difference", block1 = NULL, block2 = NULL, color.by = NULL, colors = NULL, ncol = 1, auto.plot=TRUE) {

  ## check args
  if (!is(variant.enrichment, "data.frame"))
    stop("variant.enrichment must be a data.frame")
  if (!is.null(block1)) {
    if (!(block1 %in% colnames(variant.enrichment)))
      stop("The block1 argument must either be NULL or a column of variant.enrichment.")
  }
  if (!is.null(block2)) {
    if (!(block2 %in% colnames(variant.enrichment)))
      stop("The block2 argument must either be NULL or a column of variant.enrichment.")
  }
  if (!is.null(color.by)) {
    if (!(color.by %in% colnames(variant.enrichment)))
      stop("The color.by argument must either be NULL or a column of variant.enrichment.")
  }
  if (!(value %in% c("log.odds.ratio", "difference"))) {
    stop("The y argument must be either 'log.odds.ratio' or 'difference'")
  }
  if(value == "log.odds.ratio") value.name <- "log1p(odds ratio)"
  if(value == "difference") value.name <- "difference of foreground \nadbackground distributions"
  if (is.null(color.by)) {
    variant.enrichment$color <- "#ececec"
    variant.enrichment[variant.enrichment$significant, "color"] <- "#ff0000"
    color.is <- "self"
  } else {
    color.vals <- variant.enrichment[, color.by]
    names(color.vals) <- rownames(variant.enrichment)
    color.name <- color.by
    if(!is.null(colors)) {
      if(length(color.vals) == length(colors)) {
        variant.enrichment$color <- colors
        color.is <- "self"
      } else {
        stop("arg 'colors' is not the same length as color.by")
      }
    }
    if (all(grepl("^#", color.vals) & (str_length(color.vals) == 7L))) {
      variant.enrichment$color <- color.vals
      variant.enrichment[!variant.enrichment$significant, "color"] <- alpha(variant.enrichment[!variant.enrichment$significant, "color"], 0.5)
      color.is <- "self"
    } else if (is.character(color.vals) | is.factor(color.vals) | is.logical(color.vals)) {
      color.vals <- as.factor(color.vals)
      if(nlevels(color.vals) < 10L) {
        levels(color.vals) <- brewer.pal(nlevels(color.vals), "Set1")
      } else if(nlevels(color.vals) < 20L) {
        kelly.colours <- c("gray95", "gray13", "gold2", "plum4",
                           "darkorange1", "lightskyblue2", "firebrick",
                           "burlywood3", "gray51", "springgreen4", "lightpink2",
                           "deepskyblue4", "lightsalmon2", "mediumpurple4",
                           "orange", "maroon", "yellow3", "brown4",
                           "yellow4", "sienna4", "chocolate", "gray19")
        levels(color.vals) <- kelly.colours[3:(nlevels(color.vals)+2)]
      } else {
        levels(color.vals) <- colorRampPalette(brewer.pal(9, "Spectral"))(nlevels(color.vals))
      }
      variant.enrichment$color <- as.character(color.vals)
      variant.enrichment[!variant.enrichment$significant, "color"] <- alpha(variant.enrichment[!variant.enrichment$significant, "color"], 0.5)
      color.is <- "self"
    } else {
      variant.enrichment$color <- variant.enrichment[, color.by]
      color.is <- "cont"
    }
  }

  ep <- ggplot(variant.enrichment, aes_string(x = "sample", y = value, group = "color")) + theme_minimal()

  if (n_distinct(variant.enrichment$sample) <= 7L) {
    if(value == "log.odds.ratio") {
      ep <- ep + geom_crossbar(aes(ymin = log.odds.lower, ymax = log.odds.upper, color = color), fatten = 2, width = 0.8)
    } else {
      ep <- ep + geom_crossbar(aes(ymin = lower, ymax = upper, color = color), fatten = 2, width = 0.8)
    }
  } else {
    if(value == "log.odds.ratio") {
      ep <- ep + geom_pointrange(aes(ymin = log.odds.lower, ymax = log.odds.upper, color = color), fatten = 0.5)
    } else {
      ep <- ep + geom_pointrange(aes(ymin = lower, ymax = upper, color = color), fatten = 0.5)
    }
  }
  ep <- ep + scale_x_discrete(name = "Sample")
  if(value == "log.odds.ratio") {
    my.max <- round(max(variant.enrichment$odds.upper), 2) + 0.01
    my.min <- round(min(variant.enrichment$odds.lower), 2) - 0.01
    ep <- ep + scale_y_continuous(name = value.name)
    ep <- ep + geom_hline(yintercept = 0, color = "#c4c4c4", alpha = 0.5) +
      annotate("rect", xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=log1p(1), alpha=0.1, fill="black")
  } else {
    my.max <- round(max(variant.enrichment$upper), 2) + 0.01
    my.min <- round(min(variant.enrichment$lower), 2) - 0.01
    ep <- ep + scale_y_continuous(name = value.name, breaks = round(seq(from = my.min, to = my.max, length.out = 5), 2))
    ep <- ep + geom_hline(yintercept = 0, color = "#c4c4c4", alpha = 0.5) +
      annotate("rect", xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=0, alpha=0.1, fill="black")
  }



  if (all(!is.null(block1), !is.null(block2))) {
    ep <- ep + theme(axis.text.x = element_blank(),
                     legend.position="bottom",
                     panel.grid.major.x = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(),
                     strip.text.y = element_text(angle = 0, hjust = 0, vjust = 0.5, size = rel(1.5)),
                     strip.text.x = element_text(angle = 0, hjust = 0.5, vjust = 1, size = rel(1.5)))
  } else {
    ep <- ep + theme(axis.text.x = element_text(angle = 270, hjust = 0, vjust = 0.5, size = rel(1.5)),
                     legend.position="bottom",
                     panel.grid.major.x = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(),
                     strip.text.x = element_text(angle = 0, hjust = 0.5, vjust = 1, size = rel(1.5)))
  }
  if(!is.null(block1) & !is.null(block2)) {
    my.form <- as.formula(paste(block1, "~", block2))
    ep <- ep + facet_grid(my.form, scales = "free_x", space = "free_x", switch = "x")
  } else if (!is.null(block1) & is.null(block2)) {
    my.form <- as.formula(paste("~", block1))
    ep <- ep + facet_wrap(my.form, ncol = ncol, strip.position = ifelse(ncol > 1L, "top", "right"))
  } else if (is.null(block1) & !is.null(block2)) {
    my.form <- as.formula(paste("~", block2))
    ep <- ep + facet_wrap(my.form, ncol = ncol, strip.position = ifelse(ncol > 1L, "top", "right"))
  }

  if(color.is == "cont") {
    ep <- ep + scale_color_brewer(palette = "Greens")
  } else {
    ep <- ep + scale_color_identity()
  }
  if (auto.plot == TRUE) {
    plot(ep)
  }
  return(invisible(ep))
}


###############
### Helpers ###
###############



#' @importFrom GenomeInfoDb seqlevelsStyle keepSeqlevels seqlevelsInUse
#' @importFrom VariantAnnotation readVcf
GetVariantsInWindowVCF <- function(file, param, genome) {
  vcf <- readVcf(file = file, genome = genome, param = param)
  if (nrow(vcf) < 1L) stop("no variants found in interval; \n this is sometimes an error in fetching remote file, try with a local vcf")
  vcf <- keepSeqlevels(vcf, seqlevelsInUse(vcf))
  seqlevelsStyle(vcf) <- "UCSC"
  metadata(vcf)$overlap.offset <- NA
  return(vcf)
}

getFILE <- function(FILE, FUN, ..., N.TRIES=1L) {
  N.TRIES <- as.integer(N.TRIES)
  stopifnot(length(N.TRIES) == 1L, !is.na(N.TRIES))

  while (N.TRIES > 0L) {
    result <- tryCatch(FUN(FILE, ...), error = identity)
    if (!inherits(result, "error"))
      break
    N.TRIES <- N.TRIES - 1L
  }

  if (N.TRIES == 0L) {
    stop("'getFILE()' failed:",
         "\n  FILE: ", FILE,
         "\n  error: ", conditionMessage(result))
  }

  result
}

make.overlap.matrix <- function(overlaps) {
  overlap.table <- table(overlaps)
  output.matrix <- matrix(overlap.table,
                          nrow = nrow(overlap.table),
                          dimnames = list(rownames(overlap.table),
                                          colnames(overlap.table)))

  if (!is(output.matrix, "matrix")) {
    dim(output.matrix) <- c(length(output.matrix), 1)
    dimnames(output.matrix) <- list(rownames(overlap.table),
                                    colnames(overlap.table))
  }
  output.matrix[output.matrix > 1L] <- 1L
  return(output.matrix)
}

#' @importFrom pbapply pblapply
enrich.segments <- function(fg, bg, features, CI, prior, strict.subset, return.overlaps) {
  if (is(features, "GRangesList")) {
    features <- unlist(features, use.names = FALSE)
  }
  if (all(c("sample", "state") %in% colnames(mcols(features)))) {
    states <- unique(mcols(features)$state)
    if (length(states) > 1L) {
      myapply <- pblapply
    } else {
      myapply <- lapply
    }
    enrichment <- myapply(states, function(state,
                                           local.features = features,
                                           local.fg = fg,
                                           local.bg = bg,
                                           local.CI = CI,
                                           local.prior = prior,
                                           local.strict.subset = strict.subset,
                                           local.return.overlaps = return.overlaps) {
      local.features <- local.features[mcols(local.features)$state %in% state, ]
      local.fg <- SetOverlaps(local.fg, local.features)
      local.bg <- SetOverlaps(local.bg, local.features)
      ## Equalize feature columns
      fg.features <- colnames(mcols(ShowOverlaps(local.fg)))
      bg.features <- colnames(mcols(ShowOverlaps(local.bg)))
      all.features <- union(fg.features, bg.features)
      ## first fg
      if (is(local.fg, "VCF")) {
        if(any(!(is.element(all.features, fg.features)))) {
          mcols(rowRanges(local.fg))[, all.features[!(is.element(all.features, fg.features))]] <- 0L
        }
        mcols(rowRanges(local.fg)) <- cbind(mcols(rowRanges(local.fg))[, 1:metadata(local.fg)$overlap.offset - 1],
                                            mcols(rowRanges(local.fg))[, all.features, drop = FALSE])
      } else if (is(local.fg, "GRanges")) {
        if (any(!(is.element(all.features, fg.features)))) {
          mcols(local.fg)[, all.features[!(is.element(all.features, fg.features))]] <- 0L
        }
        mcols(local.fg) <- cbind(mcols(local.fg)[, 1:attributes(local.fg)$metadata$overlap.offset - 1],
                                 mcols(local.fg)[, all.features, drop = FALSE])
      }
      ## then bg
      if (is(local.bg, "VCF")) {
        if(any(!(is.element(all.features, bg.features)))) {
          mcols(rowRanges(local.bg))[, all.features[!(is.element(all.features, bg.features))]] <- 0L
        }
        mcols(rowRanges(local.bg)) <- cbind(mcols(rowRanges(local.bg))[, 1:metadata(local.bg)$overlap.offset - 1],
                                            mcols(rowRanges(local.bg))[, all.features, drop = FALSE])
      } else if (is(local.bg, "GRanges")) {
        if (any(!(is.element(all.features, bg.features)))) {
          mcols(local.bg)[, all.features[!(is.element(all.features, bg.features))]] <- 0L
        }
        mcols(local.bg) <- cbind(mcols(local.bg)[, 1:attributes(local.bg)$metadata$overlap.offset - 1],
                                 mcols(local.bg)[, all.features, drop = FALSE])
      }
      enrich <- enrich.features(fg = local.fg,
                                bg = local.bg,
                                CI = local.CI,
                                prior = local.prior,
                                strict.subset = local.strict.subset)
      if (local.return.overlaps) {
        return(list(e = enrich, fg.o = ShowOverlaps(local.fg), bg.o = ShowOverlaps(local.bg)))
      } else {
        return(enrich)
      }
    })
    if(return.overlaps) {
      overlaps <- lapply(enrichment, function(x) {
        overlaps <- GRangesList(foreground.overlaps = x$fg.o, background.overlaps = x$bg.o)
        return(overlaps)
      })
      names(overlaps) <- states
      enrichment <- lapply(enrichment, function(x) {
        return(x$e)
      })
    }
    names(enrichment) <- states
    for (state in states) {
      enrichment[[state]]$state <- state
    }
    enrichment <- do.call("rbind", enrichment)
  } else {
    stop("Segmentations must have mcols with fields 'sample' and 'state'")
  }
  if(return.overlaps) {
    return(list(overlaps = overlaps, enrichment = enrichment))
  } else {
    return(enrichment)
  }
}


enrich.features <- function(fg, bg, CI, prior, strict.subset) {
  ## foreground overlaps
  fg.over.matrix <- as.matrix(mcols(ShowOverlaps(fg)))
  rownames(fg.over.matrix) <- rownames(fg)
  ## background overlaps
  bg.over.matrix <- as.matrix(mcols(ShowOverlaps(bg)))
  rownames(bg.over.matrix) <- rownames(bg)

  ## start enrichment
  sample.size <- dim(fg.over.matrix)[[1]]
  sample.stats <- colSums(fg.over.matrix)

  if (strict.subset) {
    total.size <- dim(bg.over.matrix)[[1]]
    total.stats <- colSums(bg.over.matrix)
  } else {
    total.size <- dim(bg.over.matrix)[[1]] + sample.size
    total.stats <- colSums(bg.over.matrix) + sample.stats
  }
  if (exists("prior") & !is.null(prior)) {
    a <- prior[["a"]]
    b <- prior[["b"]]
  } else {
    stop("no argument present for prior, needed for simulation")
  }
  enrichment <- data.frame(sample = character(),
                           fg.ratio = numeric(),
                           bg.ratio = numeric(),
                           probability = numeric(),
                           difference = numeric(),
                           lower = numeric(),
                           upper = numeric(),
                           fg.success = numeric(),
                           fg.total = numeric(),
                           bg.success = numeric(),
                           bg.total = numeric(),
                           stringsAsFactors = FALSE)
  CI <- (1 - CI)/2
  for (my.sample in seq_along(colnames(bg.over.matrix))) {
    total.success <- total.stats[my.sample]
    sample.success <- sample.stats[my.sample]

    n1 <- sample.size
    y1 <- sample.success
    n2 <- total.size - sample.size
    y2 <- total.success - sample.success
    # SIMULATION
    I = 100000 # simulations
    theta1 = rbeta(I, y1 + a, (n1 - y1) + b)
    theta2 = rbeta(I, y2 + a, (n2 - y2) + b)
    diff = theta1 - theta2

    # OUTPUT
    quantiles <- try(quantile(diff,c(CI, 0.5, 1 - CI)))
    #if(inherits(quantiles, "try-error")) browser()
    fisher.res <- fisher.test(matrix(c(sample.success,
                                       total.success - sample.success,
                                       sample.size - sample.success,
                                       (total.size - total.success) - sample.size + sample.success),
                                     2, 2),
                              conf.level = CI)

    result <- data.frame(sample = colnames(bg.over.matrix)[my.sample],
                         fg.ratio = sample.success/sample.size,
                         bg.ratio = total.success/total.size,
                         probability = mean(theta1 > theta2),
                         log.odds.ratio = log1p(fisher.res$estimate),
                         log.odds.lower = log1p(fisher.res$conf.int[[1]]),
                         log.odds.upper = log1p(fisher.res$conf.int[[2]]),
                         difference = quantiles[2],
                         lower = quantiles[1],
                         upper = quantiles[3],
                         fg.success = sample.success,
                         fg.total = sample.size,
                         bg.success = total.success,
                         bg.total = total.size,
                         stringsAsFactors = FALSE)
    enrichment <- rbind(enrichment, result)
  }
  if(nrow(enrichment) < 1) {
    return(enrichment)
  } else {
    enrichment$significant <- FALSE
    enrichment[enrichment$probability > (1 - CI) | enrichment$probability < CI, "significant"] <- TRUE
    return(enrichment)
  }
}
Simon-Coetzee/funcivar documentation built on Aug. 2, 2021, 9:55 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Simon-Coetzee/funcivar
Calculate overlaps and enrichment between genomic variants and genomic features or segmentations

R/funcivar.R
In Simon-Coetzee/funcivar: Calculate overlaps and enrichment between genomic variants and genomic features or segmentations

Defines functions enrich.features enrich.segments make.overlap.matrix getFILE GetVariantsInWindowVCF PlotEnrichment ShowOverlaps SetOverlaps CalculateEnrichment SplitVcfLd format.bed GetSegmentations GetBioFeatures CalcLD SetPopulation GetVariantsInWindow

Documented in CalcLD CalculateEnrichment GetBioFeatures GetSegmentations GetVariantsInWindow PlotEnrichment SetOverlaps SetPopulation ShowOverlaps SplitVcfLd

R Package Documentation

Browse R Packages

We want your feedback!

Simon-Coetzee/funcivar Calculate overlaps and enrichment between genomic variants and genomic features or segmentations

R/funcivar.R In Simon-Coetzee/funcivar: Calculate overlaps and enrichment between genomic variants and genomic features or segmentations

Defines functions enrich.features enrich.segments make.overlap.matrix getFILE GetVariantsInWindowVCF PlotEnrichment ShowOverlaps SetOverlaps CalculateEnrichment SplitVcfLd format.bed GetSegmentations GetBioFeatures CalcLD SetPopulation GetVariantsInWindow

Documented in CalcLD CalculateEnrichment GetBioFeatures GetSegmentations GetVariantsInWindow PlotEnrichment SetOverlaps SetPopulation ShowOverlaps SplitVcfLd

R Package Documentation

Browse R Packages

We want your feedback!

Simon-Coetzee/funcivar
Calculate overlaps and enrichment between genomic variants and genomic features or segmentations

R/funcivar.R
In Simon-Coetzee/funcivar: Calculate overlaps and enrichment between genomic variants and genomic features or segmentations