R/compare.R

Defines functions read_manta_vcf check_comparison_input compare_purple_gene_files read_purple_gene_file get_circos manta_isec_stats manta_isec read_sv_eval_file read_sv_fpfn_file read_snv_eval_file read_snv_count_file merge_umccrise_outputs umccrise_outputs

Documented in check_comparison_input compare_purple_gene_files get_circos manta_isec manta_isec_stats merge_umccrise_outputs read_manta_vcf read_purple_gene_file read_snv_count_file read_snv_eval_file read_sv_eval_file read_sv_fpfn_file umccrise_outputs

#' Get umccrise final output files
#'
#' Generates a tibble containing absolute paths to umccrise final files.
#'
#' @param d Path to `<umccrised/sampleid__tumorid>` umccrise directory.
#' @return A tibble with the following columns:
#'   - ftype: file type. Can be one of:
#'     - snv (single-nucleotide/indel variants VCF)
#'     - sv (Manta structural variants VCF)
#'     - cnv (PURPLE copy number variants)
#'     - multiqc (MultiQC JSON)
#'     - hrd (HRDetect or CHORD TSV results)
#'     - canrep_qcsum (QC summary from Cancer Report Rmd)
#'   - flabel: file label (e.g. multiqc, sage, purple etc.)
#'   - fpath: path to file
#'
#' @examples
#' \dontrun{
#' um <- "path/to/umccrised/sample"
#' umccrise_outputs(um)
#' }
#' @export
umccrise_outputs <- function(d) {
  assertthat::assert_that(dir.exists(d))
  d <- normalizePath(d)
  sid_tid <- basename(d)
  assertthat::assert_that(grepl("__", sid_tid))
  sid <- sub("(.*)__(.*)", "\\1", sid_tid)
  tid <- sub("(.*)__(.*)", "\\2", sid_tid)
  nid <- "" # keep it empty so file.exists gives FALSE
  # only way to grab the normal id is via the germline SNV file
  germ_snv <- list.files(
    file.path(d, "small_variants"),
    full.names = TRUE,
    pattern = "germline\\.predispose_genes\\.vcf\\.gz$"
  )
  if (length(germ_snv) == 1 && file.exists(germ_snv)) {
    nid <- sub(".*__(.*)-germline\\.predispose_genes\\.vcf\\.gz", "\\1", germ_snv)
  }
  p <- sid_tid # for brevity

  tibble::tribble(
    ~flabel, ~ftype, ~fpath,
    "mqc_um", "multiqc", glue::glue("{p}-multiqc_report_data/multiqc_data.json"),
    "canrep_qcsum", "canrep_qcsum", glue::glue("cancer_report_tables/{p}-qc_summary.tsv.gz"),
    "chord", "hrd", glue::glue("cancer_report_tables/hrd/{p}-chord.tsv.gz"),
    "hrdetect", "hrd", glue::glue("cancer_report_tables/hrd/{p}-hrdetect.tsv.gz"),
    "som", "snv", glue::glue("small_variants/{p}-somatic.vcf.gz"),
    "germ_um", "snv", glue::glue("small_variants/{sid}__{nid}-germline.predispose_genes.vcf.gz"),
    "sage", "snv", glue::glue("small_variants/sage1/{p}-sage.vcf.gz"),
    "manta", "sv", glue::glue("structural/{p}-manta.vcf.gz"),
    "purple", "cnv", glue::glue("purple/{p}.purple.cnv.gene.tsv"),
    "pcgr_tsv", "pcgr_tsv", glue::glue("small_variants/{p}-somatic.pcgr.snvs_indels.tiers.tsv")
  ) |>
    dplyr::mutate(
      fpath = file.path(d, .data$fpath),
      fexists = file.exists(.data$fpath)) |>
    dplyr::filter(.data$fexists) |>
    dplyr::select("ftype", "flabel", "fpath")
}

#' Gather umccrise filepaths from two umccrise final directories into a single tibble
#'
#' Generates a tibble containing absolute paths to (common) files from two umccrise final directories.
#'
#' @param d1 Path to first <umccrised/sample> directory.
#' @param d2 Path to second <umccrised/sample> directory.
#' @param sample Sample name.
#' @return A tibble with the following columns:
#'   - sample name
#'   - variant type (e.g. SNV, SV, CNV)
#'   - file label (e.g. pcgr, cpsr, purple etc.)
#'   - run1 file path
#'   - run2 file path
#'
#' @examples
#' \dontrun{
#' um1 <- "path/to/umccrised/sample1"
#' um2 <- "path/to/umccrised/sample2"
#' merge_umccrise_outputs(um1, um2)
#' }
#' @export
merge_umccrise_outputs <- function(d1, d2, sample) {
  um1 <- umccrise_outputs(d1)
  um2 <- umccrise_outputs(d2)

  dplyr::inner_join(um1, um2, by = c("flabel", "ftype")) %>%
    dplyr::mutate(sample_nm = sample) %>%
    dplyr::select("sample_nm", "ftype", "flabel", "fpath.x", "fpath.y") %>%
    utils::write.table(file = "", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
}


#' Read SNV count file output by woof
#'
#' Reads SNV count file output by woof
#'
#' @param x Path to file with counts
#' @return A single integer
#'
#' @examples
#' x <- system.file("extdata", "snv/count_vars.txt", package = "woofr")
#' read_snv_count_file(x)
#'
#' @export
read_snv_count_file <- function(x) {
  column_nms <- c("sample", "flabel", "count")
  if (!file.exists(x)) {
    # return tibble of NAs
    res <- rep(NA, length(column_nms)) %>%
      purrr::set_names(column_nms) %>%
      as.list() %>%
      dplyr::as_tibble()

    return(res)
  }
  res <- readr::read_tsv(x, col_types = "cci")
  stopifnot(names(res) == column_nms)
  res
}

#' Read SNV eval file output by woof
#'
#' Reads SNV eval file output by woof
#'
#' @param x Path to file with eval results
#' @return A tibble with a single row of evaluation metrics
#'
#' @examples
#'
#' x <- system.file("extdata", "snv/eval_stats.tsv", package = "woofr")
#' read_snv_eval_file(x)
#'
#' @export
read_snv_eval_file <- function(x) {
  column_nms <- c(
    "sample", "flabel", "subset",
    "SNP_Truth", "SNP_TP", "SNP_FP", "SNP_FN", "SNP_Recall", "SNP_Precision",
    "SNP_f1", "SNP_f2", "SNP_f3", "IND_Truth", "IND_TP", "IND_FP",
    "IND_FN", "IND_Recall", "IND_Precision", "IND_f1", "IND_f2", "IND_f3"
  )
  if (!file.exists(x)) {
    # return tibble of NAs
    res <- rep(NA, length(column_nms)) %>%
      purrr::set_names(column_nms) %>%
      as.list() %>%
      dplyr::as_tibble()

    return(res)
  }
  res <- readr::read_tsv(x, col_types = readr::cols(
    .default = "d", sample = "c", flabel = "c", subset = "c"
  ))
  stopifnot(names(res) == column_nms)
  res
}

#' Read SV FP/FN file output by woof
#'
#' Reads SV FP/FN file output by woof
#'
#' @param x Path to file with FP/FN variants
#' @return A tibble containing FP/FN SV variants
#'
#' @examples
#'
#' \dontrun{
#' x <- system.file("extdata", "sv/fpfn.tsv", package = "woofr")
#' read_sv_fpfn_file(x)
#' }
#'
#' @export
read_sv_fpfn_file <- function(x) {
  column_nms <- c("FP_or_FN", "sample", "flabel", "chrom1", "pos1", "chrom2", "pos2", "svtype", "af", "fmt_map", "fmt_val")
  if (!file.exists(x)) {
    res <- rep(NA, length(column_nms)) %>%
      purrr::set_names(column_nms) %>%
      as.list() %>%
      dplyr::as_tibble()
    return(res)
  }
  res <- readr::read_tsv(x, col_types = "ccccicicccc")
  stopifnot(names(res) == column_nms)
  res |>
    tidyr::separate(.data$fmt_val, into = c("PR_REF", "PR_ALT", "SR_REF", "SR_ALT"), convert = TRUE) |>
    dplyr::mutate(`SR>PR` = .data$SR_ALT > .data$PR_ALT) |>
    dplyr::select(-c("PR_REF", "SR_REF"))
}

#' Read SV eval metrics file output by woof
#'
#' Reads SV eval metrics file output by woof
#'
#' @param x Path to file with SV evaluation metrics
#' @return A tibble with a single row of evaluation metrics
#'
#' @examples
#'
#' x <- system.file("extdata", "sv/eval_metrics.tsv", package = "woofr")
#' read_sv_eval_file(x)
#'
#' @export
read_sv_eval_file <- function(x) {
  column_nms <- c(
    "sample", "flabel", "run1_count", "run2_count", "Recall",
    "Precision", "Truth", "TP", "FP", "FN"
  )
  if (!file.exists(x)) {
    res <- rep(NA, length(column_nms)) %>%
      purrr::set_names(column_nms) %>%
      as.list() %>%
      dplyr::as_tibble()
    return(res)
  }
  res <- readr::read_tsv(x, col_types = "cciiddiiii")
  stopifnot(names(res) == column_nms)
  res
}


#' Intersect two Manta VCF files
#'
#' Intersects two Manta VCF files for evaluation.
#'
#' @param f1 Path to first Manta file
#' @param f2 Path to second ('truthset') Manta file
#' @param samplename Sample name (used for labelling).
#' @param flab File name (used for labelling).
#' @param bnd_switch Logical. Switch BND pairs for more matches (default: TRUE).
#' @return A list with the following elements:
#'   - tot_vars: total variants for file1 and file2
#'   - fp: tibble with False Positive calls i.e. variants in f1 that are not in f2
#'   - fn: tibble with False Negative calls i.e. variants in f2 that are not in f1
#'
#' @examples
#' \dontrun{
#' f1 <- "path/to/run1/manta.vcf.gz"
#' f2 <- "path/to/run2/manta.vcf.gz"
#' mi <- manta_isec(f1, f2)
#' }
#'
#' @export
manta_isec <- function(f1, f2, samplename, flab, bnd_switch = TRUE) {
  read_manta_both <- function(f1, f2) {
    vcf1 <- prep_manta_vcf(f1, filter_pass = TRUE)$sv
    vcf2 <- prep_manta_vcf(f2, filter_pass = TRUE)$sv
    list(f1 = vcf1, f2 = vcf2)
  }

  switch_bnd <- function(d) {
    stopifnot(all(colnames(d) %in% c("chrom1", "pos1", "chrom2", "pos2", "svtype", "af", "fmt_map", "fmt_val")))
    no_bnd <- dplyr::filter(d, !.data$svtype %in% "BND")
    bnd <-
      dplyr::filter(d, .data$svtype %in% "BND") %>%
      dplyr::select(
        chrom1 = "chrom2", pos1 = "pos2",
        chrom2 = "chrom1", pos2 = "pos1", "svtype",
        "af", "fmt_map", "fmt_val"
      )

    dplyr::bind_rows(no_bnd, bnd)
  }

  mb <- read_manta_both(f1, f2)
  tot_vars <- list(run1 = nrow(mb$f1), run2 = nrow(mb$f2))
  # col_nms <- colnames(mb$f1)
  col_nms <- c("chrom1", "pos1", "chrom2", "pos2")
  fp <- dplyr::anti_join(mb$f1, mb$f2, by = col_nms)
  fn <- dplyr::anti_join(mb$f2, mb$f1, by = col_nms)

  if (bnd_switch) {
    # some real matching cases simply have switched BND mates
    fp_new <- dplyr::anti_join(switch_bnd(fp), fn, by = col_nms) %>% switch_bnd()
    fn_new <- dplyr::anti_join(switch_bnd(fn), fp, by = col_nms) %>% switch_bnd()
    fp <- fp_new
    fn <- fn_new
  }

  fp <- fp %>%
    dplyr::mutate(sample = samplename, flabel = flab) %>%
    dplyr::select("sample", "flabel", dplyr::everything())
  fn <- fn %>%
    dplyr::mutate(sample = samplename, flabel = flab) %>%
    dplyr::select("sample", "flabel", dplyr::everything())

  return(list(
    tot_vars = tot_vars,
    fp = fp,
    fn = fn
  ))
}

#' Manta Comparison Metrics
#'
#' Returns evaluation metrics from comparing two Manta call sets.
#'
#' @param mi Output from \code{\link{manta_isec}}.
#' @param samplename Sample name (used for labelling).
#' @param flab File name (used for labelling).
#' @return A single row tibble with the following columns:
#'   - sample: samplename
#'   - flabel: flab
#'   - run1_count: total variants in first file
#'   - run2_count: total variants in second file
#'   - TP: in first and second
#'   - FP: in first but not second
#'   - FN: in second but not first
#'   - Recall: TP / (TP + FN)
#'   - Precision: TP / (TP + FP)
#'   - Truth: TP + FN
#'
#' @examples
#' \dontrun{
#' f1 <- "path/to/run1/manta.vcf.gz"
#' f2 <- "path/to/run2/manta.vcf.gz"
#' mi <- manta_isec(f1, f2, "sampleA", "manta1")
#' manta_isec_stats(mi, "sampleA", "manta1")
#' }
#'
#' @export
manta_isec_stats <- function(mi, samplename, flab) {
  fn <- nrow(mi$fn)
  fp <- nrow(mi$fp)
  tp <- mi$tot_vars$run2 - fn
  truth <- tp + fn
  recall <- tp / truth
  precision <- tp / (tp + fp)

  dplyr::tibble(
    sample = samplename, flabel = flab,
    run1_count = mi$tot_vars$run1, run2_count = mi$tot_vars$run2,
    Recall = round(recall, 3), Precision = round(precision, 3), Truth = truth,
    TP = tp, FP = fp, FN = fn
  )
}

#' Get Manta Circos with FP/FN calls
#'
#' Returns Circos plot highlighting FP/FN calls from a Manta comparison.
#'
#' @param mi Output from \code{\link{manta_isec}}.
#' @param samplename Sample name (used for labelling).
#' @param outdir Directory to write circos to.
#' @return Path to circos plot highlighting FP (green) and FN (red) calls from a
#'   Manta comparison.
#'
#' @examples
#' \dontrun{
#' f1 <- "path/to/run1/manta.vcf.gz"
#' f2 <- "path/to/run2/manta.vcf.gz"
#' mi <- manta_isec(f1, f2)
#' get_circos(mi, "sampleA", "outdir1")
#' }
#'
#' @export
get_circos <- function(mi, samplename, outdir) {
  prep_svs_circos <- function() {
    sv <- dplyr::bind_rows(list(fp = mi$fp, fn = mi$fn), .id = "fp_or_fn")
    links_coloured <- sv %>%
      dplyr::mutate(
        chrom1 = paste0("hs", .data$chrom1),
        chrom2 = paste0("hs", .data$chrom2),
        col = dplyr::case_when(
          fp_or_fn == "fp" ~ "(0,255,0)",
          fp_or_fn == "fn" ~ "(255,0,0)",
          TRUE ~ "Oops"
        ),
        pos1b = .data$pos1,
        pos2b = .data$pos2,
        col = paste0("color=", col)
      ) %>%
      dplyr::select("chrom1", "pos1", "pos1b", "chrom2", "pos2", "pos2b", "col")

    return(links_coloured)
  }

  write_circos_configs <- function() {
    dir.create(outdir, recursive = TRUE)
    links <- prep_svs_circos()
    message(glue::glue("Writing circos links file to '{outdir}'."))
    readr::write_tsv(links, file.path(outdir, "SAMPLE.link.circos"), col_names = FALSE)

    message(glue::glue("Copying circos templates to '{outdir}'"))
    file.copy(system.file("extdata/circos/circos_sv.conf", package = "woofr"),
      file.path(outdir, "circos.conf"),
      overwrite = TRUE
    )
    file.copy(system.file("extdata/circos/gaps.txt", package = "woofr"),
      outdir,
      overwrite = TRUE
    )
    file.copy(system.file("extdata/circos/ideogram.conf", package = "woofr"),
      outdir,
      overwrite = TRUE
    )
  }

  run_circos <- function() {
    circos_png <- glue::glue("circos_{samplename}.png")
    cmd <- glue::glue("echo '$(date) running circos on {samplename}' && circos -nosvg -conf {outdir}/circos.conf -outputdir {outdir} -outputfile {circos_png}")

    if (Sys.which("circos") != "") {
      system(cmd, ignore.stdout = TRUE)
      return(circos_png)
    } else {
      stop("Can't find 'circos' in your PATH. Exiting.")
    }
  }

  write_circos_configs()
  circos_png <- run_circos()
  circos_png
}


#' Read PURPLE gene segments
#'
#' Reads the gene.cnv file output by PURPLE
#'
#' @param x Path to PURPLE `gene.cnv` (or `cnv.gene.tsv`) file.
#' @return A tibble with the main columns of interest from the PURPLE `gene.cnv` file i.e.
#'   chrom, start, end, gene, min_cn and max_cn.
#'
#' @examples
#'
#' x <- system.file("extdata", "cnv/sample_A.purple.gene.cnv", package = "woofr")
#' y <- system.file("extdata", "cnv/sample_B.purple.cnv.gene.tsv", package = "woofr")
#' read_purple_gene_file(x)
#' read_purple_gene_file(y)
#'
#' @export
read_purple_gene_file <- function(x) {
  column_nms <- c("chromosome", "start", "end", "gene", "mincopynumber", "maxcopynumber")
  d <- readr::read_tsv(x, col_types = readr::cols(.default = "c")) %>%
    dplyr::select(1:6)
  names(d) <- tolower(names(d))
  stopifnot(names(d) == column_nms)
  d %>%
    dplyr::mutate(
      start = as.integer(.data$start),
      end = as.integer(.data$end),
      min_cn = round(as.numeric(.data$mincopynumber), 1),
      max_cn = round(as.numeric(.data$maxcopynumber), 1)
    ) %>%
    dplyr::select(
      chrom = "chromosome", "start", "end", "gene", "min_cn", "max_cn"
    )
}

#' Compare two PURPLE gene CNV files
#'
#' Compares two PURPLE `gene.cnv` (or `cnv.gene.tsv`) files.
#'
#' @param cnv1 Path to first PURPLE `gene.cnv` (or `cnv.gene.tsv`) file.
#' @param cnv2 Path to second PURPLE `gene.cnv` (or `cnv.gene.tsv`) file ('TRUTHSET').
#' @param out_cn_diff Path to write genes with copy number differences between
#'   the two runs - needs to be a writable directory.
#' @param out_coord_diff Path to write genes with different coordinates between
#'   the two runs - needs to be a writable directory.
#' @param threshold Numeric. Copy number differences smaller than this value are not reported.
#' @return Writes results from the comparison to `out_cn_diff` and `out_coord_diff`.
#'
#'
#' @examples
#'
#' cnv1 <- system.file("extdata", "cnv/sample_A.purple.gene.cnv", package = "woofr")
#' cnv2 <- system.file("extdata", "cnv/sample_B.purple.cnv.gene.tsv", package = "woofr")
#' out1 <- tempfile()
#' out2 <- tempfile()
#' compare_purple_gene_files(cnv1, cnv2, out1, out2, threshold = 0.1)
#'
#' @export
compare_purple_gene_files <- function(cnv1, cnv2, out_cn_diff, out_coord_diff, threshold = 0.1) {
  stopifnot(is.numeric(threshold), threshold >= 0, length(threshold) == 1)

  x1 <- read_purple_gene_file(cnv1)
  x2 <- read_purple_gene_file(cnv2)
  # compare chrom,start,end,gene
  coord_diff <-
    dplyr::bind_rows(
      fp = dplyr::anti_join(x1, x2, by = c("chrom", "start", "end", "gene")),
      fn = dplyr::anti_join(x2, x1, by = c("chrom", "start", "end", "gene")),
      .id = "fp_or_fn"
    )

  # compare min/max cn
  cn_diff <-
    dplyr::left_join(x1, x2, by = c("chrom", "start", "end", "gene"), suffix = paste0(".run", 1:2)) %>%
    dplyr::mutate(
      min_diff = abs(.data$min_cn.run1 - .data$min_cn.run2) > threshold,
      max_diff = abs(.data$max_cn.run1 - .data$max_cn.run2) > threshold
    ) %>%
    dplyr::filter(.data$min_diff | .data$max_diff)

  utils::write.table(cn_diff, file = out_cn_diff, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
  utils::write.table(coord_diff, file = out_coord_diff, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
}

#' Check Comparison Input File
#'
#' Checks that the input file is suitable for running 'woof compare'.
#'
#' @param x Path to tab-separated comparison input file.
#'       Must contain the following five fields per file (NOTE: do not include column names):
#'       sample name, variant type (SNV, SV or CNV), file label, run1 & run2 file paths (can be S3 paths).
#' @return If the file has issues, an error message. If there
#' are no issues, simply returns file path.
#'
#' @examples
#' \dontrun{
#' x <- "path/to/cromwell_inputs.tsv"
#' check_comparison_input(x)
#' }
#' @export
check_comparison_input <- function(x) {
  d <- readr::read_tsv(x,
    col_types = readr::cols(.default = "c"),
    col_names = c("sample", "vartype", "flabel", "run1", "run2")
  )
  assertthat::assert_that(
    all(d$vartype %in% c("SNV", "SV", "CNV")),
    all(startsWith(d$run1, "/") | startsWith(d$run1, "s3")),
    all(startsWith(d$run2, "/") | startsWith(d$run2, "s3"))
  )

  x
}

#' Read Manta VCF
#'
#' Read main columns of interest from Manta VCF using bcftools or bedr
#'
#' Uses bcftools (https://samtools.github.io/bcftools/bcftools.html)
#' or bedr (https://cran.r-project.org/web/packages/bedr/index.html) to read
#' in the VCF file.
#'
#' @param vcf Path to Manta VCF file (`.vcf.gz` or `.vcf`).
#' @return A dataframe (`tibble`) with the following fields from the VCF:
#'   * chrom1: `CHROM` (remove `chr` prefix (if any) for hg38 compatibility)
#'   * pos1: `POS` | `INFO/BPI_START`
#'   * pos2: `INFO/END` | `INFO/BPI_END`
#'   * id: `ID`
#'   * mateid: `INFO/MATEID`
#'   * svtype: `INFO/SVTYPE`
#'   * filter: `FILTER`
#'   * af: `BPI_AF`
#'   * fmt_map: `FORMAT`
#'   * fmt_val: `SAMPLE`
read_manta_vcf <- function(vcf) {
  stopifnot(file.exists(vcf), length(vcf) == 1)

  # You have two options: use bcftools (first choice) or bedr
  if (Sys.which("bcftools") == "") {
    # use bedr
    x <- bedr::read.vcf(vcf, split.info = TRUE, verbose = FALSE)
    DF <- tibble::tibble(
      chrom1 = as.character(x$vcf$CHROM),
      pos1 = "dummy1",
      pos2 = "dummy2",
      id = x$vcf$ID,
      mateid = x$vcf$MATEID,
      svtype = x$vcf$SVTYPE,
      filter = x$vcf$FILTER,
      af = x$vcf$BPI_AF,
      fmt_map = x$vcf$FORMAT,
      fmt_val = x$vcf[[length(x$vcf)]]
    )

    if (any(grepl("BPI_START", x$header$INFO[, "ID"]))) {
      # use BPI fields
      DF <- dplyr::mutate(DF,
        pos1 = as.integer(x$vcf$BPI_START),
        pos2 = as.integer(x$vcf$BPI_END)
      )
    } else {
      # use typical fields
      DF <- dplyr::mutate(DF,
        pos1 = as.integer(x$vcf$POS),
        pos2 = as.integer(x$vcf$END)
      )
    }
  } else {
    # use bcftools
    if (system(paste0("bcftools view -h ", vcf, " | grep 'BPI_START'"), ignore.stdout = TRUE) == 0) {
      # use BPI fields
      bcftools_query <- "bcftools query -f '%CHROM\t%INFO/BPI_START\t%INFO/BPI_END\t%ID\t%INFO/MATEID\t%INFO/SVTYPE\t%FILTER\t%INFO/BPI_AF\t%FORMAT\n'"
    } else {
      # use typical fields, no BPI_AF --- TODO: FIX
      bcftools_query <- "bcftools query -f '%CHROM\t%POS\t%INFO/END\t%ID\t%INFO/MATEID\t%INFO/SVTYPE\t%FILTER\t%FORMAT\n'"
    }

    DF <- system(paste(bcftools_query, vcf), intern = TRUE) %>%
      tibble::tibble(all_cols = .) %>%
      tidyr::separate(
        col = .data$all_cols,
        into = c("chrom1", "pos1", "pos2", "id", "mateid", "svtype", "filter", "af", "fmt_map", "fmt_val"),
        sep = "\t", convert = TRUE
      ) %>%
      dplyr::mutate(chrom1 = as.character(.data$chrom1))
  }

  DF %>%
    dplyr::mutate(
      chrom1 = as.character(sub("chr", "", .data$chrom1)),
      pos1 = char2num(.data$pos1),
      pos2 = char2num(.data$pos2)
    ) # sometimes get '.' in pos2 (e.g. from purple)
}

#' Prepare Manta VCF for Circos
#'
#' Prepares a Manta VCF for display in a Circos plot.
#'
#' @param vcf Path to Manta VCF file. Can be compressed or not.
#' @param filter_pass Keep only variants annotated with a PASS FILTER?
#'   (default: FALSE).
#' @return A dataframe (`tibble`) with the following fields from the VCF:
#'   * chrom1: `CHROM`
#'   * pos1: `POS` | `INFO/BPI_START`
#'   * chrom2: `CHROM` (for mate2 if BND)
#'   * pos2: `INFO/END` | `INFO/BPI_END` (for mate1 if BND)
#'   * svtype: `INFO/SVTYPE`. Used for plotting.
#'
#' @export
prep_manta_vcf <- function(vcf, filter_pass = FALSE) {
  DF <- read_manta_vcf(vcf)

  # BNDs

  # We have POS of mate2 through INFO/END or INFO/BPI_END, just need CHROM.
  # If no BPI, Manta doesn't annotate BNDs with END. Let's grab it from the mate's POS.
  # Sometimes with post-processing a mate might get filtered out.
  # In these cases just filter out the orphan mate.
  #
  # Keep BND mate with index 1 (i.e. discard duplicated information)
  # see <https://github.com/Illumina/manta/blob/master/docs/developerGuide/ID.md>
  df_bnd1 <- DF %>%
    dplyr::filter(.data$svtype == "BND")

  match_id2mateid <- match(df_bnd1$id, df_bnd1$mateid)
  df_bnd2 <-
    df_bnd1[match_id2mateid, c("chrom1", "pos1")] %>%
    dplyr::rename(
      chrom11 = .data$chrom1,
      pos11 = .data$pos1
    )
  df_bnd <-
    dplyr::bind_cols(df_bnd1, df_bnd2) %>%
    dplyr::rename(chrom2 = .data$chrom11) %>%
    dplyr::mutate(
      pos2 = ifelse(is.na(.data$pos2), .data$pos11, .data$pos2),
      bndid = substring(.data$id, nchar(.data$id))
    )

  orphan_mates <- df_bnd %>%
    dplyr::filter(.data$chrom2 %in% NA) %>%
    dplyr::mutate(orphan = paste0(.data$chrom1, ":", .data$pos1)) %>%
    dplyr::pull(.data$orphan)

  df_bnd <- df_bnd %>%
    dplyr::filter(!is.na(.data$chrom2)) %>%
    dplyr::filter(.data$bndid == "1") %>%
    dplyr::select(
      "chrom1", "pos1", "chrom2",
      "pos2", "id", "mateid", "svtype", "filter",
      "af", "fmt_map", "fmt_val"
    )

  if (length(orphan_mates) > 0) {
    warning(glue::glue(
      "The following {length(orphan_mates)} orphan BND mates are removed:\n",
      paste(orphan_mates, collapse = ", ")
    ))
  }

  stopifnot(.manta_proper_pairs(df_bnd$id, df_bnd$mateid))

  # Non-BNDs
  df_other <- DF %>%
    dplyr::filter(.data$svtype != "BND") %>%
    dplyr::mutate(chrom2 = .data$chrom1) %>%
    dplyr::select(
      "chrom1", "pos1", "chrom2", "pos2",
      "id", "mateid", "svtype", "filter",
      "af", "fmt_map", "fmt_val"
    )

  # All together now
  sv <- df_other %>%
    dplyr::bind_rows(df_bnd)

  if (filter_pass) {
    sv <- sv %>%
      dplyr::filter(.data$filter == "PASS")
  }

  sv <- sv %>%
    dplyr::select(
      "chrom1", "pos1",
      "chrom2", "pos2", "svtype",
      "af", "fmt_map", "fmt_val"
    )

  structure(list(sv = sv), class = "sv")
}


# Check if Manta BND mates are properly paired
.manta_proper_pairs <- function(id, mid) {
  ext1 <- substring(id, nchar(id))
  ext2 <- substring(mid, nchar(mid))
  pre1 <- substring(id, 1, nchar(id) - 1)
  pre2 <- substring(mid, 1, nchar(mid) - 1)

  # id should end in 1; mateid in 0
  if (all(ext1 == "1") & all(ext2 == "0") & all(pre1 == pre2)) {
    return(TRUE)
  }
  return(FALSE)
}
pdiakumis/woofr documentation built on Jan. 4, 2024, 11:22 a.m.