R/query_vcf.R

Defines functions query_vcf

Documented in query_vcf

#' Query VCF Data
#'
#' \code{query_vcf} enables you to query variants within a VCF
#'
#' @param ... Gene names, regions, or wormbase identifiers to query.
#' @param info Info columns to output. If an \code{ANN} (annotation) column is available it is automatically fetched. [\strong{Default} \code{c()}]
#' @param format Format columns to output. A \code{"GT"} or \code{"TGT"} column must be specified to retrieve genotypes. \itemize{
#' @param impact A vector of impact levels to filter on (LOW, MODERATE, HIGH, MODIFIER). "ALL" can be used to return ALL variants. [\strong{Default} \code{c('MODERATE', 'HIGH')}]
#'     \item \code{GT} uses a numeric represetnation (0=REF, 1=ALT) and outputs g1, g2, and genotype (0=REF homozygous, 1=HET, 2=ALT homozygous).
#'     \item \code{TGT} uses the base representation (ATGC) and outputs two columns: a1, a2.
#' }
#' [\strong{Default} \code{c("TGT")}]
#' @param samples A set of samples to subset on [\strong{default:} \code{"ALL"}]
#' @param long Return dataset in long or wide format. [\strong{Default} \code{TRUE}]
#' @param vcf Use a custom VCF.
#' @return Dataframe with variant data
#'
#' @examples query_vcf("pot-2","II:1-10000","WBGene00010785")
#' @importFrom dplyr %>%
#' @export

query_vcf <- function(...,
                      info = c(),
                      format = c("TGT"),
                      impact = c("MODERATE", "HIGH"),
                      samples="ALL",
                      long = TRUE,
                      vcf = get_vcf()) {

  regions <- unlist(list(...))

  impact_options <- c(NA,
                      "LOW",
                      "MODERATE",
                      "HIGH",
                      "MODIFIER")

  # Allow user to specify 'ALL'
  if (impact[[1]] == "ALL") {
    impact <- impact_options
  } else {
    # Check that impact is properly specified
    assertthat::assert_that(all(impact %in% impact_options),
                            msg = "Invalid impact specified. Must be LOW, MODERATE, HIGH, or MODIFIER")
  }

  # Ensure that bcftools is available:
  bcftools_version <- as.double(stringr::str_replace(stringr::str_extract(readLines(pipe("bcftools --version"))[1], "[0-9]+\\.[0-9]+"), "\\.", ""))
  if(is.na(bcftools_version) | bcftools_version < 12) {
    stop("bcftools 1.2+ required for this function")
  }

  # Samples
  sample_query <- glue::glue("bcftools query -l {vcf}")
  sample_names <- readr::read_lines(suppressWarnings(pipe(sample_query)))

  # Check that VCF has annotation field
  vcf_header_query <- glue::glue("bcftools view -h {vcf}")
  vcf_header <- readr::read_lines(suppressWarnings(pipe(vcf_header_query)))

  # Pull out info columns
  info_set <- stringr::str_match(vcf_header, '##INFO=<ID=([^,>]+).*Type=([^,>]+).*Description=\"([^,>]+)\"') # nolint
  colnames(info_set) <- c("g", "ID", "Type", "Description")
  info_columns <- purrr::discard(info_set[, 2], is.na)
  info_column_types <- purrr::discard(info_set[, 3], is.na)

  # Add ANN to info if present but not specified.
  if ("ANN" %in% info_columns) {
    info <- c(info, "ANN")
  }

  # Pull out format columns
  format_set <- stringr::str_match(vcf_header, '##FORMAT=<ID=([^,>]+).*Type=([^,>]+).*Description=\"([^,>]+)\"') # nolint
  colnames(format_set) <- c("g", "ID", "Type", "Description")
  format_columns <- c(purrr::discard(format_set[, 2], is.na), "TGT")
  format_column_types <- c(purrr::discard(format_set[, 3], is.na), "TGT")

  if (is.null(regions)) {
    # Begin Exclude Linting
    cat(crayon::bold("\nVCF:"), vcf, "\n")
    cat(crayon::bold("VCF Samples:"), length(sample_names), "\n\n")
    cat(crayon::bold("Info\n"))
    utils::write.table(
      dplyr::tbl_df(
        info_set[, 2:4]) %>%
        dplyr::filter(stats::complete.cases(.)),
      sep = "\t",
      row.names = F,
      quote = F)
    cat("\n")
    cat(crayon::bold("Format"), "\n")
    utils::write.table(
      dplyr::tbl_df(
        format_set[, 2:4]) %>%
        dplyr::filter(stats::complete.cases(.)),
      sep = "\t",
      row.names = F,
      quote = F)
    cat("\n")
    return(invisible(NULL))
    # End Exclude Linting
  }


  assertthat::assert_that(all(samples %in% sample_names) | (samples[[1]] == "ALL"),
                          msg = {
                            unknown_samples <- paste(samples[!samples %in% sample_names], collapse = ", ")
                            glue::glue("Specified samples are not in the VCF: {unknown_samples}")
                          })


  assertthat::assert_that(all(info %in% info_columns),
                          msg = {
                            missing_info_columns <- paste0(info[!(info %in% info_columns)], collapse = ",")
                            glue::glue("VCF does not have specified info fields: {missing_info_columns}")
                          })

  assertthat::assert_that(all(format %in% format_columns),
                          msg = {
                            missing_format_columns <- paste0(format[!(format %in% format_columns)], collapse = ",")
                            glue::glue("VCF does not have specified format fields: {missing_format_columns}")
                          })


  results <- suppressWarnings(lapply(regions, function(query) {
    # Save region as query

    # Fix region specifications
    query <- gsub("\\.\\.", "-", query)
    query <- gsub(",", "", query)

    # Resolve region names
    if (!grepl("(I|II|III|IV|V|X|MtDNA).*", query)) {
      elegans_gff <- get_db()
      # Pull out regions by element type.
      region <- dplyr::collect(dplyr::filter(elegans_gff,
                                             locus == query |
                                               gene_id == query |
                                               transcript_id == query) %>%
                                 dplyr::select(chrom, start, end, gene_id, locus, transcript_id) %>%
                                 dplyr::distinct(.keep_all = TRUE)) %>%
        dplyr::summarize(chrom = chrom[1], start = min(start), end = max(end)) %>%
        dplyr::mutate(region_format = paste0(chrom, ":", start, "-", end)) %>%
        dplyr::select(region_format) %>%
        dplyr::distinct(.keep_all = TRUE)

      region <- paste(region$region_format, collapse = ",")

      query_type <- "locus"

      if (stringr::str_length(regions[[1]]) == 0) {
        message(paste0(query, " not found."))
        region <- NA
      }
    } else {
      region <- query
      query_type <- "region"
    }

    base_header <- c("CHROM",
                     "POS",
                     "REF",
                     "ALT",
                     "FILTER")

    if ("ANN" %in% info) {
      ann_header <- c("allele",
                      "effect",
                      "impact",
                      "gene_name",
                      "gene_id",
                      "feature_type",
                      "feature_id",
                      "transcript_biotype",
                      "exon_intron_rank",
                      "nt_change",
                      "aa_change",
                      "cdna_position_or_len",
                      "protein_position",
                      "distance_to_feature",
                      "error",
                      "extra")
    } else {
      ann_header <- c()
    }

    info_query <- paste0(info, collapse = "\\t%")
    format_query <- paste0(format, collapse = "!%")
    # If using long format provide additional information.
    query_string <- glue::glue("'%CHROM\\t%POS\\t%REF\\t%ALT\\t%FILTER\\t%{info_query}[\\t%{format_query}]\\n'")
    if (long == T) {
      query_string <- glue::glue("'%CHROM\\t%POS\\t%REF\\t%ALT\\t%FILTER\\t%{info_query}[\\t%{format_query}]\\n'")
    }

    if (samples != "ALL") {
      sample_query <- glue::glue("--samples ", paste(samples, collapse = ","))
    } else {
      sample_query <- ""
      samples <- sample_names
    }

    # Grep impacts to speed up intake
    if (impact != "ALL" & !is.na(impact)) {
      impact_grep <- paste(purrr::discard(impact, is.na), collapse = "|")
      impact_grep <- glue::glue("| egrep \"({impact_grep})\" - ")
    } else {
      impact_grep <- ""
    }

    output_file <- tempfile()
    command <- paste("bcftools",
                     "query",
                     sample_query,
                     "--regions",
                     region,
                     "-f",
                     query_string,
                     vcf,
                     impact_grep,
                     ">",
                     output_file)
    if (!is.na(region)) {
      message(glue::glue("Query: {query}"))
      conn <- system(command)
      result <- try(dplyr::tbl_df(data.table::fread(output_file,
                                                    col.names = c(base_header,
                                                                  info,
                                                                  samples),
                                                    sep = "\t",
                                                    na.strings = c("", "."))),
                    silent = FALSE)
      try(file.remove(output_file))
      if (!grepl("^Error.*", result[[1]][1])) {
        tsv <- result %>%
          dplyr::mutate(REF = ifelse(REF == TRUE, "T", REF), # T nucleotides are converted to 'true'
                        ALT = ifelse(ALT == TRUE, "T", ALT))
      } else {
        tsv <- as.data.frame(NULL)
      }

      # If no results are returned, stop.
      if (typeof(tsv) == "character" | nrow(tsv) == 0) {
        warning("No Variants")
        NA
      } else {
        # If no ANN column exists; Create a dummy one to unnest on.
        if (!("ANN" %in% colnames(tsv))) {
          tsv <- dplyr::mutate(tsv, ANN = "")
        }
        tsv <- tsv %>%
          dplyr::mutate(ANN=stringr::str_split(ANN, ",")) %>%
          tidyr::unnest(ANN) %>%
          {
            if (!is.null(ann_header))
              tidyr::separate(., ANN, into = ann_header, sep = "\\|") %>%
              dplyr::select(dplyr::one_of(c(base_header, ann_header)), dplyr::everything(), -extra)
            else
              dplyr::select(., dplyr::one_of(c(base_header)), dplyr::everything())
          } %>%
          dplyr::mutate(query = query, region = region) %>%
          dplyr::select(CHROM, POS, query, region, dplyr::everything())

        # For locus queries, filter out non-matching genes.
        if (query_type == "locus") {
          tsv <- dplyr::filter(tsv,
                               (query == gene_id) | (query == gene_name) | (query == feature_id))
        }

        # Filter impact (only if ANN present)
        if ("impact" %in% names(tsv)) {
          tsv <- tsv[tsv$impact %in% impact, ]
        } else if (impact != "ALL" | !is.na(impact)) {
          warning("Warning: No ANN column specified; Variants will not be filtered on impact.")
        }

        if (nrow(tsv) == 0) {
          message(paste("No Results for", region, "after filtering"))
        }

        if (long == FALSE) {
          tsv
        } else {
          tsv <- tidyr::gather_(tsv, "SAMPLE", "FORMAT_COLUMN", samples)  %>%
            tidyr::separate(FORMAT_COLUMN,
                            into = format,
                            sep = "\\!",
                            convert = TRUE,
                            remove = T) %>%
                            {
                              if ("DP" %in% format) dplyr::mutate(., DP = as.integer(ifelse( (DP == ".") | is.na(DP), 0, DP))) else .
                            } %>%
                            {
                              if ("TGT" %in% format)
                                tidyr::separate(.,
                                                TGT,
                                                sep = "\\/|\\|",
                                                into = c("a1", "a2"),
                                                convert = T,
                                                remove = T)
                              else
                                .
                            } %>%
                            {
                              if ("GT" %in% format)
                                tidyr::separate(.,
                                                GT,
                                                sep = "\\/|\\|",
                                                into = c("g1", "g2"),
                                                convert = T,
                                                remove = T) %>%
                                dplyr::mutate_at(as.integer, .vars = c("g1", "g2")) %>%
                                # Why this weird way? It's faster.
                                dplyr::mutate(genotype = as.integer(rowSums(.[, c("g1", "g2")])))
                              else
                                .
                            }

          # Fix NAs and convert columns
          tsv <- tsv %>%
            dplyr::mutate_all(function(x) { ifelse((x == ".") | (x == ""), NA, x)}) %>%
            # Info columns
            dplyr::mutate_at(.vars = info_columns[info_column_types == "Integer" & info_columns %in% info],
                             as.integer) %>%
            dplyr::mutate_at(.vars = info_columns[info_column_types == "Float" & info_columns %in% info],
                             as.numeric) %>%
            dplyr::mutate_at(.vars = info_columns[info_column_types == "Flag" & info_columns %in% info],
                             as.logical) %>%
            # Format Columns
            dplyr::mutate_at(.vars = format_columns[format_column_types == "Integer" & format_columns %in% format],
                             as.integer) %>%
            dplyr::mutate_at(.vars = format_columns[format_column_types == "Float" & format_columns %in% format],
                             as.numeric) %>%
            dplyr::mutate_at(.vars = format_columns[format_column_types == "Flag" & format_columns %in% format],
                             as.logical)

          column_order <- c("CHROM",
                            "POS",
                            "REF",
                            "ALT",
                            "SAMPLE",
                            "FILTER",
                            "FT",
                            "a1",
                            "a2",
                            "g1",
                            "g2",
                            "genotype",
                            "query",
                            "region")
          column_order_use <- c(column_order[column_order %in% names(tsv)],
                                names(tsv)[!names(tsv) %in% column_order])

          tsv <- tsv %>%
            dplyr::select_at(.vars = column_order_use) %>%
            dplyr::arrange(CHROM, POS)

          # Remove ANN field if its empty
          if (!("ANN" %in% info_columns)) {
            tsv <- tsv %>% dplyr::select(-ANN)
          }
        }
        tsv
      }
    }
  }
  ))
  results <- do.call(rbind, results)
  if (!"CHROM" %in% names(results)){
    warning("No Results")
    return(NA)
  }
  results <- results %>% dplyr::filter(!is.na(CHROM))
  results
}
AndersenLab/cegwas documentation built on March 6, 2020, 1:10 a.m.