R/ref_alt.R

Defines functions view_HGVS_C parse_ref_alt

Documented in parse_ref_alt view_HGVS_C

#' Parse REF and ALT AA Characters
#'
#' The HGVS_P is parsed according to 11 different rules, depending on the EFFECT,
#' and if there is a deletion, insertion or duplication.
#'
#' @param df A dataframe of imported snpSIFT data from mappgene output
#'
#' @export

parse_ref_alt <- function(df) {
  patterns <- c(
    "p\\.", "Phe", "Leu", "Ile", "Met", "Val", "Ser", "Pro", "Thr",
    "Ala", "Tyr", "His", "Gln", "Asn", "Lys", "Asp", "Glu", "Cys",
    "Trp", "Arg", "Gly"
  )

  replacement <- c(
    "", "F", "L", "I", "M", "V", "S", "P", "T", "A", "Y", "H",
    "Q", "N", "K", "D", "E", "C", "W", "R", "G"
  )

  df <- df |>
    dplyr::mutate(HGVS_P_modified = HGVS_P)

  a <- df |>
    dplyr::filter(EFFECT == "missense_variant") |>
    dplyr::mutate(HGVS_P_modified = stringr::str_replace_all(HGVS_P_modified, setNames(replacement, patterns))) |>
    tidyr::separate(HGVS_P_modified, sep = "[0-9]+", into = c("REF_AA", "ALT_AA"), remove = F) |>
    dplyr::filter(REF_AA != ALT_AA)

  b <- df |>
    dplyr::filter(EFFECT %in% c("conservative_inframe_deletion", "disruptive_inframe_deletion")) |>
    tidyr::separate(HGVS_P_modified, sep = "del", into = c("REF_AA", "ALT_AA"), remove = F) |>
    dplyr::mutate(REF_AA = stringr::str_replace_all(REF_AA, setNames(replacement, patterns))) |>
    dplyr::mutate(ALT_AA = stringr::str_replace_all(ALT_AA, setNames(replacement, patterns))) |>
    dplyr::mutate(ALT_AA = ifelse(ALT_AA == "", "del", ALT_AA))

  c <- df |>
    dplyr::filter(EFFECT %in% c("conservative_inframe_insertion", "disruptive_inframe_insertion")) |>
    dplyr::filter(grepl("dup", HGVS_P_modified)) |>
    dplyr::mutate(HGVS_P_modified = stringr::str_replace_all(HGVS_P_modified, setNames(replacement, patterns))) |>
    tidyr::separate(HGVS_P_modified, sep = "dup", into = c("REF_AA", "ALT_AA"), remove = F) |>
    dplyr::mutate(ALT_AA = "dup")

  d <- df |>
    dplyr::filter(EFFECT == "conservative_inframe_insertion") |>
    dplyr::filter(!grepl("dup", HGVS_P_modified)) |>
    tidyr::separate(HGVS_P_modified, sep = "ins", into = c("REF_AA", "ALT_AA"), remove = F) |>
    dplyr::mutate(REF_AA = stringr::str_replace_all(REF_AA, setNames(replacement, patterns))) |>
    dplyr::mutate(ALT_AA = stringr::str_replace_all(ALT_AA, setNames(replacement, patterns))) |>
    dplyr::mutate(ALT_AA = paste("ins", ALT_AA, sep = ""))

  e1 <- df |>
    dplyr::filter(EFFECT == "disruptive_inframe_insertion") |>
    dplyr::filter(!grepl("dup", HGVS_P_modified)) |>
    tidyr::separate(HGVS_P_modified, sep = "ins", into = c("REF_AA", "ALT_AA"), remove = F) |>
    # dplyr::filter(grepl("ins", ALT_AA)) |>
    dplyr::mutate(REF_AA = stringr::str_replace_all(REF_AA, setNames(replacement, patterns))) |>
    dplyr::mutate(ALT_AA = stringr::str_replace_all(ALT_AA, setNames(replacement, patterns)))

  e2 <- df |>
    dplyr::filter(EFFECT == "disruptive_inframe_insertion") |>
    dplyr::filter(!grepl("dup", HGVS_P_modified)) |>
    dplyr::filter(!grepl("del", HGVS_P_modified)) |>
    tidyr::separate(HGVS_P_modified, sep = "ins", into = c("REF_AA", "ALT_AA"), remove = F) |>
    dplyr::mutate(REF_AA = stringr::str_replace_all(REF_AA, setNames(replacement, patterns))) |>
    dplyr::mutate(ALT_AA = stringr::str_replace_all(ALT_AA, setNames(replacement, patterns))) |>
    dplyr::mutate(ALT_AA = paste("ins", ALT_AA, sep = ""))

  f <- df |>
    dplyr::filter(EFFECT == "stop_gained") |>
    tidyr::separate(HGVS_P_modified, sep = "[0-9]+", into = c("REF_AA", "ALT_AA"), remove = F) |>
    dplyr::mutate(REF_AA = stringr::str_replace_all(REF_AA, setNames(replacement, patterns)))

  g <- dplyr::bind_rows(lapply(list(a, b, c, d, e1, e2, f), function(x) if (nrow(x) == 0) NULL else x)) |>
    dplyr::arrange(POS) |>
    dplyr::select(-HGVS_P_modified)

  return(g)
}



#' View HGVS_C Effects
#'
#' Plot the consequences of HGVS_C against the wildtype (NC_045512.2) gene sequence
#'
#' @param HGVS_C A string from snpSIFT output
#' @param gene The gene name
#' @param window An approximate upstream and downstream window size for plotting
#'
#' @export

view_HGVS_C <- function(HGVS_C, gene = "S", window = 10) {
  genome_file <- system.file("extdata", "NC_045512.2.gbk.ffn", package = "MappgeneSummary")
  seqs <- Biostrings::readDNAStringSet(genome_file, "fasta")

  # logic to do substitutions
  if (grepl("del", HGVS_C)) {
    n <- stringr::str_extract_all(HGVS_C, "\\d+")
    start <- as.numeric(n[[1]][1])
    stop <- as.numeric(n[[1]][2])
    range <- IRanges::IRanges(start, stop)

    window_right <- floor(start(range) / 3) * 3 + window * 3

    S <- XVector::subseq(seqs[[gene]], 1, window_right)
    S_modified <- Biostrings::replaceAt(S, range, value = strrep("N", width(range)))
  } else if (grepl(">", HGVS_C)) {
    n <- stringr::str_extract(HGVS_C, "\\d+")
    start <- as.numeric(n[[1]][1])
    range <- IRanges::IRanges(start, start)

    window_right <- floor(start(range) / 3) * 3 + window * 3

    S <- XVector::subseq(seqs[[gene]], 1, window_right)

    S_modified <- Biostrings::replaceAt(S, range, value = str_sub(HGVS_C, -1))
  } else if (grepl("ins", HGVS_C)) {
    a <- stringr::str_split(HGVS_C, pattern = "ins|dup")[[1]]
    insertion <- a[2]
    n <- stringr::str_extract_all(a[1], "\\d+")
    start <- as.numeric(n[[1]][1])
    range <- IRanges::IRanges(start)
    y_range <- IRanges::IRanges(start, start + (nchar(insertion)))

    window_right <- floor(start(range) / 3) * 3 + window * 3

    S <- XVector::subseq(seqs[[gene]], 1, window_right)

    S <- Biostrings::replaceAt(S, range, value = paste0(as.character(XVector::subseq(S, start(range), end(range))), strrep("N", nchar(insertion))))
    S_modified <- Biostrings::replaceAt(S, y_range, value = paste0(as.character(XVector::subseq(S, start(range), end(range))), insertion))

    range <- y_range # just to get the window size right
  } else if (grepl("dup", HGVS_C)) {
    a <- stringr::str_split(HGVS_C, pattern = "ins|dup")[[1]]
    insertion <- a[2]
    n <- stringr::str_extract_all(a[1], "\\d+")
    start <- as.numeric(n[[1]][2])
    range <- IRanges::IRanges(start)
    y_range <- IRanges::IRanges(start, start + (nchar(insertion)))

    window_right <- floor(start(range) / 3) * 3 + window * 3

    S <- XVector::subseq(seqs[[gene]], 1, window_right)
    # S = seqs[[gene]]
    S <- Biostrings::replaceAt(S, range, value = paste0(as.character(XVector::subseq(S, start(range), end(range))), strrep("N", nchar(insertion))))
    S_modified <- Biostrings::replaceAt(S, y_range, value = paste0(as.character(XVector::subseq(S, start(range), end(range))), insertion))

    range <- y_range # just to get the window size right
  }

  aa_left <- floor(start(range) / 3) - window
  if (aa_left < 1) {
    aa_left <- 1
  }

  aa_right <- ceiling(end(range) / 3) + window
  if (aa_right > length(seqs[[gene]])) {
    aa_right <- length(seqs[[gene]])
  }

  gene_list <- list()
  gene_list[[paste(gene, "(NC_045512.2)")]] <- S
  gene_list[[paste(gene, "(modified)")]] <- S_modified

  dna <- Biostrings::DNAStringSet(x = gene_list)
  aa <- Biostrings::translate(dna, if.fuzzy.codon = "X")

  dna_aln <- Biostrings::DNAMultipleAlignment(msa(dna, gapExtension = 0))
  aa_aln <- Biostrings::AAMultipleAlignment(msa(aa, gapExtension = 0))


  a <- ggmsa::ggmsa(dna, aa_left * 3 - 2, aa_right * 3, color = "Chemistry_NT", font = "DroidSansMono", char_width = 0.5, seq_name = TRUE) + geom_msaBar()
  b <- ggmsa::ggmsa(aa_aln, aa_left, aa_right, color = "Chemistry_AA", font = "DroidSansMono", char_width = 0.5, seq_name = TRUE) + geom_msaBar()

  patchwork::wrap_plots(
    a$plotlist[[2]] / b$plotlist[[2]] +
      plot_annotation(
        title = paste(gene, HGVS_C)
      )
  )
}
jeffkimbrel/MappgeneSummary documentation built on Dec. 24, 2024, 3:12 p.m.