R/standard_tables.R

Defines functions .count_repeat .micro_left .micro_right .count2_del .count2_ins .count1 .get_indel_motifs create_ind83_table .table_exists_warning build_standard_table rc .gg_color_hue create_dbs78_table create_sbs192_table create_sbs96_table

Documented in build_standard_table create_dbs78_table create_sbs192_table create_sbs96_table rc

#' Uses a genome object to find context and generate standard SBS96 tables
#'
#' @param musica A \code{\linkS4class{musica}} object.
#' @param g A \linkS4class{BSgenome} object indicating which genome
#' reference the variants and their coordinates were derived from.
#' @param overwrite Overwrite existing count table
#' @return Returns the created SBS96 count table object
#' @keywords internal
create_sbs96_table <- function(musica, g, overwrite = FALSE) {
  dat <- subset_variant_by_type(variants(musica), type = "SBS")
  ref <- as.character(dat$ref)
  alt <- as.character(dat$alt)
  mut_type <- paste(ref, ">", alt, sep = "")

  # Mutation Context
  context <- BSgenome::getSeq(
    x = g, names = as.character(dat$chr),
    start = dat$start - 1,
    end = dat$end + 1,
    as.character = TRUE
  )

  final_mut_type <- rep(NA, length(ref))
  final_mut_context <- rep(NA, length(ref))

  # Get mutation context info for those on "+" strand
  forward_change <- c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G")
  ind <- mut_type %in% forward_change
  final_mut_type[ind] <- paste(as.character(ref[ind]), ">",
    as.character(alt[ind]),
    sep = ""
  )
  final_mut_context[ind] <- context[ind]

  # Get mutation context info for those on "-" strand
  rev_change <- c("A>G", "A>T", "A>C", "G>T", "G>C", "G>A")
  ind <- mut_type %in% rev_change

  # Reverse complement the context so only 6 mutation categories instead of 12
  rev_context <- as.character(Biostrings::reverseComplement(
    Biostrings::DNAStringSet(context[ind])
  ))
  rev_refbase <- as.character(Biostrings::reverseComplement(
    Biostrings::DNAStringSet(ref[ind])
  ))
  rev_altbase <- as.character(Biostrings::reverseComplement(
    Biostrings::DNAStringSet(alt[ind])
  ))

  final_mut_type[ind] <- paste0(rev_refbase, ">", rev_altbase)
  final_mut_context[ind] <- rev_context
  final_motif <- paste0(final_mut_type, "_", final_mut_context)

  ###### Now we separate into samples
  ## Define all mutation types for 96 substitution scheme
  b1 <- rep(rep(c("A", "C", "G", "T"), each = 4), 6)
  b2 <- rep(c("C", "T"), each = 48)
  b3 <- rep(c("A", "C", "G", "T"), 24)
  mut_trinuc <- apply(cbind(b1, b2, b3), 1, paste, collapse = "")
  mut <- rep(forward_change, each = 16)
  annotation <- data.frame(
    "motif" = paste0(mut, "_", mut_trinuc),
    "mutation" = mut,
    "context" = mut_trinuc
  )
  rownames(annotation) <- annotation$motif


  mut_id <- apply(cbind(mut, mut_trinuc), 1, paste,
    collapse = "_"
  )
  mutation <- factor(final_motif, levels = annotation$motif)

  mut_table <- as.matrix(as.data.frame.matrix(xtabs(~ mutation + dat$sample)))
  # xtabs adds dat$sample as dimname[2] so we remove it
  dimnames(mut_table) <- list(rownames(mut_table), colnames(mut_table))

  # Use COSMIC color scheme
  color_mapping <- c(
    "C>A" = "#5ABCEBFF",
    "C>G" = "#050708FF",
    "C>T" = "#D33C32FF",
    "T>A" = "#CBCACBFF",
    "T>C" = "#ABCD72FF",
    "T>G" = "#E7C9C6FF"
  )

  # Need to think about this more carefully - what happens if indel are zero but
  #  not SNV and the user wants to combine them?
  #  zero_samps <- which(colSums(mut_table) == 0)
  #  if (length(zero_samps) > 0) {
  #    warning(paste0("Dropping the following zero count samples: ",
  #                   paste(names(zero_samps), collapse = ", ")))
  #    mut_table <- mut_table[, -zero_samps, drop = FALSE]
  #  }
  tab <- .create_count_table(
    musica = musica, name = "SBS96",
    count_table = mut_table,
    annotation = annotation,
    features = data.frame(mutation = final_motif),
    type = as.character(dat$Variant_Type),
    color_variable = "mutation",
    color_mapping = color_mapping,
    description = paste0(
      "Single Base Substitution table with",
      " one base upstream and downstream"
    ), return_table = TRUE,
    overwrite = overwrite
  )
  return(tab)
}

#' Uses a genome object to find context and generate standard SBS192 table
#' using transcript strand
#'
#' @param musica Input samples
#' @param g A \linkS4class{BSgenome} object indicating which genome
#' reference the variants and their coordinates were derived from.
#' @param strand_type Transcript_Strand or Replication_Strand
#' @param overwrite Overwrite existing count table
#' @return Returns the created SBS192 count table object built using either
#' transcript strand or replication strand
#' @keywords internal
create_sbs192_table <- function(musica, g, strand_type, overwrite = FALSE) {
  if (!strand_type %in% c("Transcript_Strand", "Replication_Strand")) {
    stop("Please select either Transcript_Strand or Replication_Strand")
  }
  dat <- variants(musica)
  dat <- subset_variant_by_type(dat, "SBS")
  dat <- drop_na_variants(dat, strand_type)

  chr <- dat$chr
  range_start <- dat$start
  range_end <- dat$end
  lflank <- VariantAnnotation::getSeq(g, chr, range_start - 1, range_start - 1,
    as.character = TRUE
  )
  rflank <- VariantAnnotation::getSeq(g, chr, range_end + 1, range_end + 1,
    as.character = TRUE
  )
  ref_context <- paste(lflank, dat$ref, rflank, sep = "")

  final_mut_type <- rep(NA, nrow(dat))
  final_mut_context <- rep(NA, nrow(dat))

  ## Get mutation type
  initial_maf_type <- paste(dat$ref, ">", dat$alt,
    sep = ""
  )

  ## Get mutation context info for those on "+" strand
  forward_change <- c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G")
  ind <- dat$Variant_Type == "SBS" & initial_maf_type %in% forward_change

  final_mut_type[ind] <- initial_maf_type[ind]
  final_mut_context[ind] <- ref_context[ind]

  ## Get mutation context info for those on "-" strand
  rev_change <- c("A>G", "A>T", "A>C", "G>T", "G>C", "G>A")
  ind <- dat$Variant_Type == "SBS" & initial_maf_type %in% rev_change

  ## Reverse complement the context so only 6 mutation categories instead of 12
  rev_context <- Biostrings::reverseComplement(Biostrings::DNAStringSet(
    ref_context[ind]
  ))
  rev_refbase <- Biostrings::reverseComplement(Biostrings::DNAStringSet(
    dat$ref[ind]
  ))
  rev_altbase <- Biostrings::reverseComplement(Biostrings::DNAStringSet(
    dat$alt[ind]
  ))

  final_mut_type[ind] <- paste(as.character(rev_refbase), ">",
    as.character(rev_altbase),
    sep = ""
  )
  final_mut_context[ind] <- rev_context

  maf_mut_id <- paste(final_mut_type, final_mut_context, dat[[strand_type]],
    sep = "_"
  )
  tumor_id <- dat$sample

  ## Define all mutation types for 196 substitution scheme
  b1 <- rep(rep(c("A", "C", "G", "T"), each = 24), 2)
  b2 <- rep(rep(c("C", "T"), each = 12), 8)
  b3 <- rep(c("A", "C", "G", "T"), 48)
  mut_trinuc <- apply(cbind(b1, b2, b3), 1, paste, collapse = "")
  mut_type <- rep(rep(rep(forward_change, each = 4), 4), 2)
  if (strand_type == "Transcript_Strand") {
    mut_strand <- rep(c("T", "U"), each = 96)
  } else if (strand_type == "Replication_Strand") {
    mut_strand <- rep(c("leading", "lagging"), each = 96)
  }
  mut_id <- apply(cbind(mut_type, mut_trinuc, mut_strand), 1, paste,
    collapse = "_"
  )

  mutation <- factor(maf_mut_id, levels = mut_id)

  mut_table <- xtabs(~ mutation + tumor_id)

  # Convert to table by dropping xtabs class and call
  attr(mut_table, "call") <- NULL
  attr(mut_table, "class") <- NULL

  annotation <- data.frame(
    motif = mut_id, mutation =
      unlist(lapply(strsplit(mut_id, "_"), "[[", 1)),
    context = paste(
      unlist(lapply(
        strsplit(mut_id, "_"),
        "[[", 2
      )),
      unlist(lapply(
        strsplit(mut_id, "_"),
        "[[", 3
      )),
      sep = "_"
    ),
    row.names = mut_id
  )
  color_mapping <- .gg_color_hue(length(unique(annotation$mutation)))
  names(color_mapping) <- unique(annotation$mutation)
  tab <- .create_count_table(
    musica = musica, name = paste0("SBS192_", ifelse(
      strand_type == "Transcript_Strand", "Trans", "Rep"
    )),
    count_table = mut_table,
    annotation = annotation,
    features = data.frame(mutation = maf_mut_id),
    type = paste(as.character(dat$Variant_Type),
                 ifelse(strand_type ==
                          "Transcript_Strand", "Trans", "Rep"), sep = "_"),
    color_variable = "mutation",
    color_mapping = color_mapping,
    description = paste0(
      "Single Base Substitution ",
      "table with one base ",
      "upstream and downstream and",
      " transcript strand"
    ),
    return_table = TRUE, overwrite = overwrite
  )
  return(tab)
}

#' Creates and adds a table for standard doublet base subsitution (DBS)
#'
#' @param musica A \code{\linkS4class{musica}} object.
#' @param overwrite Overwrite existing count table
#' @return Returns the created DBS table object
#' @keywords internal
create_dbs78_table <- function(musica, overwrite = overwrite, verbose) {
  dbs <- subset_variant_by_type(variants(musica), "DBS")

  ref <- dbs$ref
  alt <- dbs$alt

  # Reverse Complement broad categories to the other strand
  rc_ref <- which(ref %in% c("GT", "GG", "AG", "GA", "CA", "AA"))
  ref[rc_ref] <- rc(ref[rc_ref])
  alt[rc_ref] <- rc(alt[rc_ref])

  # Reverse complement more specific categories to the other strand
  rc_alt <- NULL
  rc_ref_AT <- which(ref == "AT")
  rc_alt <- c(rc_alt, rc_ref_AT[which(alt[rc_ref_AT] %in% c("TG", "GG", "TC"))])

  rc_ref_CG <- which(ref == "CG")
  rc_alt <- c(rc_alt, rc_ref_CG[which(alt[rc_ref_CG] %in% c("AC", "GA", "AA"))])

  rc_ref_GC <- which(ref == "GC")
  rc_alt <- c(rc_alt, rc_ref_GC[which(alt[rc_ref_GC] %in% c("TT", "CT", "TG"))])

  rc_ref_TA <- which(ref == "TA")
  rc_alt <- c(rc_alt, rc_ref_TA[which(alt[rc_ref_TA] %in% c("AG", "CC", "AC"))])

  if (length(rc_alt) > 0) {
    alt[rc_alt] <- rc(alt[rc_alt])
  }

  full <- paste(ref, ">NN_", alt, sep = "")
  full_motif <- c(
    paste0("AC>NN", "_", c(
      "CA", "CG", "CT", "GA", "GG", "GT",
      "TA", "TG", "TT"
    )),
    paste0("AT>NN", "_", c("CA", "CC", "CG", "GA", "GC", "TA")),
    paste0("CC>NN", "_", c(
      "AA", "AG", "AT", "GA", "GG", "GT", "TA",
      "TG", "TT"
    )),
    paste0("CG>NN", "_", c("AT", "GC", "GT", "TA", "TC", "TT")),
    paste0("CT>NN", "_", c(
      "AA", "AC", "AG", "GA", "GC", "GG", "TA",
      "TC", "TG"
    )),
    paste0("GC>NN", "_", c("AA", "AG", "AT", "CA", "CG", "TA")),
    paste0("TA>NN", "_", c("AT", "CG", "CT", "GC", "GG", "GT")),
    paste0("TC>NN", "_", c(
      "AA", "AG", "AT", "CA", "CG", "CT", "GA",
      "GG", "GT"
    )),
    paste0("TG>NN", "_", c(
      "AA", "AC", "AT", "CA", "CC", "CT", "GA",
      "GC", "GT"
    )),
    paste0("TT>NN", "_", c(
      "AA", "AC", "AG", "CA", "CC", "CG", "GA",
      "GC", "GG"
    ))
  )

  sample_names <- sample_names(musica)
  num_samples <- length(sample_names)
  variant_tables <- vector("list", length = num_samples)
  if (isTRUE(verbose)) {
    pb <- utils::txtProgressBar(
      min = 0, max = num_samples, initial = 0,
      style = 3
    )
    i <- 0
  }
  for (i in seq_len(num_samples)) {
    sample_index <- which(dbs$sample == sample_names[i])
    if (length(sample_index) > 0) {
      variant_tables[[i]] <- table(factor(full[sample_index],
        levels = full_motif
      ))
    } else {
      variant_tables[[i]] <- rep(0, length(full_motif))
    }
    if (isTRUE(verbose)) {
      i <- i + 1
      utils::setTxtProgressBar(pb, i)
    }
  }
  mut_table <- do.call(cbind, variant_tables)
  colnames(mut_table) <- sample_names

  full_tab <- matrix(
    data = 0, nrow = length(full_motif),
    ncol = length(levels(dbs$sample)),
    dimnames = list(full_motif, levels(dbs$sample))
  )
  full_tab[, colnames(mut_table)] <- mut_table

  annotation <- data.frame(
    motif = full_motif, mutation =
      unlist(lapply(strsplit(full_motif, "_"), "[[", 1)),
    context = unlist(lapply(
      strsplit(full_motif, "_"),
      "[[", 2
    )),
    row.names = full_motif
  )
  color_mapping <- .gg_color_hue(length(unique(annotation$mutation)))
  names(color_mapping) <- unique(annotation$mutation)
  tab <- .create_count_table(
    musica = musica, name = "DBS78",
    count_table = full_tab,
    annotation = annotation,
    features = data.frame(mutation = full),
    type = as.character(dbs$Variant_Type),
    color_variable = "mutation",
    color_mapping = color_mapping,
    description = paste0(
      "Standard count table for ",
      "double base substitutions",
      "using COSMIC v3 schema"
    ),
    return_table = TRUE, overwrite = overwrite
  )
  return(tab)
}

.gg_color_hue <- function(n) {
  hues <- base::seq(15, 375, length = n + 1)
  return(grDevices::hcl(h = hues, l = 65, c = 100)[seq_len(n)])
}

#' Reverse complement of a string using biostrings
#'
#' @param dna Input DNA string
#' @return Returns the reverse compliment of the input DNA string
#' @examples
#' rc("ATGC")
#' @export
rc <- function(dna) {
  if (is(dna, "character") && length(dna) == 1) {
    rev_com <- as.character(Biostrings::reverseComplement(
      Biostrings::DNAString(dna)
    ))
  } else if (is(dna, "character") && length(dna) > 1) {
    rev_com <- vapply(dna, rc, FUN.VALUE = character(1))
    names(rev_com) <- NULL
  } else {
    stop("Must be character or character vector")
  }
  return(rev_com)
}

#' Builds count tables using various mutation type schemas
#'
#' Generates count tables for different mutation type schemas which can be
#' used as input to the mutational signature discovery or prediction functions.
#' \code{"SBS96"} generates a table for single base substitutions following the
#' standard 96 mutation types derived from the trinucleotide context.
#' \code{"SBS192"} is the 96 mutation type schema with the addition of
#' transcriptional strand or replication strand information added to each base.
#' \code{"DBS"} generates a table for the double base substitution schema
#' used in COSMIC V3. \code{"Indel"} generates a table for insertions and
#' deletions following the schema used in COSMIC V3.
#'
#' @param musica A \code{\linkS4class{musica}} object.
#' @param g A \linkS4class{BSgenome} object indicating which genome
#' reference the variants and their coordinates were derived from.
#' @param modality Modality of table to build. One of \code{"SBS96"},
#' \code{"SBS192"}, \code{"DBS"}, or \code{"Indel"}.
#' @param strand_type Strand type to use in SBS192 schema. One of
#' \code{"Transcript_Strand"} or \code{"Replication_Strand"}.
#' Only used if \code{modality = SBS192}.
#' @param overwrite If \code{TRUE}, any existing count table with the same
#' name will be overwritten. If \code{FALSE}, then an error will be thrown
#' if a table with the same name exists within the \code{musica} object.
#' @param verbose Show progress bar for processed samples
#' @param table_name Use modality instead
#' @return No object will be returned. The count tables will be automatically
#' added to the \code{musica} object.
#' @examples
#' g <- select_genome("19")
#'
#' data(musica)
#' build_standard_table(musica, g, "SBS96", overwrite = TRUE)
#'
#' data(musica)
#' annotate_transcript_strand(musica, "19")
#' build_standard_table(musica, g, "SBS192", "Transcript_Strand")
#'
#' data(musica)
#' data(rep_range)
#' annotate_replication_strand(musica, rep_range)
#' build_standard_table(musica, g, "SBS192", "Replication_Strand")
#'
#' data(dbs_musica)
#' build_standard_table(dbs_musica, g, "DBS", overwrite = TRUE)
#'
#' data(indel_musica)
#' build_standard_table(indel_musica, g, modality = "INDEL")
#' @export
build_standard_table <- function(musica, g, modality, strand_type = NULL,
                                 overwrite = FALSE, verbose = FALSE,
                                 table_name = NULL) {
  if (!is.null(table_name)) {
    warning("table_name argument deprecated. Use modality instead.")
    modality <- table_name
  }

  if (modality %in% c("SNV96", "SNV", "96", "SBS", "SBS96")) {
    message("Building count table from SBS with SBS96 schema")
    .table_exists_warning(musica, "SBS96", overwrite)
    tab <- create_sbs96_table(musica, g, overwrite)
  } else if (modality %in% c("SBS192", "192")) {
    message(
      "Building count table from SBS and ", strand_type,
      " with SBS192 schema"
    )
    .table_exists_warning(musica, "SBS192", overwrite)
    tab <- create_sbs192_table(musica, g, strand_type, overwrite)
  } else if (modality %in% c("DBS78", "DBS", "doublet")) {
    message("Building count table from DBS with DBS78 schema")
    .table_exists_warning(musica, "DBS78", overwrite)
    tab <- create_dbs78_table(musica, overwrite, verbose)
  } else if (modality %in% c(
    "IND83", "INDEL83", "INDEL", "IND", "indel",
    "Indel"
  )) {
    message("Building count table from INDELs with IND83 schema")
    .table_exists_warning(musica, "IND83", overwrite)
    tab <- create_ind83_table(musica, g, overwrite, verbose)
  } else {
    stop(paste0(
      "There is no standard table named: ", modality,
      " please select from SBS96, SBS192, DBS78, IND83."
    ))
  }
  tab_list <- tables(musica)
  tab_list[[get_tab_name(tab)]] <- tab
  eval.parent(substitute(tables(musica) <- tab_list))
}

.table_exists_warning <- function(musica, modality, overwrite = FALSE) {
  if (modality %in% names(musica@count_tables)) {
    if (!overwrite) {
      stop(paste0(
        "Table: ", modality,
        " already exists, use overwrite to continue."
      ))
    } else {
      warning(paste0("Overwriting counts table: ", modality))
    }
  }
}

create_ind83_table <- function(musica, g, overwrite = FALSE,
                               verbose = FALSE) {
  var <- variants(musica)
  all_ins <- as.data.frame(subset_variant_by_type(var, "INS"))
  all_del <- as.data.frame(subset_variant_by_type(var, "DEL"))
  # samples <- unique(c(as.character(all_ins$sample),
  #                    as.character(all_del$sample)))
  all_samples <- sample_names(musica)

  ins_len <- nchar(all_ins$alt)
  del_len <- nchar(all_del$ref)
  ins1 <- all_ins[which(ins_len == 1), ]
  ins2 <- all_ins[which(ins_len > 1), ]

  del1 <- all_del[which(del_len == 1), ]
  del2 <- all_del[which(del_len > 1), ]

  if (nrow(del1) == 0) {
    m <- .get_indel_motifs("bp1", 0, 0)
    del1_counts <- matrix(0,
      nrow = length(m), ncol = length(all_samples),
      dimnames = list(m, all_samples)
    )
  } else {
    del1_counts <- .count1(mut = del1, type = del1$ref, ins = FALSE, g = g)
    del1_ta <- table(del1_counts, del1$sample)
  }

  if (nrow(ins1) == 0) {
    m <- .get_indel_motifs("bp1", 1, 0)
    ins1_counts <- matrix(0,
      nrow = length(m), ncol = length(all_samples),
      dimnames = list(m, all_samples)
    )
  } else {
    ins1_counts <- .count1(mut = ins1, type = ins1$alt, ins = TRUE, g = g)
    ins1_ta <- table(ins1_counts, ins1$sample)
  }

  if (nrow(ins2) == 0) {
    m <- .get_indel_motifs("ins", NA, NA)
    ins2_counts <- matrix(0,
      nrow = length(m), ncol = length(all_samples),
      dimnames = list(m, all_samples)
    )
  } else {
    ins2_counts <- .count2_ins(mut = ins2, type = ins2$alt, g = g)
    ins2_ta <- table(ins2_counts, ins2$sample)
  }

  if (nrow(del2) == 0) {
    m1 <- .get_indel_motifs("del", NA, NA)
    m2 <- .get_indel_motifs("micro", NA, NA)
    del2_counts <- list(
      del =
        matrix(0,
          nrow = length(m1),
          ncol = length(all_samples),
          dimnames = list(m1, all_samples)
        ),
      micro =
        matrix(0,
          nrow = length(m2),
          ncol = length(all_samples),
          dimnames = list(m2, all_samples)
        )
    )
  } else {
    del2_counts <- .count2_del(mut = del2, type = del2$ref, g)
  }

  dimlist <- list(
    row_names = c(
      .get_indel_motifs("bp1", 0, 0),
      .get_indel_motifs("bp1", 1, 0),
      .get_indel_motifs("del", NA, NA),
      .get_indel_motifs("ins", NA, NA),
      .get_indel_motifs("micro", NA, NA)
    ),
    column_names = levels(all_samples)
  )
  mut_table <- matrix(
    rbind(
      del1_ta, ins1_ta, del2_counts$del,
      ins2_ta, del2_counts$micro
    ),
    nrow = 83,
    ncol = length(levels(all_samples)),
    dimnames = dimlist
  )
  motif <- rownames(mut_table)

  mutation <- c(
    substr(motif[seq_len(24)], 1, 5),
    paste(unlist(lapply(strsplit(motif[25:83], "_"), "[[", 1)),
      unlist(lapply(strsplit(motif[25:83], "_"), "[[", 2)),
      unlist(lapply(strsplit(motif[25:83], "_"), "[[", 3)),
      sep = "_"
    )
  )
  context <- c(
    substr(motif[seq_len(24)], 7, 9),
    unlist(lapply(strsplit(motif[25:83], "_"), "[[", 3))
  )
  annotation <- data.frame(
    motif = motif, mutation = mutation,
    context = context
  )

  color_mapping <- .gg_color_hue(length(unique(annotation$mutation)))
  names(color_mapping) <- unique(annotation$mutation)

  # TODO error in counting, we're missing some
  # incorrect_features <- length(rep(rownames(mut_table), rowSums(mut_table)))
  dummy_features <- rep(NA, length(which(var$Variant_Type %in% c(
    "INS",
    "DEL"
  ))))

  tab <- .create_count_table(
    musica = musica, name = "IND83",
    count_table = mut_table,
    annotation = annotation,
    features = data.frame(mutation = dummy_features),
    type = Rle(rep("INDEL", length(which(
      var$Variant_Type %in% c("INS", "DEL")
    )))),
    color_variable = "mutation",
    color_mapping = color_mapping,
    description = paste0(
      "Standard count table for ",
      "small insertions and",
      " deletions"
    ),
    return_table = TRUE, overwrite = overwrite
  )
  return(tab)
}

.get_indel_motifs <- function(indel, ins, plus) {
  if (indel == "bp1") {
    return(paste(ifelse(ins, "INS", "DEL"), c(
      "C", "C", "C", "C", "C", "C", "T",
      "T", "T", "T", "T", "T"
    ), 1,
    c(c(
      0, 1, 2, 3, 4, paste0(5 + plus, "+"), 0, 1, 2, 3, 4,
      paste0(5 + plus, "+")
    )),
    sep = "_"
    ))
  } else if (indel == "ins") {
    return(paste(paste("INS_repeats", c(
      "2", "2", "2", "2", "2", "2", "3", "3", "3", "3",
      "3", "3", "4", "4", "4", "4", "4", "4", "5+",
      "5+", "5+", "5+", "5+", "5+"
    ),
    c(
      "0", "1", "2", "3", "4", "5+", "0", "1", "2", "3", "4",
      "5+", "0", "1", "2", "3", "4", "5+", "0", "1", "2", "3",
      "4", "5+"
    ),
    sep = "_"
    )))
  } else if (indel == "micro") {
    return(paste("DEL_MH", c(
      "2", "3", "3", "4", "4", "4", "5+", "5+",
      "5+", "5+", "5+"
    ),
    c(
      "1", "1", "2", "1", "2", "3", "1", "2",
      "3", "4", "5+"
    ),
    sep = "_"
    ))
  } else if (indel == "del") {
    return(paste("DEL_repeats", c(
      "2", "2", "2", "2", "2", "2", "3", "3", "3", "3",
      "3", "3", "4", "4", "4", "4", "4", "4", "5+",
      "5+", "5+", "5+", "5+", "5+"
    ),
    c(
      "0", "1", "2", "3", "4", "5+", "0", "1", "2", "3", "4",
      "5+", "0", "1", "2", "3", "4", "5+", "0", "1", "2", "3",
      "4", "5+"
    ),
    sep = "_"
    ))
  } else {
    stop("Unrecognized indel type")
  }
}

.count1 <- function(mut, type, ins, g) {
  # ifelse(ins, plus <- 0, plus <- 0)
  ifelse(ins, plus <- 0, plus <- 0)
  chr <- mut$chr
  range_start <- mut$start
  range_end <- mut$end
  lflank <- VariantAnnotation::getSeq(g, chr, range_start - 7, range_start - 1,
    as.character = TRUE
  )
  rflank <- VariantAnnotation::getSeq(g, chr, range_end + 1, range_end + 7,
    as.character = TRUE
  )
  final_lflank <- lflank
  final_rflank <- rflank
  ind <- which(type %in% c("A", "G"))
  final_lflank[ind] <- rflank[ind] %>%
    Biostrings::DNAStringSet() %>%
    Biostrings::reverseComplement() %>%
    as.character()
  final_rflank[ind] <- lflank[ind] %>%
    Biostrings::DNAStringSet() %>%
    Biostrings::reverseComplement() %>%
    as.character()
  final_type <- type
  final_type[ind] <- final_type[ind] %>%
    Biostrings::DNAStringSet() %>%
    Biostrings::reverseComplement() %>%
    as.character()
  repeats <- rep(NA, length(final_type))
  for (i in seq_len(length(type))) {
    repeats[i] <-
      .count_repeat(letter = final_type[i], string = final_rflank[i]) +
      .count_repeat(final_type[i], stringi::stri_reverse(final_lflank[i])) +
      plus
  }
  repeats[repeats >= 5 + plus] <- paste0(5 + plus, "+")
  bp1_motif <- .get_indel_motifs("bp1", ins, plus)
  return(factor(paste(ifelse(ins, "INS", "DEL"), final_type, 1,
                      repeats, sep = "_"), levels = bp1_motif))
}

.count2_ins <- function(mut, type, g) {
  chr <- mut$chr
  range_start <- mut$start
  range_end <- mut$end
  lflank <- VariantAnnotation::getSeq(g, chr, range_start - 10, range_start - 1,
    as.character = TRUE
  )
  rflank <- VariantAnnotation::getSeq(g, chr, range_end + 1, range_end + 10,
    as.character = TRUE
  )
  repeats <- rep(NA, length(type))
  for (i in seq_len(length(type))) {
    repeats[i] <- .count_repeat(type[i], rflank[i]) +
      .count_repeat(type[i], stringi::stri_reverse(lflank[i]))
  }
  repeats[repeats >= 5] <- paste0(5, "+")
  len <- nchar(type)
  len[which(len >= 5)] <- "5+"
  ins_motif <- .get_indel_motifs("ins", NA, NA)
  return(factor(paste("INS_repeats", len, repeats, sep = "_"),
    levels = ins_motif
  ))
}

.count2_del <- function(mut, type, g) {
  chr <- mut$chr
  range_start <- mut$start
  range_end <- mut$end

  len <- nchar(type)
  lflank <- VariantAnnotation::getSeq(g, chr, range_start - len * 5,
    range_start - 1,
    as.character = TRUE
  )
  rflank <- VariantAnnotation::getSeq(g, chr, range_end + 1,
    range_end + len * 5,
    as.character = TRUE
  )
  # has_repeat <- type == lflank | type == rflank #could be used for a speedup
  # maybe_micro <- which(!has_repeat) #doesn't get used from some reason #TODO

  micro <- rep(NA, length(type))
  for (i in seq_len(length(type))) {
    micro[i] <- max(
      .micro_left(type[i], lflank[i]),
      .micro_right(type[i], rflank[i])
    )
  }

  repeats <- rep(NA, length(type))
  for (i in seq_len(length(type))) {
    # TEST removing the +1 for dels to bring it in line with Alexandrov counting
    # repeats[i] <- .count_repeat(type[i], rflank[i]) +
    #  .count_repeat(type[i], rev(lflank[i]))
    repeats[i] <- .count_repeat(type[i], rflank[i]) +
      .count_repeat(type[i], stringi::stri_reverse(lflank[i]))
  }
  # micro_ind <- which(repeats == 0 & micro > 0)
  micro_ind <- which(repeats == 0 & micro > 0)
  repeat_ind <- which(repeats > 0 | (repeats == 0 & micro == 0))
  final_micro <- micro[micro_ind]
  final_repeats <- repeats[repeat_ind]
  # final_repeats[final_repeats >= 5] <- paste0(5, "+")
  final_repeats[final_repeats >= 5] <- paste0(5, "+")
  final_len <- len
  final_len[which(final_len >= 5)] <- "5+"
  final_micro[which(final_micro >= 5)] <- "5+"
  micro_motif <- .get_indel_motifs("micro", NA, NA)
  del_motif <- .get_indel_motifs("del", NA, NA)
  del_temp <- factor(paste("DEL_repeats", final_len[repeat_ind],
                           final_repeats, sep = "_"), levels = del_motif)
  del_tab <- table(del_temp, mut$sample[repeat_ind])
  micro_temp <- factor(paste("DEL_MH", final_len[micro_ind], final_micro,
                             sep = "_"), levels = micro_motif)
  micro_tab <- table(micro_temp, mut$sample[micro_ind])
  return(list(del = del_tab, micro = micro_tab))
}

.micro_right <- function(letter, string) {
  len <- nchar(letter)
  elem <- len - 1
  while (elem > 0) {
    if (substr(letter, 1, elem) == substr(string, 1, elem)) {
      return(elem)
    } else {
      elem <- elem - 1
    }
  }
  return(0)
}

.micro_left <- function(letter, string) {
  len <- nchar(letter)
  elem <- 1
  while (elem < len) {
    if (substr(letter, elem, len) == substr(string, elem, len)) {
      return(elem)
    } else {
      elem <- elem + 1
    }
  }
  return(0)
}

.count_repeat <- function(letter, string) {
  len <- nchar(letter)
  if (letter != substr(string, 1, len)) {
    return(0)
  } else {
    next_matches <- TRUE
    counts <- 1
    while (next_matches) {
      # message(paste0("Counts: ", counts))
      if (letter != substr(string, counts * len + 1, counts * len + len)) {
        return(counts)
      } else {
        counts <- counts + 1
      }
    }
  }
}
campbio/musicatk documentation built on Dec. 25, 2024, 9:34 p.m.