R/generate_matrices.R

Defines functions generate_matrix_SBS

# SBS ---------------------------------------------------------------------

generate_matrix_SBS <- function(query, ref_genome, genome_build = "hg19", add_trans_bias = FALSE) {
  send_info("SBS matrix generation - start.")
  ## T: and U: are basically same to sigprofiler, the other two types (N: and B:) are
  ## a bit different, may be caused by different annotations.
  query <- query[query$Variant_Type == "SNP"]
  if (nrow(query) == 0) {
    stop("Zero SNPs to analyze!")
  }

  extract.tbl <- data.table::data.table(
    Chromosome = query$Chromosome, Start = query$Start_Position, End = query$End_Position,
    Reference_Allele = query$Reference_Allele, Tumor_Seq_Allele2 = query$Tumor_Seq_Allele2,
    Tumor_Sample_Barcode = query$Tumor_Sample_Barcode, upstream = query$Start_Position - 20,
    downstream = query$End_Position + 20
  )

  send_info("Extracting 5' and 3' adjacent bases.")
  ss <- BSgenome::getSeq(
    x = ref_genome,
    names = extract.tbl$Chromosome,
    start = pmax(extract.tbl$Start - 2, 1L),
    end = extract.tbl$End + 2,
    as.character = TRUE
  )

  send_info("Extracting +/- 20bp around mutated bases for background C>T estimation.")
  updwn <- BSgenome::getSeq(
    x = ref_genome, names = extract.tbl$Chromosome, start = pmax(extract.tbl$upstream, 1L),
    end = extract.tbl$downstream, as.character = FALSE
  )
  updwn.alphFreq <- data.table::as.data.table(BSgenome::alphabetFrequency(x = updwn))[, c("A", "T", "G", "C")] # Nucleotide frequency
  updwn.tnmFreq <- data.table::as.data.table(Biostrings::trinucleotideFrequency(x = updwn, step = 1))

  extract.tbl$pentanucleotide <- as.character(ss)
  extract.tbl$trinucleotide <- substr(extract.tbl$pentanucleotide, 2, 4)
  extract.tbl$updown <- as.character(updwn)

  extract.tbl <- cbind(extract.tbl, updwn.alphFreq)
  extract.tbl <- cbind(extract.tbl, updwn.tnmFreq[, c("TCA", "TCT", "AGA", "TGA")])
  extract.tbl[, tcw := rowSums(extract.tbl[, c("TCA", "TCT")])]
  extract.tbl[, wga := rowSums(extract.tbl[, c("TGA", "AGA")])]

  ## All combinations
  extract.tbl[, Substitution := paste(extract.tbl$Reference_Allele, extract.tbl$Tumor_Seq_Allele2, sep = ">")]
  extract.tbl$SubstitutionMotif <- paste0(
    substr(x = as.character(extract.tbl$pentanucleotide), 1, 2),
    "[", extract.tbl$Substitution, "]",
    substr(as.character(extract.tbl$pentanucleotide), 4, 5)
  )
  extract.tbl$TriSubstitutionMotif <- substr(extract.tbl$SubstitutionMotif, 2, 8)

  # substitutions are referred to by the pyrimidine of the mutated Watson-Crick base pair
  conv <- c("T>C", "T>C", "C>T", "C>T", "T>A", "T>A", "T>G", "T>G", "C>A", "C>A", "C>G", "C>G")
  names(conv) <- c("A>G", "T>C", "C>T", "G>A", "A>T", "T>A", "A>C", "T>G", "C>A", "G>T", "C>G", "G>C")
  complement <- c("A", "C", "G", "T")
  names(complement) <- c("T", "G", "C", "A")


  extract.tbl$SubstitutionType <- conv[extract.tbl$Substitution]
  # need to reverse-complement triplet for mutated purines (not just the middle base)
  extract.tbl$should_reverse <- extract.tbl$Substitution != extract.tbl$SubstitutionType
  extract.tbl$SubstitutionTypeMotif <- ifelse(extract.tbl$should_reverse,
    paste0(
      complement[substr(x = extract.tbl$pentanucleotide, 5, 5)],
      complement[substr(x = extract.tbl$pentanucleotide, 4, 4)],
      "[", extract.tbl$SubstitutionType, "]",
      complement[substr(x = extract.tbl$pentanucleotide, 2, 2)],
      complement[substr(x = extract.tbl$pentanucleotide, 1, 1)]
    ),
    paste0(
      substr(x = as.character(extract.tbl$pentanucleotide), 1, 2),
      "[", extract.tbl$SubstitutionType, "]",
      substr(as.character(extract.tbl$pentanucleotide), 4, 5)
    )
  )
  extract.tbl$TriSubstitutionTypeMotif <- substr(extract.tbl$SubstitutionTypeMotif, 2, 8)


  # Possible substitution types after being referred to by the pyrimidine of the mutated Watson-Crick base pair

  # penta_comb <- expand.grid(
  #   complement,
  #   complement,
  #   "[",
  #   unique(as.character(conv)),
  #   "]",
  #   complement,
  #   complement,
  #   stringsAsFactors = FALSE
  # ) %>%
  #   apply(1, paste0, collapse = "") %>%
  #   unique()

  penta_comb <- vector_to_combination(
    complement,
    complement,
    "[",
    unique(as.character(conv)),
    "]",
    complement,
    complement
  )

  tri_comb <- substr(penta_comb, 2, 8) %>%
    unique()

  tri_comb2 <- vector_to_combination(
    complement,
    "[",
    unique(c(as.character(conv), names(conv))),
    "]",
    complement
  )

  # Set levels for type (mainly component)
  extract.tbl$SubstitutionType <- factor(extract.tbl$SubstitutionType, levels = unique(as.character(conv)))
  extract.tbl$TriSubstitutionTypeMotif <- factor(extract.tbl$TriSubstitutionTypeMotif, levels = tri_comb)
  extract.tbl$SubstitutionTypeMotif <- factor(extract.tbl$SubstitutionTypeMotif, levels = penta_comb)

  extract.tbl$TriSubstitutionMotif <- factor(extract.tbl$TriSubstitutionMotif, levels = tri_comb2)
  extract.tbl$Substitution <- factor(extract.tbl$Substitution, levels = unique(c(as.character(conv), names(conv))))

  # Compile data
  ## This is nucleotide frequcny and motif frequency across 41 bp in C>T and C>G context.
  apobecSummary <- extract.tbl[
    as.character(extract.tbl$SubstitutionType) %in% c("C>T", "C>G"),
    .(
      A = sum(A), T = sum(T), G = sum(G), C = sum(C), tcw = sum(tcw),
      wga = sum(wga), bases = sum(A, T, G, C)
    ), Tumor_Sample_Barcode
  ]
  ## The by operation may remove some samples here
  ## should it be added??

  ## This is per sample conversion events
  sub.tbl <- extract.tbl[, .N, list(Tumor_Sample_Barcode, Substitution)]
  sub.tbl <- data.table::dcast(data = sub.tbl, formula = Tumor_Sample_Barcode ~ Substitution, fill = 0, value.var = "N", drop = FALSE)
  sub.tbl[, n_A := rowSums(sub.tbl[, c("A>C", "A>G", "A>T")], na.rm = TRUE)][, n_T := rowSums(sub.tbl[, c("T>A", "T>C", "T>G")], na.rm = TRUE)][, n_G := rowSums(sub.tbl[, c("G>A", "G>C", "G>T")], na.rm = TRUE)][, n_C := rowSums(sub.tbl[, c("C>A", "C>G", "C>T")], na.rm = TRUE)]
  sub.tbl[, n_mutations := rowSums(sub.tbl[, c("n_A", "n_T", "n_G", "n_C")], na.rm = TRUE)]
  sub.tbl[, "n_C>G_and_C>T" := rowSums(sub.tbl[, c("C>G", "G>C", "C>T", "G>A")], na.rm = TRUE)] # number of APOBEC type mutations (C>G and C>T type)

  ## This is per substitution type events
  subType.tbl <- extract.tbl[, .N, .(Tumor_Sample_Barcode, TriSubstitutionMotif)]
  subType.tbl <- data.table::dcast(data = subType.tbl, formula = Tumor_Sample_Barcode ~ TriSubstitutionMotif, fill = 0, value.var = "N", drop = FALSE)

  ### tCw events
  subType.tbl[, tCw_to_A := rowSums(subType.tbl[, .(`T[C>A]A`, `T[C>A]T`)], na.rm = TRUE)]
  subType.tbl[, tCw_to_G := rowSums(subType.tbl[, .(`T[C>G]A`, `T[C>G]T`)], na.rm = TRUE)]
  subType.tbl[, tCw_to_T := rowSums(subType.tbl[, .(`T[C>T]A`, `T[C>T]T`)], na.rm = TRUE)]
  subType.tbl[, tCw := rowSums(subType.tbl[, .(tCw_to_A, tCw_to_G, tCw_to_T)], na.rm = TRUE)]

  ### wGa events
  subType.tbl[, wGa_to_C := rowSums(subType.tbl[, .(`A[G>C]A`, `T[G>C]A`)], na.rm = TRUE)]
  subType.tbl[, wGa_to_T := rowSums(subType.tbl[, .(`A[G>T]A`, `T[G>T]A`)], na.rm = TRUE)]
  subType.tbl[, wGa_to_A := rowSums(subType.tbl[, .(`A[G>A]A`, `T[G>A]A`)], na.rm = TRUE)]
  subType.tbl[, wGa := rowSums(subType.tbl[, .(wGa_to_C, wGa_to_T, wGa_to_A)], na.rm = TRUE)]

  ## tCw_to_G+tCw_to_T
  subType.tbl[, "tCw_to_G+tCw_to_T" := rowSums(subType.tbl[, .(`T[C>G]T`, `T[C>G]A`, `T[C>T]T`, `T[C>T]A`, `T[G>C]A`, `A[G>C]A`, `T[G>A]A`, `A[G>A]A`)], na.rm = TRUE)]

  ### Merge data
  sub.tbl <- merge(sub.tbl, subType.tbl[, .(
    tCw_to_A, tCw_to_T, tCw_to_G, tCw, wGa_to_C, wGa_to_T, wGa_to_A,
    wGa, `tCw_to_G+tCw_to_T`, Tumor_Sample_Barcode
  )],
  by = "Tumor_Sample_Barcode"
  )
  sub.tbl <- merge(sub.tbl, apobecSummary, by = "Tumor_Sample_Barcode")

  ### Estimate APOBEC enrichment
  sub.tbl[, APOBEC_Enrichment := (`tCw_to_G+tCw_to_T` / `n_C>G_and_C>T`) / ((tcw + wga) / (C + G))]
  sub.tbl[, non_APOBEC_mutations := n_mutations - `tCw_to_G+tCw_to_T`]
  sub.tbl[, fraction_APOBEC_mutations := round((n_mutations - non_APOBEC_mutations) / n_mutations, digits = 3)]
  data.table::setDF(sub.tbl)

  send_info("Estimating APOBEC enrichment scores.")
  apobec.fisher.dat <- sub.tbl[, c(19, 28, 32, 33, 34)]
  if (nrow(apobec.fisher.dat) == 1) {
    apobec.fisher.dat <- t(as.matrix(apply(X = apobec.fisher.dat, 2, as.numeric)))
  } else {
    apobec.fisher.dat <- apply(X = apobec.fisher.dat, 2, as.numeric)
  }

  ### One way Fisher test to estimate over representation og APOBEC associated tcw mutations
  send_info("Performing one-way Fisher's test for APOBEC enrichment.")
  sub.tbl <- cbind(sub.tbl, data.table::rbindlist(apply(X = apobec.fisher.dat, 1, function(x) {
    xf <- fisher.test(matrix(c(x[2], sum(x[3], x[4]), x[1] - x[2], x[3] - x[4]), nrow = 2), alternative = "g")
    data.table::data.table(fisher_pvalue = xf$p.value, or = xf$estimate, ci.up = xf$conf.int[1], ci.low = xf$conf.int[2])
  })))

  data.table::setDT(sub.tbl)
  colnames(sub.tbl)[29:35] <- paste0("n_bg_", colnames(sub.tbl)[29:35])
  sub.tbl <- sub.tbl[order(sub.tbl$fisher_pvalue)]

  ## Choosing APOBEC Enrichment scores > 2 as cutoff
  sub.tbl$APOBEC_Enriched <- ifelse(test = sub.tbl$APOBEC_Enrichment > 2, yes = "yes", no = "no")
  sub.tbl[, fdr := p.adjust(sub.tbl$fisher_pvalue, method = "fdr")] # Adjusted p-values

  send_success(
    paste0("APOBEC related mutations are enriched in "),
    round(nrow(sub.tbl[APOBEC_Enriched %in% "yes"]) / nrow(sub.tbl) * 100, digits = 3),
    "% of samples (APOBEC enrichment score > 2; ",
    nrow(sub.tbl[APOBEC_Enriched %in% "yes"]), " of ", nrow(sub.tbl), " samples)"
  )

  send_info("Creating SBS sample-by-component matrices.")

  SBS_6 <- records_to_matrix(extract.tbl, "Tumor_Sample_Barcode", "SubstitutionType")
  SBS_6 <- SBS_6[, c("T>C", "C>T", "T>A", "T>G", "C>A", "C>G")] %>% as.matrix()
  send_success("SBS-6 matrix created.")

  SBS_96 <- records_to_matrix(extract.tbl, "Tumor_Sample_Barcode", "TriSubstitutionTypeMotif")
  SBS_96 <- SBS_96[, tri_comb] %>% as.matrix()
  send_success("SBS-96 matrix created.")

  SBS_1536 <- records_to_matrix(extract.tbl, "Tumor_Sample_Barcode", "SubstitutionTypeMotif")
  SBS_1536 <- SBS_1536[, penta_comb] %>% as.matrix()
  send_success("SBS-1536 matrix created.")

  if (add_trans_bias) {
    t_labels <- c("T:", "U:", "B:", "N:")

    SBS_24 <- records_to_matrix(extract.tbl, "Tumor_Sample_Barcode", "SubstitutionType",
      add_trans_bias = TRUE, build = genome_build
    )
    SBS_24 <- SBS_24[, vector_to_combination(
      t_labels,
      c("T>C", "C>T", "T>A", "T>G", "C>A", "C>G")
    )] %>% as.matrix()
    send_success("SBS-24 (6x4) matrix created.")

    SBS_384 <- records_to_matrix(extract.tbl, "Tumor_Sample_Barcode", "TriSubstitutionTypeMotif",
      add_trans_bias = TRUE, build = genome_build
    )
    SBS_384 <- SBS_384[, vector_to_combination(t_labels, tri_comb)] %>% as.matrix()
    send_success("SBS-384 (96x4) matrix created.")

    SBS_6144 <- records_to_matrix(extract.tbl, "Tumor_Sample_Barcode", "SubstitutionTypeMotif",
      add_trans_bias = TRUE, build = genome_build
    )
    SBS_6144 <- SBS_6144[, vector_to_combination(t_labels, penta_comb)] %>% as.matrix()
    send_success("SBS-6144 (1536x4) matrix created.")
  }

  if (add_trans_bias) {
    send_info("Return SBS-192 as major matrix.")
    SBS_192 <- SBS_384[, grepl("T:|U:", colnames(SBS_384)), drop = FALSE]
    res <- list(
      nmf_matrix = SBS_192,
      all_matrices = list(
        SBS_6 = SBS_6,
        SBS_24 = SBS_24,
        SBS_96 = SBS_96,
        SBS_192 = SBS_192,
        SBS_384 = SBS_384,
        SBS_1536 = SBS_1536,
        SBS_6144 = SBS_6144
      ),
      APOBEC_scores = sub.tbl
    )
  } else {
    send_info("Return SBS-96 as major matrix.")
    res <- list(
      nmf_matrix = SBS_96,
      all_matrices = list(
        SBS_6 = SBS_6,
        SBS_96 = SBS_96,
        SBS_1536 = SBS_1536
      ),
      APOBEC_scores = sub.tbl
    )
  }

  # Reorder mutation types
  res$all_matrices = lapply(res$all_matrices, function(x) {
    y = x[, sort(colnames(x)), drop = FALSE]
    y
  })
  res$nmf_matrix = res$nmf_matrix[, sort(colnames(res$nmf_matrix)), drop = FALSE]

  # Return
  res
}


# DBS ---------------------------------------------------------------------

generate_matrix_DBS <- function(query, ref_genome, genome_build = "hg19", add_trans_bias = FALSE) {
  send_info("DBS matrix generation - start.")

  query <- query[query$Variant_Type %in% c("SNP", "DNP")]
  if (nrow(query) == 0) {
    send_stop("Zero SNP/DNPs to analyze!")
  }

  ## Generate DBS catalogues
  conv <- c(
    "AC>CA", "AC>CG", "AC>CT", "AC>GA", "AC>GG", "AC>GT",
    "AC>TA", "AC>TG", "AC>TT", "AT>CA", "AT>CC", "AT>CG",
    "AT>GA", "AT>GC", "AT>TA", "CC>AA", "CC>AG", "CC>AT",
    "CC>GA", "CC>GG", "CC>GT", "CC>TA", "CC>TG", "CC>TT",
    "CG>AT", "CG>GC", "CG>GT", "CG>TA", "CG>TC", "CG>TT",
    "CT>AA", "CT>AC", "CT>AG", "CT>GA", "CT>GC", "CT>GG",
    "CT>TA", "CT>TC", "CT>TG", "GC>AA", "GC>AG", "GC>AT",
    "GC>CA", "GC>CG", "GC>TA", "TA>AT", "TA>CG", "TA>CT",
    "TA>GC", "TA>GG", "TA>GT", "TC>AA", "TC>AG", "TC>AT",
    "TC>CA", "TC>CG", "TC>CT", "TC>GA", "TC>GG", "TC>GT",
    "TG>AA", "TG>AC", "TG>AT", "TG>CA", "TG>CC", "TG>CT",
    "TG>GA", "TG>GC", "TG>GT", "TT>AA", "TT>AC", "TT>AG",
    "TT>CA", "TT>CC", "TT>CG", "TT>GA", "TT>GC", "TT>GG"
  )
  names(conv) <- c(
    "GT>TG", "GT>CG", "GT>AG", "GT>TC", "GT>CC", "GT>AC",
    "GT>TA", "GT>CA", "GT>AA", "AT>TG", "AT>GG", "AT>CG",
    "AT>TC", "AT>GC", "AT>TA", "GG>TT", "GG>CT", "GG>AT",
    "GG>TC", "GG>CC", "GG>AC", "GG>TA", "GG>CA", "GG>AA",
    "CG>AT", "CG>GC", "CG>AC", "CG>TA", "CG>GA", "CG>AA",
    "AG>TT", "AG>GT", "AG>CT", "AG>TC", "AG>GC", "AG>CC",
    "AG>TA", "AG>GA", "AG>CA", "GC>TT", "GC>CT", "GC>AT",
    "GC>TG", "GC>CG", "GC>TA", "TA>AT", "TA>CG", "TA>AG",
    "TA>GC", "TA>CC", "TA>AC", "GA>TT", "GA>CT", "GA>AT",
    "GA>TG", "GA>CG", "GA>AG", "GA>TC", "GA>CC", "GA>AC",
    "CA>TT", "CA>GT", "CA>AT", "CA>TG", "CA>GG", "CA>AG",
    "CA>TC", "CA>GC", "CA>AC", "AA>TT", "AA>GT", "AA>CT",
    "AA>TG", "AA>GG", "AA>CG", "AA>TC", "AA>GC", "AA>CC"
  )

  ## Search for DBS
  send_info("Searching DBS records...")
  query_DNP <- query[
    query$Variant_Type == "DNP",
    c(
      "Tumor_Sample_Barcode", "Hugo_Symbol", "Chromosome",
      "Start_Position", "Reference_Allele", "Tumor_Seq_Allele2"
    )
  ]
  query <- query[, search_DBS(.SD), by = Tumor_Sample_Barcode]
  query <- rbind(query, query_DNP, fill = TRUE)
  send_success("Done.")

  if (nrow(query) == 0) {
    send_stop("Zero DBSs to analyze!")
  }

  query[, dbs := paste(Reference_Allele, Tumor_Seq_Allele2, sep = ">")]
  query[, dbsMotif := ifelse(dbs %in% names(conv), conv[dbs], dbs)]
  query[, dbsMotif := factor(dbsMotif, levels = as.character(conv))]
  query[, should_reverse := substr(dbsMotif, 1, 2) != substr(dbs, 1, 2)]

  ## Query sequence
  query[, upstream := as.character(
    BSgenome::getSeq(
      x = ref_genome,
      names = Chromosome,
      start = pmax(Start_Position - 1, 1L),
      end = pmax(Start_Position - 1, 1L)
    )
  )]
  query[, downstream := as.character(
    BSgenome::getSeq(
      x = ref_genome,
      names = Chromosome,
      start = Start_Position + 2,
      end = Start_Position + 2
    )
  )]
  send_success("Reference sequences queried from genome.")

  query$complexMotif <- paste0(query$upstream, "[", query$dbsMotif, "]", query$downstream)
  query$complexMotif <- factor(query$complexMotif,
    levels = vector_to_combination(
      c("A", "C", "G", "T"),
      "[",
      levels(query$dbsMotif),
      "]",
      c("A", "C", "G", "T")
    )
  )

  DBS_78 <- records_to_matrix(query, "Tumor_Sample_Barcode", "dbsMotif") %>% as.matrix()
  send_success("DBS-78 matrix created.")

  DBS_1248 <- records_to_matrix(query, "Tumor_Sample_Barcode", "complexMotif") %>% as.matrix()
  send_success("DBS-1248 matrix created.")

  if (add_trans_bias) {
    query$End_Position <- query$Start_Position + 1
    DBS_186 <- records_to_matrix(query, "Tumor_Sample_Barcode", "dbsMotif",
      add_trans_bias = TRUE, build = genome_build, mode = "DBS"
    )
    DBS_186 <- DBS_186 %>% as.matrix()
    send_success("DBS-186 matrix created.")

    send_info("Return DBS-186 as major matrix.")
    res <- list(
      nmf_matrix = DBS_186,
      all_matrices = list(
        DBS_78 = DBS_78,
        DBS_186 = DBS_186,
        DBS_1248 = DBS_1248
      )
    )
  } else {
    send_info("Return SBS-78 as major matrix.")
    res <- list(
      nmf_matrix = DBS_78,
      all_matrices = list(
        DBS_78 = DBS_78,
        DBS_1248 = DBS_1248
      )
    )
  }

  # Reorder mutation types
  res$all_matrices = lapply(res$all_matrices, function(x) {
    y = x[, sort(colnames(x))]
    y
  })
  res$nmf_matrix = res$nmf_matrix[, sort(colnames(res$nmf_matrix))]

  res
}

## Search DBS in SNV records
search_DBS <- function(x) {
  # x = data.table::as.data.table(x)
  ## Set a position data.table to cover all combinations
  pos_dt <- rbind(
    data.table::data.table(
      chr = x$Chromosome,
      start = x$Start_Position - 1,
      end = x$Start_Position
    ),
    data.table::data.table(
      chr = x$Chromosome,
      start = x$Start_Position,
      end = x$Start_Position
    )
  ) %>%
    unique()

  data.table::setkey(pos_dt, chr, start, end)

  ## Find the overlaps
  x$chr <- x$Chromosome
  x$start <- x$Start_Position
  x$end <- x$start

  overlap_dt <- data.table::foverlaps(x, pos_dt, type = "within")
  overlap_dt[, region := paste0(chr, ":", start, "-", end)]

  freq_dt <- overlap_dt[, .N, by = region]

  DBS_index <- freq_dt$N > 1
  if (sum(DBS_index) < 1) {
    return(data.table::data.table())
  } else {
    DBS_regions <- freq_dt$region[DBS_index]
    res_dt <- overlap_dt[region %in% DBS_regions][order(Start_Position)] # Mutation position should be ordered
    return(
      res_dt[, list(
        Hugo_Symbol = paste(unique(Hugo_Symbol), collapse = ","),
        Chromosome = paste(unique(Chromosome), collapse = ","),
        Start_Position = Start_Position[1],
        Reference_Allele = paste(Reference_Allele, collapse = ""),
        Tumor_Seq_Allele2 = paste(Tumor_Seq_Allele2, collapse = "")
      ), by = region]
    )
  }
}

# INDELs (ID) -------------------------------------------------------------

adjust_indels <- function(query) {
  query %>%
    dplyr::rowwise() %>%
    dplyr::mutate(cN = nchar(Biobase::lcPrefixC(c(.data$Reference_Allele, .data$Tumor_Seq_Allele2), ignore.case = TRUE))) %>%
    dplyr::mutate(
      Start_Position = .data$Start_Position + .data$cN,
      Reference_Allele = substring(.data$Reference_Allele, .data$cN + 1L),
      Tumor_Seq_Allele2 = substring(.data$Tumor_Seq_Allele2, .data$cN + 1L)
    ) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(
      Reference_Allele = ifelse(.data$Reference_Allele == "", "-", .data$Reference_Allele),
      Tumor_Seq_Allele2 = ifelse(.data$Tumor_Seq_Allele2 == "", "-", .data$Tumor_Seq_Allele2),
      End_Position = .data$Start_Position + nchar(.data$Reference_Allele) - 1L
    ) %>%
    data.table::as.data.table()
}

generate_matrix_INDEL <- function(query, ref_genome, genome_build = "hg19", add_trans_bias = FALSE) {
  send_info("INDEL matrix generation - start.")

  query <- query[query$Variant_Type %in% c("INS", "DEL")]
  if (nrow(query) == 0) {
    send_stop("Zero INDELs to analyze!")
  }

  indel_types <- c(
    "1:Del:C:0", "1:Del:C:1", "1:Del:C:2", "1:Del:C:3", "1:Del:C:4", "1:Del:C:5",
    "1:Del:T:0", "1:Del:T:1", "1:Del:T:2", "1:Del:T:3", "1:Del:T:4", "1:Del:T:5",
    "1:Ins:C:0", "1:Ins:C:1", "1:Ins:C:2", "1:Ins:C:3", "1:Ins:C:4", "1:Ins:C:5",
    "1:Ins:T:0", "1:Ins:T:1", "1:Ins:T:2", "1:Ins:T:3", "1:Ins:T:4", "1:Ins:T:5",
    # >1bp INDELS
    "2:Del:R:0", "2:Del:R:1", "2:Del:R:2", "2:Del:R:3", "2:Del:R:4", "2:Del:R:5",
    "3:Del:R:0", "3:Del:R:1", "3:Del:R:2", "3:Del:R:3", "3:Del:R:4", "3:Del:R:5",
    "4:Del:R:0", "4:Del:R:1", "4:Del:R:2", "4:Del:R:3", "4:Del:R:4", "4:Del:R:5",
    "5:Del:R:0", "5:Del:R:1", "5:Del:R:2", "5:Del:R:3", "5:Del:R:4", "5:Del:R:5",
    "2:Ins:R:0", "2:Ins:R:1", "2:Ins:R:2", "2:Ins:R:3", "2:Ins:R:4", "2:Ins:R:5",
    "3:Ins:R:0", "3:Ins:R:1", "3:Ins:R:2", "3:Ins:R:3", "3:Ins:R:4", "3:Ins:R:5",
    "4:Ins:R:0", "4:Ins:R:1", "4:Ins:R:2", "4:Ins:R:3", "4:Ins:R:4", "4:Ins:R:5",
    "5:Ins:R:0", "5:Ins:R:1", "5:Ins:R:2", "5:Ins:R:3", "5:Ins:R:4", "5:Ins:R:5",
    # MicroHomology INDELS
    "2:Del:M:1", "3:Del:M:1", "3:Del:M:2", "4:Del:M:1", "4:Del:M:2", "4:Del:M:3",
    "5:Del:M:1", "5:Del:M:2", "5:Del:M:3", "5:Del:M:4", "5:Del:M:5",
    "complex"
    # , "non_matching"
  )

  ## Adjust un-standard variant formats
  query <- adjust_indels(query)
  ## Update Variant_Type
  query[, Variant_Type := ifelse(Variant_Type == "INS", "Ins", "Del")]
  ## Set ID type
  query[, ID_type := ifelse(Reference_Allele == "-",
    Tumor_Seq_Allele2,
    Reference_Allele
  )]
  ## Seach 'complex' motif
  query[, ID_motif := ifelse(Reference_Allele != "-" & Tumor_Seq_Allele2 != "-",
    "complex", NA_character_
  )]

  ## Drop records with 'complex' motif
  query_comp <- query[ID_motif == "complex"]
  query <- query[is.na(ID_motif)]

  if (nrow(query) == 0) {
    send_stop("Zero INDELs to analyze after dropping 'complex' labeled records!")
  }

  ## Get first INDEL base
  query[, mut_base := substr(ID_type, 1, 1)]

  ## Query sequence
  query[, upstream := as.character(
    BSgenome::getSeq(
      x = ref_genome,
      names = Chromosome,
      start = pmax(Start_Position - 50, 1L),
      end = pmax(Start_Position - 1, 1L)
    )
  )]

  # NOTE: deletion variants should start from Start_Position
  query[, downstream := as.character(
    BSgenome::getSeq(
      x = ref_genome,
      names = Chromosome,
      start = ifelse(Variant_Type == "Ins", Start_Position, End_Position + 1),
      end = ifelse(Variant_Type == "Ins", Start_Position + 249, End_Position + 250)
    )
  )]
  send_success("Reference sequences queried from genome.")

  ## Length of INDEL
  query[, ID_len := ifelse(nchar(ID_type) < 5, nchar(ID_type), 5)]
  send_success("INDEL length extracted.")

  ## Adjacent repeats (copies) count
  query[, count_repeat := mapply(count_repeat, ID_type, downstream) + mapply(count_repeat, ID_type, upstream, is_downstream = FALSE)]
  send_success("Adjacent copies counted.")
  ## Micro-homology count
  query[, count_homosize := mapply(
    count_homology_size,
    ID_type, upstream, downstream
  ) %>% as.integer()]
  send_success("Microhomology size calculated.")

  ## Set maximum repeat/homology-size count
  query[, count_repeat := ifelse(count_repeat > 5, 5, count_repeat)]
  query[, count_homosize := ifelse(count_homosize > 5, 5, count_homosize)]

  ## For length-1 INDEL
  conv <- c("C", "T")
  names(conv) <- c("G", "A")
  query[, should_reverse := ID_len == 1 & mut_base %in% c("G", "A")]
  query[, mut_base := ifelse(should_reverse, conv[mut_base], mut_base)]
  ## For length-n (n>1) INDEL
  query[, mut_base := ifelse(ID_len > 1, "R", mut_base)]

  ## 这里可能有问题 M 标记
  query[, mut_base := ifelse(Variant_Type == "Del" &
                               ID_len > 1 & count_repeat == 0 &
                               count_homosize > 0, "M", mut_base)]

  ## Generate variables to handle strand bias labeling
  query[, should_reverse := sapply(ID_type, is_all_purine)]
  query[, ID_type := ifelse(should_reverse, purine2pyrimidine(ID_type), ID_type)]
  # determine pyrimidine situaiton
  query$all_pyrimidine <- sapply(query$ID_type, is_all_pyrimidine)

  ## Generate all factors
  sim_types <- c(indel_types[1:24], "long_Del", "long_Ins", "MH", "complex")
  query <- rbind(query, query_comp, fill = TRUE)

  ## Generate Motifs
  ## ifelse() has problems for assign values here
  ## use case_when
  ##
  ## NOTE: according to reference paper says
  ## "Since almost no insertions with microhomologies
  ## were identified across more than 20,000 tumors"
  ## so records like this will classified into 'non_matching' in ID_motif
  ## and 'MH' in ID_motif_sp
  query <- query %>%
    dplyr::as_tibble() %>%
    dplyr::mutate(
      ID_motif = dplyr::case_when(
        ID_len == 1 ~ paste(ID_len, Variant_Type, mut_base, count_repeat, sep = ":"),
        mut_base == "R" ~ paste(ID_len, Variant_Type, mut_base, count_repeat, sep = ":"),
        mut_base == "M" ~ paste(ID_len, Variant_Type, mut_base, count_homosize, sep = ":"),
        ID_motif == "complex" ~ "complex",
        TRUE ~ "non_matching"
      ),
      ID_motif_sp = ID_motif
    ) %>%
    dplyr::mutate(
      ID_motif_sp = dplyr::case_when(
        ID_motif_sp %in% c(indel_types[1:24], "complex") ~ ID_motif_sp,
        ID_motif_sp %in% indel_types[25:48] ~ "long_Del",
        ID_motif_sp %in% indel_types[49:72] ~ "long_Ins",
        TRUE ~ "MH"
      )
    ) %>%
    data.table::as.data.table()

  ## "complex" type will be dropped off from ID-83
  query[, ID_motif := factor(ID_motif, levels = indel_types[-84])]
  query[, ID_motif_sp := factor(ID_motif_sp, levels = sim_types)]
  send_success("INDEL records classified into different components (types).")

  ID_28 <- records_to_matrix(query, "Tumor_Sample_Barcode", "ID_motif_sp") %>% as.matrix()
  send_success("ID-28 matrix created.")
  ID_83 <- records_to_matrix(query, "Tumor_Sample_Barcode", "ID_motif") %>% as.matrix()
  send_success("ID-83 matrix created.")

  if (add_trans_bias) {
    ID_415 <- records_to_matrix(query, "Tumor_Sample_Barcode", "ID_motif",
      add_trans_bias = TRUE, build = genome_build, mode = "ID"
    )
    send_success("ID-415 matrix created.")

    send_info("Return ID-415 as major matrix.")
    res <- list(
      nmf_matrix = ID_415,
      all_matrices = list(
        ID_28 = ID_28,
        ID_83 = ID_83,
        ID_415 = ID_415
      )
    )
  } else {
    send_info("Return ID-83 as major matrix.")
    res <- list(
      nmf_matrix = ID_83,
      all_matrices = list(
        ID_28 = ID_28,
        ID_83 = ID_83
      )
    )
  }
  # Reorder mutation types
  res$all_matrices = lapply(res$all_matrices, function(x) {
    y = x[, sort(colnames(x))]
    y
  })
  res$nmf_matrix = res$nmf_matrix[, sort(colnames(res$nmf_matrix))]

  res
}


# Utils -------------------------------------------------------------------

records_to_matrix <- function(dt, samp_col, component_col, add_trans_bias = FALSE, build = "hg19", mode = "SBS") {
  if (add_trans_bias) {
    transcript_dt <- get_genome_annotation(data_type = "transcript", genome_build = build)
    data.table::setkey(transcript_dt, chrom, start, end)

    if ("Start" %in% colnames(dt)) {
      loc_dt <- dt[, .(Chromosome, Start, End)]
    } else {
      loc_dt <- dt[, .(Chromosome, Start_Position, End_Position)]
    }
    colnames(loc_dt)[1:3] <- c("chrom", "start", "end")
    loc_dt$MutIndex <- 1:nrow(loc_dt)

    m_dt <- data.table::foverlaps(loc_dt, transcript_dt, type = "any")[
      , .(MatchCount = .N, strand = paste0(unique(strand), collapse = "/")),
      by = list(MutIndex)
    ]

    ## Actually, the MatchCount should only be 0, 1, 2
    if (any(m_dt$MatchCount > 2)) {
      send_stop("More than 2 regions counted, please report your data and code to developer!")
    }

    if (mode == "SBS") {
      dt$transcript_bias_label <- ifelse(
        m_dt$MatchCount == 2, "B:",
        ifelse(m_dt$MatchCount == 0, "N:",
          ifelse(xor(dt$should_reverse, m_dt$strand == "-"),
            # (dt$should_reverse & m_dt$strand == "+") | (!dt$should_reverse & m_dt$strand == "-"),
            "T:", "U:"
          )
        ) ## If should reverse, the base switch to template strand from coding strand
      )
      new_levels <- vector_to_combination(
        c("T:", "U:", "B:", "N:"),
        levels(dt[[component_col]])
      )
      dt[[component_col]] <- paste0(dt$transcript_bias_label, dt[[component_col]])
      dt[[component_col]] <- factor(dt[[component_col]], levels = new_levels)
    } else if (mode == "DBS") {

      # dt <- dt %>%
      #   dplyr::as_tibble() %>%
      #   #dplyr::bind_cols(m_dt %>% dplyr::as_tibble()) %>%
      #   dplyr::mutate(
      #     transcript_bias_label = dplyr::case_when(
      #       !substr(.data[[component_col]], 1, 2) %in% c("TT", "TC", "CT", "CC") ~ "Q:",
      #       m_dt$MatchCount == 2 ~ "B:",
      #       m_dt$MatchCount == 0 ~ "N:",
      #       xor(dt$should_reverse, m_dt$strand == "-") ~ "T:",
      #       TRUE ~ "U:"
      #     )
      #   ) %>%
      #   data.table::as.data.table()

      dt$transcript_bias_label <- ifelse(
        substr(dt[[component_col]], 1, 2) %in% c("TT", "TC", "CT", "CC"),
        ifelse(
          m_dt$MatchCount == 2, "B:",
          ifelse(m_dt$MatchCount == 0, "N:",
            ifelse(xor(dt$should_reverse, m_dt$strand == "-"),
              "T:", "U:"
            )
          )
        ), "Q:"
      )

      new_levels <- vector_to_combination(
        c("T:", "U:", "B:", "N:", "Q:"),
        levels(dt[[component_col]])
      )
      new_levels <- ifelse(substr(new_levels, 3, 4) %in% c("TT", "TC", "CT", "CC") & substr(new_levels, 1, 1) != "Q", new_levels,
        ifelse(substr(new_levels, 3, 4) %in% c("TT", "TC", "CT", "CC") & substr(new_levels, 1, 1) == "Q",
          NA, gsub("[A-Z]\\:", "Q:", new_levels)
        )
      ) %>%
        na.omit() %>%
        unique()

      dt[[component_col]] <- paste0(dt$transcript_bias_label, dt[[component_col]])
      dt[[component_col]] <- factor(dt[[component_col]], levels = new_levels)
    } else if (mode == "ID") {
      dt$transcript_bias_label <- ifelse(
        dt$all_pyrimidine == TRUE,
        ifelse(
          m_dt$MatchCount == 2, "B:",
          ifelse(m_dt$MatchCount == 0, "N:",
            ifelse(xor(dt$should_reverse, m_dt$strand == "-"),
              "T:", "U:"
            )
          )
        ),
        "Q:"
      )
      new_levels <- vector_to_combination(
        c("T:", "U:", "B:", "N:", "Q:"),
        levels(dt[[component_col]])
      )
      dt[[component_col]] <- paste0(dt$transcript_bias_label, dt[[component_col]])
      dt[[component_col]] <- factor(dt[[component_col]], levels = new_levels)
    }
  }

  dt.summary <- dt[, .N, by = c(samp_col, component_col)]
  mat <- as.data.frame(data.table::dcast(dt.summary,
    formula = as.formula(paste(samp_col, "~", component_col)),
    fill = 0, value.var = "N", drop = FALSE
  ))

  rownames(mat) <- mat[, 1]
  mat <- mat[, -1]
  mat
}

vector_to_combination <- function(..., c_string = "") {
  expand.grid(
    ...,
    stringsAsFactors = FALSE
  ) %>%
    apply(1, paste0, collapse = c_string) %>%
    unique()
}

count_repeat <- function(x, sequence, is_downstream = TRUE) {
  if (is_downstream) {
    if (startsWith(sequence, x)) {
      pattern <- paste0(
        "^((",
        x,
        ")+).*$"
      )
      nchar(sub(pattern, "\\1", sequence)) / nchar(x)
    } else {
      return(0L)
    }
  } else {
    if (endsWith(sequence, x)) {
      return(count_repeat(
        Biostrings::reverse(x),
        Biostrings::reverse(sequence)
      ))
    } else {
      return(0L)
    }
  }
}


## Examples:
## ACCAA|TC|TAGCGGC or ACAAC|TC|AAGCGGC
## ACCCA|TATC|TATAGCGGC or ACCCATC|TATC|AAGCGGC
## ACCCA|TAGCCTC|TAGCCTAGCGGC or ACCCAGCCTC|TAGCCTC|AAGCGGC
##
## 1
## count_homology_size("TC", "ACCAA", "TAGCGGC")
## count_homology_size("TC", "ACCAC", "AAGCGGC")
## 3
## count_homology_size("TATC", "ACCCA", "TATAGCGGC")
## count_homology_size("TATC", "ACCCATC", "AAGCGGC")
## 5+
## count_homology_size("TAGCCTC", "ACCCA", "TAGCCTAGCGGC")
## count_homology_size("TAGCCTC", "ACCCAGCCTC", "AAGCGGC")
count_homology_size <- function(x, upstream, downstream) {
  size <- nchar(as.character(x))
  if (size >= 2) {
    x_down <- substr(x, 1, size - 1)
    size_down <- 0
    while (nchar(x_down) > 0) {
      if (startsWith(downstream, x_down)) {
        size_down <- nchar(x_down)
        break()
      } else {
        x_down <- substr(x_down, 1, nchar(x_down) - 1)
      }
    }
    x_up <- substring(x, 2)
    size_up <- 0
    while (nchar(x_up) > 0) {
      if (endsWith(upstream, x_up)) {
        size_up <- nchar(x_up)
        break()
      } else {
        x_up <- substring(x_up, 2)
      }
    }
    max(size_up, size_down)
  } else {
    return(0L)
  }
}

is_all_pyrimidine <- function(x) {
  !isTRUE(nchar(gsub("[CT]", "", x)) > 0)
}

is_all_purine <- function(x) {
  !isTRUE(nchar(gsub("[AG]", "", x)) > 0)
}

purine2pyrimidine <- function(x) {
  x <- chartr("A", "T", x)
  x <- chartr("G", "C", x)
  return(x)
}

utils::globalVariables(
  c(
    "A", "APOBEC_Enriched", "APOBEC_Enrichment",
    "A[G>A]A", "A[G>C]A", "A[G>T]A", "G",
    "Substitution",
    "T[C>A]A", "T[C>A]T",
    "T[C>G]A", "T[C>G]T",
    "T[C>T]A", "T[C>T]T", "T[G>A]A",
    "T[G>C]A", "T[G>T]A",
    "TriSubstitutionMotif", "Tumor_Sample_Barcode", "fdr",
    "fraction_APOBEC_mutations", "n_A", "n_C", "n_C>G_and_C>T",
    "n_G", "n_T", "n_mutations", "non_APOBEC_mutations",
    "tCw", "tCw_to_A", "tCw_to_G", "tCw_to_G+tCw_to_T",
    "tCw_to_T", "tcw", "wGa", "wGa_to_A", "wGa_to_C", "wGa_to_T", "wga",
    "Start", "End", "MutIndex",
    ".SD", "dbs", "dbsMotif", "Reference_Allele", "Tumor_Seq_Allele2",
    "chr", "region", "Start_Position", "Hugo_Symbol",
    "End_Position", "ID_len", "ID_motif", "ID_motif_sp", "ID_type",
    "Variant_Type", "count_homosize", "downstream", "mut_base", "should_reverse",
    "upstream"
  )
)

Try the sigminer package in your browser

Any scripts or data that you put into this service are public.

sigminer documentation built on Aug. 21, 2023, 9:08 a.m.