R/utilities.R

Defines functions .get_transcripts_if_not_there .is_nucleotide_amount_valid .is_character_parameter_valid .is_parameter_boolean .is_ylim_valid .is_which_transcripts_valid .is_bamfile_bedgraphfile_valid .is_bedgraphfile_valid .is_bedfile_valid .is_bamfile_valid .is_bamfile_valid_allow_null .is_edb_valid .is_fusion_valid down_shift .check_that_transcripts_have_breakpoint_exons .check_that_breakpoints_are_within_transcripts select_transcript fusion_to_data_frame add_fusion_reads_alignment get_transcripts_ensembl_db decide_transcript_category split_on_utr_and_add_feature get_ensembl_ids write_fusion_reference fetch_reads_from_fastq

Documented in add_fusion_reads_alignment decide_transcript_category down_shift fetch_reads_from_fastq fusion_to_data_frame get_ensembl_ids get_transcripts_ensembl_db select_transcript split_on_utr_and_add_feature write_fusion_reference

#' Fetch reads from fastq files
#'
#' This function will fetch read sequences from fastq files and put them into
#' new fastq files.
#'
#' Note: This function runs (read only) bash commands on your system. Therefore
#' the function will only work on a unix system.
#'
#' @param reads List of read IDs that is to be fetched.
#'
#' @param fastq_file_in1 First fastq file to search in.
#' @param fastq_file_in2 Second fastq file to seach in.
#'
#' @param fastq_file_out1 First fastq file with results.
#' @param fastq_file_out2 Second fastq file with results.
#'
#' @return The files fastqFileOut1 and fastqFileOut2 populated with the
#' specified reads.
#'
#' @examples
#' \dontrun{
#' # fastq files that has the supporting reads
#' fastq1 <- system.file("extdata", "reads.1.fq", package="chimeraviz")
#' fastq2 <- system.file("extdata", "reads.2.fq", package="chimeraviz")
#' # Which read ids to extract
#' reads <- c(
#'   "13422259", "19375605", "29755061",
#'   "31632876", "32141428", "33857245")
#' # Extract the actual reads and put them in the tmp files "fastqFileOut1" and
#' # "fastqFileOut2"
#' fastqFileOut1 <- tempfile(pattern = "fq1", tmpdir = tempdir())
#' fastqFileOut2 <- tempfile(pattern = "fq2", tmpdir = tempdir())
#' fetch_reads_from_fastq(reads, fastq1, fastq2,
#'     fastqFileOut1, fastqFileOut2)
#' # We now have the reads supporting fusion 5267 in the two files.
#' }
#'
#' @export
fetch_reads_from_fastq <- function(
  reads,
  fastq_file_in1,
  fastq_file_in2,
  fastq_file_out1,
  fastq_file_out2
  ) {

  # Since this function use the bash function egrep to extract reads, give a
  # warning if we're running on windows
  if (Sys.info()["sysname"] == "Windows") {
    stop(paste("This function uses the bash function egrep. It looks like",
               "you're running on windows, so this function will terminate."))
  }

  if (is.vector(reads, mode = "character") == FALSE) {
    stop("reads should be a character vector of read ids")
  }

  if (file.exists(fastq_file_in1) == FALSE ||
      file.exists(fastq_file_in2) == FALSE) {
    stop("Invalid fastq input files")
  }

  # The command below will extract the sequences for ids 11, 22, and 33 from the
  # fastq file reads.fastq

  # $ egrep -A 3 '@11|22|33' reads.fastq | sed '/^--$/d'

  # By default, the -A parameter inserts a "--" between matches, so the sed part
  # above removes that

  # First build the query with the read ids
  query <- paste("@",
                 paste(reads, collapse = "|"),
                 sep = "")

  # Then create commands for each fastq file
  command1 <- paste("egrep -A 3 '",
                    query,
                    "' ",
                    shQuote(fastq_file_in1),
                    " | sed '/^--$/d'", # Exclude Linting
                    sep = "")
  command2 <- paste("egrep -A 3 '",
                    query,
                    "' ",
                    shQuote(fastq_file_in2),
                    " | sed '/^--$/d'", # Exclude Linting
                    sep = "")

  # We now have two commands:
  # egrep -A 1 '@11|22|33' reads.1.fastq | sed '/^--$/d'
  # egrep -A 1 '@11|22|33' reads.2.fastq | sed '/^--$/d'

  # Run the command and capture output
  supporting_reads_fq1 <- system(command1, intern = TRUE)
  supporting_reads_fq2 <- system(command2, intern = TRUE)

  # Write these to each their own files
  write(supporting_reads_fq1, file = fastq_file_out1)
  write(supporting_reads_fq2, file = fastq_file_out2)

  # Check file sizes
  if (file.info(fastq_file_out1)$size <= 10) {
    warning(paste("It looks like we couldn't find any of the reads in the",
                  "fastq file. Check your arguments."))
  }
}

#' Write fusion junction sequence to a fasta file
#'
#' This function will write the fusion sequence to a fasta file, using
#' Biostring::writeXStringSet() .
#'
#' @param fusion The Fusion object we want to create a fasta file from.
#' @param filename The filename to write to.
#'
#' @return Writes the fusion junction sequence to the given filename.
#'
#' @examples
#' # Import the filtered defuse results
#' defuse833keFiltered <- system.file(
#'   "extdata",
#'   "defuse_833ke_results.filtered.tsv",
#'   package="chimeraviz")
#' fusions <- import_defuse(defuse833keFiltered, "hg19", 1)
#' # Get a specific fusion
#' fusion <- get_fusion_by_id(fusions, 5267)
#' # Create temporary file to hold the fusion sequence
#' fastaFileOut <- tempfile(pattern = "fusionSequence", tmpdir = tempdir())
#' # Write fusion sequence to file
#' write_fusion_reference(fusion, fastaFileOut)
#'
#' @export
write_fusion_reference <- function(fusion, filename) {

  # Check if we got a fusion object
  if (class(fusion) != "Fusion") {
    stop("fusion argument must be an object of type Fusion")
  }

  # First put the fusion junction sequence in a DNAStringSet object
  fusion_sequence <- Biostrings::DNAStringSet(
    x = c(fusion@gene_upstream@junction_sequence,
          fusion@gene_downstream@junction_sequence))

  # Give an error if the length of the fusionSequence is 0:
  if (nchar(fusion_sequence) == 0) {
    stop(
      paste0(
        "The fusion sequence length is zero, so the fusion reference sequence",
        " cannot be written."
      )
    )
  }

  # Set sequence name to chrNA, since this is a sequence created from a fusion
  # event (i.e. not a sequence from a real chromosome). The "chrNA" name will
  # make Gviz happy.
  names(fusion_sequence) <- "chrNA"
  # Write to file
  Biostrings::writeXStringSet(fusion_sequence, filename)
}

#' Get ensembl ids for a fusion object
#'
#' This function will get the ensembl ids from the org.Hs.eg.db/org.Mm.eg.db
#' package given the gene names of the fusion event.
#'
#' @param fusion The Fusion object we want to get ensembl ids for.
#'
#' @return The Fusion object with Ensembl ids set.
#'
#' @import org.Hs.eg.db org.Mm.eg.db
#'
#' @examples
#' # Import the filtered defuse results
#' defuse833keFiltered <- system.file(
#'   "extdata",
#'   "defuse_833ke_results.filtered.tsv",
#'   package="chimeraviz")
#' fusions <- import_defuse(defuse833keFiltered, "hg19", 1)
#' # Get a specific fusion
#' fusion <- get_fusion_by_id(fusions, 5267)
#' # See the ensembl ids:
#' partner_gene_ensembl_id(upstream_partner_gene(fusion))
#' # [1] "ENSG00000180198"
#' partner_gene_ensembl_id(downstream_partner_gene(fusion))
#' # [1] "ENSG00000162639"
#' # Reset the fusion objects ensembl ids
#' partner_gene_ensembl_id(upstream_partner_gene(fusion)) <- ""
#' partner_gene_ensembl_id(downstream_partner_gene(fusion))  <- ""
#' # Get the ensembl ids
#' fusion <- get_ensembl_ids(fusion)
#' # See that we now have the same ensembl ids again:
#' partner_gene_ensembl_id(upstream_partner_gene(fusion))
#' # [1] "ENSG00000180198"
#' partner_gene_ensembl_id(downstream_partner_gene(fusion))
#' # [1] "ENSG00000162639"
#'
#' @export
get_ensembl_ids <- function(fusion) {

  # Check if we got a fusion object
  if (class(fusion) != "Fusion") {
    stop("fusion argument must be an object of type Fusion")
  }

  if (startsWith(fusion@genome_version, "hg")) {
    annotation_db <- org.Hs.eg.db # Exclude Linting
  } else if (startsWith(fusion@genome_version, "mm")) {
    annotation_db <- org.Mm.eg.db # Exclude Linting
  } else {
    stop("Unsupported genome version")
  }

  result <- AnnotationDbi::select(
    annotation_db,
    keys = c(fusion@gene_upstream@name, fusion@gene_downstream@name),
    keytype = "ALIAS",
    columns = c("ALIAS", "ENSEMBL"))

  upstream_gene_result <-
    result[which(result$ALIAS == fusion@gene_upstream@name), ]
  downstream_gene_result <-
    result[which(result$ALIAS == fusion@gene_downstream@name), ]

  # Stop execution if no results
  if (any(is.na(upstream_gene_result$ENSEMBL))) {
    stop(
      paste(
        "Could not find Ensembl id for ",
        fusion@gene_upstream@name,
        ". ",
        "If you know the id, add it manually with ",
        "fusion@gene_upstream@ensembl_id <- \"ensemblId\"",
        sep = ""
      )
    )
  }
  if (any(is.na(downstream_gene_result$ENSEMBL))) {
    stop(
      paste(
        "Could not find Ensembl id for ",
        fusion@gene_downstream@name,
        ". ",
        "If you know the id, add it manually with ",
        "fusion@gene_downstream@ensembl_id <- \"ensemblId\"",
        sep = ""
      )
    )
  }

  # Store ensembl ids in fusion object
  fusion@gene_upstream@ensembl_id <- upstream_gene_result[, 2]
  fusion@gene_downstream@ensembl_id <- downstream_gene_result[, 2]

  # Return updated fusion object
  fusion
}

#' Split GRanges object based on cds
#'
#' This function will look for ranges (exons) in the GRanges object that has the
#' coding DNA sequence starting or stopping within it. If found, these exons are
#' split, and each exon in the GRanges object will be tagged as either
#' "protein_coding", "5utr", or "3utr". The returned GRanges object will have
#' feature values set in mcols(gr)$feature reflecting this.
#'
#' @param gr The GRanges object we want to split and tag with feature info.
#'
#' @return An updated GRanges object with feature values set.
#'
#' @examples
#' # Load fusion data and choose a fusion object:
#' defuseData <- system.file(
#'   "extdata",
#'   "defuse_833ke_results.filtered.tsv",
#'   package="chimeraviz")
#' fusions <- import_defuse(defuseData, "hg19", 1)
#' fusion <- get_fusion_by_id(fusions, 5267)
#' # Create edb object
#' edbSqliteFile <- system.file(
#'   "extdata",
#'   "Homo_sapiens.GRCh37.74.sqlite",
#'   package="chimeraviz")
#' edb <- ensembldb::EnsDb(edbSqliteFile)
#' # Get all exons for all transcripts in the genes in the fusion transcript
#' allTranscripts <- ensembldb::exonsBy(
#'   edb,
#'   filter = list(
#'     AnnotationFilter::GeneIdFilter(
#'       c(
#'         partner_gene_ensembl_id(upstream_partner_gene(fusion)),
#'         partner_gene_ensembl_id(downstream_partner_gene(fusion))))),
#'   columns = c(
#'     "gene_id",
#'     "gene_name",
#'     "tx_id",
#'     "tx_cds_seq_start",
#'     "tx_cds_seq_end",
#'     "exon_id"))
#' # Extract one of the GRanges objects
#' gr <- allTranscripts[[1]]
#' # Check how many ranges there are here
#' length(gr)
#' # Should be 9 ranges
#' # Split the ranges containing the cds start/stop positions and add feature
#' # values:
#' gr <- split_on_utr_and_add_feature(gr)
#' # Check the length again
#' length(gr)
#' # Should be 11 now, as the range containing the cds_strat position and the
#' # range containing the cds_stop position has been split into separate ranges
#'
#' @importFrom S4Vectors mcols
#'
#' @export
split_on_utr_and_add_feature <- function(gr) {

  # Check if we got a valid GRanges object
  if (class(gr) != "GRanges") {
    stop("gr argument must be an object of type GRanges")
  }

  # Check that we have the tx_cds_seq_start and tx_cds_seq_end values


  # check that gr has a tx_cds_seq_start and tx_cds_seq_end mcols value

  # Get cds start and cds end
  cds_start <- S4Vectors::mcols(gr)$tx_cds_seq_start[[1]]
  cds_end <- S4Vectors::mcols(gr)$tx_cds_seq_end[[1]]

  # find exon containing start
  cds_start_exon <- cds_start > start(gr) & cds_start < end(gr)

  if (any(cds_start_exon)) {
    # Create two copies of the exon in which the cds starts
    first <- gr[cds_start_exon]
    second <- gr[cds_start_exon]
    # Update ranges for the new objects
    end(first) <- cds_start - 1
    start(second) <- cds_start
    # Remove the original range
    gr <- gr[!cds_start_exon]
    # Add new ranges
    gr <- append(gr, first)
    gr <- append(gr, second)
    # Sort again
    gr <- sort(gr)
  }

  # find exon containing end
  cds_end_exon <- cds_end > start(gr) & cds_end < end(gr)

  if (any(cds_end_exon)) {
    # Create two copies of the exon in which the cds ends
    first <- gr[cds_end_exon]
    second <- gr[cds_end_exon]
    # Update ranges for the new objects
    end(first) <- cds_end
    start(second) <- cds_end + 1
    # Remove the original range
    gr <- gr[!cds_end_exon]
    # Add new ranges
    gr <- append(gr, first)
    gr <- append(gr, second)
    # Sort again
    gr <- sort(gr)
  }

  # add features
  S4Vectors::mcols(gr)$feature <- "protein_coding"
  # 5utr:
  #   plus strand:
  #     end(gr) < cds_start # Exclude Linting
  #   minus strand:
  #     start(gr) > cds_end # Exclude Linting
  if (length(gr[as.character(strand(gr)) == "+" & end(gr) < cds_start |
                as.character(strand(gr)) == "-" & start(gr) > cds_end])) {
    S4Vectors::mcols(
      gr[as.character(strand(gr)) == "+" & end(gr) < cds_start |
           as.character(strand(gr)) == "-" & start(gr) > cds_end]
      )$feature <- "5utr"
  }
  # 3utr:
  #   plus strand:
  #     start(gr) > cds_end # Exclude Linting
  #   minus strand:
  #     end(gr) < cds_start # Exclude Linting
  if (length(gr[as.character(strand(gr)) == "+" & start(gr) > cds_end |
                as.character(strand(gr)) == "-" & end(gr) < cds_start])) {
    S4Vectors::mcols(
      gr[as.character(strand(gr)) == "+" & start(gr) > cds_end |
           as.character(strand(gr)) == "-" & end(gr) < cds_start]
      )$feature <- "3utr"
  }

  gr
}

#' Retrieves transcripts for partner genes in a Fusion object using Ensembldb
#'
#' This function will check where in the transcript (the GRanges object) the
#' fusion breakpoint is located, and return either "exonBoundary", "withinExon",
#' "withinIntron", or "intergenic".
#'
#' @param gr The GRanges object containing the transcript to be checked.
#' @param fusion The fusion object used to check the transcript.
#'
#' @return Either "exonBoundary", "withinExon", "withinIntron", or "intergenic"
#' depending on where in the transcript the breakpoint hits.
#'
#' @examples
#' # Load fusion data and choose a fusion object:
#' defuseData <- system.file(
#'   "extdata",
#'   "defuse_833ke_results.filtered.tsv",
#'   package="chimeraviz")
#' fusions <- import_defuse(defuseData, "hg19", 1)
#' fusion <- get_fusion_by_id(fusions, 5267)
#' # Create edb object
#' edbSqliteFile <- system.file(
#'   "extdata",
#'   "Homo_sapiens.GRCh37.74.sqlite",
#'   package="chimeraviz")
#' edb <- ensembldb::EnsDb(edbSqliteFile)
#' # Get all exons for all transcripts in the genes in the fusion transcript
#' allTranscripts <- ensembldb::exonsBy(
#'   edb,
#'   filter = list(
#'     AnnotationFilter::GeneIdFilter(
#'       c(
#'         partner_gene_ensembl_id(upstream_partner_gene(fusion)),
#'         partner_gene_ensembl_id(downstream_partner_gene(fusion))))),
#'   columns = c(
#'     "gene_id",
#'     "gene_name",
#'     "tx_id",
#'     "tx_cds_seq_start",
#'     "tx_cds_seq_end",
#'     "exon_id"))
#' # Extract one of the GRanges objects
#' gr <- allTranscripts[[1]]
#' # Check where in the transcript the fusion breakpoint hits
#' decide_transcript_category(gr, fusion)
#' # "exonBoundary"
#' # Check another case
#' gr <- allTranscripts[[3]]
#' decide_transcript_category(gr, fusion)
#' # "withinIntron"
#'
#' @importFrom S4Vectors mcols
#'
#' @export
decide_transcript_category <- function(gr, fusion) {

  # Check if we got a valid GRanges object
  if (class(gr) != "GRanges") {
    stop("gr argument must be an object of type GRanges")
  }

  # Check if we got a fusion object
  if (class(fusion) != "Fusion") {
    stop("fusion argument must be an object of type Fusion")
  }

  # 4 possible cases:
  #
  # 1: geneA is on the + strand
  #    Exon boundary event if breakpointA equals an exon_end position
  # 2: geneA is on the - strand
  #    Exon boundary event if breakpointA equals an exon_start position
  # 3: geneB is on the + strand
  #    Exon boundary event if breakpointB equals an exon_start position
  # 4: geneB is on the - strand
  #    Exon boundary event if breakpointB equals an exon_end position

  # Helper variables
  upstream_gene <-
    S4Vectors::mcols(gr)$gene_id[[1]] == fusion@gene_upstream@ensembl_id
  breakpoint <-
    if (all(S4Vectors::mcols(gr)$gene_id == fusion@gene_upstream@ensembl_id)) {
      fusion@gene_upstream@breakpoint
    } else {
      fusion@gene_downstream@breakpoint
    }
  exon_start_positions <- GenomicRanges::start(gr)
  exon_end_positions <- GenomicRanges::end(gr)

  # Check whether or not the breakpoint is within the transcript
  if (all(breakpoint < exon_start_positions) ||
      all(exon_end_positions < breakpoint)) {
    return("intergenic")
  }

  # Assuming only single-stranded transcripts
  if (as.character(strand(gr)[1]) == "+") {

    # To decide whether or not the breakpoint occurs at an exon boundary, we
    # have to be vary careful about both whether this is the upstream/downstrem
    # fusion partner gene
    if (upstream_gene) {
      # GeneA
      if (any(exon_end_positions == breakpoint)) {
        # exon boundary
        return("exonBoundary")
      }
    } else {
      # GeneB
      if (any(exon_start_positions == breakpoint)) {
        # exon boundary
        return("exonBoundary")
      }
    }

  } else {

    # To decide whether or not the breakpoint occurs at an exon boundary, we
    # have to be vary careful about both whether this is the upstream/downstrem
    # fusion partner gene
    if (all(upstream_gene)) {
      # GeneA
      if (any(exon_start_positions == breakpoint)) {
        # exon boundary
        return("exonBoundary")
      }
    } else {
      # GeneB
      if (any(exon_end_positions == breakpoint)) {
        # exon boundary
        return("exonBoundary")
      }
    }

  }

  # Within exon if end < breakpoint & breakpoint < start
  if (any(start(gr) < breakpoint & breakpoint < end(gr))) {
    return("withinExon")
  } else {
    return("withinIntron")
  }
}

#' Retrieves transcripts for partner genes in a Fusion object using Ensembldb
#'
#' This function will retrieve transcripts for both genes in a fusion. It will
#' check all transcripts and decide for each transcript if the fusion breakpoint
#' happens at 1) an exon boundary, 2) within an exon, or 3) within an intron.
#' This is done because fusions happening at exon boundaries are more likely to
#' produce biologically interesting gene products. The function returns an
#' updated Fusion object, where the fusion@gene_upstream@transcriptsX slots are set with
#' transcript information.
#'
#' @param fusion The fusion object to find transcripts for.
#' @param edb The edb object used to fetch data from.
#'
#' @return An updated fusion object with transcript data stored.
#'
#' @examples
#' # Load fusion data and choose a fusion object:
#' defuseData <- system.file(
#'   "extdata",
#'   "defuse_833ke_results.filtered.tsv",
#'   package="chimeraviz")
#' fusions <- import_defuse(defuseData, "hg19", 1)
#' fusion <- get_fusion_by_id(fusions, 5267)
#' # Create edb object
#' edbSqliteFile <- system.file(
#'   "extdata",
#'   "Homo_sapiens.GRCh37.74.sqlite",
#'   package="chimeraviz")
#' edb <- ensembldb::EnsDb(edbSqliteFile)
#' # Add transcripts data to fusion object
#' fusion <- get_transcripts_ensembl_db(fusion, edb)
#' # The transcripts are now accessible through fusion@gene_upstream@transcripts and
#' # fusion@gene_downstream@transcripts .
#'
#' @importFrom S4Vectors mcols
#' @importFrom ensembldb exonsBy
#' @importFrom AnnotationFilter GeneIdFilter
#'
#' @export
get_transcripts_ensembl_db <- function(fusion, edb) {

  # Check if we got a fusion object
  if (class(fusion) != "Fusion") {
    stop("fusion argument must be an object of type Fusion")
  }

  # Check if we got a valid edb
  if (class(edb) != "EnsDb") {
    stop("edb argument must be an object of type EnsDb")
  }

  # Fetch ensembl gene ids if we don't already have them
  if (
    is.na(fusion@gene_upstream@ensembl_id) ||
    is.na(fusion@gene_downstream@ensembl_id)
  ) {
    fusion <- get_ensembl_ids(fusion)
  }

  # Get all exon information
  all_transcripts <- ensembldb::exonsBy(
    edb,
    filter = list(
      AnnotationFilter::GeneIdFilter(
        c(
          fusion@gene_upstream@ensembl_id,
          fusion@gene_downstream@ensembl_id))),
    columns = c(
      "gene_id",
      "gene_name",
      "tx_id",
      "tx_cds_seq_start",
      "tx_cds_seq_end",
      "exon_id"))

  # Fail if no transcripts were found
  if (length(all_transcripts) == 0) {
    stop(paste(
      "No transcripts available for the genes ",
      fusion@gene_upstream@name,
      " and ",
      fusion@gene_downstream@name,
      ".",
      sep = ""))
  }

  # Go through each transcript in the GRangesList and
  grangeslist_upstream <- GRangesList()
  grangeslist_downstream <- GRangesList()
  for (i in seq_along(all_transcripts)) {

    # Extract the GRanges object
    gr <- all_transcripts[[i]]

    # Add $transcript as a metadata column
    S4Vectors::mcols(gr)$transcript <- names(all_transcripts[i])
    # Add $symbol as a metadata column
    S4Vectors::mcols(gr)$symbol <- names(all_transcripts[i])

    # Add utr/coding S4Vectors::mcols$feature data for each region
    if (is.na(S4Vectors::mcols(gr)$tx_cds_seq_start[[1]])) {
      # The transcript is not coding. Set all to "utr"
      S4Vectors::mcols(gr)$feature <- "utr"
    }else {
      # The transcript is coding. The function below will set
      # S4Vectors::mcols(gr)$feature = "utr5"/"protein_coding"/"utr3" for each
      # region
      gr <- split_on_utr_and_add_feature(gr)
    }

    # Create new GRangesList
    grl <- GRangesList(gr)

    # Tag each transcript with either "exonBoundary", "withinExon",
    # "withinIntron", or "intergenic", depending on where in the transcript the
    # fusion breakpoint hits
    S4Vectors::mcols(grl)$transcript_category <- decide_transcript_category(
      gr,
      fusion)

    # Set S4Vectors::mcols$transcript and name for the grl
    S4Vectors::mcols(grl)$transcript <- names(all_transcripts[i])
    names(grl) <- names(all_transcripts[i])

    # Add S4Vectors::mcols$coding to signify whether it is a coding transcript
    # at all
    S4Vectors::mcols(grl)$coding <-
      !all(is.na(S4Vectors::mcols(gr)$tx_cds_seq_start))

    # Append the GRanges object to the correct GRangesList
    if (S4Vectors::mcols(gr)$gene_id[[1]] == fusion@gene_upstream@ensembl_id) {
      grangeslist_upstream <- append(grangeslist_upstream, grl)
    } else {
      grangeslist_downstream <- append(grangeslist_downstream, grl)
    }
  }

  # Add transcripts to fusion object
  fusion@gene_upstream@transcripts <- grangeslist_upstream
  fusion@gene_downstream@transcripts <- grangeslist_downstream

  # Warn if no transcripts were found for one of the genes
  if (length(fusion@gene_upstream@transcripts) == 0) {
    warning(paste(
      "No transcripts available for the upstream gene ",
      fusion@gene_upstream@name,
      " available.",
      sep = ""))
  }
  if (length(fusion@gene_upstream@transcripts) == 0) {
    warning(paste(
      "No transcripts available for the downstream gene ",
      fusion@gene_downstream@name,
      " available.",
      sep = ""))
  }

  # In case the fusion object doesn't have the strands set (as is the case for
  # JAFFA), set the strands now:
  if (
    fusion@gene_upstream@strand == "*" || fusion@gene_downstream@strand == "*"
  ) {
    fusion@gene_upstream@strand <- as.character(
      strand(fusion@gene_upstream@transcripts[[1]][1]))
    fusion@gene_downstream@strand <- as.character(
      strand(fusion@gene_downstream@transcripts[[1]][1]))
  }

  # Return fusion object
  fusion

}

#' Add fusion reads alignment to fusion object
#'
#' This function lets you add a fusion read alignment file to a fusion object.
#' If you've mapped the reads supporting a fusion against the fusion junction
#' sequence, and have the resulting bamfile, use this function to add the
#' information (as a Gviz::GAlignmentPairs object) to the fusion object.
#'
#' @param fusion The fusion object to add a genomic alignment to.
#' @param bamfile The bam file containing the fusion reads plotted to the fusion
#' sequence.
#'
#' @return An updated fusion object with fusion@fusion_reads_alignment set.
#'
#' @examples
#' # Load data
#' defuse833ke <- system.file(
#'   "extdata",
#'   "defuse_833ke_results.filtered.tsv",
#'   package="chimeraviz")
#' fusions <- import_defuse(defuse833ke, "hg19", 1)
#' # Find the specific fusion we have aligned reads for
#' fusion <- get_fusion_by_id(fusions, 5267)
#' # Get reference to the bamfile with the alignment data
#' bamfile5267 <- system.file(
#'   "extdata",
#'   "5267readsAligned.bam",
#'   package="chimeraviz")
#' # Add the bam file of aligned fusion reads to the fusion object
#' fusion <- add_fusion_reads_alignment(fusion, bamfile5267)
#'
#' @export
add_fusion_reads_alignment <- function(fusion, bamfile) {

  # Check if we got a fusion object
  if (class(fusion) != "Fusion") {
    stop("fusion argument must be an object of type Fusion")
  }

  fusion_reads_alignment <- Gviz::AlignmentsTrack(
    bamfile,
    isPaired = TRUE,
    # Setting chromosome to chrNA because this is a fusion sequence not found in
    # any reference genome.
    chromosome = "chrNA",
    name = "Fusion Reads",
    genome = fusion@genome_version)

  # Return new fusion object, now with the fusion read alignment
  fusion@fusion_reads_alignment <- fusion_reads_alignment
  fusion
}

#' Coerce Fusion object to data.frame
#'
#' This function is used in create_fusion_report() to convert Fusion objects to a
#' data.frame-format.
#'
#' @param fusion The Fusion object to coerce.
#'
#' @return A data.frame with the fusion object.
#'
#' @seealso create_fusion_report
#'
#' @examples
#' # Load data
#' defuse833ke <- system.file(
#'   "extdata",
#'   "defuse_833ke_results.filtered.tsv",
#'   package="chimeraviz")
#' fusions <- import_defuse(defuse833ke, "hg19", 1)
#' # Find the fusion object to create a data frame from
#' fusion <- get_fusion_by_id(fusions, 5267)
#' # Create the data frame
#' dfFusion <- fusion_to_data_frame(fusion)
#'
#' @export
fusion_to_data_frame <- function(fusion) {

  # Check if we got a list of fusion objects
  if (class(fusion) != "Fusion") {
    stop("fusions argument must be a list of Fusion objects")
  }

  df <- data.frame(
    fusion@id,
    fusion@gene_upstream@name,
    fusion@gene_upstream@ensembl_id,
    fusion@gene_upstream@breakpoint,
    fusion@gene_downstream@name,
    fusion@gene_downstream@ensembl_id,
    fusion@gene_downstream@breakpoint,
    fusion@split_reads_count,
    fusion@spanning_reads_count)
  names(df) <- c(
    "id",
    "gene_upstream",
    "ensembl_upstream",
    "breakpoint_upstrea,",
    "gene_downstream",
    "ensembl_downstream",
    "breakpoint_downstream",
    "Split Reads",
    "Spanning Reads")
  df
}

#' Select which transcript to use (for plotting) for a GenePartner object
#'
#' This function takes a GenePartner object and creates a transcript data.frame
#' with transcript information, including only the transcripts given by the
#' parameter which_transcripts
#'
#' select_transcript() selects which transcript to create by this prioritization:
#'
#' 1. Exon boundary transcripts.
#' 2. Within exon transcripts.
#' 3. Within intron transcripts.
#' 4. Intergenic transcripts.
#'
#' @param gene_partner The GenePartner object to select a transcript for.
#' @param which_transcripts This character vector decides which transcripts are
#' to be plotted. Can be "exonBoundary", "withinExon", "withinIntron",
#' "intergenic", or a character vector with specific transcript ids. Default
#' value is "exonBoundary".
#'
#' @return A data.frame with transcript data.
#'
#' @examples
#' # Load data and example fusion event
#' defuse833ke <- system.file(
#'   "extdata",
#'   "defuse_833ke_results.filtered.tsv",
#'   package="chimeraviz")
#' fusions <- import_defuse(defuse833ke, "hg19", 1)
#' fusion <- get_fusion_by_id(fusions, 5267)
#' # Load edb
#' edbSqliteFile <- system.file(
#'   "extdata",
#'   "Homo_sapiens.GRCh37.74.sqlite",
#'   package="chimeraviz")
#' edb <- ensembldb::EnsDb(edbSqliteFile)
#' # Get transcripts
#' fusion <- get_transcripts_ensembl_db(fusion, edb)
#' # Select transcript
#' transcriptsA <- select_transcript(upstream_partner_gene(fusion))
#'
#' @export
select_transcript <- function(
  gene_partner,
  which_transcripts = "exonBoundary") {

  # Check if we got a PartnerGene object
  if (class(gene_partner) != "PartnerGene") {
    stop("genePartner argument must be an object of type PartnerGene")
  }

  # Does the PartnerGene have any transcripts?
  if (isEmpty(gene_partner@transcripts)) {
    stop("genePartner has no transcripts. See get_transcripts_ensembl_db()")
  }

  # Either select from one of these four categories
  # 1. Exon boundary transcripts.
  # 2. Within exon transcripts.
  # 3. Within intron transcripts.
  # 4. Intergenic transcripts.
  #
  # or select the specific transcripts given by which_transcripts.

  transcript_categories <- c(
    "exonBoundary",
    "withinExon",
    "withinIntron",
    "intergenic")

  if (which_transcripts[[1]] %in% transcript_categories) {

    message(paste0("Selecting transcripts for ", gene_partner@name, ".."))

    # If the user has chosen one of the four transcript categories, then we
    # want to check whether or not such transcripts exist. If they exist,
    # simply return them. If they don't exist, go on to try the other
    # categories. Try the wanted category first:
    transcripts_of_wanted_category <-
      gene_partner@transcripts[
        mcols(gene_partner@transcripts)$
          transcript_category == which_transcripts[[1]]
      ]
    if (length(transcripts_of_wanted_category) > 0) {
      message(paste0("..found transcripts of type ", which_transcripts[[1]]))
      return(transcripts_of_wanted_category)
    }
    # Check the remaining categories
    remaining_categories <- transcript_categories[
      transcript_categories != which_transcripts[[1]]
    ]
    for (transcript_category in remaining_categories) {
      transcripts_in_this_category <- gene_partner@transcripts[
        mcols(gene_partner@transcripts)$
          transcript_category == transcript_category
      ]
      if (length(transcripts_in_this_category) > 0) {
        message(paste0("..found transcripts of type ", transcript_category))
        return(transcripts_in_this_category)
      }
    }
  }

  # At this point the user wants specific transcripts. Get the transcripts that
  # we have for this genePartner.

  specific_transcripts <- gene_partner@transcripts[
    names(gene_partner@transcripts) %in% which_transcripts]
  if (length(specific_transcripts) > 0) {
    return(specific_transcripts)
  }

  stop("The specific transcripts could not be found")
}

# Check that there's at least one transcript that has the fusion breakpoint
# within the transcript.
.check_that_breakpoints_are_within_transcripts <- function(
  fusion,
  transcripts_upstream,
  transcripts_downstream) {
  if (class(transcripts_upstream) == "CompressedGRangesList") {
    if (!any(start(transcripts_upstream)@unlistData <
             fusion@gene_upstream@breakpoint) &
        any(fusion@gene_upstream@breakpoint <
            end(transcripts_upstream)@unlistData)) {
      stop(paste0(
        "None of the transcripts given for gene A has the fusion breakpoint ",
        "within them. This plot cannot be created with the given transcripts."))
    }
  } else if (class(transcripts_upstream) == "GRanges") {
    if (!any(start(transcripts_upstream) <
             fusion@gene_upstream@breakpoint) &
        any(fusion@gene_upstream@breakpoint <
            end(transcripts_upstream))) {
      stop(paste0(
        "None of the transcripts given for gene A has the fusion breakpoint ",
        "within them. This plot cannot be created with the given transcripts."))
    }
  } else {
    stop(paste0(
      "Unknown input to .check_that_breakpoints_are_within_transcripts."))
  }
  if (class(transcripts_downstream) == "CompressedGRangesList") {
    if (!any(start(transcripts_downstream)@unlistData <
             fusion@gene_downstream@breakpoint) &
        any(fusion@gene_downstream@breakpoint <
            end(transcripts_downstream)@unlistData)) {
      stop(paste0(
        "None of the transcripts given for gene B has the fusion breakpoint ",
        "within them. This plot cannot be created with the given transcripts."))
    }
  } else if (class(transcripts_downstream) == "GRanges") {
    if (!any(start(transcripts_downstream) <
             fusion@gene_downstream@breakpoint) &
        any(fusion@gene_downstream@breakpoint <
            end(transcripts_downstream))) {
      stop(paste0(
        "None of the transcripts given for gene B has the fusion breakpoint ",
        "within them. This plot cannot be created with the given transcripts."))
    }
  } else {
    stop(paste0(
      "Unknown input to .check_that_breakpoints_are_within_transcripts."))
  }
}

# Check that the transcripts have a breakpoint exon
.check_that_transcripts_have_breakpoint_exons <- function(
  fusion,
  transcript_upstream,
  transcript_downstream) {

  if (fusion@gene_upstream@strand == "+") {
    if (
      length(
        transcript_upstream[
          end(ranges(transcript_upstream)) == fusion@gene_upstream@breakpoint
        ]
      ) == 0
    ) {
      stop(paste(
        "The transcript for gene A doesn't have an exon boundary matching the",
        "fusion breakpoint. The plot cannot be created."))
    }
  } else {
    if (
      length(
        transcript_upstream[
          start(ranges(transcript_upstream)) == fusion@gene_upstream@breakpoint
        ]
      ) == 0
    ) {
      stop(paste(
        "The transcript for gene A doesn't have an exon boundary matching the",
        "fusion breakpoint. The plot cannot be created."))
    }
  }
  if (fusion@gene_downstream@strand == "+") {
    if (
      length(
        transcript_downstream[
          start(ranges(transcript_downstream)) ==
          fusion@gene_downstream@breakpoint
        ]
      ) == 0
    ) {
      stop(paste(
        "The transcript for gene B doesn't have an exon boundary matching the",
        "fusion breakpoint. The plot cannot be created."))
    }
  } else {
    if (
      length(
        transcript_downstream[
          end(ranges(transcript_downstream)) ==
          fusion@gene_downstream@breakpoint
        ]
      ) == 0
    ) {
      stop(paste(
        "The transcript for gene B doesn't have an exon boundary matching the",
        "fusion breakpoint. The plot cannot be created."))
    }
  }
}

#' Remove introns and shift exons leftward
#'
#' This function takes a GRanges object and moves each IRanges object within
#' next to each other starting at 1. This effectively removes the introns from
#' the GRanges object.
#'
#' @param transcript The GRanges object to remove introns from.
#'
#' @return A GRanges object with introns removed.
#'
#' @examples
#' # Create a simple GRanges object:
#' gr <- IRanges::IRanges(
#'   start = c(13, 40, 100),
#'   end = c(20, 53, 110))
#' # Downshift it and see the introns are removed:
#' down_shift(gr)
#'
#' @export
down_shift <- function(transcript) {

  # Check if we got a GRanges object
  if (!class(transcript) %in% c("GRanges", "IRanges")) {
    stop("transcript argument must be an object of type GRanges")
  }

  for (i in seq_along(transcript)) {
    if (i == 1) {
      transcript[i] <- IRanges::shift(
        transcript[i],
        shift = 1 - start(transcript[i])
      )
    } else {
      shift_down_by <- - (start(transcript[i]) - end(transcript[i - 1])) + 1
      transcript[i] <- IRanges::shift(
        transcript[i],
        shift = shift_down_by
      )
    }
  }

  transcript
}

# -----------------------------------------------------------------------------
# Functions that validate parameters passed to functions in chimeraviz

.is_fusion_valid <- function(argument_checker, fusion) {
  # Check if we got a fusion object
  checkmate::assert_class(x = fusion,
                          classes = "Fusion",
                          add = argument_checker)
  argument_checker
}

.is_edb_valid <- function(argument_checker, edb, fusion) {
  if (!is.null(edb)){
    checkmate::assert_class(x = edb,
                            classes = "EnsDb",
                            add = argument_checker)
  } else {
    if (isEmpty(fusion@gene_upstream@transcripts)) {
      argument_checker$push(
        paste0("There are no transcipts for gene A. Please provide an ",
                     "EnsDb object to the edb parameter, or see",
               "get_transcripts_ensembl_db().")
      )
    } else if (isEmpty(fusion@gene_downstream@transcripts)) {
      argument_checker$push(
        paste0("There are no transcipts for gene B. Please provide an ",
               "EnsDb object to the edb parameter, or see",
               "get_transcripts_ensembl_db().")
      )
    }
  }
  argument_checker
}

.is_bamfile_valid_allow_null <- function(argument_checker, bamfile) {
  if (!is.null(bamfile)){
    checkmate::assert_file_exists(x = bamfile,
                                  add = argument_checker)
  }
  argument_checker
}

.is_bamfile_valid <- function(argument_checker, bamfile) {
  # Check that the argument is given
  if (is.null(bamfile)) {
    argument_checker$push(
      "'bamfile' must be the path to a .BAM file."
    )
  }
  # Check that the file exists
  if (!is.null(bamfile)) {
    checkmate::assert_file_exists(x = bamfile,
                                  add = argument_checker)
  }
  argument_checker
}

.is_bedfile_valid <- function(argument_checker, bedfile) {
  # Check that the argument is given
  if (is.null(bedfile)) {
    argument_checker$push("'bedfile' must be the path to a .BED file.")
  }
  # Check that the file exists
  if (!is.null(bedfile)) {
    checkmate::assert_file_exists(x = bedfile,
                                  add = argument_checker)
  }
  argument_checker
}

.is_bedgraphfile_valid <- function(argument_checker, bedgraphfile) {
  # Check that the argument is given
  if (is.null(bedgraphfile)) {
    argument_checker$push("'bedgraphfile' must be the path to a .bedGraph file.")
  }
  # Check that the file exists
  if (!is.null(bedgraphfile)) {
    checkmate::assert_file_exists(x = bedgraphfile,
                                  add = argument_checker)
  }
  argument_checker
}

.is_bamfile_bedgraphfile_valid <- function(
  argument_checker,
  bamfile,
  bedgraphfile) {
  # Either bamfile or bedgraphfile can be given, not both
  bamfile_given <- !is.null(bamfile)
  bedgraphfile_given <- !is.null(bedgraphfile)
  if (bamfile_given && bedgraphfile_given) {
    argument_checker$push(
      "Either 'bamfile' or 'bedgraphfile' must be given, not both."
    )
  } else if (!bamfile_given && !bedgraphfile_given) {
    # It is OK to not give any of them
  } else if (bamfile_given) {
    .is_bamfile_valid(argument_checker, bamfile)
  } else {
    .is_bedgraphfile_valid(argument_checker, bedgraphfile)
  }
  argument_checker
}

.is_which_transcripts_valid <- function(
  argument_checker,
  which_transcripts,
  fusion
) {
  if (class(which_transcripts) != "character") {
    argument_checker$push(
      paste0("'which_transcripts' must be a character (or a character ",
             "vector) holding the desired transcript category or the ",
             "names of specific transcripts.")
    )
  }
  # Is which_transcripts valid?
  transcript_categories <- c(
    "exonBoundary",
    "withinExon",
    "withinIntron",
    "intergenic"
  )
  # Check if the transcript(s) given actually exist, either in
  # transcript_categories, in geneA, or in geneB
  for (i in seq_along(which_transcripts)) {
    if (
      !which_transcripts[[i]] %in% transcript_categories &&
      !which_transcripts[[i]] %in% names(fusion@gene_upstream@transcripts) &&
      !which_transcripts[[i]] %in% names(fusion@gene_downstream@transcripts)
    ) {
      argument_checker$push(
        paste0("No transcript with name ", which_transcripts[[i]],
               " was found.")
      )
    }
  }
  argument_checker
}

.is_ylim_valid <- function(argument_checker, ylim) {
  if (class(ylim) != "numeric" || length(ylim) != 2) {
    argument_checker$push(
      "'ylim' must be a numeric vector of length 2"
    )
  }
  argument_checker
}

.is_parameter_boolean <- function(
  argument_checker,
  parameter,
  parameter_name
) {
  checkmate::assert_logical(x = parameter,
                            .var.name = parameter_name,
                            add = argument_checker)
  argument_checker
}

.is_character_parameter_valid <- function(
  argument_checker,
  parameter,
  parameter_name
) {
  checkmate::assert_character(x = parameter,
                              .var.name = parameter_name,
                              add = argument_checker)
  argument_checker
}

.is_nucleotide_amount_valid <- function(
  argument_checker,
  nucleotide_amount,
  fusion
) {

  fusion_junction_sequence_length <-
    length(fusion@gene_upstream@junction_sequence) +
    length(fusion@gene_downstream@junction_sequence)

  if (class(nucleotide_amount) != "numeric" ||
      nucleotide_amount <= 0 ||
      nucleotide_amount > fusion_junction_sequence_length) {
    argument_checker$push(
      paste0(
        "'nucleotide_amount' must be a numeric bigger than or equal ",
        "to 0 and less than or equal to the fusion junction ",
        "sequence length."
      )
    )
  }

  if (fusion_junction_sequence_length == 0) {
    argument_checker$push(
      msg = paste0(
        "length of junction sequence is 0, have they been determined?"
      )
    )
  }
  argument_checker
}

# End of functions that validate parameters passed to functions in chimeraviz
# -----------------------------------------------------------------------------

.get_transcripts_if_not_there <- function(fusion, edb) {
  # Establish a new 'checkmate' object
  argument_checker <- checkmate::makeAssertCollection()
  # Check parameters
  argument_checker <- .is_fusion_valid(argument_checker, fusion)
  argument_checker <- .is_edb_valid(argument_checker, edb, fusion)
  # Return errors and warnings (if any)
  checkmate::reportAssertions(argument_checker)

  if (
    isEmpty(fusion@gene_upstream@transcripts) ||
    isEmpty(fusion@gene_downstream@transcripts)
  ) {
    message("Fetching transcripts for gene partners..")
    fusion <- get_transcripts_ensembl_db(fusion, edb)
    message("..transcripts fetched.")
  }
  fusion
}
stianlagstad/chimeraviz documentation built on Dec. 3, 2023, 8:11 p.m.