R/Ularcirc_standalone.R

Defines functions Junction_Sequence_from_Genome bsj_fastq_generate sequence_from_exon_coords bsj_to_circRNA_sequence BSJ_details

Documented in BSJ_details bsj_fastq_generate bsj_to_circRNA_sequence Junction_Sequence_from_Genome sequence_from_exon_coords

## Could make a function that returns a couple of circRNA sequences? Could be a resource
circRNA_seq_example <- "GGAAGAGGAAGAACGTCTGAGAAATAAAATTCGAGCTGATCATGAGAAGGCCTTGGAAGAAGCAAAAGAAAAATTAAGAAAGTCAAGAGAGGAAATTCGAGCAGAAATTCAGACAGAGAAAAATAAGGTAGTCCAAGAAATGAAGATAAAAGAGAACAAGCCACTGCCACCAGTCCCTATTCCCAACCTTGTAGGAATACGTGGTGGAGACCCAGAAGATAATGACATAAGAGAGAAAAGGGAAAAAATTAAAGAGATGATGAAACATGCTTGGGATAACTATAGGACATATGGGTGGGGACATAATGAACTCAGACCTATTGCAAGGAAAGGACACTCCCCTAACATATTTGGAAGTTCACAAATGGGTGCTACCATAGTAGATGCTTTGGATACCCTTTATATCATGGGACTTCATGATGAATTCCTAGATGGGCAAAGATGGATTGAAGACAACCTTGATTTCAGTGTGAATTCAGAGGTGTCTGTGTTTGAAGTCAACATTCGATTTATTGGAGGCCTACTTGCAGCATATTACCTATCAGGAGAGGAG"

###############################################################################################
#'
#' BSJ_details
#' This function returns details of a BSJ string and returns a list of coordinates. Can accept two
#' different formats, Ularcirc or  generic.
#'
#' @param BSJ : backsplice junction as a string. See details below for example formats
#'
#' @examples
#' bsj <-  'chr14_99465814_chr14_99458278'   # Historic Ularcirc format
#'
#' bsj <- c("chr14_99465814_chr14_99458278","chr22_20933778_chr22_20934245",
#'          "chr12_120155720_chr12_120154969", "chr4_143543508_chr4_143543973",
#'          "chr10_7285955_chr10_7276891")
#'
#' BSJ_details(bsj)
#'
#' bsj <- 'chr10:100923974-100926020:+'     # generic format
#' BSJ_details(bsj)
#'
#' @export
#'
BSJ_details <- function(BSJ)
{

  if (length(gregexpr("_",BSJ)[[1]]) == 3) # Ularcirc format  eg chr14_99465814_chr14_99458278
  {
    BSjuncDetails <- strsplit(BSJ, split = "_")
    BSJ_acceptor <- as.numeric(unlist(lapply(BSjuncDetails, FUN = function(x){min(x[c(2,4)])})))
    BSJ_donor <- as.numeric(unlist(lapply(BSjuncDetails, FUN = function(x){max(x[c(2,4)])})))
    BSJ_chrom <- unlist(lapply(BSjuncDetails, FUN = function(x) return(x[1]) ))
    strand <- unlist(lapply(BSjuncDetails, FUN = function(x)
                                      {  BSJ_strand <- '+'

                                         if (x[2] > x[4])
                                            BSJ_strand <- '-'
                                          return(BSJ_strand)
                                           }))

  }
  else if (length(gregexpr(":",BSJ)[[1]]) == 2) # generic format eg chr10:100923974-100926020:+
  {
    BSjuncDetails <- strsplit(BSJ, split = ":")
    BSJ_chrom <- lapply(BSjuncDetails,FUN = function(x) { x[1]})
    coordinates <- lapply(BSjuncDetails,FUN = function(x) { strsplit(x[2],split="-")})
    BSJ_acceptor <- as.numeric(lapply(coordinates, FUN = function(x) { min(unlist(x)) }))
    BSJ_donor <- as.numeric(lapply(coordinates, FUN = function(x) { max(unlist(x)) }))
    strand <- lapply(BSjuncDetails,FUN = function(x) { x[3]})
  }
  else
  {
    warning("BSJ not in the correct format. Did not detect separating characters.")
    return(NULL)
  }
  if ((length(BSJ_donor) == 0) || (length(BSJ_acceptor) ==0 ))
  {
    warning("Could not extract coordinates from BSJ (please check). Aborting")
    return(NULL)
  }
  return(list(BSJ_chrom = BSJ_chrom, BSJ_donor=BSJ_donor, BSJ_acceptor=BSJ_acceptor, strand=strand))
}




################################################################################################
#' bsj_to_circRNA_sequence
#'
#' Takes one BSJ coordinate and generates a predicted circular RNA sequence.
#' @param BSJ : BSJ coordinate in the format of chr_coordinate_chr_coorindate OR chr:coordinate-coorindate:strand.
#' @param geneID : The gene ID that the BSJ aligns to. Not essential as this can
#' be identified from the BSJ coordinate, however time performance of function improved
#' if this information can be provided.
#' @param genome : Is the length f the library fragment
#' @param TxDb : The sequence read length
#' @param annotationLibrary : annotation database. See details for example.
#' @param reduce_candidates : IF multiple exon entries align to a single BSJ then either return
#' longest entry (TRUE) or all entries (FALSE)
#' @param shiny : If TRUE then will setup shiny progress bars. Default is FALSE where a standard
#'  text progress bar is used.
#' @return Returns a DNAstring object.
#' @details
#' Backsplice junction coordinates are typically reported as a character string. Two formats
#' are recognised, ":" delimited (eg circExplorer, CIRI) or "_" delimited (Ularcirc). The
#' BSJ genomic coordinates are compared against the supplied gene model and exonic sequences
#' from matching splice junctions are concatenated. This means the BSJ is the first and last
#' nucleotide of the returned sequence. The current implementation will automatically check 0 or
#' 1 base coordinates and any match is returned.
#'
#' In some cases one BSJ will match multiple exon combinations. The default setting is to return
#' the longest sequence. Alternatively all possibilities can be returned by setting
#' reduce_candidates to FALSE. BSJ candidates that align to multiple exon combinations are
#' added to duplicated list.
#'
#' BSJ that do not align to any canonical junctions are returned as failed.
#'
#' @examples
#'
#' library('Ularcirc')
#' TxDb <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene
#' genome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
#' annotationLibrary <- org.Hs.eg.db::org.Hs.eg.db
#'
#' # Define BSJ. Following two formats are accepted
#' BSJ <- 'chr2:40430305-40428472:-'       # SLC8A1
#' BSJ  <- 'chr2_40430305_chr2_40428472'   # SLC8A1
#'
#' circRNA_sequence <- bsj_to_circRNA_sequence(BSJ, "SLC8A1", genome,TxDb, annotationLibrary)
#'
#' # You can also retrieve sequence without passing gene annotation - but this is slower
#' # circRNA_sequence <- bsj_to_circRNA_sequence(BSJ, NULL, genome,TxDb, annotationLibrary)
#'
#' TxDb <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene
#' genome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
#' # EXAMPLE1 (3 fail and 2 will produce sequences)
#' BSJ <- c("chr14_99465814_chr14_99458278","chr22_20933778_chr22_20934245",
#'          "chr12_120155720_chr12_120154969", "chr4_143543508_chr4_143543973",
#'          "chr10_7285955_chr10_7276891")
#' GeneIDs <- c("SMARCA5","MSLN","RNF138","KIAA0368","CRKL")
#' circRNA_sequence <- bsj_to_circRNA_sequence(BSJ, GeneIDs, genome,TxDb, annotationLibrary)
#'
#' # Returns a list with three items:
#' # (1) "identified" is a list of DNA strings from BSJ that aligned to FSJ coordinates of the gene model
#' # (2) "failed" is a character object of BSJ that did not align to FSJ coordinates of gene model. Each entry is
#' # named with gene ID.
#' # (3) "duplicates" (not implemented yet) identifies which BSJ returned multiple sequences
#' @export
#'
# load(file="c:/TEMP/TestData.RData")  # BSJ and GeneID in circExplorer2 format
#  bsj_to_circRNA_sequence(BSJ[1:100], as.character(GeneID[1:100]), genome=genome_hg38, TxDb=TxDb_hg38,annotationLibrary = annotationLibrary)
#  Does not work on unannotated data.
#
#
#  CURRENTLY ANNOTATION FOR MULTIPLE ENTRIES DOES NOT WORK
bsj_to_circRNA_sequence <- function(BSJ, geneID=NULL, genome, TxDb,
                                    annotationLibrary, reduce_candidates = TRUE, shiny=FALSE)
{
  lookupID <- {}
  BSJ_donor <- {}
  BSJ_acceptor <- {}

  # Need to distinguish the following formats
  # chr10:100923974-100926020:+
  # chr11_33286412_chr11_33287512

  if (length(gregexpr("_",BSJ)[[1]]) == 3) # Ularcirc format  eg chr14_99465814_chr14_99458278
  {
    BSjuncDetails <- strsplit(BSJ, split = "_")
    BSJ_donor <- as.numeric(unlist(lapply(BSjuncDetails, FUN = function(x){min(x[c(2,4)])})))
    BSJ_acceptor <- as.numeric(unlist(lapply(BSjuncDetails, FUN = function(x){max(x[c(2,4)])})))
  }
  else if (length(gregexpr(":",BSJ)[[1]]) == 2) # generic format eg chr10:100923974-100926020:+
  {
    BSjuncDetails <- strsplit(BSJ, split = ":")
    coordinates <- lapply(BSjuncDetails,FUN = function(x) { strsplit(x[2],split="-")})
    BSJ_donor <- as.numeric(lapply(coordinates, FUN = function(x) { min(unlist(x)) }))
    BSJ_acceptor <- as.numeric(lapply(coordinates, FUN = function(x) { max(unlist(x)) }))
  }
  else
  {
    warning("BSJ not in the correct format. Did not detect separating characters.")
    return(NULL)
  }
  if ((length(BSJ_donor) == 0) || (length(BSJ_acceptor) ==0 ))
  {
    warning("Could not extract coordinates from BSJ (please check). Aborting")
    return(NULL)
  }


  if (is.null(geneID))  # Identify potential gene coordinate
  { warning("This function currently requires annotated data. Please ensure you use parameter geneID")
    return(NULL)
    g_GR <- GenomicFeatures::genes(TxDb)
    strand <- "*"
    bs_junc_gr <- GenomicRanges::GRanges(seqnames=BSjuncDetails[[1]][1], ranges = as.numeric(min(BSjuncDetails[[1]][c(2,4)]),min(BSjuncDetails[[1]][c(2,4)])),strand = strand)
    t_start <- GenomicAlignments::findOverlaps(BiocGenerics::invertStrand(bs_junc_gr),g_GR, type=c("within"))
    bs_junc_gr <- GenomicRanges::GRanges(seqnames=BSjuncDetails[[1]][1], ranges = as.numeric(max(BSjuncDetails[[1]][c(2,4)]),max(BSjuncDetails[[1]][c(2,4)])),strand = strand)
    t_end <- GenomicAlignments::findOverlaps(BiocGenerics::invertStrand(bs_junc_gr),g_GR, type=c("within"))

    entrezID <- c("Novel")
    #	  browser()
    if ((length(t_start) > 0) && (length(t_end) > 0))
    {	entrezID_start <- g_GR[S4Vectors::subjectHits(t_start)]$gene_id
      entrezID_end <- g_GR[S4Vectors::subjectHits(t_end)]$gene_id
      entrezID <- intersect(entrezID_start, entrezID_end)
      lookupID <- entrezID
    }

  }


  if (! is.null(geneID))  # Gene name was provided. Trust this and obtain entrezID
  { message("Extracting entrez lookup ID")
    if (shiny)
    { shiny::incProgress(1/4, detail = paste("Extracting entrez lookup ID")) }

    a <- try(AnnotationDbi::select(annotationLibrary, keys = geneID, columns=c("ENTREZID", "SYMBOL", "ENSEMBL"),keytype="SYMBOL"),silent=TRUE)
    if(length(grep(pattern = "Error", x = a)))
    {   # cannot continue
      warning(paste(a,"Error obtaining annotation information.","\n"))
      return(-1)
    }
    lookupID <- a$ENTREZID   # Default lookup
  }

  if (length(BSJ) != length(geneID))
  {
    warning("Length of BSJ is not same as length of geneID, please check.")
    return(NULL)
  }
  else
  { names(BSJ) <- geneID }


  genes_to_consider <- length(geneID)
  if(genes_to_consider > 1)
    reduce_candidates <- FALSE

  if (! is.null(lookupID))
  { message("Extracting exon coordinates")
    if (shiny)
    { shiny::incProgress(1/4, detail = paste("Extracting exon coordinates")) }

    # Create exon table
    b <- try(AnnotationDbi::select(TxDb, keys = lookupID, columns=c('GENEID', 'TXCHROM', 'EXONSTART',  'EXONEND','TXID', 'EXONSTRAND'),keytype="GENEID"),silent=TRUE)
    if(length(grep(pattern = "Error", x = b)))
    {   # cannot continue
      warning(paste(b,"Error obtaining annotation information.","\n"))
      return(-1)
    }

    b <- b[,c("GENEID","TXCHROM","EXONSTART","EXONEND","TXID","EXONSTRAND")]
    names(b) <- c('gene', 'chrom', 'start', 'stop', 'txid','strand')
    b <- data.table::as.data.table(b)

    # Need to work through all entries (list) to extract possible exon boundaries
    if (shiny)
    { shiny::incProgress(1/4, detail = paste("Short listing exons")) }

    message("short listing exons")
    pb <- R.utils::ProgressBar(max=length(BSJ_donor))
    R.utils::reset(pb)
    circRNA_exons <- {}
    failed_BSJ <- {}
    for (i in seq_along(1:length(BSJ_donor)))
    {
      idx <- b$start >= BSJ_donor[i] & b$stop <= BSJ_acceptor[i]
      possible_exons <- b[idx,]

      # select candidates. In some situations coordinates may be entered in 0 or 1 base.
      true_candidates_stop <- which(abs(possible_exons$start - BSJ_donor[i]) == 1 |
                                      abs(possible_exons$start - BSJ_donor[i]) == 0)
      true_candidates_start <- which(abs(possible_exons$stop - BSJ_acceptor[i]) == 1 |
                                       abs(possible_exons$stop - BSJ_acceptor[i]) == 0 )

      true_candidate_IDs <- intersect(possible_exons$gene[true_candidates_start],
                                      possible_exons$gene[true_candidates_stop])
      exon_idx <- possible_exons$gene %in% true_candidate_IDs

      if (length(exon_idx) == 0)
      { failed_BSJ <- c(failed_BSJ, BSJ[i]) }

      if (length(circRNA_exons) == 0)
      { circRNA_exons <- possible_exons[exon_idx,]}
      else
      {
        circRNA_exons <- rbind(circRNA_exons, possible_exons[exon_idx,])
      }
      R.utils::increase(pb)
    }  # for (i in seq_along(1:length(BSJ_donor)))

    if (nrow(circRNA_exons) == 0)
    {
      warning("No exon junctions aligned with BSJ coorindates, therfore no sequences recovered")
      return(NULL)
    }
    ### Now to work out maximum length by sifting through tx entries and adding up exon lengths
    circRNA_exon_lengths <-by(circRNA_exons,circRNA_exons$gene,identity )  # This makes a list of all transcripts
    circRNA_sizes <- lapply(X = circRNA_exon_lengths,FUN = function(x) {  return(abs(sum(x$start-x$stop)))})
    circRNA_sizes_idx <- order(unlist(circRNA_sizes), decreasing = TRUE)


    # Sometimes a single BSJ can align to multiple exons within a transcript.
    # In these situations will choose the longest entry as the candidate circRNA entry.
    Exons_of_Interest <- circRNA_exons  # default is to select all entries
    duplicated_BSJ <- {}
    if (reduce_candidates)
    { largest_transcript_ID <- names(circRNA_sizes[circRNA_sizes_idx])[1]
      transcript_ID_idx <- which(names(circRNA_exon_lengths) == largest_transcript_ID)  # Get Index of transcript ID name
      transcript_ID_idx <- which(circRNA_exons$gene == names(circRNA_exon_lengths)[transcript_ID_idx])  # Get Row index(es) corresponding to transcript
      Exons_of_Interest <- circRNA_exons[transcript_ID_idx,]
      Exons_of_Interest <- Exons_of_Interest[ ! duplicated(Exons_of_Interest)]  # Sometimes there is duplicated records.
    }
    else # Will return all candidates
    { # would be nice to record how many possible duplicate entries
    #  all_entries <- table(names(circRNA_sizes))


    }

    message("Extracting sequence")
    if (shiny)
    { shiny::incProgress(1/4, detail = paste("Extracting sequence")) }
    else
    {
      pb <- R.utils::ProgressBar(max=length(circRNA_exon_lengths))
      R.utils::reset(pb)
    }

     all_seq <- Biostrings::getSeq(genome,Exons_of_Interest$chrom,start=Exons_of_Interest$start, end=Exons_of_Interest$stop, strand=Exons_of_Interest$strand)
     names(all_seq) <- Exons_of_Interest$gene
     exons_to_stitch <- table(Exons_of_Interest$gene)
     temp <- by(data.frame(as.character(all_seq), names(all_seq)), names(all_seq), identity)
     all_circRNA_seq <- lapply(temp, FUN= function(x) {  R.utils::increase(pb);
          paste(as.character(x[,1]) ,sep="",collapse = "")})

#    message("Extracting sequence - method 2")
#    R.utils::reset(pb)
#    circRNA_exon_lengths <- by(circRNA_exons,Exons_of_Interest$gene,identity )
#    circRNA_sequence <- lapply(X = circRNA_exon_lengths, FUN= function(x) {     R.utils::increase(pb); sequence_from_exon_coords(genome, x) })

    return(list(identified=all_circRNA_seq, failed=failed_BSJ, duplicates=duplicated_BSJ))
  } # if (! is.null(lookupID))

  warning("Cannot find or match gene ID")
  return(NULL)

}

#################################################################333
#' sequence_from_exon_coords
#'
#' @param genome : genome object
#' @param exon_df : data frame of exons. Must have column with names "chrom", "start",
#'    "stop", "strand"
#'
sequence_from_exon_coords <- function(genome, exon_df)
{
  seq <- ''
  if (nrow(exon_df) < 1)
  { return(seq) }


  tmp <- as.character(Biostrings::getSeq(genome,exon_df$chrom,
                                           start=exon_df$start,
                                           end=exon_df$stop,
                                           strand = exon_df$strand)  )
  seq <- paste(tmp,sep="",collapse = "")

  return(seq)
}


####
#' bsj_fastq_generate
#'
#' Takes a circRNA predicted sequence and generates synthetic short sequence reads
#' @param circRNA_Sequence : Linear sequence of a circRNA. i.e. the backspice junction
#'               is the first and last base of this sequence
#' @param fragmentLength : Is the length the library fragment
#' @param readLength : The sequence read length
#' @param variations : Number of sequences returned for each read type. Note each
#' sequence variation will start at a unique location (where possible)
#' @param headerID : Character identifier that will be incorporated into sequence header
#' @return Returns a list of two DNAstring sets labelled "read1" and "read2" which correspond
#' to forward and reverse read pairs.
#'
#' @examples
#'
#' library('Ularcirc')
#'
#'
#' # Generate a 500nt sequence containing A and which is flanked with GG and CC.
#' circRNA_Sequence <- paste(rep('A',500),collapse='')
#' circRNA_Sequence <- paste('GG',circRNA_Sequence, 'CC', sep='')
#' # The GG and CC ends of sequence represent ends of linear exons that are circularised.
#' # Therefore the backsplice junction (BSJ) is GGCC.
#' # Generate reads that alternate over this BSJ
#'
#' fastqReads <- bsj_fastq_generate(circRNA_Sequence, fragmentLength=300, readLength=100,
#'                variations = 4,   # Four type I , II, III, and IV reads generated
#'                headerID='circRNA_example')  # Identifier incorporated in name of each sequence
#' # The following will indicate 12 sequences are present in each list entry
#' length(fastqReads$read1)
#' length(fastqReads$read2)
#'
#' # Can create fastq file as follows
#' Biostrings::writeXStringSet( fastqReads$read1,"circRNA_Sample_R1.fastq.gz",
#'                               compress = TRUE, format="fastq")
#' Biostrings::writeXStringSet( fastqReads$read2,"circRNA_Sample_R2.fastq.gz",
#'                               compress = TRUE, format="fastq")
#' @import Biostrings
#'
#' @export
bsj_fastq_generate <- function(circRNA_Sequence, fragmentLength=300, readLength=100, variations = 4, headerID='')
{
  if (variations < 1)
  { warning("Number of fragment variations must be 1 or more. Resetting to 1")
    variations <- 1
  }
  # Variations: Number of read variations prepared for each read type.
  #
  circ_length <- nchar(circRNA_Sequence)
  if (fragmentLength > circ_length)
  {
    warning("Fragment length is larger than circRNA length. Please check input sequence. Returning NULL.")
    return(NULL)
  }

  circRNA_Sequence <- paste(circRNA_Sequence, circRNA_Sequence, sep = "")
  typeII_III_offset_step_size <- round(readLength/(variations + 1))
  typeIV_gap_size <- fragmentLength - (2 * readLength)
  typeIV_offset_step_size <- round(typeIV_gap_size / (variations + 1))



  if (fragmentLength <= readLength)
  { warning("fragment length is shorter than or equal to read length.
            Increasing fragment length to read length + 1")
    fragmentLength <- readLength + 1
  }

  # Calculate offset to generate the appropriate alignment types.
  typeII_offset  <- readLength - typeII_III_offset_step_size
  typeIII_offset <- fragmentLength - typeII_III_offset_step_size
  typeIV_offset  <- readLength + typeIV_gap_size - typeIV_offset_step_size
  common_label <- paste("_F",fragmentLength,"_R",readLength, sep="")

  read_one <- {};  read_two <- {};
  # Generate Type II reads
  start_pos <- circ_length - typeII_offset
  for (i in 1:variations)
  { read_one <- c(read_one, substr(x = circRNA_Sequence, start = start_pos, stop = start_pos + readLength))
  read_two <- c(read_two, substr(x = circRNA_Sequence, start = start_pos+fragmentLength-readLength, stop = start_pos + fragmentLength))
  names(read_one)[i] <- paste(headerID,"_typeII_",start_pos, common_label,sep="")
  start_pos <- start_pos + typeII_III_offset_step_size
  }

  # Generate Type III reads
  start_pos <- circ_length - typeIII_offset
  for (i in 1:variations)
  { read_one <- c(read_one, substr(x = circRNA_Sequence, start = start_pos, stop = start_pos + readLength))
    read_two <- c(read_two, substr(x = circRNA_Sequence, start = start_pos+fragmentLength-readLength, stop = start_pos + fragmentLength))
    names(read_one)[length(read_one)] <- paste(headerID,"_typeIII_",start_pos,common_label,sep="")
    start_pos <- start_pos + typeII_III_offset_step_size
  }


  if (fragmentLength > (readLength*2))
  {  # Generate typeIV reads

    if (typeIV_offset_step_size < 1)  # Ensure we have a step increment.
    {   typeIV_offset_step_size <- 1      }
    else if (round(typeIV_gap_size/typeIV_offset_step_size) < variations)
    { warning("Requested number of fastq are more than what is possible, downsizing number of generated entries")
      variations <- round(typeIV_gap_size/typeIV_offset_step_size)
    }

    start_pos <- circ_length - typeIV_offset
    for (i in 1:variations)
    { read_one <- c(read_one, substr(x = circRNA_Sequence, start = start_pos, stop = start_pos + readLength))
    read_two <- c(read_two, substr(x = circRNA_Sequence, start = start_pos+fragmentLength-readLength, stop = start_pos + fragmentLength))
    names(read_one)[length(read_one)] <- paste(headerID,"_typeIV_",start_pos,common_label,sep="")
    start_pos <- start_pos + typeIV_offset_step_size
    }
  }

  names(read_two) <- names(read_one)
  read_one <- Biostrings::DNAStringSet(x=read_one)
  read_two <- Biostrings::reverseComplement(Biostrings::DNAStringSet(x=read_two))
  return(list(read1 =read_one, read2 = read_two))
}



######################################################################################################
#' Grab_BS_Junc_Sequence
#'
#' @description 
#'  This function extracts genomic sequence that is likely to capture BSJ. Function does not cross
#'  validate to gene models.
#'  
#' @param SelectUniqueJunct_value : a dataframe with columns names startDonor, strandDonor, startAcceptor
#' @param GeneList : GeneList
#'
#'
##
## SelectUniqueJunct_value should be a dataframe with columns names startDonor, strandDonor, startAcceptor.
##        Currently only extracts information from the first row, but in theory could be rewritten to work on multiple entries
##
## NOTE: This was originally designed purely for backsplice junction analysis and because of this the column names are from
## STAR junction output tables. It is now also used for canonical splice junctions.
##
Junction_Sequence_from_Genome <- function(SelectUniqueJunct_Value, GeneList)
{
  toDisplay <- c("Error, No coordinate data to extract or no BSJ selected")
  #browser()
  if ((nrow(SelectUniqueJunct_Value) >= 1) && (SelectUniqueJunct_Value[1] != "ACTION REQUIRED"))
  { # Initially assume we have backsplice junciton
    Seq_length <- 125


    Donor <- list()
    Acceptor <- list()
    # Following if statements work for illumina Truseq data. Need to identify if this works for same stranded libraries
    TranscriptStrand <- SelectUniqueJunct_Value$strandDonor[1]
    Transcriptchrom <- SelectUniqueJunct_Value$chromDonor[1]   # This should be same for donor and acceptor

    if ((TranscriptStrand == '-') && (SelectUniqueJunct_Value$type[1] == 'bs'))  # Need to swap donor and acceptor for BS junctions only
    {
      tmp <- SelectUniqueJunct_Value$startDonor[1]
      SelectUniqueJunct_Value$startDonor[1] <- SelectUniqueJunct_Value$startAcceptor[1]
      SelectUniqueJunct_Value$startAcceptor[1] <- tmp
    }


    if (TranscriptStrand == '+')
    { Donor$start <- SelectUniqueJunct_Value$startDonor[1] - Seq_length -1
    Donor$end   <- SelectUniqueJunct_Value$startDonor[1]  - 1    # +1 ensures start in exon as STAR passes coordinates of intron donor sequence
    }
    if (TranscriptStrand == '-')
    {  Donor$end   <- SelectUniqueJunct_Value$startDonor[1] -1
    Donor$start <- SelectUniqueJunct_Value$startDonor[1]  - Seq_length -1
    }

    if (TranscriptStrand == '+')
    { Acceptor$start <- SelectUniqueJunct_Value$startAcceptor[1] +  1    # -1 ensures start in exon as STAR passes coordinates of intron donor sequence
    Acceptor$end   <- SelectUniqueJunct_Value$startAcceptor[1] + Seq_length +1
    }
    if (TranscriptStrand == '-')
    {  Acceptor$end   <- SelectUniqueJunct_Value$startAcceptor[1] + Seq_length + 1
    Acceptor$start <- SelectUniqueJunct_Value$startAcceptor[1] +1
    }

    # Need to test for kit type . i.e. Same strand or opposing.
    if (TranscriptStrand =="+")
      TranscriptStrand <- "-"
    else
      TranscriptStrand <- "+"

    DonorSequence <- extractGenomeSequence(Transcriptchrom, Donor$start, Donor$end, TranscriptStrand, GeneList = GeneList)
    AcceptorSequence <- extractGenomeSequence(Transcriptchrom, Acceptor$start, Acceptor$end, TranscriptStrand, GeneList = GeneList)


    if  (SelectUniqueJunct_Value$type[1] == 'c') # Canonical junction
    { toDisplay <- paste(DonorSequence, AcceptorSequence, sep=".") # Positive strand
    if (TranscriptStrand =="-")
      toDisplay <- paste(AcceptorSequence, DonorSequence, sep=".") # Negative strand
    }

    if (SelectUniqueJunct_Value$type[1] == 'bs')  # backsplice junction
    { toDisplay <- paste(DonorSequence,AcceptorSequence,sep = ".") # Positive strand  <== currently back to front
    if (TranscriptStrand =="-")
      toDisplay <- paste(AcceptorSequence, DonorSequence, sep=".") # Negative strand
    }
    toDisplay <- gsub(pattern=" ",replacement="", x=toDisplay)              # Clean up space characters
  }

  return(toDisplay)

}
VCCRI/Ularcirc documentation built on April 8, 2022, 5:17 p.m.