R/file_io.R

Defines functions SelectUniqueJunctions chimericFilters FilterChimericJuncs FilterChimeric_Ularcirc loadSTAR_chimeric chimericStats

Documented in chimericFilters chimericStats FilterChimericJuncs FilterChimeric_Ularcirc loadSTAR_chimeric SelectUniqueJunctions

##################################################################
#' chimericStats
#'
#' @description
#' Simple function that returns a list of basic stats obtained from a STAR
#' chimeric file
#'
#' @param chimericDT : Data table of chimeric junctions as provided by
#' STAR aligner
#'
#'
#' @examples
#' extdata_path <- system.file("extdata",package = "Ularcirc")
#' chimeric.file <- paste0(extdata_path,"/SRR444655_subset.Chimeric.out.junction.gz")
#' chimericDT <- Ularcirc::loadSTAR_chimeric(chimeric.file,returnColIdx = 1:14)
#' Ularcirc::chimericStats(chimericDT$data_set)
#' chimericDT$filtered <- Ularcirc::FilterChimericJuncs(chimericDT$data_set, canonicalJuncs = TRUE)
#' Ularcirc::chimericStats(chimericDT$filtered)
#'
#'
#' @seealso
#' FilterChimericJuncs
#' @export
#
# TO DO
#   Number of chimeric entries
#   Number of unique junctions
#   Frequency of most abundant junction
#   Number of backsplice vs other junctions
#
chimericStats <- function(chimericDT)
{

  total_rows <- nrow(chimericDT)
  # Number of unique junctions
  tableSummary <- table(chimericDT$BSjuncName)
  n_UniqueJunctions <- length(tableSummary)

 return(list(totalRows= total_rows, uniqueJunctionNum = n_UniqueJunctions))
}


####################################################################
#' loadSTAR_chimeric
#'
#' @description
#' Loads chimeric output file from the STAR aligner and returns a list containing
#' three items (a data table, alignment stats and command line).
#'
#' @param filename :  filename of the STAR chimeric output file.
#'             Can be gzipped
#' @param ID_index :  An index (single integer) which will be added
#' as a separate column in the returned data table. Useful when
#' collating multiple files into one large matrix like object.
#' @param returnColIdx : Numeric index of columns to return. Default 1:15
#'
#' @details  :
#'
#' Reads in a text or gzipped chimeric output file generated by the STAR aligner. Function
#' automatically detects if the last two lines contains meta-data (produced from STAR 2.7)
#' onwards.
#'
#' Returns a list of containing three items: (1) data_set (2) alignmentStats and (3) commandLine.
#'
#' The column names of data_set are defined as
#' c("chromDonor","startDonor","strandDonor", "chromAcceptor",
#' "startAcceptor","strandAcceptor","JuncType", "RepeatLength_L",
#' "RepeatLength_R",  "ReadName","FirstBase_1stSeq","CIGAR_1stSeg",
#' "FirstBase_2ndSeq","CIGAR_2ndSeg", "Multimapping")
#'
#' If ID_index is set to a value greater than 0 then an additional column called
#' "DataSet" is created.
#'
#' Columns can be subsetted by defining returnColIdx with an integer value that correspond
#' to order of column names listed above.
#'
#'
#' @export
loadSTAR_chimeric <- function(filename=NULL, ID_index = 0, returnColIdx=1:21)
{ messg <- NULL
  if (is.null(filename))
  { messg <- "No file name provided. Cannot continue"  }

  if(length(setdiff(returnColIdx, 1:21)) > 0)
  { messg <- "returnColIdx contains out of bound index. Please check and try again"  }

  if (! is.numeric(version) && ! is.numeric(ID_index))
  { messg <- "version or ID_index is not numeric. Please modify and try again" }

  if (! is.null(messg))
  {
    warning(messg)
    return(NULL)
  }

  # read in file and check has correct number of columns
  data_set <- data.table::fread(filename, sep="\t", fill=TRUE)
  total_rows <- nrow(data_set)
  columnNumber <- ncol(data_set)
  if ((columnNumber < 14) || (columnNumber > 21))
  {
    warning(paste0("Unexpected number of columns in ", filename," cannot process."))
    return(NULL)
  }
  # Check what columns can be accessed, which is defined by number of columns
  maxColRange <- 1:columnNumber


  if(length(setdiff(returnColIdx, maxColRange)) > 0)
  { messg <- paste0("Number of columns available is smaller than what was specified. ",
        "Suspect using data from old version of STAR aligner (pre 2.7.3). ",
        "Please try a smaller range in returnColIdx (perhaps 1:14 or 1:15).")
    warning(messg)
    return(NULL)
  }


  # prepare column labels
  colnameIDs <- c("chromDonor","startDonor","strandDonor", "chromAcceptor",
                "startAcceptor","strandAcceptor","JuncType", "RepeatLength_L",
                "RepeatLength_R",  "ReadName","FirstBase_1stSeq","CIGAR_1stSeg",
                "FirstBase_2ndSeq","CIGAR_2ndSeg", "Num_chim_aln",
                "max_poss_aln_score", "non_chim_aln_score", "this_chim_aln_score",
                "bestall_chim_aln_score", "PEmerged_bool", "readgrp"
                )
  numericColIDs <- colnameIDs[c(2,5,7,8,9,11,13,15:20)]
  colnameIDs <- colnameIDs[returnColIdx]


  # Check last two rows to determine if data generated from STAR 2.7 or above.
  # If so need to re-format data
  alignmentStats = NULL
  commandLine = NULL

  if (length(grep(pattern = "Nreads",x = data_set[total_rows,1])) > 0)
  { # Collect meta data from last two rows and then remove
    alignmentStats <- paste(data_set[total_rows,],collapse = " ")
    commandLine <- paste(data_set[total_rows -1,],collapse = " ")
    total_rows <- total_rows - 2
    data_set <- data_set[1:total_rows,]
  }
  # See if data has header, if so remove. This was introduced in STAR 2.7.3
  if(data_set[1,1] == "chr_donorA")
  { data_set <- data_set[-1,]

  }

  data_set <- data_set[,..returnColIdx]
  data.table::setnames(data_set,returnColIdx,colnameIDs)

  # Fix up numeric columns which are converted to characters if alignmentStats not null
  if (! is.null(alignmentStats))
  { colsToConvert<- intersect(colnameIDs, numericColIDs)
    for (i in colsToConvert)
    { data_set[[i]] <- as.numeric(data_set[[i]])  }
  }

  # General unique lookup ID
  data_set$BSjuncName <- paste(data_set$chromDonor,data_set$startDonor,data_set$chromAcceptor, data_set$startAcceptor,sep="_")
  if (ID_index > 0)
  {  data_set$DataSet <-  ID_index }	# This adds an index to identify the file
  return(list(data_set=data_set, alignmentStats=alignmentStats, commandLine=commandLine))

}


####################################################################################################
#' Wrapper function for Ularcirc shiny app which expects a list of objects to be returned
#'
#'  NEED to ensure that  unstranded boolean value is passed to this function
#' Not tested via shiny app yet.
#'
#' @param All_junctions : data.table of chimeric reads from STAR aligner
#' @param chromFilter : when TRUE (default) both chimera parts have to align to same chromosome
#' @param strandFilter : when TRUE (default) both chimera parts have to align to same strand
#' @param genomicDistance : minimum and maximum distance filters of chimeric reads on chromosome. Only is applied
#'  if ChromFilter is TRUE and StrandFilter is TRUE
#' @param canonicalJuncs : Will include any canonical junctions (default TRUE). Note STAR keeps canonical junctions that do not conform to aligner rules.
#' @param fileID : Specify a file index. Useful if planing to concatenating all data sets into a single table.
#' @param chrM_Filter : Filter out mitochondrial chimeric reads (default TRUE)
#' @param invertReads : Boolean that specifies in read strand should be inverted (default FALSE).
#' @param unstranded : Boolen for if reads are unstranded
#' @param summaryNumber : Number (Integer) of records to display in shiny app
#'
#'
#'
#'
FilterChimeric_Ularcirc <- function(All_junctions, chromFilter=TRUE, strandFilter=TRUE,
                                    genomicDistance=c(200,100000),  canonicalJuncs=TRUE,
                                    fileID= c(-1), chrM_Filter=TRUE, invertReads = FALSE,
                                    unstranded=FALSE, summaryNumber = 50)
{
  filtered_Rawdata <- FilterChimericJuncs(All_junctions, chromFilter=chromFilter,
                                          strandFilter=strandFilter,
                                          genomicDistance=genomicDistance,
                                          canonicalJuncs=canonicalJuncs,
                                          fileID= c(-1), chrM_Filter=chrM_Filter,
                                          invertReads = invertReads)

  filterlist <- list(BSjuncName=NULL, SortDir="Descending", IndexNumber=1, DisplayNumber=summaryNumber,
                     DisplayRAD_score= FALSE, Apply_FSJ_Filter=FALSE)

  SummaryData <- SelectUniqueJunctions(All_junctions, filterlist = filterlist, unstranded=unstranded)

  return(list(RawData=All_junctions,  SummaryData= SummaryData))
}



#' FilterChimericJuncs
#'
#' @description
#'	A generic function that filters STAR chimeric junction files on certain genomic criteria (eg strand, same chromosome etc).
#'	Useful filter to remove the most obvious false positives. The default filter settings are suitable for circRNA
#'  discovery in humans / mice data sets.
#'
#' @param All_junctions : data.table of chimeric reads from STAR aligner
#' @param chromFilter : when TRUE (default) both chimera parts have to align to same chromosome
#' @param strandFilter : when TRUE (default) both chimera parts have to align to same strand
#' @param genomicDistance : minimum and maximum distance filters of chimeric reads on chromosome. Only is applied
#'  if ChromFilter is TRUE and StrandFilter is TRUE
#' @param canonicalJuncs : Will include any canonical junctions (default TRUE). Note STAR keeps canonical junctions that do not conform to aligner rules.
#' @param fileID : Specify a file index. Useful if planing to concatenating all data sets into a single table.
#' @param chrM_Filter : Filter out mitochondrial chimeric reads (default TRUE)
#' @param invertReads : Boolean that specifies in read strand should be inverted (default FALSE).
#'
#'
#'
#' @seealso
#' SelectUniqueJunctions, loadSTAR_chimeric
#'
#' @examples
#' extdata_path <- system.file("extdata",package = "Ularcirc")
#' chimeric.file <- paste0(extdata_path,"/SRR444655_subset.Chimeric.out.junction.gz")
#' chimericsDT <- Ularcirc::loadSTAR_chimeric(chimeric.file,returnColIdx = 1:14)
#' chimericsDT$filtered <- Ularcirc::FilterChimericJuncs(chimericsDT$data_set, canonicalJuncs = TRUE)
#'
#' @import data.table
#'
#' @export
FilterChimericJuncs <- function(All_junctions, chromFilter=TRUE, strandFilter=TRUE,
                                genomicDistance=c(200,100000),  canonicalJuncs=TRUE,
                                fileID= c(-1), chrM_Filter=TRUE, invertReads = FALSE)
{
  # Setup colmn name ID lookups. Setting up this way to remove warning on package build.
  cn <- c(chromDonor="chromDonor", chromAcceptor="chromAcceptor",
          strandDonor="strandDonor",strandAcceptor="strandAcceptor" )

  if (fileID[1] != -1)  # Select data from files that user has selected
  {	All_junctions <- Filter_by_Data_Set(fileID, All_junctions)	}

  All_junctions <- data.table::data.table(All_junctions)
  if (chromFilter == TRUE)
  {	All_junctions = All_junctions[get(cn["chromDonor"]) == get(cn["chromAcceptor"]),]		}

  if (chrM_Filter == TRUE)
  { All_junctions = All_junctions[get(cn["chromDonor"]) != "chrM",]	}


  if (strandFilter == TRUE)
  {	All_junctions = All_junctions[get(cn["strandDonor"]) == get(cn["strandAcceptor"]),]	}

  if (invertReads == TRUE)
  {
    pos_strand_idx <- All_junctions$strandDonor == "+"
    All_junctions$strandDonor[pos_strand_idx] = "-"
    All_junctions$strandDonor[! pos_strand_idx] = "+"
    tmp <- All_junctions$startDonor[pos_strand_idx]
    All_junctions$startDonor[pos_strand_idx]  <- All_junctions$startAcceptor[pos_strand_idx]
    All_junctions$startAcceptor[pos_strand_idx] <- tmp
  }


  if ((chromFilter == TRUE) && (strandFilter == TRUE)) # Apply genomic distance filter
  {	# Classical BSjunctions are defined as:
    # (strandDonor=='-' & (startAcceptor > startDonor))  | (strandDonor=='+' & (startDonor > startAcceptor) )

    BS_junctions = All_junctions[((strandDonor == "+" ) &
                                    ((startDonor - startAcceptor) > genomicDistance[1]) & ((startDonor - startAcceptor) < genomicDistance[2])) |
                                   ((strandDonor == "-" ) &
                                      ((startAcceptor - startDonor) > genomicDistance[1]) & ((startAcceptor - startDonor) < genomicDistance[2])),]
    BS_junctions$type = "bs"

    if (canonicalJuncs == TRUE)
    {	# Merge both canonical and BS junctions together
      Canonical_junctions =  All_junctions[((strandDonor == "-" ) &
                                              ((startDonor - startAcceptor) > genomicDistance[1]) & ((startDonor - startAcceptor) < genomicDistance[2])) |
                                             ((strandDonor == "+" ) &
                                                ((startAcceptor - startDonor) > genomicDistance[1]) & ((startAcceptor - startDonor) < genomicDistance[2])),]
      Canonical_junctions$type = "c"
      All_junctions <- data.table::rbindlist(list(BS_junctions, Canonical_junctions))
    }
    if (canonicalJuncs != TRUE)
      All_junctions <- BS_junctions
  }
  else # No strand filter or chromosome filter
  {
    BS_junctions <- All_junctions
    BS_junctions <- "Any"
  }


  return(RawData=All_junctions)

}


##################################################################################################
#'chimericFilters
#'
#' @description
#' A wrapper function that prepares a list of filters that can be passed
#'
#' @param BSjuncName : A character string that represents a backsplice junction ID. Set when
#' needing to extract a specific junction. Default NULL.
#' @param sortDir : Specifies how data is sorted, either "Descending" (default) or "Ascending".
#' @param indexNumber : Filter data according to this file index
#' @param displayNumber : Number of records to display in an shiny app
#' @param displayRADscore : Boolean. If TRUE then will apply/calculate RAD score
#' @param RADcountThreshold : Integer. The minimum count threshold required to calculate RAD score.
#' i.e. A default RAD score of -1 will be applied to any BSJ with a count less than this score
#' @param applyFSJfilter : Boolean of whether to apply FSJ filter
#'
#'
chimericFilters <- function(BSjuncName=NULL, sortDir="Descending", indexNumber=1, displayNumber=10,
                            displayRADscore= FALSE, RADcountThreshold = 10, applyFSJfilter=FALSE)
{

  filterlist = list(BSjuncName, sortDir, indexNumber, displayNumber,
                    displayRADscore, RADcountThreshold,
                    applyFSJfilter)
  return(filterlist)
}

################################################################################################
#' SelectUniqueJunctions
#'
#' This is the workhorse for collated BSJ junctions from the input data. It will return
#'    selected rows of data (annotated) that will enable enhanced browsing of raw data on the fly.
#'
#'		Filter options: Junction abundance. Sort
#'
#' @description
#' Builds a summary table from chimeric data obtained from the STAR aligner. Assembles table with
#' the requested number of top entries. Populates with RAD score and FSJ score.
#'
#'
#' @param BSJ_junctions : Junction to display
#' @param filterlist : filterlist
#' @param unstranded : If TRUE will match reads from both strands.
#' @param FSJ_Junctions : Junction to display.
#' @param shinyapp : Boolean. If true used to setup control status bars in shiny app.
#'
#' @export
SelectUniqueJunctions <- function(BSJ_junctions,  filterlist = chimericFilters(),
                                  #filterlist = list(BSjuncName=NULL, SortDir="Descending",
                                  # IndexNumber=1, DisplayNumber=10, DisplayRAD_score= FALSE, Apply_FSJ_Filter=FALSE),
#                                  library_strand = "Opposing strand", FSJ_Junctions=NULL, shinyapp=FALSE)
                                  unstranded = FALSE, FSJ_Junctions=NULL, shinyapp=FALSE)
{

  if (! is.null(filterlist$BSjuncName))
  {   # Check to see if a specific junction has been requested. If so only need to return these candidates
    toReturn <- BSJ_junctions[BSJ_junctions$BSjuncName == filterlist$BSjuncName]
    return(toReturn)
  }

  filterlist$DisplayNumber <- as.numeric(filterlist$DisplayNumber)

  junc_count <- table(BSJ_junctions$BSjuncName)				# Quantify the number of the backsplice junction
  # "table is similar to following which is faster)
  # test <- BSJ_junctions[,.N, by=BSjuncName]
  # test <- test[order(-N)]   # decreasing order


  junc_number <- length(junc_count)
  if (junc_number < filterlist$DisplayNumber)
  {	filterlist$DisplayNumber <- junc_number	}


  # Making a table with following columns:
  #		"BSjuncName" (eg chr9_46241521_chr9_46242431), "type" (canonical/BS), "JuncType" (0 1 2), Count, PCR duplicate score.

  First_CIGAR <- {}
  Second_CIGAR <- {}


  # Sort table
  if (filterlist$SortDir == "Descending")
  {	junc_count <- junc_count[names(sort(junc_count, decreasing = TRUE))]	}
  else
  {	junc_count <- junc_count[names(sort(junc_count, decreasing = FALSE))]	}

  # The following loop calculates PCR duplicate frequency of backsplice junction.
  First_CIGAR <- {}
  Second_CIGAR <- {}
  UniqueJunctions <- {}


  filtersteps <- function(shinyapp=TRUE, BSjuncName, JuncType, strandDonor)
  {
    Max_Display <- filterlist$IndexNumber + filterlist$DisplayNumber

    for(i in filterlist$IndexNumber : Max_Display)
    {
      if (shinyapp)
      { shiny::incProgress(1/Max_Display, detail = paste("Updating Row ",i)) }


      # Check to make sure the loop increment has not jumped past the max numbers of entries
      if (i > length(junc_count))
        break

      # Collate all data that relate to current BSJ
      BS_Junc_ID <- names(junc_count)[i]
      OneJunctionReads <- (BSJ_junctions[BSJ_junctions$BSjuncName == BS_Junc_ID,])

      # Re-organise table
      if (length(grep(pattern = 'type', x = colnames(OneJunctionReads))))
      {    temp <- OneJunctionReads[1,.(BSjuncName, type, JuncType, strandDonor)]	  }
      else  # On very rare occations when applying NO filter for BSJ will enter this block
      {
        temp <- OneJunctionReads[1,.(BSjuncName, JuncType, strandDonor)]
        temp$type <- "unknown"
      }
      ## If data is unstranded we need to collapse alignments from both strands
      additional_counts <- 0
      if (unstranded)  # Need to append data from palindrome ID
      {  # Only keep IDs that have a larger coordinate in first position of ID
        decode_juncID <- strsplit(BS_Junc_ID  , "_")
        if (as.numeric(decode_juncID[[1]][4]) > as.numeric(decode_juncID[[1]][2]))
          next

        # Extract matching coordinate on other strand and add value
        # Some times the other strand is off by a couple of bases. Work through
        # a 10nt window to find most likely candidate
        search_pattern <- c(0,1,-1,2,-2,3,-3,4,-4,5,-5) # ranked likely positions to identify opposing strand equivalent
        for(j in seq_along(search_pattern))
        { sp <- search_pattern[j]
        matching_coord_ID <- paste(decode_juncID[[1]][1],as.numeric(decode_juncID[[1]][4])+sp,decode_juncID[[1]][1],as.numeric(decode_juncID[[1]][2])+sp,sep = "_")
        if (! is.na(junc_count[matching_coord_ID]))  # Great! Found the junctions from opposing reads
        { additional_counts <- junc_count[matching_coord_ID]
        how_similar <- junc_count[i]/additional_counts
        if ((how_similar < 0.5) || (how_similar > 1.5))
        {  next  }  # Making sure count is within range of other strand

        # Extract all junctions so that the RAD score can be calculated
        matching_coord_entries <- (BSJ_junctions[BSJ_junctions$BSjuncName == matching_coord_ID,])
        # swap type II and type III junctions around to ensure accurate RAD score is calculated
        CIGAR_temp <- matching_coord_entries$CIGAR_1stSeg
        matching_coord_entries$CIGAR_1stSeg <- matching_coord_entries$CIGAR_2ndSeg
        matching_coord_entries$CIGAR_2ndSeg <- CIGAR_temp

        if (dim(matching_coord_entries)[1])
	{ OneJunctionReads <- rbind(OneJunctionReads, matching_coord_entries) }

        break
        } # if (! is.na(junc_count[matching_coord_ID]))
        } # for(j in seq_along(search_pattern))
      } # if (unstranded)



      temp$Freq <- junc_count[i] + additional_counts

      temp$TypeII_TypeIII <- -1
      if (filterlist$DisplayRAD_score)
      {  # FAD <- Fragment_Alignment_Distribution(BSJ_table = OneJunctionReads)
        # Think I can delete the next two lines.
        temp$PCR_dup_A <- 0 #round(FAD$FirstSeg_RAD,2)
        temp$PCR_dup_B <- 0 # round(FAD$SecondSeg_RAD, 2)

        # Calculate the RAD score
        temp$TypeII_TypeIII <- RAD_score(CIGAR_1stSeg = OneJunctionReads$CIGAR_1stSeg,
                                         CIGAR_2ndSeg = OneJunctionReads$CIGAR_2ndSeg,
                                         RADcountThreshold = filterlist$RADcountThreshold
        )


      } # if (filterlist$DisplayRAD_score)

      # Cross compare to FSJ. Looking for perfect matches and annotate if FSJ exist.
      temp$FSJ_support <- 0

      if (typeof(FSJ_Junctions) != 'NULL')
      {
        BSJ_details <- unlist(strsplit(BS_Junc_ID,split = "_"))
        BSJ_idx <- c(2,4)
        if (temp$strandDonor == "-") { BSJ_idx <- c(4,2) }

        FSJ_score <- FSJ_Junctions[start==as.numeric(BSJ_details[BSJ_idx[1]])]$score
        if (length(FSJ_score) )
          temp$FSJ_support <- temp$FSJ_support + 1

        FSJ_score <- FSJ_Junctions[end==as.numeric(BSJ_details[BSJ_idx[2]])]$score
        if (length(FSJ_score) )
          temp$FSJ_support <- temp$FSJ_support+ 1
      }


      if (length(UniqueJunctions)[1] == 0)
      {	UniqueJunctions <- temp	}
      else
      {	UniqueJunctions <- try(rbind(UniqueJunctions, temp)  )		}
    }  # for(i in filterlist$IndexNumber : Max_Display)
    return(UniqueJunctions)
  } # filtersteps <- function(shinyapp=TRUE)

  # Setup code for either shiny or other progress bar status
  if (shinyapp)
  { # Display shiny progress bars
    shiny::withProgress(message="Calculating read alignment distributions (RAD)", value=0, {
      UniqueJunctions <- filtersteps(shinyapp = shinyapp, BSjuncName=BSjuncName,
                                     JuncType=JuncType, strandDonor=strandDonor)
    }) # withProgress(message="Calculating RAD scores", value=0, {
  }
  else
  { # No progress bars
    UniqueJunctions <- filtersteps(shinyapp = shinyapp)
  }


  return(UniqueJunctions)
}

####################################################################################
#' RAD_score
#'
#' @description
#' Theoretically the position of backsplice junctions should be distributed randomly
#' across a amplicon. This function calculates the read alignment distribution (RAD)
#' of backsplice junctions between forward and reverse read pairs. The RAD score
#' is calculated from CIGAR strings which can be used to identify type II and type III
#' alignments.
#'
#'
#' @param CIGAR_1stSeg : CIGAR string of the first segment.
#' @param CIGAR_2ndSeg : CIGAR string of the second segment
#' @param RADcountThreshold : Minimum count threshold required to apply RAD score. If there
#' are less than this many entries in CIGAR list then -1 is returned.
#' @param digits : rounding of the RAD score to this many digits (default 2)
#'
#' @export
RAD_score <- function(CIGAR_1stSeg = NULL, CIGAR_2ndSeg = NULL, RADcountThreshold = 10, digits = 2)
{
  if ((is.null(CIGAR_1stSeg)) || (is.null(CIGAR_2ndSeg)))
  { warning("No CIGAR data provided for either 1st or 2nd segment. Returning NULL")
    return(NULL)
  }
  CIGARlength <- length(CIGAR_1stSeg)
  if (CIGARlength != length(CIGAR_2ndSeg))
  { warning("Unequal amounts of 1st and 2nd CIGAR segments passed. Returning NULL")
    return(NULL)
  }
  # If number of data points below threshold. Return -1
  if (CIGARlength < RADcountThreshold)
  { return(-1)  }


  # Calculate RAD score. This requires identifying type II and III alignments
  # Type II and III alignments are identified by length of match within CIGAR string
  FirstSeg_Match <- lapply(gsubfn::strapplyc(as.character(CIGAR_1stSeg),
                                             pattern="([-0-9]+)M"),
                           FUN = function(x) {sum(as.numeric(x))})
  SecondSeg_Match <- lapply(gsubfn::strapplyc(as.character(CIGAR_2ndSeg),
                                              pattern="([-0-9]+)M"),
                            FUN = function(x) {sum(as.numeric(x))})
  # Extract padding lengths
  FirstSeg_Pad <- lapply(gsubfn::strapplyc(as.character(CIGAR_1stSeg),
                                           pattern="(-[0-9]+)p"),
                         FUN = function(x) {sum(as.numeric(x))})
  SecondSeg_Pad <- lapply(gsubfn::strapplyc(as.character(CIGAR_2ndSeg),
                                            pattern="(-[0-9]+)p"),
                          FUN = function(x) {sum(as.numeric(x))})

  # Correct match length with padding length
  FirstSeg_Match <- unlist(FirstSeg_Match) - unlist(FirstSeg_Pad)
  SecondSeg_Match <- unlist(SecondSeg_Match) - unlist(SecondSeg_Pad)

  TypeIII_from_CIGAR <- length(which(FirstSeg_Match > SecondSeg_Match))
  TypeII_from_CIGAR <- length(which(FirstSeg_Match < SecondSeg_Match))

  return(round(TypeII_from_CIGAR/ (TypeII_from_CIGAR + TypeIII_from_CIGAR),digits))

}
VCCRI/Ularcirc documentation built on April 8, 2022, 5:17 p.m.