R/hiReadsProcessor.R

Defines functions addListNameToReads getSonicAbund sampleSummary uniqueLength crossOverCheck isuSites otuSites clusterSites getIntegrationSites read.blast8 write.psl read.psl pslCols blatSeqs splitSeqsToFiles pslToRangedObject blatListedSet stopgfServer startgfServer pairUpAlignments read.BAMasPSL checkArgsSetDefaults_ALIGNed checkArgs_SEQed write.listedDNAStringSet read.seqsFromSector getSectorsForSamples addFeature extractFeature extractSeqs trimSeqs findAndRemoveVector findAndTrimSeq troubleshootLinkers annotateSites findIntegrations findVector findLinkers findLTRs findPrimers showFindStats doRCtest vpairwiseAlignSeqs primerIDAlignSeqs pairwiseAlignSeqs findBarcodes splitByBarcode chunkize removeReadsWithNs replicateReads dereplicateReads read.sampleInfo read.SeqFolder

Documented in addFeature addListNameToReads annotateSites blatListedSet blatSeqs chunkize clusterSites crossOverCheck dereplicateReads doRCtest extractFeature extractSeqs findAndRemoveVector findAndTrimSeq findBarcodes findIntegrations findLinkers findLTRs findPrimers findVector getIntegrationSites getSectorsForSamples getSonicAbund isuSites otuSites pairUpAlignments pairwiseAlignSeqs primerIDAlignSeqs pslCols pslToRangedObject read.BAMasPSL read.blast8 read.psl read.sampleInfo read.SeqFolder read.seqsFromSector removeReadsWithNs replicateReads sampleSummary splitByBarcode splitSeqsToFiles startgfServer stopgfServer trimSeqs troubleshootLinkers vpairwiseAlignSeqs write.listedDNAStringSet write.psl

#' Functions to process LM-PCR reads from 454/Illumina data
#'
#' hiReadsProcessor contains set of functions which allow users to process 
#' LM-PCR products sequenced using any platform. Given an excel/txt file 
#' containing parameters for demultiplexing and sample metadata, the functions 
#' automate trimming of adaptors and identification of the genomic product.
#' Genomic products are further processed for QC and abundance quantification.
#'
#' @import BiocParallel Biostrings GenomicAlignments hiAnnotator 
#' sonicLength BiocGenerics GenomicRanges
#' @importFrom rSFFreader sread readSff
#' @importFrom readxl read_excel
#' @importFrom dplyr count arrange summarise rename mutate select ungroup
#' group_by bind_rows left_join desc n %>% contains
#' @importFrom methods as is
#' @importFrom stats aggregate ave filter na.omit runif
#' @importFrom utils count.fields read.delim write.table
#' @docType package
#' @name hiReadsProcessor
#' @author Nirav V Malani
NULL

#' Sample Integration Sites Sequencing Data
#' 
#' This is a processed data object containing raw sequences and respective 
#' alignments to UCSC freeze hg18 from 112 integration site samples. The object
#' is of SimpleList class and follows a certain structural hierarchy explained
#' by the Introductory vignette.
#' 
#' @docType data
#' @keywords datasets
#' @format A SimpleList object 
#' @name seqProps
NULL

#' PSL file output
#' 
#' Sample BLAT PSL file output from samples included Integration Sites 
#' Sequencing Data \code{\link{seqProps}}
#' 
#' @docType data
#' @keywords datasets
#' @format a data frame of 1000 rows and 21 columns
#' @name psl
NULL

#' Read contents of a sequencing folder and make a SimpleList object
#'
#' Given a sequencing folder path, sample information file path, and sequence 
#' file extension pattern, the function returns a list of variables required to 
#' process the data. The function also calls \code{\link{read.sampleInfo}} 
#' which reads in sample processing metadata and formats it if needed.
#'
#' @param sequencingFolderPath full or relative path to the sequencing folder
#' @param sampleInfoFilePath full or relative path to the sample information 
#' file, which holds samples to quadrant/lane associations along with other 
#' metadata required to trim sequences or process it. Default to NULL, where 
#' the function tries to find xls/xlsx or tab deliminated txt file in the sequencing 
#' folder which sounds similar to 'sampleinfo' and present you with choices of 
#' file to select from.
#' @param seqfilePattern regex/string to describe sequence file endings. 
#' See examples. Default is NULL.
#'
#' @return a SimpleList list which is used by other functions to process and 
#' decode the data.
#'
#' @note 
#' \itemize{
#'   \item One must make sure that each sequencing file has sector name/number 
#'   prefixed at the beginning, else \code{\link{findBarcodes}} will fail trying 
#'   to find the filename.
#'   \item For paired end Illumina runs, make sure the filenames include R1, R2, 
#'   and I1 somewhere in the name denoting pair1, pair2, and index/barcode 
#'   reads, respectively. 
#' }
#'
#' @seealso \code{\link{read.sampleInfo}}, \code{\link{findBarcodes}}, 
#' \code{\link{splitByBarcode}}
#'
#' @export
#'
#' @examples
#' runData <- system.file("extdata/FLX_sample_run/", 
#' package = "hiReadsProcessor")
#' read.SeqFolder(runData, seqfilePattern=".+fna.gz$")
#' \dontrun{
#' read.SeqFolder(".", seqfilePattern = "\\.TCA.454Reads.fna$")
#' read.SeqFolder(".", seqfilePattern = ".+fastq$")
#' read.SeqFolder(".", seqfilePattern = ".+sff$")
#' }
read.SeqFolder <- function(sequencingFolderPath = NULL, 
                           sampleInfoFilePath = NULL,
                           seqfilePattern = NULL) {
  if(is.null(sequencingFolderPath)) {
    stop("No Sequencing Folder Path provided.")
  }
  
  ## get the sequencingFolderPath right!
  sequencingFolderPath <- normalizePath(sequencingFolderPath, mustWork = TRUE)
  
  seqFilePaths <- list.files(path = sequencingFolderPath, recursive = TRUE,
                             full.names = TRUE, pattern = seqfilePattern)
  if(length(seqFilePaths) == 0) {
    stop("No files found in the folder ", sequencingFolderPath,
         "matching following pattern: ", seqfilePattern)
  }
  
  ## read the sample info file
  if(is.null(sampleInfoFilePath)) {
    possibleFiles <- list.files(path = sequencingFolderPath,
                                recursive = TRUE, full.names = TRUE,
                                pattern = ".*sampleinfo.+", ignore.case = TRUE)
    if (length(possibleFiles) == 0) {
      stop("No sample information file found in folder: ", sequencingFolderPath)
    } else {
      if(interactive() & length(possibleFiles)>1) {
        message("Please choose a sample information file to read the meta data 
                from:\n",
                paste(1:length(possibleFiles), possibleFiles,
                      sep = ": ", collapse = "\n"))
        choice <- scan(what = integer(0), n = 1, quiet = TRUE, 
                       multi.line = FALSE)
      } else {
        choice <- 1            
      }
      message("Choosing ", possibleFiles[choice], 
              " as sample information file.")
    }
    sampleInfoFilePath <- possibleFiles[choice]
  }
  sampleInfo <- read.sampleInfo(sampleInfoFilePath)
  
  ## do a quick test of filenames if any samples are from paired end illumina
  if(any(sampleInfo$pairedend)) {    
    sectors <- unique(sampleInfo$sector[sampleInfo$pairedend])
    for(sector in sectors) {
      vals <- grep(gsub("I1|R1|R2", "", sector), seqFilePaths, value = TRUE)
      if(length(vals)!=3) {
        stop("Sector ",sector," is missing one of the files: R1, R2, or I1.")
      }
    }
  }
  
  if(length(sampleInfo) != length(unique(gsub("I1|R1|R2", "", seqFilePaths)))) {
    warning("Number of sectors (", length(sampleInfo),
            ") in sample information file doesn't match # of sector files (", 
            length(unique(gsub(seqfilePattern,'',seqFilePaths))), 
            ") found in the folder.")
  }
  
  return(SimpleList("sequencingFolderPath" = sequencingFolderPath,
                    "seqFilePaths" = seqFilePaths,
                    "seqfilePattern" = seqfilePattern,
                    "sampleInfoFilePath" = sampleInfoFilePath,
                    "sectors" = sampleInfo, "callHistory" = match.call()))
}

#' Read a sample information file and format appropriate metadata.
#'
#' Given a sample information file, the function checks if it includes required 
#' information to process samples present on each sector/quadrant/region/lane. 
#' The function also adds other columns required for processing with default 
#' values if not already defined ahead of time.
#'
#' @param sampleInfoPath full or relative path to the sample information file, 
#' which holds samples to quadrant/lane associations along with other metadata 
#' required to trim sequences or process it. 
#' @param splitBySector split the data frame into a list by sector column. 
#' Default is TRUE.
#'
#' @details
#' \itemize{
#'  \item Required Column Description:
#'    \itemize{
#'  	\item sector => region/quadrant/lane of the sequencing plate the sample 
#'    comes from. If files have been split by samples apriori, then the filename
#'    associated per sample without the extension. If this is a filename, then 
#'    be sure to enable 'alreadyDecoded' parameter in \code{\link{findBarcodes}}, 
#'    since contents of this column is pasted together with 'seqfilePattern' 
#'    parameter in \code{\link{read.SeqFolder}} to find the appropriate file 
#'    needed. For paired end data, this is basename of the FASTA/Q file holding 
#'    the sample data from the LTR side. For example, files such as 
#'    Lib3_L001_R2_001.fastq.gz or Lib3_L001_R2_001.fastq would be 
#'    Lib3_L001_R2_001, and consequently Lib3_L001_R1_001 would be used as the 
#'    second pair!
#'  	\item barcode => unique 4-12bp DNA sequence which identifies the sample. 
#'    If providing filename as sector, then leave this blank since it is assumed 
#'    that the data is already demultiplexed.
#'  	\item primerltrsequence => DNA sequence of the viral LTR primer 
#'    with/without the viral LTR sequence following the primer landing site. 
#'    If already trimmed, then mark this as SKIP.
#'  	\item sampleName => Name of the sample associated with the barcode
#'  	\item sampleDescription => Detailed description of the sample
#'  	\item gender => sex of the sample: male or female or NA
#'  	\item species => species of the sample: homo sapien, mus musculus, etc.
#'  	\item freeze => UCSC freeze to which the sample should be aligned to.
#'  	\item linkerSequence => DNA sequence of the linker adaptor following the 
#'    genomic sequence. If already trimmed, then mark this as SKIP.
#'  	\item restrictionEnzyme => Restriction enzyme used for digestion and 
#'    sample recovery. Can also be one of: Fragmentase or Sonication!
#' 		}
#'  \item Metadata Parameter Column Description:
#'   \itemize{
#'  	\item ltrBitSequence => DNA sequence of the viral LTR following the 
#'    primer landing site. Default is last 7bps of the primerltrsequence.
#'  	\item ltrBitIdentity => percent of LTR bit sequence to match during the 
#'    alignment. Default is 1.
#'  	\item primerLTRidentity => percent of primer to match during the 
#'    alignment. Default is .85
#'  	\item linkerIdentity => percent of linker sequence to match during the 
#'    alignment. Default is 0.55. Only applies to non-primerID/random tag based 
#'    linker search.
#'  	\item primerIdInLinker => whether the linker adaptor used has 
#'    primerID/random tag in it? Default is FALSE.
#'  	\item primerIdInLinkerIdentity1 => percent of sequence to match before 
#'    the random tag. Default is 0.75. Only applies to primerID/random tag 
#'    based linker search and when primeridinlinker is TRUE.
#'  	\item primerIdInLinkerIdentity2 => percent of sequence to match after the 
#'    random tag. Default is 0.50. Only applies to primerID/random tag based 
#'    linker search and when primeridinlinker is TRUE.
#'  	\item celltype => celltype information associated with the sample
#'  	\item user => name of the user who prepared or processed the sample
#'  	\item pairedEnd => is the data paired end? Default is FALSE.
#'    \item vectorFile => fasta file containing the vector sequence
#' 		}
#'  \item Processing Parameter Column Description:
#'   \itemize{
#'  	\item startWithin => upper bound limit of where the alignment should 
#'    start within the query. Default is 3.
#'  	\item alignRatioThreshold => cuttoff for (alignment span/read length). 
#'    Default is 0.7.
#'  	\item genomicPercentIdentity => cuttoff for (1-(misMatches/matches)). 
#'    Default is 0.98.
#'  	\item clusterSitesWithin => cluster integration sites within a defined 
#'    window size based on frequency which corrects for any sequencing errors. 
#'    Default is 5.
#'  	\item keepMultiHits => whether to keep sequences/reads that return 
#'    multiple best hits, aka ambiguous locations. 
#'  	\item processingDate => the date of processing 
#' 		}
#' }
#'
#' @return if splitBySector=TRUE, then an object of SimpleList named by 
#' quadrant/lane information defined in sampleInfo file, else a dataframe.
#'
#' @seealso \code{\link{read.SeqFolder}}, \code{\link{findBarcodes}}, 
#' \code{\link{splitByBarcode}}
#'
#' @export
#'
#' @examples  
#' runData <- system.file(file.path("extdata", "FLX_sample_run"), 
#' package = "hiReadsProcessor")
#' read.sampleInfo(file.path(runData, "sampleInfo.xlsx"))
read.sampleInfo <- function(sampleInfoPath = NULL, splitBySector = TRUE) {
  ## read file and make sampleInfo object with sample to metadata associations
  if(is.null(sampleInfoPath)) {
    stop("No sample information file path provided.")
  }
  
  sampleInfoPath <- normalizePath(sampleInfoPath, mustWork = TRUE)
  
  requiredCols <- c('sector', 'barcode', 'primerltrsequence', 'samplename',
                    'sampledescription', 'gender', 'species', 'freeze',
                    'linkersequence', 'restrictionenzyme')
  
  metaDataCols <- c('ltrbitsequence' = '', 'ltrbitidentity' = 1,
                    'primerltridentity' = .85, 'linkeridentity' = .55,
                    'primeridinlinker' = FALSE, 
                    'primeridinlinkeridentity1' = .75,
                    'primeridinlinkeridentity2' = .50, 'celltype' = '',
                    'user' = Sys.getenv("USER"), 'startwithin' = 3,
                    'alignratiothreshold' = .7, 'clustersiteswithin' = 5,
                    'keepmultihits' = TRUE , 'genomicpercentidentity' = .98,
                    'processingdate' = format(Sys.time(), "%Y-%m-%d"),
                    'pairedend' = FALSE, 'vectorFile' = '')
  
  if(grepl('.xls.?$', sampleInfoPath)) {
    sampleInfo <- unique(readxl::read_excel(sampleInfoPath))
  } else {
    sampleInfo <- unique(read.delim(sampleInfoPath, stringsAsFactors = FALSE))
  }
  names(sampleInfo) <- tolower(gsub("[[:punct:]]|[[:space:]]", "", 
                                    names(sampleInfo)))
  
  # check for required columns
  ColsNotThere <- !requiredCols %in% names(sampleInfo)
  if (any(ColsNotThere)) {
    absentCols <- requiredCols[ColsNotThere]
    stop("Following required column(s) is absent from the Sample Info file: ",
         paste(absentCols, sep = "", collapse = ", ")
    )
  }
  
  # add missing meta data columns
  metaColsNotThere <- !names(metaDataCols) %in% names(sampleInfo)
  if (any(metaColsNotThere)) {
    sampleInfo <- cbind(sampleInfo,
                        as.data.frame(t(metaDataCols[metaColsNotThere]),
                                      stringsAsFactors = FALSE))
  }
  
  # do some formatting to avoid later hassels!
  for(column in c('sector', 'barcode', 'primerltrsequence', 'ltrbitsequence', 
                  'samplename', 'linkersequence', 'restrictionenzyme')) {
    sampleInfo[,column] <- gsub("[[:space:]]", "", sampleInfo[,column])
    if(column %in% c('barcode', 'primerltrsequence', 'ltrbitsequence', 
                     'linkersequence', 'restrictionenzyme')) {
      sampleInfo[,column] <- toupper(sampleInfo[,column])
    }
  }
  
  for(column in c('pairedend', 'keepmultihits', 'primeridinlinker')) {
    sampleInfo[,column] <- as.logical(sampleInfo[,column])
  }
  
  # confirm ltr bit is correct
  ltrbitTest <- sampleInfo$primerltrsequence == "SKIP"
  if(any(ltrbitTest)) { 
    ## add SKIP to ltrbit as well if primerltrsequence has been trimmed
    tofix <- which(ltrbitTest)
    message("adding SKIP to ltrbitsequence to ", length(tofix),
            " sample since primerltrsequence has been trimmed.")
    sampleInfo$ltrbitsequence[tofix] <- "SKIP"
  }
  
  ltrbitTest <- with(sampleInfo, nchar(ltrbitsequence) == 0 | 
                       ltrbitsequence == "")
  if(any(ltrbitTest)) {
    tofix <- which(ltrbitTest)
    if(interactive()) {
      message("LTR bit not found for ", length(tofix), " samples.
              Use last 7 bases of the LTR primer as the LTR bit? (y/n)")
      choice <- scan(what = character(0), n = 1, quiet = TRUE, 
                     multi.line = FALSE)
    } else {
      message("LTR bit not found for ", length(tofix),
              " samples. Using last 7 bases of the LTR primer as the LTR bit.")
      choice <- "y"
    }
    
    if(tolower(choice) == "y") {
      sampleInfo$ltrbitsequence <- 
        substr(sampleInfo$primerltrsequence,
               nchar(sampleInfo$primerltrsequence) - 6,
               nchar(sampleInfo$primerltrsequence))
      sampleInfo$primerltrsequence <- 
        substr(sampleInfo$primerltrsequence, 1,
               nchar(sampleInfo$primerltrsequence) - 7)
    } else {
      warning("No LTR bit sequence found for following samples: ",
              paste(sampleInfo$samplename[tofix], sep = "", collapse = ", "),
              immediate. = TRUE)
    }        
  }
  
  # check if samplenames are up to the expectations
  samplenametest <- with(sampleInfo, nchar(samplename) == 0 | samplename == "")
  if(any(samplenametest)) {
    stop("No sample names found in following rows of the sample information file ",
         sampleInfoPath, " : ", paste(which(samplenametest),
                                      sep = "", collapse = ", "))
  }
  
  # check for sectors and their usage
  sectortest <- with(sampleInfo, nchar(sector) == 0 | sector == "" | 
                       is.na(sector))
  if(any(sectortest)) {
    tofix <- which(sectortest)
    
    if(interactive()) {
      message("Sector information not found for ", length(tofix),
              " samples. Which sector are they from? (1,2,4,etc)")
      choice <- scan(what = character(0), quiet = TRUE, multi.line = FALSE)
    } else {
      message("Sector information not found for ", length(tofix),
              " samples. Assuming they are from sector 1.")
      choice <- "1"
    }
    
    if(length(choice) > 0) {
      sampleInfo$sector[tofix] <- unlist(strsplit(choice, ","))
    } else {
      stop("No Sector information found for following samples: ",
           paste(sampleInfo$samplename[tofix], sep = "", collapse = ", "))
    }
  }
  
  ## excel sometimes converts integers to doubles...
  ## make sure to remove the trailing 0
  sampleInfo$sector <- gsub("\\.0$", "", as.character(sampleInfo$sector))
  
  sampleSectorTest <- table(paste(sampleInfo$samplename, sampleInfo$sector))
  if(any(sampleSectorTest > 1)) {
    stop("Duplicate sample names found on same quadrant in the ",
         "sample information file ", sampleInfoPath, " : ", 
         paste(sampleSectorTest[sampleSectorTest > 1], sep = "", 
               collapse = ", "))
  }
  
  # prepare the sample info object!
  if(splitBySector) {            
    sampleInfo <- SimpleList(split(sampleInfo, sampleInfo$sector))
    for(sector in 1:length(sampleInfo)) { 
      sampleData <- SimpleList(split(sampleInfo[[sector]], 
                                     sampleInfo[[sector]]$samplename))
      for(sample.i in 1:length(sampleData)) { 
        sampleData[[sample.i]] <- SimpleList(as.list(sampleData[[sample.i]])) 
      }
      sampleInfo[[sector]] <- SimpleList("samples" = sampleData) 
    }
  }
  return(sampleInfo)
}

#' Removes duplicate sequences from DNAStringSet object.
#'
#' Given a DNAStringSet object, the function dereplicates reads and 
#' adds counts=X to the definition line to indicate replication. 
#'
#' @param dnaSet DNAStringSet object to dereplicate. 
#'
#' @return DNAStringSet object with names describing frequency of repeat.
#'
#' @seealso \code{\link{replicateReads}}, \code{\link{removeReadsWithNs}}, 
#' \code{\link{findBarcodes}}, \code{\link{splitByBarcode}}
#'
#' @export
#'
#' @examples 
#' dnaSet <- c("CCTGAATCCTGGCAATGTCATCATC", "ATCCTGGCAATGTCATCATCAATGG", 
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA", 
#' "CCGCGTCTGCAATGTGAGGGCCTAA", "GAAGGATGCCAGTTGAAGTTCACAC", 
#' "CCTGAATCCTGGCAATGTCATCATC", "ATCCTGGCAATGTCATCATCAATGG", 
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA", 
#' "CCGCGTCTGCAATGTGAGGGCCTAA", "GAAGGATGCCAGTTGAAGTTCACAC") 
#' dereplicateReads(dnaSet)
dereplicateReads <- function(dnaSet) {
  if(!is(dnaSet,"DNAStringSet")) {
    dnaSet <- DNAStringSet(dnaSet)
  }
  
  if(is.null(names(dnaSet))) {
    message("No names attribute found in dnaSet object...", 
            "using artifically generated names")
    names(dnaSet) <- paste("read", 1:length(dnaSet), sep="-")
  }
  
  dnaSet <- dnaSet[order(dnaSet)]
  counts <- BiocGenerics::table(dnaSet)
  dnaSet <- unique(dnaSet)
  rows <- match(dnaSet, names(counts))
  names(dnaSet) <- paste0(names(dnaSet), 
                          "counts=", as.integer(counts[rows]))
  return(dnaSet)
}

#' Replicate sequences from DNAStringSet object using counts identifier or vector
#'
#' Given a DNAStringSet object, the function replicates reads using counts=X 
#' marker at the end of definition line. 
#'
#' @param dnaSet DNAStringSet object to replicate. 
#' @param counts an integer or a numeric vector of length length(dnaSet) 
#' indicating how many times to repeat each sequence. Default is NULL, 
#' in which it uses counts=X notation from the definition line to 
#' replicate reads.
#'
#' @return DNAStringSet object.
#'
#' @seealso \code{\link{dereplicateReads}}, \code{\link{removeReadsWithNs}},
#' \code{\link{findBarcodes}}, \code{\link{splitByBarcode}}
#'
#' @export
#'
#' @examples  
#' dnaSet <- c("CCTGAATCCTGGCAATGTCATCATC", "ATCCTGGCAATGTCATCATCAATGG", 
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA", 
#' "CCGCGTCTGCAATGTGAGGGCCTAA", "GAAGGATGCCAGTTGAAGTTCACAC", 
#' "CCTGAATCCTGGCAATGTCATCATC", "ATCCTGGCAATGTCATCATCAATGG", 
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA", 
#' "CCGCGTCTGCAATGTGAGGGCCTAA", "GAAGGATGCCAGTTGAAGTTCACAC") 
#' dnaSet <- dereplicateReads(dnaSet)
#' replicateReads(dnaSet)
replicateReads <- function(dnaSet, counts = NULL) {
  
  stopifnot(is(dnaSet,"DNAStringSet"))
  if(is.null(counts)) {
    if(is.null(names(dnaSet))) {
      stop("No names attribute found in dnaSet object")
    }
    counts <- as.numeric(sub(".+counts=(\\d+)", "\\1", names(dnaSet)))
    if(all(is.na(counts))) {
      warning("No counts=\\d+ marker found at the end of definition line or ",
              "names attribute in dnaSet object. Defaulting to count=1")
      counts <- 1
    }
  }
  
  if(length(counts) == 1) {
    counts <- rep(counts, length(dnaSet))
  }
  
  ids <- unlist(sapply(counts, seq_len))
  deflines <- paste0(rep(sub("(.+)counts=.+", "\\1",
                             names(dnaSet)), times = counts), "_", ids)
  dnaSet <- rep(dnaSet, times = counts)
  names(dnaSet) <- deflines
  return(dnaSet)
}

#' Remove sequences with ambiguous nucleotides.
#'
#' Given a DNAStringSet object, the function removes any reads that has either repeating or total Ns which is greater than to maxNs threshold
#'
#' @param dnaSet DNAStringSet object to evaluate. 
#' @param maxNs integer value denoting the threshold of maximum allowed Ns. 
#' Default is 5.
#' @param consecutive boolean flag denoting whether Ns to filter is consecutive or total . Default is TRUE.
#'
#' @return DNAStringSet object.
#'
#' @seealso \code{\link{dereplicateReads}}, \code{\link{replicateReads}},
#' \code{\link{findBarcodes}}, \code{\link{splitByBarcode}}
#'
#' @export
#'
#' @examples  
#' dnaSet <- c("CCTGAATCCTNNCAATGTCATCATC", "ATCCTGGCNATGTCATCATCAATGG", 
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA", 
#' "CCGNNTCTGCAATGTGNGGNCCTAN", "GAAGNNNNNNGTTGAAGTTCACAC") 
#' removeReadsWithNs(dnaSet)
#' removeReadsWithNs(dnaSet, maxNs = 4, consecutive = FALSE)
removeReadsWithNs <- function(dnaSet, maxNs = 5, consecutive = TRUE) {
  if(!is(dnaSet,"DNAStringSet")) {
    dnaSet <- DNAStringSet(dnaSet)
  }
  if(consecutive) {
    good.row <- !grepl(paste(rep("N", maxNs + 1), collapse = ""), dnaSet, 
                       fixed = TRUE)
  } else {
    res <- alphabetFrequency(dnaSet)
    good.row <- res[,"N"] <= maxNs
  }
  return(dnaSet[good.row])
}

#' Breaks an object into chunks of N size.
#'
#' Given a linear/vector like object, the function breaks it into equal sized chunks either by chunkSize. This is a helper function used by functions in 'See Also' section where each chunk is sent to a parallel node for processing.
#'
#' @param x a linear object.
#' @param chunkSize number of rows to use per chunk of query. 
#' Defaults to length(x)/detectCores() or length(query)/bpworkers() 
#' depending on parallel backend registered. 
#'
#' @return a list of object split into chunks.
#'
#' @seealso \code{\link{primerIDAlignSeqs}}, \code{\link{vpairwiseAlignSeqs}}, 
#' \code{\link{pairwiseAlignSeqs}}
#'
#' @export
#'
#' @examples
#' x <- c("GAGGCTGTCACCGACAAGGTTCTGA", "AATAGCGTGGTGACAGCCCACATGC", 
#'        "GGTCTTCTAGGGAACCTACGCCACA", "TTTCCGGCGGCAGTCAGAGCCAAAG", 
#'        "TCCTGTCAACTCGTAGATCCAATCA", "ATGGTCACCTACACACAACGGCAAT", 
#'        "GTCAGGACACCTAATCACAAACGGA", "AGACGCAGGTTCAGGCTGGACTAGA", 
#'        "ATCGTTTCCGGAATTCGTGCACTGG", "CAATGCGGGCACACGCTCTTACAGT")
#' chunkize(DNAStringSet(x), 5)
chunkize <- function(x, chunkSize = NULL) {
  chunks <- breakInChunks(length(x), 
                          ifelse(!is.null(chunkSize),
                                 length(x)/chunkSize, 
                                 ifelse(!is.null(bpworkers()), 
                                        length(x)/bpworkers(), 
                                        length(x)/detectCores())))
  mapply(function(z, y) x[z:y], start(chunks), end(chunks), 
         SIMPLIFY = FALSE, USE.NAMES = FALSE)
}

#' Split DNAStringSet object using first X number of bases defined by a vector.
#'
#' Given a character vector of barcodes/MID to sample association and a DNAStringSet object, the function splits/demultiplexes the DNAStringSet object by first few bases dictated by length of barcodes/MID supplied. This is an accessory function used by \code{\link{findBarcodes}}
#'
#' @param barcodesSample a character vector of barcodes to sample name associations. 
#' Ex: c("ACATCCAT"="Sample1", "GAATGGAT"="Sample2",...)
#' @param dnaSet DNAStringSet object to evaluate. 
#' @param trimFrom integer value serving as start point to trim the sequences from. 
#' This is calculated internally length barcode+1. Default is NULL.
#' @param showStats boolean flag denoting whether to show decoding statistics 
#' per sample & barcode. Default is FALSE.
#' @param returnUnmatched boolean flag denoting whether to return unmatched reads. 
#' Default is FALSE.
#'
#' @return DNAStringSet object split by sample name found in barcodesSample.
#'
#' @seealso \code{\link{findBarcodes}}, \code{\link{dereplicateReads}},
#' \code{\link{replicateReads}}
#'
#' @export
#'
#' @examples 
#' dnaSet <- DNAStringSet(c("read1" = "ACATCCATAGAGCTACGACGACATCGACATA",
#' "read2"="GAATGGATGACGACTACAGCACGACGAGCAGCTACT",
#' "read3"="GAATGGATGCGCTAAGAAGAGA", "read4"="ACATCCATTCTACACATCT"))
#' splitByBarcode(c("ACATCCAT" = "Sample1", "GAATGGAT" = "Sample2"), dnaSet,
#' showStats=TRUE)
splitByBarcode <- function(barcodesSample, dnaSet, trimFrom = NULL,
                           showStats = FALSE, returnUnmatched = FALSE) {
  if(is.null(barcodesSample) | length(barcodesSample)==0) {
    stop("No barcodes to samples association vector provided in parameter ",
         "barcodesSample.")
  }
  
  if(is.null(dnaSet) | length(dnaSet)==0) {
    stop("No sequences provided in parameter dnaSet.")
  }
  
  if(is.null(names(dnaSet))) {
    stop("No names attribute found in dnaSet object")
  }
  
  message("Using following schema for barcode to sample associations")
  print(as.data.frame(barcodesSample))
  
  ## subset barcode string from the rest of sequence ##
  barcodelen <- unique(nchar(names(barcodesSample)))
  seqbarcodes <- substr(dnaSet, 1, barcodelen)
  
  ## index rows that match your list of barcodes ##
  good.row <- seqbarcodes %in% names(barcodesSample)
  if(!any(good.row)) {
    stop("No matching barcoded sequences found on this quadrant.")
  }
  
  sampleNames <- barcodesSample[seqbarcodes[good.row]]    
  deflines <- sub("^(\\S+) .+$", "\\1", names(dnaSet)[good.row], perl = TRUE)
  
  ## if primer bases were utilized for tiebreaker, use the original length 
  ## instead of modified for trimming.
  if(is.null(trimFrom)) {
    trimFrom <- barcodelen + 1
  }
  
  ## remove sequences with unknown barcode and trim barcode itself ##
  unmatched <- DNAStringSet(dnaSet[!good.row])
  dnaSet <- DNAStringSet(dnaSet[good.row], start = trimFrom)
  names(dnaSet) <- deflines
  
  if(showStats) {
    message("Number of Sequences with no matching barcode: ",
            as.numeric(table(good.row)['FALSE']))
    message("Number of Sequences decoded:")
    print(as.data.frame(table(sampleNames)))
  }
  
  dnaSet <- as.list(split(dnaSet, as.character(sampleNames)))
  
  if(returnUnmatched) {
    dnaSet <- c(dnaSet, "unDecodedSeqs" = unmatched)
  }
  
  return(dnaSet)
}

#' Demultiplex reads by their barcodes
#'
#' Given a sample information object, the function reads in the raw fasta/fastq file, demultiplexes reads by their barcodes, and appends it back to the sampleInfo object. Calls \code{\link{splitByBarcode}} to perform the actual splitting of file by barcode sequences. If supplied with a character vector and reads themselves, the function behaves a bit differently. See the examples.
#'
#' @param sampleInfo sample information SimpleList object created using 
#' \code{\link{read.sampleInfo}}, which holds barcodes and sample names per 
#' sector/quadrant/lane or a character vector of barcodes to sample name 
#' associations. Ex: c("ACATCCAT"="Sample1", "GAATGGAT"="Sample2",...)
#' @param sector If sampleInfo is a SimpleList object, then a numeric/character 
#' value or vector representing sector(s) in sampleInfo. Optionally if on high 
#' memory machine sector='all' will decode/demultiplex sequences from all 
#' sectors/quadrants. This option is ignored if sampleInfo is a character vector. 
#' Default is NULL. 
#' @param dnaSet DNAStringSet object containing sequences to be decoded or 
#' demultiplexed. Default is NULL. If sampleInfo is a SimpleList object, then 
#' reads are automatically extracted using \code{\link{read.seqsFromSector}} 
#' and parameters defined in sampleInfo object.
#' @param showStats toggle output of search statistics. Default is FALSE.
#' @param returnUnmatched return unmatched sequences. Returns results as a list 
#' where x[["unDecodedSeqs"]] has culprits. Default is FALSE.
#' @param dereplicate return dereplicated sequences. Calls 
#' \code{\link{dereplicateReads}}, which appends counts=X to sequence 
#' names/deflines. Default is FALSE. Not applicable for paired end data since 
#' it can cause insyncronicity.
#' @param alreadyDecoded if reads have be already decoded and split into 
#' respective files per sample and 'seqfilePattern' parameter in 
#' \code{\link{read.SeqFolder}} is set to reading sample files and not the 
#' sector files, then set this to TRUE. Default is FALSE. Enabling this 
#' parameter skips the barcode detection step and loads the sequence file as is 
#' into the sampleInfo object. 
#'
#' @return If sampleInfo is an object of SimpleList then decoded sequences are 
#' appeneded to respective sample slots, else a named list of DNAStringSet 
#' object. If returnUnmatched=TRUE, then x[["unDecodedSeqs"]] has the 
#' unmatched sequences.
#'
#' @seealso \code{\link{splitByBarcode}}, \code{\link{dereplicateReads}},
#' \code{\link{replicateReads}}
#'
#' @export
#'
#' @aliases decodeByBarcode
#' 
#' @examples 
#' dnaSet <- DNAStringSet(c("read1" = "ACATCCATAGAGCTACGACGACATCGACATA",
#' "read2"="GAATGGATGACGACTACAGCACGACGAGCAGCTACT",
#' "read3"="GAATGGATGCGCTAAGAAGAGA", "read4"="ACATCCATTCTACACATCT"))
#' findBarcodes(sampleInfo = c("ACATCCAT" = "Sample1", "GAATGGAT" = "Sample2"),
#' dnaSet=dnaSet, showStats=TRUE, returnUnmatched=TRUE)
#' \dontrun{
#' load(file.path(system.file("data", package = "hiReadsProcessor"),
#' "FLX_seqProps.RData"))
#' findBarcodes(seqProps, sector = "all", showStats = TRUE)
#' }
findBarcodes <- function(sampleInfo, sector = NULL, dnaSet = NULL,
                         showStats = FALSE, returnUnmatched = FALSE,
                         dereplicate = FALSE, alreadyDecoded = FALSE) {
  
  ## tried PDict()...and its slower than this awesome code! ##    
  if(is(sampleInfo,"SimpleList")) {
    if(is.null(sector)) {
      stop("No sector provided in parameter sector.")
    }
    
    sectors <- sector <- as.character(sector)
    if(length(sectors) == 1 & tolower(sectors) == "all") {
      sectors <- names(sampleInfo$sectors)
    }
    
    if(any(!sectors %in% names(sampleInfo$sectors))) {
      stop("Following sectors not found in names(sampleInfo$sectors): ",
           sectors[!sectors %in% names(sampleInfo$sectors)])
    }
    
    for(sector in sectors) {
      ## check everything is cool with the provided barcodes first before 
      ## reading the sequences!
      message("Decoding sector: ",sector) 
      isPaired <- any(as.logical(extractFeature(sampleInfo, sector, 
                                                feature='pairedend')[[sector]]))
      
      ## prepare a vector of barcode to sample associations ##
      sampleBarcodes <- toupper(extractFeature(sampleInfo, sector = sector,
                                               feature = "barcode")[[sector]])
      barcodesSample <- structure(names(sampleBarcodes),
                                  names = as.character(sampleBarcodes))
      
      if(length(table(nchar(as.character(sampleBarcodes)))) > 1) {
        stop("Multiple barcode lengths found.")
      }
      
      ## length of barcodes before any modifications done later if any! ##
      realbarcodelen <- unique(nchar(as.character(sampleBarcodes)))
      
      if(any(table(as.character(sampleBarcodes)) > 1)) {
        message("Duplicate barcode found on this sector.\n",
                "Please choose from one of the options below:\n",
                "\t1: Pick first few bases of primer for tiebreaker? ",
                "(This could be dangerous if the run has too many errors!)\n",
                "\t2: Use the last sample associated with the duplicate ",
                "as the primary sample?\n", 
                "\t3: Do not do anything.")
        
        choice <- scan(what = integer(0), n = 1, quiet = TRUE, 
                       multi.line = FALSE)
        
        if(choice == 1) {  
          message("Enter # of bases to use from primer:")
          howmany <- scan(what = integer(0), n = 1, quiet = TRUE,
                          multi.line = FALSE)
          samplePrimers <- extractFeature(sampleInfo,
                                          sector = sector,
                                          feature = "primerltrsequence")[[sector]]
          samplePrimers <- toupper(samplePrimers)
          newBarcodes <- toupper(paste0(sampleBarcodes,
                                        substr(samplePrimers, 1, howmany))) 
          counts <- table(newBarcodes)
          ## only take 1 to 1 associations!
          rows <- newBarcodes %in% names(which(counts == 1)) 
          
          if(any(counts > 1)) {            
            message("Tie breaking failed...",
                    "try choosing high number of bases from primer possibly? ",
                    "Here are the failed barcodes: ",
                    paste(names(which(counts > 1)), collapse = ", "))
            
            message("Ignore samples associated with those barcodes and ",
                    "continue processing? (y/n)")
            whatsup <- scan(what = character(0), n = 1, quiet = TRUE,
                            multi.line = FALSE)            
            if(whatsup == 'n') {
              stop("Aborting processing due to ambiguous barcode ",
                   "association for samples: ",
                   paste(names(sampleBarcodes[!rows]), collapse = ", "))
            } else {              
              message("Ignoring following samples due to duplicate barcodes: ",
                      paste(names(sampleBarcodes[!rows]), collapse = ", "))
            }
          }
          
          barcodesSample <- structure(names(sampleBarcodes[rows]),
                                      names = newBarcodes[rows])
          
        } else if(choice == 2) {
          message("Overwriting duplicate samples associated with the same barcode...")
        } else {
          stop("Aborting due to duplicate barcode found on this sector")
        }
      }
      
      dnaSet <- read.seqsFromSector(sampleInfo, sector, isPaired)
      
      if(alreadyDecoded) {
        if(length(barcodesSample) > 1) {
          stop("alreadyDecoded parameter is set to TRUE. There shouldn't be more ",
               "than one sample associated to a sequence file.")
        }
        
        if(is.list(dnaSet)) {
          dnaSet <- sapply(dnaSet, function(x) {
            names(x) <- sub("^\\S+-(\\S+) .+$", "\\1", names(x), perl = TRUE)
            x
          })          
        } else {
          names(dnaSet) <- sub("^\\S+-(\\S+) .+$", "\\1",
                               names(dnaSet), perl = TRUE) 
        }        
        
        if(isPaired) {
          ## no need to store barcode/index reads if alreadyDecoded!
          dnaSet <- list(dnaSet[c("pair1","pair2")])
        } else {          
          if(dereplicate) {
            dnaSet <- list(dereplicateReads(dnaSet))
          } else {
            dnaSet <- list(dnaSet)
          }
        }
        names(dnaSet) <- as.character(barcodesSample)
      } else {
        if(isPaired) {
          bc <- splitByBarcode(barcodesSample, dnaSet[["barcode"]],
                               trimFrom = realbarcodelen + 1, 
                               showStats = showStats,
                               returnUnmatched = returnUnmatched)
          p1 <- sapply(bc, function(x) dnaSet[['pair1']][names(x)])
          p2 <- sapply(bc, function(x) dnaSet[['pair2']][names(x)])
          stopifnot(identical(sapply(bc, length), sapply(p1, length)))
          stopifnot(identical(sapply(bc, length), sapply(p2, length)))
          dnaSet <- mapply(function(x, y) list("pair1" = x, "pair2" = y),
                           p1, p2, SIMPLIFY = FALSE)
          rm("bc","p1","p2")
        } else {
          dnaSet <- splitByBarcode(barcodesSample, dnaSet,
                                   trimFrom = realbarcodelen + 1,
                                   showStats = showStats,
                                   returnUnmatched = returnUnmatched)
          if(dereplicate) {
            dnaSet <- dereplicateReads(dnaSet)
          }
        }
      }
      
      for(samplename in names(dnaSet)) {
        if(samplename == "unDecodedSeqs") {
          metadata(sampleInfo$sectors[[sector]]) <-
            append(metadata(sampleInfo$sectors[[sector]]),
                   list("unDecodedSeqs" = dnaSet[[samplename]]))
        } else {
          sampleInfo$sectors[[sector]]$samples[[samplename]]$decoded <- 
            dnaSet[[samplename]]
        }
      }
      metadata(sampleInfo$sectors[[sector]]) <- 
        append(metadata(sampleInfo$sectors[[sector]]),
               list("decodedBy" = barcodesSample))
    }
    sampleInfo$callHistory <- append(sampleInfo$callHistory, match.call())
    decoded <- sampleInfo
    cleanit <- gc()
  } else {
    decoded <- splitByBarcode(sampleInfo, dnaSet, trimFrom = NULL,
                              showStats = showStats,
                              returnUnmatched = returnUnmatched)
    cleanit <- gc()
  }
  return(decoded)
}

#' @export
decodeByBarcode <- findBarcodes

#' Align a short pattern to variable length target sequences.
#'
#' Align a fixed length short pattern sequence (i.e. primers or adaptors) to subject sequences using \code{\link{pairwiseAlignment}}. This function uses default of type="overlap", gapOpening=-1, and gapExtension=-1 to align the patternSeq against subjectSeqs. One can adjust these parameters if prefered, but not recommended. This function is meant for aligning a short pattern onto large collection of subjects. If you are looking to align a vector sequence to subjects, then please use BLAT or see one of following \code{\link{blatSeqs}}, \code{\link{findAndRemoveVector}}
#'
#' @param subjectSeqs DNAStringSet object containing sequences to be searched for the pattern. This is generally bigger than patternSeq, and cases where subjectSeqs is smaller than patternSeq will be ignored in the alignment.
#' @param patternSeq DNAString object or a sequence containing the query sequence to search. This is generally smaller than subjectSeqs. 
#' @param side which side of the sequence to perform the search: left, right or middle. Default is 'left'.
#' @param qualityThreshold percent of patternSeq to match. Default is 1, full match.
#' @param showStats toggle output of search statistics. Default is FALSE.
#' @param bufferBases use x number of bases in addition to patternSeq length to perform the search. Beneficial in cases where the pattern has homopolymers or indels compared to the subject. Default is 5. Doesn't apply when side='middle'.
#' @param doRC perform reverse complement search of the defined pattern. Default is TRUE.
#' @param returnUnmatched return sequences which had no or less than 5\% match 
#' to the patternSeq. Default is FALSE.
#' @param returnLowScored return sequences which had quality score less than 
#' the defined qualityThreshold. Default is FALSE.
#' @param parallel use parallel backend to perform calculation with 
#' \code{\link{BiocParallel}}. Defaults to FALSE. If no parallel backend is 
#' registered, then a serial version is ran using \code{\link{SerialParam}}.
#' @param ... extra parameters for \code{\link{pairwiseAlignment}}
#' @note 
#' \itemize{
#'  \item For qualityThreshold, the alignment score is calculated by 
#'  (matches*2)-(mismatches+gaps) which programatically translates to 
#'  round(nchar(patternSeq)*qualityThreshold)*2.
#'  \item Gaps and mismatches are weighed equally with value of -1 which can 
#'  be overriden by defining extra parameters 'gapOpening' & 'gapExtension'.
#'  \item If qualityThreshold is 1, then it is a full match, if 0, then any 
#'  match is accepted which is useful in searching linker sequences at 3' end. 
#'  Beware, this function only searches for the pattern sequence in one 
#'  orientation. If you are expecting to find the pattern in both orientation, 
#'  you might be better off using BLAST/BLAT!
#'  \item If parallel=TRUE, then be sure to have a parallel backend registered 
#'  before running the function. One can use any of the following 
#'  \code{\link{MulticoreParam}} \code{\link{SnowParam}}
#' }
#'
#' @return 
#' \itemize{
#'  \item IRanges object with starts, stops, and names of the aligned sequences. 
#'  \item If returnLowScored or returnUnmatched = T, then a CompressedIRangesList 
#'  where x[["hits"]] has the good scoring hits, x[["Rejected"]] has the failed 
#'  to match qualityThreshold hits, and x[["Absent"]] has the hits where the 
#'  aligned bit is <=10\% match to the patternSeq.
#' }
#'
#' @seealso \code{\link{primerIDAlignSeqs}}, \code{\link{vpairwiseAlignSeqs}}, 
#' \code{\link{doRCtest}}, \code{\link{findAndTrimSeq}}, \code{\link{blatSeqs}},
#' \code{\link{findAndRemoveVector}}
#'
#' @export
#'
#' @examples 
#' subjectSeqs <- c("CCTGAATCCTGGCAATGTCATCATC", "ATCCTGGCAATGTCATCATCAATGG", 
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA", 
#' "CCGCGTCTGCAATGTGAGGGCCTAA", "GAAGGATGCCAGTTGAAGTTCACAC")
#' subjectSeqs <- DNAStringSet(xscat("AAAAAAAAAA", subjectSeqs))
#' pairwiseAlignSeqs(subjectSeqs, "AAAAAAAAAA", showStats=TRUE)
#' pairwiseAlignSeqs(subjectSeqs, "AAATAATAAA", showStats=TRUE, 
#' qualityThreshold=0.5)
#'
pairwiseAlignSeqs <- function(subjectSeqs = NULL, patternSeq = NULL, 
                              side = "left", qualityThreshold = 1, 
                              showStats = FALSE, bufferBases = 5, doRC = TRUE, 
                              returnUnmatched = FALSE, returnLowScored = FALSE, 
                              parallel = FALSE, ...) {
  dp <- NULL
  
  .checkArgs_SEQed()
  
  if(parallel) {
    subjectSeqs2 <- chunkize(subjectSeqs)
    hits <- bplapply(subjectSeqs2, function(x) 
      pairwiseAlignSeqs(x, patternSeq, side, qualityThreshold, showStats = FALSE,
                        bufferBases, doRC, returnUnmatched,  returnLowScored,
                        parallel = FALSE, ...), BPPARAM = dp)    
    hits <- do.call(c, hits)
    if(is(hits,"CompressedIRangesList")) {
      attrs <- unique(names(hits))
      hits <- sapply(attrs, 
                     function(x) unlist(hits[names(hits) == x], 
                                        use.names = FALSE))
      IRangesList(hits)
    } else {
      hits
    }
  } else {
    qualityThreshold <- as.numeric(qualityThreshold)
    
    ## only get the relevant side of subject sequence with extra bufferBases to 
    ## account for indels/mismatches & save memory while searching and avoid 
    ## searching elsewhere in the sequence
    if(tolower(side) == "left") {
      badSeqs <- DNAStringSet()
      culprits <- width(subjectSeqs) < (nchar(patternSeq) + bufferBases)
      if(any(culprits)) {
        badSeqs <- subjectSeqs[culprits]
        message(length(badSeqs),
                " sequences were removed from aligning since they were",
                " shorter than pattern getting aligned: ",
                (nchar(patternSeq) + bufferBases),"bp")            
        subjectSeqs <- subjectSeqs[!culprits]            
      }
      subjectSeqs2 <- subseq(subjectSeqs, start = 1,
                             end = (nchar(patternSeq) + bufferBases))
      overFromLeft <- rep(0, length(subjectSeqs))
    } else if (tolower(side) == "right") {
      overFromLeft <- width(subjectSeqs) - (nchar(patternSeq) + bufferBases)
      overFromLeft[overFromLeft < 1] <- 1
      subjectSeqs2 <- subseq(subjectSeqs, start = overFromLeft)
    } else {
      subjectSeqs2 <- subjectSeqs
      overFromLeft <- rep(0, length(subjectSeqs))
    }
    
    ## search both ways to test which side yields more hits!
    if(doRC) {
      patternSeq <- tryCatch(doRCtest(subjectSeqs2, patternSeq, 
                                      qualityThreshold),
                             error = function(e) patternSeq)
    }
    
    ## type=overlap is best for primer trimming...see Biostrings Alignment vignette
    if(any(names(match.call()) %in% c("type", "gapOpening", "gapExtension"))) {
      hits <- pairwiseAlignment(subjectSeqs2, patternSeq, ...)        
    } else {
      hits <- pairwiseAlignment(subjectSeqs2, patternSeq, type = "overlap",
                                gapOpening = -1, gapExtension = -1, ...)
    }
    stopifnot(length(hits) == length(subjectSeqs2))
    
    scores <- round(score(hits))
    highscored <- scores >= round(nchar(patternSeq) * qualityThreshold) * 2
    
    if(!any(highscored)) {
      stop("No hits found which passed the qualityThreshold")
    }
    
    ## basically a small subset of highscored
    unmatched <- nchar(hits) <= round(nchar(patternSeq) * .1) 
    
    # no point in showing stats if all sequences are a potential match! #
    if(showStats & qualityThreshold != 0) {
      bore <- as.numeric(table(highscored)['FALSE'])
      message("Total of ", ifelse(is.na(bore), 0, bore),
              " did not have the defined pattern sequence (", patternSeq,
              ") that passed qualityThreshold on the ", side, " side")
    }
    
    ## extract starts-stops of the entire pattern hit ##
    starts <- start(pattern(hits))
    ends <- end(pattern(hits))
    namesq <- names(subjectSeqs)
    hits <- IRanges(start = starts + overFromLeft - ifelse(side == "right", 2, 0),
                    end = ends + overFromLeft - ifelse(side == "right", 2, 0),
                    names = namesq)
    rm("scores","subjectSeqs2","subjectSeqs","starts","ends","namesq")
    
    ## no need to test if there were any multiple hits since pairwiseAlignment 
    ## will only output one optimal alignment...see the man page.
    if(!returnLowScored & !returnUnmatched) {      
      hits <- hits[highscored]
    } else {
      hitstoreturn <- IRangesList("hits" = hits[highscored])
      if(returnLowScored & length(hits[!highscored])>0) {
        hitstoreturn <- append(hitstoreturn,
                               IRangesList("Rejected" = hits[!highscored]))
      }
      
      if(returnUnmatched & length(hits[unmatched]) > 0) {
        hitstoreturn <- append(hitstoreturn,
                               IRangesList("Absent" = hits[unmatched]))
      }
      hits <- hitstoreturn
      rm(hitstoreturn)
    }             
    cleanit <- gc()
    return(hits)
  }
}

#' Align a short pattern with PrimerID to variable length target sequences.
#'
#' Align a fixed length short pattern sequence containing primerID to variable length subject sequences using \code{\link{pairwiseAlignment}}. This function uses default of type="overlap", gapOpening=-1, and gapExtension=-1 to align the patterSeq against subjectSeqs. The search is broken up into as many pieces +1 as there are primerID and then compared against subjectSeqs. For example, patternSeq="AGCATCAGCANNNNNNNNNACGATCTACGCC" will launch two search jobs one per either side of Ns. For each search, qualityThreshold is used to filter out candidate alignments and the area in between is chosen to be the primerID. This strategy is benefical because of Indels introduced through homopolymer errors. Most likely the length of primerID(s) wont the same as you expected!
#'
#' @param subjectSeqs DNAStringSet object containing sequences to be searched for the pattern. 
#' @param patternSeq DNAString object or a sequence containing the query sequence to search with the primerID.
#' @param qualityThreshold1 percent of first part of patternSeq to match. Default is 0.75.
#' @param qualityThreshold2 percent of second part of patternSeq to match. Default is 0.50.
#' @param doAnchored for primerID based patternSeq, use the base before and after primer ID in patternSeq as anchors?. Default is FALSE.
#' @param doRC perform reverse complement search of the defined pattern. Default is TRUE.
#' @param returnUnmatched return sequences if it had no or less than 5\% match to the first part of patternSeq before the primerID. Default is FALSE.
#' @param returnRejected return sequences if it only has a match to one side of patternSeq or primerID length does not match # of Ns +/-2 in the pattern. Default is FALSE.
#' @param showStats toggle output of search statistics. Default is FALSE.
#' @param ... extra parameters for \code{\link{pairwiseAlignment}}
#'
#' @note 
#' \itemize{
#'  \item For qualityThreshold1 & qualityThreshold2, the alignment score is calculated by (matches*2)-(mismatches+gaps) which programatically translates to round(nchar(patternSeq)*qualityThreshold)*2
#'  \item Gaps and mismatches are weighed equally with value of -1 which can be overriden by defining extra parameters 'gapOpening' & 'gapExtension'.
#'  \item If qualityThreshold is 1, then it is a full match, if 0, then any match is accepted which is useful in searching linker sequences at 3' end. Beware, this function only searches for the pattern sequence in one orientation. If you are expecting to find the pattern in both orientation, you might be better off using BLAST/BLAT!
#' }
#'
#' @return 
#' \itemize{
#'  \item A CompressedIRangesList of length two, where x[["hits"]] is hits covering the entire patternSeq, and x[["primerIDs"]] is the potential primerID region. 
#'  \item If returnUnmatched = T, then x[["Absent"]] is appended which includes reads not matching the first part of patternSeq. 
#'  \item If returnRejected=TRUE, then x[["Rejected"]] includes reads that only matched one part of patternSeq or places where no primerID was found in between two part of patternSeq, and x[["RejectedprimerIDs"]] includes primerIDs that didn't match the correct length. 
#'  \item If doAnchored=TRUE, then x[["unAnchoredprimerIDs"]] includes reads that didn't match the base before and after primer ID on patternSeq.
#' }
#'
#' @seealso \code{\link{vpairwiseAlignSeqs}}, \code{\link{pairwiseAlignSeqs}},
#' \code{\link{doRCtest}}, \code{\link{blatSeqs}}, 
#' \code{\link{findAndRemoveVector}}
#'
#' @export
#'
#' @examples 
#' subjectSeqs <- c("CCTGAATCCTGGCAATGTCATCATC", "ATCCTGGCAATGTCATCATCAATGG", 
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA", 
#' "CCGCGTCTGCAATGTGAGGGCCTAA", "GAAGGATGCCAGTTGAAGTTCACAC")
#' ids <- c("GGTTCTACGT", "AGGAGTATGA", "TGTCGGTATA", "GTTATAAAAC", 
#' "AGGCTATATC", "ATGGTTTGTT")
#' subjectSeqs <- xscat(subjectSeqs, xscat("AAGCGGAGCCC",ids,"TTTTTTTTTTT"))
#' patternSeq <- "AAGCGGAGCCCNNNNNNNNNNTTTTTTTTTTT"
#' primerIDAlignSeqs(DNAStringSet(subjectSeqs), patternSeq, doAnchored = TRUE)
primerIDAlignSeqs <- function(subjectSeqs = NULL, patternSeq = NULL,
                              qualityThreshold1 = 0.75, 
                              qualityThreshold2 = 0.50,
                              doAnchored = FALSE, doRC = TRUE,
                              returnUnmatched = FALSE, returnRejected = FALSE,
                              showStats = FALSE, ...) {
  
  .checkArgs_SEQed()
  
  qualityThreshold1 <- as.numeric(qualityThreshold1)
  qualityThreshold2 <- as.numeric(qualityThreshold2)
  
  ## make sure there are Ns in the patternSeq for considering primerIDs
  if(length(unlist(gregexpr("N",patternSeq)))<4) {
    stop("There should be minimum of atleast 4 Ns in patternSeq to be ",
         "considered as a primerID sequence.")
  }
  
  ## Get the right orientation of the supplied patternSeq to peform proper search 
  ## at later two step search phase. 
  if(doRC) {
    patternSeq <- tryCatch(doRCtest(subjectSeqs, patternSeq),
                           error = function(e) patternSeq)
  }
  
  primerIDpos <- unlist(gregexpr("N", patternSeq))
  
  ## Perform primerID extraction by breaking the pattern into two parts...
  ## for sanity sakes due to homopolymers ##
  pattern1 <- as.character(subseq(DNAString(patternSeq), 1, primerIDpos[1] - 1))
  pattern2 <- as.character(subseq(DNAString(patternSeq),
                                  primerIDpos[length(primerIDpos)] + 1))
  
  pattern1.hits <- pairwiseAlignSeqs(subjectSeqs, pattern1, "middle",
                                     qualityThreshold = qualityThreshold1,
                                     doRC = FALSE, returnUnmatched = TRUE, ...)
  pattern2.hits <- pairwiseAlignSeqs(subjectSeqs, pattern2, "middle",
                                     qualityThreshold = qualityThreshold2,
                                     doRC = FALSE, ...)
  
  ## Set aside reads which has no match to the pattern1...
  ## most likely mispriming if primerID is on 5' or read was too loong if on 3'
  ## Hits returned from pairwiseAlignSeqs will be filtered for low scored hits...
  ## so no need to check for those from subjectSeqs
  unmatched <- pattern1.hits[["Absent"]]
  pattern1.hits <- pattern1.hits[["hits"]]
  
  ## remove reads which only have a match to one of the patterns...crossover most likely!
  rejected1 <- setdiff(names(pattern1.hits), names(pattern2.hits))
  rejected2 <- setdiff(names(pattern2.hits), names(pattern1.hits))
  rejected <- c(pattern1.hits[names(pattern1.hits) %in% rejected1], 
                pattern2.hits[names(pattern2.hits) %in% rejected2])
  rm("rejected1","rejected2")
  if(showStats) { 
    message("Removed ", length(rejected),
            " read(s) for only matching one of pattern1 or pattern2") 
  }
  
  ## use only reads which match to both sides of the patterns.
  good.rows <- intersect(names(pattern1.hits), names(pattern2.hits))
  pattern1.hits <- pattern1.hits[names(pattern1.hits) %in% good.rows]
  pattern2.hits <- pattern2.hits[names(pattern2.hits) %in% good.rows]
  
  stopifnot(identical(names(pattern1.hits), names(pattern2.hits)))
  stopifnot(identical(names(pattern1.hits), good.rows))
  
  ## make sure there is no overlap of ranges between pattern1.hits & 
  ## pattern2.hits if there is, then no primerID was found...remove it
  badAss <- end(pattern1.hits) >= start(pattern2.hits)
  if(any(badAss)) {
    rejected <- c(rejected, pattern1.hits[badAss], pattern2.hits[badAss])
    pattern1.hits <- pattern1.hits[!badAss]
    pattern2.hits <- pattern2.hits[!badAss]
    good.rows <- good.rows[!badAss]
    bore <- table(badAss)["TRUE"]
    message("Removed ", ifelse(is.na(bore),0,bore),
            " read(s) for not having primerID present between pattern 1 & 2")
  }
  
  hits <- IRanges(start = start(pattern1.hits),
                  end = end(pattern2.hits),
                  names = good.rows)        
  
  primerIDs <- IRanges(start = end(pattern1.hits) + 1,
                       end = start(pattern2.hits) - 1,
                       names = good.rows)        
  
  if(length(hits) == 0) {
    stop("No hits found that matched both sides of patternSeq with ",
         "primerID in the middle.")
  }    
  
  ## do anchored search for only sequences that matched both sides of patternSeq
  if(doAnchored) {
    message("Found ", length(primerIDs), 
            " total primerIDs before anchored filter.")
    
    ## get anchors of bases flanking Ns
    anchorBase.s <- substr(patternSeq, primerIDpos[1] - 1, primerIDpos[1] - 1)
    anchorBase.e <- substr(patternSeq, primerIDpos[length(primerIDpos)] + 1,
                           primerIDpos[length(primerIDpos)] + 1)
    
    anchorBase.s.i <- trimSeqs(subjectSeqs,
                               resize(pattern1.hits, width = 1, fix = "end"))
    anchorBase.e.i <- trimSeqs(subjectSeqs,
                               resize(pattern2.hits, width = 1, fix = "start"))
    rows <- anchorBase.s == as.character(anchorBase.s.i) &
      anchorBase.e == as.character(anchorBase.e.i)
    
    unAnchored <- hits[!rows]
    primerIDs <- primerIDs[rows]
    hits <- hits[rows]
    message("Found ", length(primerIDs), " total primerIDs after anchored filter.")
    rm("rows","anchorBase.s.i","anchorBase.e.i")
  }
  rm("good.rows", "pattern1.hits", "pattern2.hits")
  cleanit <- gc()
  
  ## also remove any primerIDs that were too short or big than intended.
  nSize <- 2
  badAss <- !width(primerIDs) %in% 
    (length(primerIDpos) - nSize):(length(primerIDpos) + nSize)
  if(any(badAss)) {
    rejectedprimerIDs <- hits[badAss]
    hits <- hits[!badAss]
    primerIDs <- primerIDs[!badAss]
    message("Removed ", table(badAss)["TRUE"],
            " read(s) for not having right primerID length")
  }
  
  hits <- IRangesList("hits" = hits, "primerIDs" = primerIDs)
  
  if(exists("unAnchored")) {
    if(length(unAnchored) > 0) { 
      hits <- append(hits, IRangesList("unAnchoredprimerIDs" = unAnchored)) 
    }
  }
  
  if(returnUnmatched & length(unmatched) > 0) {
    hits <- append(hits, IRangesList("Absent" = unmatched))
  }
  
  if(returnRejected) {
    if(length(rejected) > 0) { 
      hits <- append(hits, IRangesList("Rejected" = rejected)) 
    }
    if(exists("rejectedprimerIDs")) { 
      if(length(rejectedprimerIDs) > 0) { 
        hits <- append(hits, IRangesList("RejectedprimerIDs" = rejectedprimerIDs)) 
      } 
    }
  }    
  return(hits)
}

#' Align a short pattern to variable length target sequences.
#'
#' Align a fixed length short pattern sequence to subject sequences using \code{\link{vmatchPattern}}. This function is meant for aligning a short pattern onto large collection of subjects. If you are looking to align a vector sequence to subjects, then please use BLAT.
#'
#' @param subjectSeqs DNAStringSet object containing sequences to be searched for the pattern. This is generally bigger than patternSeq, and cases where subjectSeqs is smaller than patternSeq will be ignored in the alignment.
#' @param patternSeq DNAString object or a sequence containing the query sequence to search. This is generally smaller than subjectSeqs. 
#' @param side which side of the sequence to perform the search: left, right, or middle. Default is 'left'.
#' @param qualityThreshold percent of patternSeq to match. Default is 1, full match. This is supplied to max.mismatch parameter of \code{\link{vmatchPattern}} as round(nchar(patternSeq)*(1-qualityThreshold)).
#' @param showStats toggle output of search statistics. Default is FALSE.
#' @param bufferBases use x number of bases in addition to patternSeq length to perform the search. Beneficial in cases where the pattern has homopolymers or indels compared to the subject. Default is 5. Doesn't apply when side='middle'.
#' @param doRC perform reverse complement search of the defined pattern. Default is TRUE.
#' @param parallel use parallel backend to perform calculation with 
#' \code{\link{BiocParallel}}. Defaults to FALSE. If no parallel backend is 
#' registered, then a serial version is ran using \code{\link{SerialParam}}.
#' @param ... extra parameters for \code{\link{vmatchPattern}} except for 'max.mismatch' since it's calculated internally using the 'qualityThreshold' parameter.
#'
#'
#' @note 
#' \itemize{
#'  \item For qualityThreshold, the alignment score is calculated by (matches*2)-(mismatches+gaps) which programatically translates to round(nchar(patternSeq)*qualityThreshold)*2.
#'  \item No indels are allowed in the function, if expecting indels then use \code{\link{pairwiseAlignSeqs}}.
#'  \item If qualityThreshold is 1, then it is a full match, if 0, then any match is accepted which is useful in searching linker sequences at 3' end. Beware, this function only searches for the pattern sequence in one orientation. If you are expecting to find the pattern in both orientation, you might be better off using BLAST/BLAT!
#'  \item If parallel=TRUE, then be sure to have a parallel backend registered 
#'  before running the function. One can use any of the following 
#'  \code{\link{MulticoreParam}} \code{\link{SnowParam}}
#' }
#' 
#' @return IRanges object with starts, stops, and names of the aligned sequences.
#'
#' @seealso \code{\link{pairwiseAlignSeqs}}, \code{\link{primerIDAlignSeqs}}, 
#' \code{\link{doRCtest}}, \code{\link{findAndTrimSeq}}, \code{\link{blatSeqs}}, 
#' \code{\link{findAndRemoveVector}}
#'
#' @export
#'
#' @examples 
#' subjectSeqs <- c("CCTGAATCCTGGCAATGTCATCATC", "ATCCTGGCAATGTCATCATCAATGG", 
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA", 
#' "CCGCGTCTGCAATGTGAGGGCCTAA", "GAAGGATGCCAGTTGAAGTTCACAC")
#' subjectSeqs <- DNAStringSet(xscat("AAAAAAAAAA", subjectSeqs))
#' vpairwiseAlignSeqs(subjectSeqs, "AAAAAAAAAA", showStats=TRUE)
#' vpairwiseAlignSeqs(subjectSeqs, "AAAAAAAAAA", showStats=TRUE,
#' qualityThreshold=0.5)
vpairwiseAlignSeqs <- function(subjectSeqs = NULL, patternSeq = NULL,
                               side = "left", qualityThreshold = 1, 
                               showStats = FALSE, bufferBases = 5, doRC = TRUE, 
                               parallel = FALSE, ...) {
  dp <- NULL
  
  .checkArgs_SEQed()
  
  qualityThreshold <- as.numeric(qualityThreshold)
  
  ## Only get the relevant side of subject sequence with extra bufferBases to...
  ## account for indels/mismatches & save memory while searching & avoid searching...
  ## elsewhere in the sequence
  if(tolower(side) == "left") {
    badSeqs <- DNAStringSet()
    culprits <- width(subjectSeqs) < (nchar(patternSeq)+bufferBases)
    if(any(culprits)) {
      badSeqs <- subjectSeqs[culprits]
      message(length(badSeqs),
              " sequences were removed from aligning since they were",
              " shorter than pattern getting aligned: ",
              (nchar(patternSeq) + bufferBases), "bp")
      subjectSeqs <- subjectSeqs[!culprits]            
    }
    subjectSeqs2 <- subseq(subjectSeqs, start = 1,
                           end = (nchar(patternSeq) + bufferBases))
    overFromLeft <- rep(0, length(subjectSeqs))
  } else if (tolower(side) == "right") { 
    overFromLeft <- width(subjectSeqs) - (nchar(patternSeq) + bufferBases)
    overFromLeft[overFromLeft < 1] <- 1
    subjectSeqs2 <- subseq(subjectSeqs, start = overFromLeft)
  } else {
    subjectSeqs2 <- subjectSeqs
    overFromLeft <- rep(0, length(subjectSeqs))
  }
  
  ## search both ways to test which side yields more hits!        
  if(doRC) {        
    patternSeq <- tryCatch(doRCtest(subjectSeqs2, patternSeq, qualityThreshold),
                           error = function(e) patternSeq)
  }
  
  if(parallel) {
    subjectSeqs2 <- chunkize(subjectSeqs2)
    maxmis <- round(nchar(patternSeq) * (1 - qualityThreshold))
    hits <- bplapply(subjectSeqs2, function(x) {
      bore <- vmatchPattern(patternSeq, x, max.mismatch = maxmis, ...)
      unlist(bore, recursive = TRUE, use.names = TRUE)
    }, BPPARAM = dp) 
    hits <- do.call(c, hits)
  } else {
    max.mm <- round(nchar(patternSeq) * (1 - qualityThreshold))
    hits <- vmatchPattern(patternSeq, subjectSeqs2, max.mismatch = max.mm, ...)
    hits <- unlist(hits, recursive = TRUE, use.names = TRUE)
  }
  
  ## test if there were any multiple hits which are overlapping and if they are 
  ## reduce them
  counts <- Rle(names(hits))
  if(any(runLength(counts) > 1)) {
    reduced <- reduce(GRanges(seqnames = names(hits),
                              IRanges(start = start(hits), end = end(hits))))
    counts <- seqnames(reduced)
    hits <- ranges(reduced)
    names(hits) <- as.character(seqnames(reduced))
    rm(reduced)
    if(any(runLength(counts) > 1)) {
      message("More than 1 pattern (", patternSeq, ") match found for:",
              paste(runValue(counts)[runLength(counts) > 1], collapse = ","),
              "\nUsing the latter occuring hit as the dominant for each read.")
      toremove <- c()
      for(culprits in as.character(runValue(counts)[runLength(counts) > 1])) {
        rows <- which(names(hits) %in% culprits)
        toremove <- c(toremove, rows[1:length(rows) - 1])
      }
      hits <- hits[-toremove]
      counts <- Rle(names(hits))
      if(any(runLength(counts) > 1)) {
        stop("More than 1 pattern unresolved (", patternSeq, ") match found for:",
             paste(runValue(counts)[runLength(counts) > 1], collapse = ","))
      }
    }
  }
  
  good.row <- names(subjectSeqs2) %in% names(hits)
  
  if(showStats) {
    bore <- as.numeric(table(good.row)['FALSE'])    
    message("Total of ", ifelse(is.na(bore),0,bore),
            " did not have the defined pattern sequence (", patternSeq,
            ") that passed qualityThreshold on the ", side, " side")
  }  
  
  starts <- start(hits)
  ends <- end(hits)
  namesq <- names(hits)
  rm("hits", "subjectSeqs2")     
  
  hits <- IRanges(start = starts + overFromLeft[good.row] - 
                    ifelse(side == "right", 2, 0),
                  end = ends + overFromLeft[good.row] - 
                    ifelse(side == "right", 2, 0),
                  names = namesq)
  
  cleanit <- gc()
  return(hits)
}

#' Test if pattern aligns better in +/- orientation.
#'
#' Given a fixed length pattern sequence and variable length subject sequences, the function roughly finds which orientation of pattern yields the most hits. The function doing the heavylifting is \code{\link{vcountPattern}}. This is an accessory function used in function listed under See Also section below. 
#'
#' @param subjectSeqs DNAStringSet object containing sequences to be searched for the pattern. 
#' @param patternSeq DNAString object or a sequence containing the query sequence to search.
#' @param qualityThreshold percent of patternSeq to match. Default is 0.50, half match. This is supplied to max.mismatch parameter of \code{\link{vcountPattern}} as round(nchar(patternSeq)*(1-qualityThreshold)).
#' @param parallel use parallel backend to perform calculation with 
#' \code{\link{BiocParallel}}. Defaults to FALSE. If no parallel backend is 
#' registered, then a serial version is ran using \code{\link{SerialParam}}.
#'
#' @return patternSeq that aligned the best. 
#'
#' @seealso \code{\link{pairwiseAlignSeqs}}, \code{\link{vpairwiseAlignSeqs}}, \code{\link{primerIDAlignSeqs}}
#'
#' @export
#'
#' @examples 
#' subjectSeqs <- c("CCTGAATCCTGGCAATGTCATCATC", "ATCCTGGCAATGTCATCATCAATGG", 
#' "ATCAGTTGTCAACGGCTAATACGCG", "ATCAATGGCGATTGCCGCGTCTGCA", 
#' "CCGCGTCTGCAATGTGAGGGCCTAA", "GAAGGATGCCAGTTGAAGTTCACAC")
#' subjectSeqs <- xscat("AAAAAAAAAA", subjectSeqs)
#' doRCtest(subjectSeqs, "TTTTTTTTT")
doRCtest <- function(subjectSeqs = NULL, patternSeq = NULL,
                     qualityThreshold = 0.5, parallel = TRUE) {
  
  dp <- NULL
  
  .checkArgs_SEQed()
    
  patternSeq.rc <- as.character(reverseComplement(DNAString(patternSeq)))
  hits <- bplapply(c(patternSeq, patternSeq.rc), function(x) {
    max.mm <- round(nchar(x) * (1 - qualityThreshold))
    counts <- pmin(vcountPattern(x, subjectSeqs, max.mismatch = max.mm), 1)
    sum(counts)
  }, BPPARAM = dp)
  
  if(all(hits == 0)) {
    stop("No hits found")
  }
  
  if(hits[[1]] < hits[[2]]) {
    message("There were less/no good hits found for pattern ", patternSeq,
            " than its reverse complement...", patternSeq.rc,
            "\nUsing the latter to perform the searching.")
    patternSeq <- patternSeq.rc
  }
  
  return(patternSeq)
}

# This function generates alignment statistics for findXXXXXX functions. 
# Evaluation of this function happens in the parent function.
.showFindStats <- function() {
  checks <- expression(
    isFindLinker <- grepl("linker", trimmedObj, ignore.case = TRUE) |
      grepl("linker", featureTrim, ignore.case = TRUE),
    listed <- sapply(get(trimmedObj), is.list),
    if(any(listed)) {
      totals <- if(isFindLinker) {        
        sapply(get(trimmedObj)[listed], sapply, function(x) length(x$hits))
      } else {
        sapply(get(trimmedObj)[listed], sapply, length)
      }
      counts <- sapply(get(rawObj)[names(get(trimmedObj)[listed])], 
                       sapply, length)
      stopifnot(identical(colnames(totals), colnames(counts)))
      toprint <- cbind(data.frame("Total" = t(totals)),
                       data.frame("valueTempCol" = t(100 * (totals / counts))))
      names(toprint)[3:4] <- gsub("valueTempCol", valueColname,
                                  names(toprint)[3:4])
      mean.test <- mean(toprint[, paste0(valueColname,
                                         ifelse(isFindLinker,
                                                ".pair2", ".pair1"))]) <= 5      
    } else {
      if(isFindLinker) {
        toprint <- as.data.frame(sapply(sapply(linkerTrimmed[!listed], 
                                               "[[", "hits"), length))
      } else {
        toprint <- as.data.frame(sapply(get(trimmedObj)[!listed], length))
      }
      names(toprint) <- "Total"            
      counts <- sapply(get(rawObj)[names(get(trimmedObj)[!listed])], length)
      toprint[,valueColname] <- 100 * (toprint$Total / counts[rownames(toprint)])
      mean.test <- mean(toprint[, valueColname]) <= 5
    },
    
    toprint$SampleName <- rownames(toprint),
    rownames(toprint) <- NULL,
    
    if(showStats) {            
      message("Sequence Stats after ", featureTrim, " alignment:")
      print(toprint)        
    },
    
    ## if <= 5% of sequences found primers...then something is wrong with the 
    ## primer sequences provided???
    if(doTest & mean.test) {
      stop("Something seems to be wrong with the ", featureTrim,
           "s provided for each sample. On average <= 5% of sequences found ",
           featureTrim, " match for the entire run!!!")
    }
  )
  
  eval.parent(checks)
}

#' Find the 5' primers and add results to SampleInfo object. 
#'
#' Given a sampleInfo object, the function finds 5' primers for each sample per 
#' sector and adds the results back to the object. This is a specialized 
#' function which depends on many other functions shown in 'see also section' 
#' to perform specialized trimming of 5' primer/adaptor found in the sampleInfo 
#' object. The sequence itself is never trimmed but rather coordinates of primer
#' portion is recorded back to the object and used subsequently by 
#' \code{\link{extractSeqs}} function to perform the trimming.
#'
#' @param sampleInfo sample information SimpleList object outputted from 
#' \code{\link{findBarcodes}}, which holds decoded sequences for samples 
#' per sector/quadrant along with information of sample to primer associations.
#' @param alignWay method to utilize for detecting the primers. One of 
#' following: "slow" (Default), or "fast". Fast, calls 
#' \code{\link{vpairwiseAlignSeqs}} and uses \code{\link{vmatchPattern}} at its 
#' core, which is less accurate with indels and mismatches but much faster. 
#' Slow, calls \code{\link{pairwiseAlignSeqs}} and uses 
#' \code{\link{pairwiseAlignment}} at its core, which is accurate with indels 
#' and mismatches but slower.
#' @param showStats toggle output of search statistics. Default is FALSE.
#' @param doRC perform reverse complement search of the defined pattern/primer. 
#' Default is FALSE.
#' @param parallel use parallel backend to perform calculation . Defaults to TRUE. 
#' If no parallel backend is registered, then a serial version is ran using
#' \code{\link{SerialParam}}. Parllelization is done at sample level 
#' per sector. Use parallel2 for parallelization at sequence level.
#' @param samplenames a vector of samplenames to process. Default is NULL, which
#' processes all samples from sampleInfo object.
#' @param bypassChecks skip checkpoints which detect if something was odd with 
#' the data? Default is FALSE.
#' @param parallel2 perform parallelization is sequence level. Default is FALSE.
#' Useful in cases where each sector has only one sample with numerous sequences.
#' @param ... extra parameters to be passed to either \code{\link{vmatchPattern}} 
#' or \code{\link{pairwiseAlignment}} depending on 'alignWay' parameter.
#'
#' @return a SimpleList object similar to sampleInfo paramter supplied with new 
#' data added under each sector and sample. New data attributes include: primed
#'
#' @note 
#' \itemize{
#'  \item For paired end data, qualityThreshold for pair 2 is decreased by 
#'  0.10 to increase chances of matching primer sequence.
#'  \item If parallel=TRUE, then be sure to have a parallel backend registered 
#'  before running the function. One can use any of the following 
#'  \code{\link{MulticoreParam}} \code{\link{SnowParam}}
#' }
#'
#' @seealso \code{\link{pairwiseAlignSeqs}}, \code{\link{vpairwiseAlignSeqs}},
#' \code{\link{extractFeature}}, \code{\link{extractSeqs}}, 
#' \code{\link{primerIDAlignSeqs}}, \code{\link{findLTRs}}, 
#' \code{\link{findLinkers}}, \code{\link{findAndTrimSeq}}
#'
#' @export
#'
#' @examples 
#' 
#' \donttest{
#' load(file.path(system.file("data", package = "hiReadsProcessor"),
#' "FLX_seqProps.RData"))
#' findPrimers(seqProps, showStats=TRUE)
#' }
findPrimers <- function(sampleInfo, alignWay = "slow", showStats = FALSE,
                        doRC = FALSE, parallel = TRUE, samplenames = NULL,
                        bypassChecks = FALSE, parallel2 = FALSE, ...) {    
  dp <- NULL
  
  .checkArgs_SEQed()
  
  ## test if there are decoded sequences in the sampleinfo object ##
  decoded <- extractFeature(sampleInfo, feature = "decoded")
  samplesDecoded <- sapply(decoded, names, simplify = FALSE)
  sectorsDecoded <- names(which(sapply(sapply(decoded,length), ">", 0)))
  rm(decoded)
  cleanit <- gc()
  
  if(length(sectorsDecoded) == 0) {
    stop("No decoded information found in sampleInfo...",
         "did you run findBarcodes()?")
  }
  
  for(sector in sectorsDecoded) {
    message("Processing sector ", sector)
    
    ## refine sample list if specific samples are supplied ##
    samplesToProcess <- samplesDecoded[[sector]]
    if(!is.null(samplenames)) {
      samplesToProcess <- samplesToProcess[samplesToProcess %in% samplenames]
    }
    
    ## prepare sample to primer association ##
    ltrPrimers <- toupper(extractFeature(sampleInfo, sector, samplesToProcess,
                                         feature = "primerltrsequence")[[sector]])
    
    ## find any samples which need to be skipped
    skippers <- ltrPrimers == "SKIP"
    if(!all(skippers) & (length(ltrPrimers[!skippers]) == 0 |
                         mean(nchar(ltrPrimers[!skippers])) <= 5 |
                         all(is.na(ltrPrimers[!skippers])))) {
      stop("Either the primer size is too short (<=5) or ",
           "no primers are found in sample information object.")
    }

    skip.samples <- names(which(skippers))
    if(length(skip.samples) > 0) {
      samplesToProcess <- samplesToProcess[!samplesToProcess %in% skip.samples]
      message("Skipping samples ", paste(skip.samples, collapse = ","))
      sampleInfo <- addFeature(sampleInfo, sector, skip.samples,
                               feature = "primed",
                               value = structure(rep("SKIPPED",
                                                     length(skip.samples)),
                                                 names = skip.samples))
    }

    ## dont bother searching if no samples are to be processed! ##
    if(length(samplesToProcess)>0) {			
      ## get the decoded reads ##
      decoded <- extractSeqs(sampleInfo, sector, samplesToProcess,
                             feature = "decoded")[[sector]]
      
      ## find paired end samples
      isPaired <- extractFeature(sampleInfo, sector,
                                 feature = 'pairedend')[[sector]]
      
      ## trim the primers ##
      message("\tFinding Primers.")
      primerIdentity <- extractFeature(sampleInfo, sector,
                                       feature = "primerltridentity")[[sector]]
      stopifnot(length(primerIdentity) > 0)
      
      primerTrimmed <- bpmapply(function(subjectSeqs, patternSeq, paired,
                                         qualT, ...) {
        if(paired) {
          if(alignWay == "fast") {
            p1 <- vpairwiseAlignSeqs(subjectSeqs$pair1, patternSeq, "left",
                                     (qualT - 0.05), doRC = doRC,
                                     parallel = parallel2, ...)
          } else if (alignWay == "slow") {
            p1 <- tryCatch(pairwiseAlignSeqs(subjectSeqs$pair1, patternSeq,
                                             "left", qualT, doRC = doRC,
                                             parallel = parallel2, ...),
                           error = function(e) geterrmessage())
          }                                   
          
          ## side needs to be middle since we dont 
          ## know where in pair2 the primer can be
          if(alignWay == "fast") {
            p2 <- vpairwiseAlignSeqs(subjectSeqs$pair2, patternSeq, "middle",
                                     (qualT - 0.15), doRC = doRC,
                                     parallel = parallel2, ...)
          } else if (alignWay == "slow") {
            p2 <- tryCatch(pairwiseAlignSeqs(subjectSeqs$pair2, patternSeq,
                                             "middle", (qualT - 0.1), doRC = doRC,
                                             parallel = parallel2, ...),
                           error = function(e) geterrmessage())
          }
          list("pair1" = p1, "pair2" = p2)
        } else {
          if(alignWay == "fast") {
            vpairwiseAlignSeqs(subjectSeqs, patternSeq, "left", (qualT - 0.05),
                               doRC = doRC, parallel = parallel2, ...)
          } else if (alignWay == "slow") {
            tryCatch(pairwiseAlignSeqs(subjectSeqs, patternSeq, "left", qualT,
                                       doRC = doRC, parallel = parallel2, ...),
                     error = function(e) geterrmessage())
          }                                   
        }
      }, decoded[samplesToProcess], ltrPrimers[samplesToProcess],
      isPaired[samplesToProcess], primerIdentity[samplesToProcess],
      SIMPLIFY = FALSE, USE.NAMES = FALSE, BPPARAM = dp)
      names(primerTrimmed) <- samplesToProcess
      
      ## check if any error occured during alignments ##
      if(any(grepl("simpleError", primerTrimmed))) {
        stop("Error encountered in LTR Trimming function",
             paste(names(primerTrimmed[grepl("simpleError", primerTrimmed)]),
                   collapse = ", "))
      }        
      
      ## remove samples with no primer hits from further processing ##
      culprits <- grep("No hits found", primerTrimmed)
      if(length(culprits) > 0) {
        message("Following sample(s) had no hits for primer alignment: ",
                paste(samplesToProcess[culprits],collapse=", "))
        samplesToProcess <- samplesToProcess[-c(culprits)]
        primerTrimmed <- primerTrimmed[-c(culprits)]
      }
      
      cleanit <- gc()
      
      if(length(primerTrimmed) > 0) {
        if(!bypassChecks | showStats) {
          eval(expression(trimmedObj <- "primerTrimmed", rawObj <- "decoded", 
                          featureTrim <- "Primer", 
                          valueColname <- "PercOfDecoded",
                          doTest <- TRUE))
          .showFindStats()
        }
        
        ## modify metadata attribute, write primer coordinates back to 
        ## sampleInfo object & trim
        message("Adding primer info back to the object")
        sampleInfo <- addFeature(sampleInfo, sector, names(primerTrimmed),
                                 feature = "primed", value = primerTrimmed)
      }
      
      rm(primerTrimmed)
      cleanit <- gc()
    }
  }
  sampleInfo$callHistory <- append(sampleInfo$callHistory, match.call())
  return(sampleInfo)
}

#' Find the 5' LTRs and add results to SampleInfo object. 
#'
#' Given a sampleInfo object, the function finds 5' LTR following the primer for
#' each sample per sector and adds the results back to the object. This is a 
#' specialized function which depends on many other functions shown in 'see also
#' section' to perform specialized trimming of 5' viral LTRs found in the 
#' sampleInfo object. The sequence itself is never trimmed but rather 
#' coordinates of LTR portion is added to primer coordinates and recorded back 
#' to the object and used subsequently by \code{\link{extractSeqs}} function to 
#' perform the trimming. This function heavily relies on 
#' \code{\link{pairwiseAlignSeqs}}.
#'
#' @param sampleInfo sample information SimpleList object outputted from 
#' \code{\link{findPrimers}}, which holds decoded and primed sequences for 
#' samples per sector/quadrant along with information of sample to LTR 
#' associations.
#' @param showStats toggle output of search statistics. Default is FALSE. 
#' For paired end data, stats for "pair2" is relative to decoded and/or 
#' primed reads.
#' @param doRC perform reverse complement search of the defined pattern/LTR 
#' sequence. Default is FALSE.
#' @param parallel use parallel backend to perform calculation with 
#' \code{\link{BiocParallel}}. Defaults to TRUE. If no parallel backend is 
#' registered, then a serial version is ran using 
#' \code{\link{SerialParam}}. Parllelization is done at sample level per 
#' sector.
#' @param samplenames a vector of samplenames to process. Default is NULL, 
#' which processes all samples from sampleInfo object.
#' @param bypassChecks skip checkpoints which detect if something was odd with 
#' the data? Default is FALSE.
#' @param parallel2 perform parallelization is sequence level. Default is FALSE. 
#' Useful in cases where each sector has only one sample with numerous sequences.
#'
#' @param ... extra parameters to be passed to \code{\link{pairwiseAlignment}}.
#' @return a SimpleList object similar to sampleInfo paramter supplied with new 
#' data added under each sector and sample. New data attributes include: LTRed
#'
#' @note 
#' \itemize{
#'  \item For paired end data, qualityThreshold for pair 2 is decreased by 
#'  0.05 to increase chances of matching LTR sequence.
#'  \item If parallel=TRUE, then be sure to have a parallel backend registered 
#'  before running the function. One can use any of the following 
#'  \code{\link{MulticoreParam}} \code{\link{SnowParam}}
#' }
#'
#' @seealso \code{\link{pairwiseAlignSeqs}}, \code{\link{vpairwiseAlignSeqs}},
#' \code{\link{extractFeature}}, \code{\link{extractSeqs}}, 
#' \code{\link{primerIDAlignSeqs}}, \code{\link{findPrimers}}, 
#' \code{\link{findLinkers}}, \code{\link{findAndTrimSeq}}
#'
#' @export
#'
#' @examples
#'  
#' \donttest{
#' load(file.path(system.file("data", package = "hiReadsProcessor"),
#' "FLX_seqProps.RData"))
#' findLTRs(seqProps, showStats=TRUE)
#' }
findLTRs <- function(sampleInfo, showStats = FALSE, doRC = FALSE,
                     parallel = TRUE, samplenames = NULL, bypassChecks = FALSE,
                     parallel2 = FALSE, ...) {    
  dp <- NULL
  
  .checkArgs_SEQed()
  
  ## test if there are primed sequences in the sampleinfo object ##   
  primed <- extractFeature(sampleInfo, feature = "primed")
  samplesprimed <- sapply(primed, names, simplify = FALSE)
  sectorsPrimed <- names(which(sapply(sapply(primed, length), ">", 0)))
  rm(primed)
  cleanit <- gc()
  
  if(length(sectorsPrimed) == 0) {
    stop("No primed information found in sampleInfo. 
         Did you run findPrimers()?")
  }
  
  for(sector in sectorsPrimed) {
    message("Processing sector ", sector)
    
    ## refine sample list if specific samples are supplied ##
    samplesToProcess <- samplesprimed[[sector]]
    if(!is.null(samplenames)) {
      samplesToProcess <- samplesToProcess[samplesToProcess %in% samplenames]
    }
    
    ## prepare sample to LTR bit associations ##
    sampleLTRbits <- toupper(extractFeature(sampleInfo, sector, samplesToProcess,
                                            feature = "ltrbitsequence")[[sector]])
    
    # find any samples which need to be skipped
    skippers <- sampleLTRbits == "SKIP"
    if(!all(skippers) & (length(sampleLTRbits[!skippers]) == 0 |
                         mean(nchar(sampleLTRbits[!skippers])) <= 1 |
                         all(is.na(sampleLTRbits[!skippers])))) {
      stop("Either LTR bit sequence is too short (<=1) or no LTR bits ",
           "found in sample information object.")
    }

    skip.samples <- names(which(skippers))
    if(length(skip.samples) > 0) {
      samplesToProcess <- samplesToProcess[!samplesToProcess %in% skip.samples]
      message("Skipping samples ", paste(skip.samples, collapse = ","))
      sampleInfo <- addFeature(sampleInfo, sector, skip.samples,
                               feature = "LTRed",
                               value = structure(rep("SKIPPED",
                                                     length(skip.samples)),
                                                 names = skip.samples))
    }

    ## dont bother searching if no samples are to be processed! ##
    if(length(samplesToProcess) > 0) {
      ## get the primer trimmed reads ##
      primerTrimmed <- extractSeqs(sampleInfo, sector, samplesToProcess,
                                   feature = "primed")[[sector]]
      
      ## find paired end samples...
      ## add reads which aren't primed in pair2 for cases where reads were long
      isPaired <- extractFeature(sampleInfo, sector, samplesToProcess,
                                 feature = 'pairedend')[[sector]]
      if(any(isPaired)) {
        message("Getting reads from pair2 which weren't primed...")
        decoded <- extractSeqs(sampleInfo, sector, names(which(isPaired)),
                               feature = "decoded", pairReturn = 'pair2')[[sector]]
        rows <- intersect(names(decoded), names(primerTrimmed))
        
        bore <- mapply(function(x,y) {
          x$pair2 <- c(x$pair2, y[!names(y) %in% names(x$pair2)])
          x
        }, primerTrimmed[rows], decoded[rows])
        primerTrimmed[rows] <- as(bore, "DataFrame")        
        rm(bore)
      }
      
      ## trim LTRbits using slow method since its the best! ##
      message("\tFinding LTR bits.")
      ltrBitIdentity <- extractFeature(sampleInfo, sector = sector,
                                       feature = "ltrbitidentity")[[sector]]
      
      ltrTrimmed <- bpmapply(function(subjectSeqs, patternSeq, paired,
                                      qualT, ...) {
        if(paired) {
          p1 <- tryCatch(pairwiseAlignSeqs(subjectSeqs$pair1, patternSeq,
                                           "left", qualityThreshold = qualT,
                                           doRC = doRC, parallel = parallel2, ...),
                         error = function(e) geterrmessage())
          
          p2 <- tryCatch(pairwiseAlignSeqs(subjectSeqs$pair2, patternSeq,
                                           "middle", qualityThreshold = (qualT - 0.05),
                                           doRC = doRC, parallel = parallel2, ...),
                         error = function(e) geterrmessage())
          
          list("pair1" = p1, "pair2" = p2)          
        } else {
          tryCatch(pairwiseAlignSeqs(subjectSeqs, patternSeq, "left",
                                     qualityThreshold = qualT, doRC = doRC,
                                     parallel = parallel2, ...),
                   error = function(e) geterrmessage())
        }        
      }, primerTrimmed[samplesToProcess], sampleLTRbits[samplesToProcess],
      isPaired[samplesToProcess], ltrBitIdentity[samplesToProcess],
      SIMPLIFY = FALSE, USE.NAMES = FALSE, BPPARAM = dp)
      names(ltrTrimmed) <- samplesToProcess
      
      ## check if any error occured during alignments ##
      if(any(grepl("simpleError", ltrTrimmed))) {
        stop("Error encountered in LTR Trimming function",
             paste(names(ltrTrimmed[grepl("simpleError", ltrTrimmed)]),
                   collapse = ", "))
      }
      
      ## remove samples with no LTR hits from further processing ##
      culprits <- grep("No hits found", ltrTrimmed)
      if(length(culprits) > 0) {
        message("Following sample(s) had no hits for LTR bit alignment: ",
                paste(samplesToProcess[culprits], collapse = ", "))
        samplesToProcess <- samplesToProcess[-c(culprits)]
        ltrTrimmed <- ltrTrimmed[-c(culprits)]
      }    
      
      cleanit <- gc()
      
      if(length(ltrTrimmed) > 0) {
        if(!bypassChecks | showStats) {
          eval(expression(trimmedObj <- "ltrTrimmed", rawObj <- "primerTrimmed", 
                          featureTrim <- "LTR", valueColname <- "PercOfPrimed",
                          doTest <- TRUE))
          .showFindStats()
        }
        
        ## modify metadata attribute, add LTR bit coordinates to primer and write 
        ## back to sampleInfo object & trim...remember everything is relative to 
        ## the entire read length!
        message("Adding LTR info back to the object")
        for(x in names(ltrTrimmed)) {
          cat(".")
          if(isPaired[[x]]) {
            worked <- sapply(ltrTrimmed[[x]], length) > 0
            if(any(worked)) {
              for(pair in names(which(worked))) {
                primed <- sampleInfo$sectors[[sector]]$samples[[x]]$primed[[pair]]
                rows <- names(ltrTrimmed[[x]][[pair]]) %in% names(primed)
                primed.end <- end(primed[names(ltrTrimmed[[x]][[pair]][rows])])
                rm(primed)
                
                end(ltrTrimmed[[x]][[pair]][rows]) <- 
                  end(ltrTrimmed[[x]][[pair]][rows]) + primed.end              
                start(ltrTrimmed[[x]][[pair]][rows]) <- 
                  start(ltrTrimmed[[x]][[pair]][rows]) + primed.end              
                rm(primed.end)
              }
            }
            sampleInfo$sectors[[sector]]$samples[[x]]$LTRed <- ltrTrimmed[[x]]
          } else {
            if(length(ltrTrimmed[[x]]) > 0) {
              primed <- sampleInfo$sectors[[sector]]$samples[[x]]$primed
              primed.end <- end(primed[names(ltrTrimmed[[x]])])
              rm(primed)
              
              end(ltrTrimmed[[x]]) <- end(ltrTrimmed[[x]]) + primed.end
              start(ltrTrimmed[[x]]) <- start(ltrTrimmed[[x]]) + primed.end
              rm(primed.end)
              
              sampleInfo$sectors[[sector]]$samples[[x]]$LTRed <- ltrTrimmed[[x]]
            }
          }        
        }
      }
      rm("ltrTrimmed", "primerTrimmed")
      cleanit <- gc()
    }
  }
  sampleInfo$callHistory <- append(sampleInfo$callHistory, match.call())
  return(sampleInfo)
}

#' Find the 3' linkers and add results to SampleInfo object. 
#'
#' Given a sampleInfo object, the function finds 3' linkers for each sample per 
#' sector and adds the results back to the object. This is a specialized 
#' function which depends on many other functions shown in 'see also section' to
#' perform specialized trimming of 3' primer/linker adaptor sequence found in 
#' the sampleInfo object. The sequence itself is never trimmed but rather 
#' coordinates of linker portion is recorded back to the object and used 
#' subsequently by \code{\link{extractSeqs}} function to perform the trimming. 
#' This function heavily relies on either \code{\link{pairwiseAlignSeqs}} or 
#' \code{\link{primerIDAlignSeqs}} depending upon whether linkers getting 
#' aligned have primerID in it or not.
#'
#' @param sampleInfo sample information SimpleList object outputted from 
#' \code{\link{findPrimers}} or \code{\link{findLTRs}}, which holds decoded 
#' sequences for samples per sector/quadrant along with information of sample 
#' to primer associations.
#' @param showStats toggle output of search statistics. Default is FALSE.
#' @param doRC perform reverse complement search of the defined pattern/linker 
#' sequence. Default is FALSE.
#' @param parallel use parallel backend to perform calculation with
#'  \code{\link{BiocParallel}}. Defaults to TRUE. If no parallel backend is 
#'  registered, then a serial version is ran using 
#'  \code{\link{SerialParam}}. Parllelization is done at sample level 
#'  per sector.
#' @param samplenames a vector of samplenames to process. Default is NULL, 
#' which processes all samples from sampleInfo object.
#' @param bypassChecks skip checkpoints which detect if something was odd 
#' with the data? Default is FALSE.
#' @param parallel2 perform parallelization is sequence level. Default is FALSE. 
#' Useful in cases where each sector has only one sample with numerous sequences.
#'
#' @param ... extra parameters to be passed to \code{\link{pairwiseAlignment}}.
#' @return a SimpleList object similar to sampleInfo paramter supplied with new 
#' data added under each sector and sample. New data attributes include: 
#' linkered. If linkers have primerID then, primerIDs attribute is appended 
#' as well. 
#'
#' @note 
#' \itemize{
#'  \item For paired end data, qualityThreshold for pair 2 is increased by 
#'  0.25 or set to 1 whichever is lower to increase quality & full match to 
#'  linker sequence.
#'  \item If no linker matches are found with default options, then try 
#'  doRC=TRUE. 
#'  \item If parallel=TRUE, then be sure to have a parallel backend registered 
#'  before running the function. One can use any of the following 
#'  \code{\link{MulticoreParam}} \code{\link{SnowParam}}
#' }
#'
#' @seealso \code{\link{pairwiseAlignSeqs}}, \code{\link{vpairwiseAlignSeqs}},
#' \code{\link{primerIDAlignSeqs}}, \code{\link{findLTRs}}, 
#' \code{\link{findPrimers}}, \code{\link{extractFeature}}, 
#' \code{\link{extractSeqs}}, \code{\link{findAndTrimSeq}},
#' \code{\link{findIntegrations}}
#'
#' @export
#'
#' @examples
#'  
#' \donttest{
#' load(file.path(system.file("data", package = "hiReadsProcessor"),
#' "FLX_seqProps.RData"))
#' findLinkers(seqProps, showStats=TRUE, doRC=TRUE)
#' }
findLinkers <- function(sampleInfo, showStats = FALSE, doRC = FALSE, 
                        parallel = TRUE, samplenames = NULL, 
                        bypassChecks = FALSE, parallel2 = FALSE, ...) {    
  dp <- NULL
  
  .checkArgs_SEQed()
  
  ## test if there are decoded sequences in the sampleinfo object ##
  decoded <- extractFeature(sampleInfo, feature = "decoded")
  toProcessSamples <- sapply(decoded, names, simplify = FALSE)
  sectorsDecoded <- names(which(sapply(sapply(decoded,length), ">", 0)))
  rm(decoded)
  cleanit <- gc()
  
  if(length(sectorsDecoded) == 0) {
    stop("No decoded information found in sampleInfo...",
         "did you run findBarcodes()?")
  }
  
  for(sector in sectorsDecoded) {
    message("Processing sector ", sector)
    
    ## subset samplenames from all samples if defined ##
    samplesToProcess <- toProcessSamples[[sector]]
    if(!is.null(samplenames)) {
      samplesToProcess <- samplesToProcess[samplesToProcess %in% samplenames]
    }
    
    ## prepare sample to linker association ##
    sampleLinkers <- toupper(extractFeature(sampleInfo, sector, samplesToProcess,
                                            feature = "linkersequence")[[sector]])
    
    # find any samples which need to be skipped
    skippers <- sampleLinkers == "SKIP"
    if(!all(skippers) & (length(sampleLinkers[!skippers]) == 0 |
                         mean(nchar(sampleLinkers[!skippers])) <= 10 |
                         all(is.na(sampleLinkers[!skippers])))) {
      stop("Either Linker sequence is too short (<=10) or ",
           "no Linkers found in sample information object.")
    }

    skip.samples <- names(which(skippers))
    if(length(skip.samples) > 0) {
      samplesToProcess <- samplesToProcess[!samplesToProcess %in% skip.samples]
      message("Skipping samples ", paste(skip.samples, collapse = ","))
      sampleInfo <- addFeature(sampleInfo, sector, skip.samples,
                               feature = "linkered",
                               value = structure(rep("SKIPPED",
                                                     length(skip.samples)),
                                                 names = skip.samples))
    }

    ## get the linker quality thresholds for non primerID based samples ##
    linkerIdentity <- extractFeature(sampleInfo, sector, samplesToProcess,
                                     feature = "linkeridentity")[[sector]]
    stopifnot(length(linkerIdentity) > 0)
    
    ## check if any are primerIDed and get their identity thresholds ##
    primerIded <- extractFeature(sampleInfo, sector, samplesToProcess,
                                 feature = "primeridinlinker")[[sector]]        
    primerIded.threshold1 <- 
      extractFeature(sampleInfo, sector, samplesToProcess,
                     feature = "primeridinlinkeridentity1")[[sector]]        
    primerIded.threshold2 <- 
      extractFeature(sampleInfo, sector, samplesToProcess,
                     feature = "primeridinlinkeridentity2")[[sector]]        

    ## dont bother searching if no samples are to be processed! ##
    if(length(samplesToProcess) > 0) {
      
      toProcess <- extractSeqs(sampleInfo, sector, samplesToProcess,
                               feature = "decoded")[[sector]]
      
      ## find paired end samples
      isPaired <- extractFeature(sampleInfo, sector, 
                                 feature = 'pairedend')[[sector]]
      
      ## trim the Linkers ##
      message("\tFinding Linkers.")
      linkerTrimmed <- bpmapply(function(subjectSeqs, patternSeq, qualT,
                                         paired, pIded, qualT1, qualT2, ...) {        
        if(pIded) {
          if(paired) {
            p1 <- tryCatch(primerIDAlignSeqs(subjectSeqs$pair1, patternSeq,
                                             doAnchored = TRUE, 
                                             returnUnmatched = TRUE,
                                             returnRejected = TRUE, doRC = doRC,
                                             qualityThreshold1 = qualT1,
                                             qualityThreshold2 = qualT2,
                                             parallel = parallel2, ...),
                           error = function(e) geterrmessage())
            
            p2 <- tryCatch(primerIDAlignSeqs(subjectSeqs$pair2, patternSeq,
                                             doAnchored = TRUE,
                                             returnUnmatched = TRUE,
                                             returnRejected = TRUE, doRC = doRC,
                                             qualityThreshold1 =
                                               pmin(qualT1 + .25, 1),
                                             qualityThreshold2 =
                                               pmin(qualT2 + .25, 1),
                                             parallel = parallel2, ...),
                           error = function(e) geterrmessage())
            
            list("pair1"=p1, "pair2"=p2)
          } else {
            tryCatch(primerIDAlignSeqs(subjectSeqs, patternSeq, doAnchored = TRUE,
                                       returnUnmatched = TRUE, 
                                       returnRejected = TRUE,
                                       doRC = doRC, qualityThreshold1 = qualT1,
                                       qualityThreshold2 = qualT2,
                                       parallel = parallel2, ...),
                     error = function(e) geterrmessage())
          }          
        } else {
          ## use side="middle" since more junk sequence can be present after 
          ## linker which would fail pairwiseAlignSeqs if side='right' for
          ## single end reads or "pair1"
          if(paired) {
            p1 <- tryCatch(pairwiseAlignSeqs(subjectSeqs$pair1, patternSeq,
                                             "middle", qualityThreshold = qualT,
                                             returnUnmatched = TRUE,
                                             returnLowScored = TRUE, doRC = doRC,
                                             parallel = parallel2, ...),
                           error = function(e) geterrmessage())
            
            # pair2 should end with linker, hence side='right'            
            p2 <- tryCatch(pairwiseAlignSeqs(subjectSeqs$pair2, patternSeq,
                                             "right",
                                             qualityThreshold = 
                                               pmin(qualT + .25, 1),
                                             returnUnmatched = TRUE,
                                             returnLowScored = TRUE, doRC = doRC,
                                             parallel = parallel2, ...),
                           error = function(e) geterrmessage())
            
            list("pair1" = p1, "pair2" = p2)
          } else {
            tryCatch(pairwiseAlignSeqs(subjectSeqs, patternSeq, "middle",
                                       qualityThreshold = qualT,
                                       returnUnmatched = TRUE,
                                       returnLowScored = TRUE, doRC = doRC,
                                       parallel = parallel2, ...),
                     error = function(e) geterrmessage())
          }
        }        
      },
      toProcess[samplesToProcess], sampleLinkers[samplesToProcess],
      linkerIdentity[samplesToProcess], isPaired[samplesToProcess],
      primerIded[samplesToProcess], primerIded.threshold1[samplesToProcess],
      primerIded.threshold2[samplesToProcess], SIMPLIFY = FALSE, USE.NAMES = FALSE,
      BPPARAM = dp)      
      names(linkerTrimmed) <- samplesToProcess
      
      ## check if any error occured during alignments ##
      if(any(grepl("simpleError", linkerTrimmed))) {
        stop("Error encountered in Linker Trimming functions",
             paste(names(linkerTrimmed[grepl("simpleError", linkerTrimmed)]),
                   collapse = ", "))
      }
      
      ## remove samples with no linker hits from further processing ##
      culprits <- grep("No hits found", linkerTrimmed)
      if(length(culprits) > 0) {
        message("Following sample(s) had no hits for Linker alignment: ",
                paste(samplesToProcess[culprits], collapse = ", "))
        samplesToProcess <- samplesToProcess[-c(culprits)]
        linkerTrimmed <- linkerTrimmed[-c(culprits)]
      }        
      
      cleanit <- gc()
      if(length(linkerTrimmed) > 0) {
        if(!bypassChecks | showStats) {
          eval(expression(trimmedObj <- "linkerTrimmed", rawObj <- "toProcess", 
                          featureTrim <- "Linker", 
                          valueColname <- "PercOfDecoded",
                          doTest <- TRUE))
          .showFindStats()
        }
        
        message("Adding linker info back to the object")
        ## modify metadata attribute and write back to sampleInfo object
        ## for primerID based samples...write back all the returned attibutes
        for(x in names(linkerTrimmed)) {
          cat(".") 
          if(isPaired[[x]]) {
            for(y in unique(unlist(sapply(linkerTrimmed[[x]], names)))) {
              bore <- sapply(linkerTrimmed[[x]], "[[", y)
              newAttrName <- paste0(ifelse(y == "hits", "", y), "linkered")
              sampleInfo$sectors[[sector]]$samples[[x]][[newAttrName]] <- bore
              rm(bore)
            }
          } else {
            for(y in names(linkerTrimmed[[x]])) {
              newAttrName <- paste0(ifelse(y == "hits", "", y), "linkered")
              sampleInfo$sectors[[sector]]$samples[[x]][[newAttrName]] <- 
                linkerTrimmed[[x]][[y]]
            }
          }
        }
      }
      rm(linkerTrimmed)
      cleanit <- gc()
    }
  }
  sampleInfo$callHistory <- append(sampleInfo$callHistory, match.call())
  return(sampleInfo)
}

#' Find vector DNA in reads and add results to SampleInfo object. 
#'
#' Given a sampleInfo object, the function finds vector fragments following the 
#' LTR piece for each sample per sector and adds the results back to the object. 
#' This is a specialized function which depends on many other functions shown
#' in 'see also section' to perform specialized trimming of 5' viral LTRs found
#' in the sampleInfo object. The sequence itself is never trimmed but rather 
#' coordinates of vector portion is added to LTR coordinates and recorded back 
#' to the object and used subsequently by \code{\link{extractSeqs}} function to 
#' perform the trimming. This function heavily relies on \code{\link{blatSeqs}}.
#' In order for this function to work, it needs vector sequence which is read in
#' using 'vectorFile' metadata supplied in the sample information file in 
#' \code{\link{read.sampleInfo}}
#'
#' @param sampleInfo sample information SimpleList object outputted from 
#' \code{\link{findLTRs}}, which holds decoded, primed, and LTRed sequences for 
#' samples per sector/quadrant.
#' @param showStats toggle output of search statistics. Default is FALSE.
#' @param parallel use parallel backend to perform calculation with 
#' \code{\link{BiocParallel}}. Defaults to TRUE. If no parallel backend is  
#' registered, then a serial version is ran using \code{\link{SerialParam}}.
#' Parllelization is done at sample level per sector. Use parallel2 for 
#' parallelization at sequence level.
#' @param samplenames a vector of samplenames to process. Default is NULL, which
#' processes all samples from sampleInfo object.
#'
#' @return a SimpleList object similar to sampleInfo paramter supplied with new 
#' data added under each sector and sample. New data attributes include: vectored
#'
#' @note 
#' \itemize{
#'  \item If parallel=TRUE, then be sure to have a parallel backend registered 
#'  before running the function. One can use any of the following 
#'  \code{\link{MulticoreParam}} \code{\link{SnowParam}}
#' }
#'
#' @seealso \code{\link{pairwiseAlignSeqs}}, \code{\link{blatSeqs}},
#' \code{\link{extractFeature}}, \code{\link{extractSeqs}}, 
#' \code{\link{findPrimers}}, \code{\link{findLTRs}}, 
#' \code{\link{findLinkers}}, \code{\link{findAndTrimSeq}},
#' \code{\link{findAndRemoveVector}}
#'
#' @export
#'
#' @examples 
#' 
#' \donttest{
#' load(file.path(system.file("data", package = "hiReadsProcessor"),
#' "FLX_seqProps.RData"))
#' findVector(seqProps, showStats=TRUE)
#' }
findVector <- function(sampleInfo, showStats = FALSE, parallel = TRUE,
                       samplenames = NULL) {    
  
  dp <- NULL

  .checkArgs_SEQed()

  ## test if there are primed sequences in the sampleinfo object ##   
  primed <- extractFeature(sampleInfo, feature = "primed")
  samplesprimed <- sapply(primed, names, simplify = FALSE)
  sectorsPrimed <- names(which(sapply(sapply(primed, length), ">", 0)))
  rm(primed)
  cleanit <- gc()
  
  if(length(sectorsPrimed) == 0) {
    stop("No primed information found in sampleInfo. Did you run findPrimers()?")
  }
  
  for(sector in sectorsPrimed) {
    message("Processing sector ", sector)
    
    ## refine sample list if specific samples are supplied ##
    samplesToProcess <- samplesprimed[[sector]]
    if(!is.null(samplenames)) {
      samplesToProcess <- samplesToProcess[samplesToProcess %in% samplenames]
    }
    
    ## get the vector files ##
    vectorFiles <- extractFeature(sampleInfo, sector, samplesToProcess,
                                  feature = 'vectorfile')[[sector]]
    vectorFiles <- structure(file.path(sampleInfo$sequencingFolderPath,
                                       vectorFiles),
                             names = names(vectorFiles))
    
    # find any samples which need to be skipped #
    # gotta use grep here because it could be a file path #
    skippers <- grepl("SKIP$", vectorFiles)
    names(skippers) <- names(vectorFiles)
    if(!all(skippers) & (length(vectorFiles[!skippers]) == 0 |
                         mean(nchar(vectorFiles[!skippers])) <= 1 |
                         all(is.na(vectorFiles[!skippers])))) {
      stop("Problem detecting 'vectorFile' association to samples in the
           supplied sampleInfo object.")
    }
    
    skip.samples <- names(which(skippers))
    if(length(skip.samples) > 0) {
      samplesToProcess <- samplesToProcess[!samplesToProcess %in% skip.samples]
      message("Skipping samples ", paste(skip.samples, collapse = ","))
      sampleInfo <- addFeature(sampleInfo, sector, skip.samples,
                               feature = "vectored",
                               value = structure(rep("SKIPPED",
                                                     length(skip.samples)),
                                                 names = skip.samples))
    }

    ## dont bother searching if no samples are to be processed! ##
    if(length(samplesToProcess) > 0) {
      ## get the LTR trimmed reads ##
      ltrTrimmed <- extractSeqs(sampleInfo, sector, samplesToProcess,
                                feature = "LTRed")[[sector]]
      samplesToProcess <- names(ltrTrimmed)
      
      ## find paired end samples...
      ## add reads which aren't primed in pair2 for cases where reads were long
      isPaired <- extractFeature(sampleInfo, sector, samplesToProcess,
                                 feature = 'pairedend')[[sector]]
      if(any(isPaired)) {
        message("Getting reads from pair2 which weren't LTRed...")
        decoded <- extractSeqs(sampleInfo, sector, names(which(isPaired)),
                               feature = "decoded", pairReturn = 'pair2')[[sector]]
        rows <- intersect(names(decoded), names(ltrTrimmed))
        
        bore <- mapply(function(x,y) {
          x$pair2 <- c(x$pair2, y[!names(y) %in% names(x$pair2)])
          x
        }, ltrTrimmed[rows], decoded[rows])
        ltrTrimmed[rows] <- as(bore, "DataFrame")        
        rm(bore)
      }

      ## find vector bits using BLAT! ##
      message("\tFinding Vector bits.")
      
      vecTrimmed <- mapply(function(subjectSeqs, paired, vectorFile) {
        Vector <- readDNAStringSet(vectorFile)
        if(paired) {
          reads <- subjectSeqs$pair1
          p1 <- findAndRemoveVector(reads, Vector, 10, TRUE, parallel)$hits
          p1 <- if(is(p1, "GRanges")) 
          { IRanges(ranges(p1), names = p1$qNames) } else { p1 }
          
          reads <- subjectSeqs$pair2
          p2 <- findAndRemoveVector(reads, Vector, 10, TRUE, parallel)$hits
          p2 <- if(is(p2, "GRanges"))
          { IRanges(ranges(p2), names = p2$qNames) } else { p2 }
          
          list("pair1"=p1, "pair2"=p2)          
        } else {
          p <- findAndRemoveVector(subjectSeqs, Vector, 10, TRUE, parallel)$hits
          if(is(p,"GRanges"))
          { IRanges(ranges(p), names = p$qNames) } else { p }
        }        
      }, ltrTrimmed[samplesToProcess], isPaired[samplesToProcess], 
      vectorFiles[samplesToProcess], SIMPLIFY=FALSE, USE.NAMES=FALSE)
      names(vecTrimmed) <- samplesToProcess
      
      ## check if any error occured during alignments ##
      if(any(grepl("simpleError", vecTrimmed))) {
        stop("Error encountered in LTR Trimming function",
             paste(names(vecTrimmed[grepl("simpleError", vecTrimmed)]),
                   collapse = ", "))
      }
      
      ## remove samples with no LTR hits from further processing ##
      culprits <- grep("No hits found", vecTrimmed)
      if(length(culprits) > 0) {
        message("Following sample(s) had no hits for LTR bit alignment: ",
                paste(samplesToProcess[culprits], collapse = ", "))
        samplesToProcess <- samplesToProcess[-c(culprits)]
        vecTrimmed <- vecTrimmed[-c(culprits)]
      }    
      
      cleanit <- gc()
      
      if(length(vecTrimmed) > 0) {
        if(showStats) {
          eval(expression(trimmedObj <- "vecTrimmed", 
                          rawObj <- "ltrTrimmed", 
                          featureTrim <- "Vector", 
                          valueColname <- "PercOfLTRed",
                          doTest <- FALSE))
          .showFindStats()
        }
        
        ## modify metadata attribute, add vector bit coordinates to LTR 
        # and write back to sampleInfo object & trim...
        # remember everything is relative to the entire read length!
        message("Adding vector info back to the object")
        for(x in names(vecTrimmed)) {
          cat(".")
          if(isPaired[[x]]) {
            worked <- sapply(vecTrimmed[[x]], length) > 0
            if(any(worked)) {
              for(pair in names(which(worked))) {
                ltred <- sampleInfo$sectors[[sector]]$samples[[x]]$LTRed[[pair]]
                rows <- names(vecTrimmed[[x]][[pair]]) %in% names(ltred)
                ltred.end <- end(ltred[names(vecTrimmed[[x]][[pair]][rows])])
                rm(ltred)
                
                end(vecTrimmed[[x]][[pair]][rows]) <- 
                  end(vecTrimmed[[x]][[pair]][rows]) + ltred.end              
                start(vecTrimmed[[x]][[pair]][rows]) <- 
                  start(vecTrimmed[[x]][[pair]][rows]) + ltred.end              
                rm(ltred.end)
              }
            }
            sampleInfo$sectors[[sector]]$samples[[x]]$vectored <- 
              vecTrimmed[[x]]
          } else {
            if(length(vecTrimmed[[x]]) > 0) {
              ltred <- sampleInfo$sectors[[sector]]$samples[[x]]$LTRed
              ltred.end <- end(ltred[names(vecTrimmed[[x]])])
              rm(ltred)
              
              end(vecTrimmed[[x]]) <- end(vecTrimmed[[x]]) + ltred.end
              start(vecTrimmed[[x]]) <- start(vecTrimmed[[x]]) + ltred.end
              rm(ltred.end)
              
              sampleInfo$sectors[[sector]]$samples[[x]]$vectored <- vecTrimmed[[x]]
            }
          }        
        }
      }
      rm("vecTrimmed")
      cleanit <- gc()
    }
  }
  sampleInfo$callHistory <- append(sampleInfo$callHistory, match.call())
  return(sampleInfo)
}

#' Find the integration sites and add results to SampleInfo object. 
#'
#' Given a SampleInfo object, the function finds integration sites for each 
#' sample using their respective settings and adds the results back to the 
#' object. This is an all-in-one function which aligns, finds best hit per 
#' read per sample, cluster sites, and assign ISU IDs. It calls 
#' \code{\link{blatSeqs}}, \code{\link{read.psl}}, \code{\link{getIntegrationSites}}, 
#' \code{\link{clusterSites}}, \code{\link{otuSites}}. here must be linkered 
#' reads within the sampleInfo object in order to use this function using the 
#' default parameters. If you are planning on BLATing non-linkered reads, 
#' then change the seqType to one of accepted options for the 'feature' 
#' parameter of \code{\link{extractSeqs}}, except for '!' based features.
#'
#' @param sampleInfo sample information SimpleList object outputted from 
#' \code{\link{findLinkers}}, which holds decoded, primed, LTRed, and Linkered 
#' sequences for samples per sector/quadrant along with metadata.
#' @param seqType which type of sequence to align and find integration sites. 
#' Default is NULL and determined automatically based on type of restriction 
#' enzyme or isolation method used. If restriction enzyme is Fragmentase, MuA, 
#' Sonication, or Sheared then this parameter is set to genomicLinkered, else 
#' it is genomic. Any one of following options are valid: genomic, 
#' genomicLinkered, decoded, primed, LTRed, linkered.
#' @param genomeIndices an associative character vector of freeze to full or 
#' relative path of respective of indexed genomes from BLAT(.nib or .2bit files).
#' For example: c("hg18"="/usr/local/blatSuite34/hg18.2bit", "mm8"="/usr/local/blatSuite34/mm8.2bit"). Be sure to supply an index per freeze supplied 
#' in the sampleInfo object. Default is NULL.
#' @param samplenames a vector of samplenames to process. Default is NULL, 
#' which processes all samples from sampleInfo object.
#' @param parallel use parallel backend to perform calculation with 
#' \code{\link{BiocParallel}}. Defaults to TRUE. If no parallel backend is 
#' registered, then a serial version is ran using \code{\link{SerialParam}}.
#' @param autoOptimize if aligner='BLAT', then should the blatParameters be 
#' automatically optimized based on the reads? Default is FALSE. When TRUE, 
#' following parameters are adjusted within the supplied blatParameters vector: 
#' stepSize, tileSize, minScore, minIdentity. This parameter is useful 
#' when aligning reads of various lengths to the genome. 
#' Optimization is done using only read lengths. In beta phase!
#' @param doSonic calculate integration sites abundance using breakpoints. 
#' See \code{\link{getSonicAbund}} for more details. Default is FALSE.
#' @param doISU calculate integration site unit for multihits. 
#' See \code{\link{isuSites}} for more details. Default is FALSE.
#' @param ... additional parameters to be passed to \code{\link{blatSeqs}}.
#'
#' @return a SimpleList object similar to sampleInfo parameter supplied with 
#' new data added under each sector and sample. New data attributes include: 
#' psl, and sites. The psl attributes holds the genomic hits per read along 
#' with QC information. The sites attribute holds the condensed integration 
#' sites where genomic hits have been clustered by the Position column and 
#' cherry picked to have each site pass all the QC steps. 
#'
#' @note If parallel=TRUE, then be sure to have a parallel backend registered 
#' before running the function. One can use any of the following 
#' \code{\link{MulticoreParam}} \code{\link{SnowParam}}
#'
#' @seealso \code{\link{findPrimers}}, \code{\link{findLTRs}}, 
#' \code{\link{findLinkers}}, \code{\link{startgfServer}}, 
#' \code{\link{read.psl}}, \code{\link{blatSeqs}}, \code{\link{blatListedSet}}, 
#' \code{\link{pslToRangedObject}}, \code{\link{clusterSites}}, 
#' \code{\link{isuSites}}, \code{\link{crossOverCheck}}, 
#' \code{\link{getIntegrationSites}}, \code{\link{getSonicAbund}},
#' \code{\link{annotateSites}}
#'
#' @export
#'
#' @examples
#'  
#' \donttest{
#' load(file.path(system.file("data", package = "hiReadsProcessor"),
#' "FLX_seqProps.RData"))
#' findIntegrations(seqProps, 
#' genomeIndices=c("hg18"="/usr/local/genomeIndexes/hg18.noRandom.2bit"), 
#' numServers=2)
#' }
findIntegrations <- function(sampleInfo, seqType = NULL,
                             genomeIndices = NULL, samplenames = NULL,
                             parallel = TRUE, autoOptimize = FALSE,
                             doSonic = FALSE, doISU = FALSE, ...) {
  
  ## to avoid 'no visible binding for global variable' during checks ##
  freeze <- restrictionenzyme <- startwithin <- alignratiothreshold <- NULL
  genomicpercentidentity <- keepmultihits <- clustersiteswithin <- dp <- NULL
  
  .checkArgs_SEQed()
  
  if(is.null(genomeIndices)) {
    stop("No genomeIndices provided")
  }
  
  ## test if there are linkered sequences in the sampleinfo object if 
  ## specific feature/seqType is not defined ##   
  feature <- ifelse(is.null(seqType), "linkered", seqType)
  message("Checking for ", feature, " reads.")  	
  featured <- extractFeature(sampleInfo, feature = feature)
  samplesfeatured <- sapply(featured, names, simplify = FALSE)
  sectorsfeatured <- names(which(sapply(sapply(featured, length), ">", 0)))
  rm(featured)
  cleanit <- gc()
  
  if(length(sectorsfeatured) == 0) {
    stop("No ", feature, " reads found in sampleInfo object provided.")
  }
  
  ## subset specific samples if defined ##
  samplesToProcess <- unlist(samplesfeatured, use.names = FALSE)
  if(!is.null(samplenames)) {
    samplesToProcess <- samplesToProcess[samplesToProcess %in% samplenames]
  }
  
  message("Creating hashes of settings for aligning and processing.")
  
  ## setup settings hashes for aligning the genomic seqs by species & 
  ## processing hits later
  for(setting in c("restrictionenzyme", "freeze", "startwithin",
                   "alignratiothreshold", "genomicpercentidentity",
                   "clustersiteswithin", "keepmultihits")) {
    setter <- extractFeature(sampleInfo, samplename = samplesToProcess,
                             feature = setting)
    names(setter) <- NULL
    setter <- unlist(setter)
    assign(setting, setter)
  }
  
  minReadLength <- 10

  ## Align by respective species ##
  alignedFiles <- c()        
  for(f in unique(freeze)) {
    message("Aligning to: ", f)
    
    message("Getting sequences to align")
    # get sequences to align #
    if(is.null(seqType)) {
      wanted <- names(restrictionenzyme[!grepl("FRAG|SONIC|MU|SHEAR",
                                               restrictionenzyme,
                                               ignore.case = TRUE)])
      wanted <- wanted[wanted %in% names(freeze[freeze == f])]
      seqs <- extractSeqs(sampleInfo, samplename = wanted,
                          feature = "genomic", strict = TRUE,
                          minReadLength = minReadLength)
      if(any(as.numeric(sapply(seqs, length)) > 0)) {
        write.listedDNAStringSet(seqs, filePrefix = paste0("processed", f))
      }
      
      wanted <- names(restrictionenzyme[grepl("FRAG|SONIC|MU|SHEAR",
                                              restrictionenzyme,
                                              ignore.case = TRUE)])
      wanted <- wanted[wanted %in% names(freeze[freeze == f])]
      seqs <- extractSeqs(sampleInfo, samplename = wanted,
                          feature = "genomicLinkered", strict = TRUE,
                          minReadLength = minReadLength)
      if(any(as.numeric(sapply(seqs, length))>0)) {
        write.listedDNAStringSet(seqs, filePrefix = paste0("processed", f))
      }
    } else {
      seqs <- extractSeqs(sampleInfo, samplename = names(freeze[freeze == f]),
                          feature = seqType, minReadLength = minReadLength)
      if(any(as.numeric(sapply(seqs, length)) > 0)) {
        write.listedDNAStringSet(seqs, filePrefix = paste0("processed", f))
      }
    }

    ## Align seqs ##
    
    queryFiles <- list.files(pattern = paste0("^processed", f))
    
    # merge the extra args with blatSeq() params #
    dots <- list(...)
    blatArgs <- as.list(args(blatSeqs))
    blatArgs <- head(blatArgs, -1) ## remove the last NULL
    
    if(length(dots) > 0) {
      for(a in names(dots)) {
        blatArgs[[a]] <- dots[[a]]
      }
    }
    
    blatParams <- eval(blatArgs$blatParameters)
    
    ## autoOptimize BLAT settings if enabled ##
    if(autoOptimize) {
      # BLAT formula: 2*stepSize+tileSize-1
      queryLengths <- summary(fasta.info(queryFiles, use.names = FALSE))
      blatParams[["tileSize"]] <- pmax(pmin(queryLengths[["Min."]], 15), 8)
      blatParams[["stepSize"]] <- pmax(round((queryLengths[["1st Qu."]] / 4)), 5)
      blatParams[["minScore"]] <- queryLengths[["Min."]]
      blatParams[["minIdentity"]] <- round(queryLengths[["Median"]] * .95)
      blatArgs[["blatParameters"]] <- blatParams
    }
    
    ## start the requested number of gfServers ##
    numServers <- blatArgs$numServers
    blatArgs[["numServers"]] <- 1L
    gfServerOpts <- c("tileSize", "stepSize", "minMatch", "maxGap", "trans", "log",
                      "seqLog", "syslog", "logFacility", "mask", "repMatch",
                      "maxDnaHits", "maxTransHits", "maxNtSize", "maxAsSize",
                      "canStop")
    port <- blatArgs$port + 0:(numServers - 1)
    for(n in 1:numServers) {
      searchCMD <- sprintf("gfServer status %s %s", blatArgs$host, port[n])
      if(system(searchCMD, ignore.stderr = TRUE) != 0) {
        message(sprintf("Starting gfServer # %s.", n))
        startgfServer(seqDir = genomeIndices[[f]], host = blatArgs$host,
                      port = port[n], gfServerOpts = blatParams[names(blatParams)
                                                                %in% gfServerOpts])
      }
    }
    
    ## align read files in parallel! ##
    blatArgs[["standaloneBlat"]] <- FALSE
    aFile <- bpmapply(function(port, x) {
      blatArgs[["query"]] <- x
      blatArgs[["subject"]] <- genomeIndices[[f]]
      blatArgs[["port"]] <- port
      do.call("blatSeqs", blatArgs)
    }, rep(port, length = length(queryFiles)), queryFiles, 
    SIMPLIFY = FALSE, USE.NAMES = FALSE, BPPARAM = dp)
    
    aFile <- unlist(aFile)
    
    ## stop the requested number of gfServers ##
    sapply(port, function(x) stopgfServer(port = x))

    alignedFiles <- c(alignedFiles, aFile)
    message("Cleaning!")
    cleanit <- gc()
    system(paste("rm", paste0("processed", f, ".*.fa")))    
  }
  
  ## read all hits and split by samples ##
  message("Reading aligned files.")  
  psl <- read.psl(alignedFiles, bestScoring = TRUE, asGRanges = TRUE,
                  removeFile = TRUE, parallel = FALSE)    

  mcols(psl)$samplename <- sub("^(.+)-(.+)$", "\\1", mcols(psl)$qName)
  psl <- split(psl, mcols(psl)$samplename)
  cleanit <- gc()
  
  ## pair up alignments if sample==paired end ##
  isPaired <- extractFeature(sampleInfo, samplename = samplesToProcess,
                             feature = 'pairedend')
  names(isPaired) <- NULL
  isPaired <- unlist(isPaired)
  if(any(isPaired)) {
    for(x in names(which(isPaired))) {
      psl[[x]] <- pairUpAlignments(psl[[x]], parallel = parallel)  
    }    
  }
  
  ## begin processing hits ##
  psl.hits <- bplapply(names(psl), function(x) {
    message("Processing ", x)
    
    # add qc info for bestscoring hits #
    psl.x <- 
      getIntegrationSites(psl[[x]], startWithin = startwithin[[x]],
                          alignRatioThreshold = alignratiothreshold[[x]],
                          genomicPercentIdentity = genomicpercentidentity[[x]],
                          oneBased = TRUE)
    
    # filter multihits if applicable #
    if(!as.logical(keepmultihits[[x]])) {
      psl.x <- psl.x[!mcols(psl.x)$isMultiHit, ]
    }
    
    # cluster sites by positions #
    psl.x <- clusterSites(psl.rd = psl.x, windowSize = clustersiteswithin[[x]])
    
    # get sonicAbund #
    if(doSonic) {
      psl.x <- getSonicAbund(psl.rd = psl.x)
    }

    # get sites ISU for tagging multihits #
    if(doISU) {        
      psl.x <- isuSites(psl.rd = psl.x)
    }
    
    psl.x
  }, BPPARAM = dp)
  names(psl.hits) <- names(psl)
  
  message("Adding PSL hits back to the object.")
  sampleInfo <- addFeature(sampleInfo, sector = NULL, samplename = names(psl.hits),
                           feature = "psl", value = psl.hits)
  
  message("Adding sites back to the object.")
  psl.hits <- sapply(psl.hits, function(x) {
    x <- subset(x, mcols(x)$clusterTopHit & mcols(x)$pass.allQC)
    ranges(x) <- IRanges(mcols(x)$clusteredPosition, width = 1) 
    x[, setdiff(colnames(mcols(x)), 
                c('matches', 'misMatches', 'repMatches', 'nCount', 'qNumInsert',
                  'qBaseInsert', 'tNumInsert', 'tBaseInsert', 'tSize',
                  'blockCount', 'blockSizes', 'qStarts', 'score', 'tStarts',
                  'pass.startWithin', 'alignRatio', 'pass.alignRatio',
                  'percIdentity', 'pass.percIdentity', 'pass.allQC',
                  'clusterTopHit', 'width', 'Position', 'clusteredPosition'))]
  })
  
  sampleInfo <- addFeature(sampleInfo, sector = NULL, samplename = names(psl.hits),
                           feature = "sites", value = psl.hits)
  
  cleanit <- gc()
  
  sampleInfo$callHistory <- append(sampleInfo$callHistory, match.call())
  return(sampleInfo)
}

#' Find the 5' primers and add results to SampleInfo object. 
#'
#' Given a sampleInfo object, the function finds 5' primers for each sample per 
#' sector and adds the results back to the object. This is a specialized 
#' function which depends on many other functions shown in 'see also section' 
#' to perform specialized trimming of 5' primer/adaptor found in the sampleInfo 
#' object. The sequence itself is never trimmed but rather coordinates of primer
#' portion is recorded back to the object and used subsequently by 
#' \code{\link{extractSeqs}} function to perform the trimming.
#'
#' @param sampleInfo sample information SimpleList object outputted from 
#' \code{\link{findIntegrations}}, which holds genomic integration sites.
#' @param annots a named list of GRanges object holding features for annotating 
#' integration sites. The name attribute of the list is used as column name.
#' @param samplenames a vector of samplenames to process. Default is NULL, which
#' processes all samples from sampleInfo object.
#' @param parallel use parallel backend to perform calculation. Defaults to TRUE. 
#' If no parallel backend is registered, then a serial version is ran using
#' \code{\link{SerialParam}}. Parllelization is done at sample level 
#' per sector. Use parallel2 for parallelization at sequence level.
#' @param ... additional parameters to be passed to \code{\link{doAnnotation}}
#' except for sites.rd, features.rd, and colnam.
#'
#' @return a SimpleList object similar to sampleInfo paramter supplied with new 
#' data added under each sector and sample. New data attributes include: primed
#'
#' @seealso \code{\link{clusterSites}}, \code{\link{isuSites}}, 
#' \code{\link{crossOverCheck}}, \code{\link{findIntegrations}}, 
#' \code{\link{getIntegrationSites}}, \code{\link{pslToRangedObject}}
#'
#' @export
#'
#' @examples
#'  
#' \donttest{
#' load(file.path(system.file("data", package = "hiReadsProcessor"),
#' "FLX_seqProps.RData"))
#' data(genes)
#' genes <- makeGRanges(genes)
#' cpgs <- getUCSCtable("cpgIslandExt","CpG Islands")
#' cpgs <- makeGRanges(cbind(cpgs,strand="*"), chromCol = "chrom")
#' annots <- list("RefGenes"=genes,"CpG"=cpgs)
#' annotateSites(seqProps, annots, annotType="nearest", side="5p")
#' }
annotateSites <- function(sampleInfo, annots = NULL, samplenames = NULL,
                          parallel = TRUE, ...) {    
  dp <- NULL
  
  .checkArgs_SEQed()
  if(is.null(annots)) {
    stop("annots parameter is empty. Please provide an annotation object.")
  }
  
  ## test if there are sites in the sampleinfo object ##
  sites <- extractFeature(sampleInfo, feature = "sites")
  samplesDone <- sapply(sites, names, simplify = FALSE)
  sectorsDone <- names(which(sapply(sapply(sites, length), ">", 0)))
  rm(sites)
  cleanit <- gc()
  
  if(length(sectorsDone) == 0) {
    stop("No sites information found in sampleInfo...",
         "did you run findIntegrations()?")
  }
  
  for(sector in sectorsDone) {
    message("Processing sector ", sector)
    
    ## refine sample list if specific samples are supplied ##
    samplesToProcess <- samplesDone[[sector]]
    if(!is.null(samplenames)) {
      samplesToProcess <- samplesToProcess[samplesToProcess %in% samplenames]
    }

    ## dont bother searching if no samples are to be processed! ##
    if(length(samplesToProcess) > 0) {
      
      ## get the sites ##
      sites <- extractFeature(sampleInfo, sector, samplesToProcess,
                              feature = "sites")[[sector]]
      
      ## do Annotations ##
      message("\tAnnotating Sites.")
      
      annotated <- bplapply(sites[samplesToProcess], function(x) {
        for(f in names(annots)) {
          x <- doAnnotation(sites.rd = x, features.rd = annots[[f]], colnam = f, ...)
        }
        x
      }, BPPARAM = dp)
      stopifnot(identical(names(annotated), samplesToProcess))
      
      if(length(annotated) > 0) {                  
        ## Store annotated sites back to the object ##
        sampleInfo <- addFeature(sampleInfo, sector, names(annotated),
                                 feature = "sites", value = annotated)
      }
      
      rm(annotated)
      cleanit <- gc()
    }
  }
  sampleInfo$callHistory <- append(sampleInfo$callHistory, match.call())
  return(sampleInfo)
}

#' Compare LTRed/Primed sequences to all linkers. 
#'
#' Given a SampleInfo object, the function compares LTRed sequences from each sample per sector to all the linker sequences present in the run. The output is a summary table of counts of good matches to all the linkers per sample. 
#'
#' @param sampleInfo sample information SimpleList object outputted from \code{\link{findPrimers}} or \code{\link{findLTRs}}, which holds decoded sequences for samples per sector/quadrant along with information of sample to primer associations.
#' @param qualityThreshold percent of linker length to match, round(nchar(linker)*qualityThreshold). Default is 0.55. Only applies to non-primerID based linkers
#' @param qualityThreshold1 percent of first part of patternSeq to match. Default is 0.75. Only applies to primerID based linker search.
#' @param qualityThreshold2 percent of second part of patternSeq to match. Default is 0.50. Only applies to primerID based linker search.
#' @param doRC perform reverse complement search of the linker sequence. Default is TRUE. Highly recommended!
#' @param parallel use parallel backend to perform calculation with 
#' \code{\link{BiocParallel}}. Defaults to TRUE. If no parallel backend is 
#' registered, then a serial version is ran using \code{\link{SerialParam}}.
#'  Parllelization is done at sample level per sector.
#' @param samplenames a vector of samplenames to process. Default is NULL, which processes all samples from sampleInfo object.
#' @param ... extra parameters to be passed to \code{\link{pairwiseAlignment}}.
#'
#' @return a dataframe of counts. 
#'
#' @note If parallel=TRUE, then be sure to have a parallel backend registered 
#' before running the function. One can use any of the following 
#' \code{\link{MulticoreParam}} \code{\link{SnowParam}}
#'
#' @seealso \code{\link{pairwiseAlignSeqs}}, \code{\link{vpairwiseAlignSeqs}},
#' \code{\link{primerIDAlignSeqs}}, \code{\link{findLTRs}}, 
#' \code{\link{findPrimers}}, \code{\link{findAndTrimSeq}}
#'
#' @export
troubleshootLinkers <- function(sampleInfo, qualityThreshold = 0.55,
                                qualityThreshold1 = 0.75, 
                                qualityThreshold2 = 0.50,
                                doRC = TRUE, parallel = TRUE, 
                                samplenames = NULL, ...) {    
  dp <- NULL
  
  .checkArgs_SEQed()
  
  ## test if there are decoded sequences in the sampleinfo object ##
  toProcess <- extractFeature(sampleInfo, feature = "decoded")
  toProcessSamples <- sapply(toProcess, names, simplify = FALSE)
  sectors <- names(toProcess)   
  rm(toProcess)
  cleanit <- gc()
  
  results <- data.frame()
  for(sector in sectors) {
    message("Processing sector ", sector)
    
    ## prepare sample to linker association ##
    sampleLinkers <- toupper(extractFeature(sampleInfo, sector = sector,
                                            feature = "linkersequence")[[sector]])
    if(length(sampleLinkers) == 0 | mean(nchar(sampleLinkers)) <= 10 |
       all(is.na(sampleLinkers))) {
      stop("Either Linker sequence is too short (<=10) or ",
           "no Linkers found in sample information object.")
    }
    
    samplesToProcess <- toProcessSamples[[sector]]
    if(!is.null(samplenames)) {
      samplesToProcess <- samplesToProcess[samplesToProcess %in% samplenames]
    }
    
    toProcess.seqs <- extractSeqs(sampleInfo, sector, samplesToProcess,
                                  feature = "decoded")[[sector]]    
    
    ## find paired end samples
    isPaired <- extractFeature(sampleInfo, sector, feature = 'pairedend')[[sector]]
    
    ## do all by all comparison of linkers ##
    message("\tFinding Linkers.")
    for(linkerSeq in unique(as.character(sampleLinkers))) {
      message("Checking ",linkerSeq)
      
      linkerTrimmed <- bplapply(samplesToProcess, function(x) {
        if(length(unlist(gregexpr("N", linkerSeq))) > 3) {
          if(isPaired[[x]]) {
            p1 <- tryCatch(length(primerIDAlignSeqs(
              toProcess.seqs[[x]]$pair1, linkerSeq, qualityThreshold1,
              qualityThreshold2, doRC = doRC, ...)$hits), error = function(e) 0)
            
            p2 <- tryCatch(length(primerIDAlignSeqs( 
              toProcess.seqs[[x]]$pair2, linkerSeq, qualityThreshold1, 
              qualityThreshold2, doRC = doRC, ...)$hits), error = function(e) 0)
            
            list("pair1" = p1, "pair2" = p2)
          } else {
            tryCatch(length(primerIDAlignSeqs(
              toProcess.seqs[[x]], linkerSeq, qualityThreshold1, 
              qualityThreshold2, doRC = doRC, ...)$hits), error = function(e) 0)
          }
        } else {
          if(isPaired[[x]]) {
            p1 <- tryCatch(length(pairwiseAlignSeqs(
              toProcess.seqs[[x]]$pair1, linkerSeq, "middle", qualityThreshold,
              doRC = doRC, ...)), error = function(e) 0)
            
            p2 <- tryCatch(length(pairwiseAlignSeqs(
              toProcess.seqs[[x]]$pair2, linkerSeq, "right", qualityThreshold,
              doRC = doRC, ...)), error = function(e) 0)
            
            list("pair1" = p1, "pair2" = p2)
          } else {
            tryCatch(length(pairwiseAlignSeqs(
              toProcess.seqs[[x]], linkerSeq, "middle", qualityThreshold,
              doRC = doRC, ...)), error = function(e) 0)
          }
        }
      }, BPPARAM = dp)      
      names(linkerTrimmed) <- samplesToProcess
      
      isPaired <- sapply(linkerTrimmed, is.list)
      if(any(isPaired)) {
        totalSeqs <- sapply(toProcess.seqs[isPaired], sapply, length)
        linkerhits <- sapply(linkerTrimmed[isPaired], unlist)
        PercentOfTotal <- linkerhits / totalSeqs[, names(linkerTrimmed[isPaired])]
      } else {
        totalSeqs <- sapply(toProcess.seqs[!isPaired], length)
        linkerhits <- as.numeric(unlist(linkerTrimmed[!isPaired]))
        PercentOfTotal <- linkerhits / totalSeqs[names(linkerTrimmed[!isPaired])]
      }
      
      bore <- data.frame("linkerSeq" = linkerSeq,
                         "samplename" = names(linkerTrimmed),
                         "linkerhits" = linkerhits,
                         "PercentOfTotal" = PercentOfTotal,
                         stringsAsFactors = FALSE)
      rownames(bore) <- NULL
      results <- rbind(results, bore)
      
      cleanit <- gc()
    }        
  }  
  sampleLinkers <- extractFeature(sampleInfo, feature = "linkersequence")
  names(sampleLinkers) <- NULL
  sampleLinkers <- unlist(sampleLinkers)
  linkersample <- as.data.frame(sampleLinkers)
  linkersample <- tapply(rownames(linkersample), linkersample$sampleLinkers,
                         paste, collapse = ",")
  
  results$CorrectLinker <- with(results,
                                sampleLinkers[as.character(samplename)] ==
                                  as.character(linkerSeq))
  results$CorrectSample <- with(results, linkersample[as.character(linkerSeq)])
  return(results)
}

#' Find and trim a short pattern sequence from the subject. 
#'
#' This function facilitates finding and trimming of a short pattern sequence from a collection of subject sequences. The trimming is dictated by side parameter. For more information on the trimming process see the 'side' parameter documentation in \code{\link{trimSeqs}}. For information regarding the pattern alignment see the documentation for \code{\link{pairwiseAlignSeqs}}. This function is meant for aligning a short pattern onto large collection of subjects. If you are looking to align a vector sequence to subjects, then please use BLAT.
#'
#' @param patternSeq DNAString object or a sequence containing the query sequence to search.
#' @param subjectSeqs DNAStringSet object containing sequences to be searched for the pattern.
#' @param side which side of the sequence to perform the search & trimming: left, right or middle. Default is 'left'.
#' @param offBy integer value dictating if the trimming base should be offset by X number of bases. Default is 0.
#' @param alignWay method to utilize for detecting the primers. One of following: "slow" (Default), "fast", or "blat". Fast, calls \code{\link{vpairwiseAlignSeqs}} and uses \code{\link{vmatchPattern}} at its core, which is less accurate with indels and mismatches but much faster. Slow, calls \code{\link{pairwiseAlignSeqs}} and uses \code{\link{pairwiseAlignment}} at its core, which is accurate with indels and mismatches but slower. Blat will use \code{\link{blatSeqs}}.
#' @param ... parameters to be passed to \code{\link{pairwiseAlignment}}, \code{\link{vpairwiseAlignSeqs}} or \code{\link{blatSeqs}} depending on which method is defined in 'alignWay' parameter.
#'
#' @return DNAStringSet object with pattern sequence removed from the subject sequences. 
#'
#' @note If parallel=TRUE, then be sure to have a parallel backend registered 
#' before running the function. One can use any of the following 
#' \code{\link{MulticoreParam}} \code{\link{SnowParam}}
#'
#' @seealso \code{\link{pairwiseAlignSeqs}}, \code{\link{vpairwiseAlignSeqs}}, \code{\link{extractFeature}}, \code{\link{extractSeqs}}, \code{\link{primerIDAlignSeqs}}, \code{\link{findPrimers}}, \code{\link{findLinkers}}
#'
#' @export
#'
#' @examples 
#' findAndTrimSeq(patternSeq="AGACCCTTTT",
#' subjectSeqs=DNAStringSet(c("AGACCCTTTTGAGCAGCAT","AGACCCTTGGTCGACTCA",
#' "AGACCCTTTTGACGAGCTAG")), qualityThreshold=.85, doRC=FALSE, side="left", 
#' offBy=1, alignWay = "slow")
#'
findAndTrimSeq <- function(patternSeq, subjectSeqs, side = "left", offBy = 0, 
                           alignWay = "slow", ...) {
  
  # to avoid NOTE during R cmd check #
  removeSubjectNamesAfter <- NULL
  
  .checkArgs_SEQed()
  
  coords <- switch(alignWay,
                   fast = vpairwiseAlignSeqs(subjectSeqs, patternSeq, side,...),
                   slow = pairwiseAlignSeqs(subjectSeqs, patternSeq, side, ...),
                   blat = with(read.psl(blatSeqs(subjectSeqs, patternSeq, ...)),
                               IRanges(qStart,qEnd))
  )
  
  res <- trimSeqs(subjectSeqs, coords, side, offBy)
  if(removeSubjectNamesAfter) {
    names(res) <- NULL
  }
  res
}

#' Find and trim vector sequence from reads. 
#'
#' This function facilitates finding and trimming of long/short fragments of 
#' vector present in LM-PCR products. The algorithm looks for vector sequence 
#' present anywhere within the read and trims according longest contiguous 
#' match on either end of the read. Alignment is doing using BLAT
#'
#' @param reads DNAStringSet object containing sequences to be trimmed for vector.
#' @param Vector DNAString object containing vector sequence to be searched in reads.
#' @param minLength integer value dictating minimum length of trimmed sequences to return. Default is 10.
#' @param returnCoords return the coordinates of vector start-stop for the matching reads. Defaults to FALSE.
#' @param parallel use parallel backend to perform calculation with 
#' \code{\link{BiocParallel}}. Defaults to TRUE. If no parallel backend is 
#' registered, then a serial version is ran using \code{\link{SerialParam}}.
#'
#' @return DNAStringSet object with Vector sequence removed from the reads. If returnCoords=TRUE, then a list of two named elements "hits" & "reads". The first element, "hits" is a GRanges object with properties of matched region and whether it was considered valid denoted by 'good.row'. The second element, "reads" is a DNAStringSet object with Vector sequence removed from the reads.
#'
#' @note If parallel=TRUE, then be sure to have a parallel backend registered 
#' before running the function. One can use any of the following 
#' \code{\link{MulticoreParam}} \code{\link{SnowParam}}
#'
#' @seealso \code{\link{pairwiseAlignSeqs}}, \code{\link{vpairwiseAlignSeqs}}, \code{\link{pslToRangedObject}}, \code{\link{blatSeqs}}, \code{\link{read.blast8}}, \code{\link{findAndTrimSeq}}
#' @export
findAndRemoveVector <- function(reads, Vector, minLength = 10,
                                returnCoords = FALSE, parallel = TRUE) {
  
  .checkArgs_SEQed()
  
  ## use tryCatch to be safe incase not reads were aligned in read.psl stops
  ## with "No hits found"
  hits <- tryCatch(read.psl(blatSeqs(query = reads, subject = Vector, 
                                     parallel = parallel, 
                                     blatParameters = c(
                                       stepSize = 3, tileSize = 8,
                                       minIdentity = 70, minScore = 5,
                                       repMatch = 112312, out = "psl")), 
                            bestScoring = FALSE), error = function(z) NULL)
  
  ## delete temporary files like subjectFile.fa.*.tempyS generated and not
  ## removed after the command above due to an abrupt stop ##
  toremove <- list.files(pattern = "subjectFile.fa.+.tempyS")
  if(length(toremove) > 0) {
    file.remove(toremove)
  }
  
  if(!is.null(hits)) {
    hits <- reduce(pslToRangedObject(hits, useTargetAsRef = FALSE),
                   min.gapwidth = 10, ignore.strand = TRUE)
    hits$qSize <- width(reads[as.character(seqnames(hits))])
    
    ## only consider hits that start or end with vector...
    ## others would be wierdos! ##
    tocheck <- start(hits) <= 10 | hits$qSize-end(hits) <= 11
    hits <- hits[tocheck]
  }
  
  if(length(hits) > 0) {
    ## queries where genomic sequence is surrounded by vector sequence (counts>1)...
    ## we will fix these later!
    hits$qNames <- as.character(seqnames(hits))
    counts <- table(hits$qNames)
    hits$toCure <- hits$qNames %in% names(counts[counts>1])
    rm(counts)
    
    ### queries composed of recombined forms of vector
    widthSum <- tapply(width(hits), hits$qNames, sum)
    hits$widthSum <- widthSum[hits$qNames]
    rm(widthSum)    
    hits$allCovered <- hits$qSize >= (hits$widthSum-3) & 
      hits$qSize <= (hits$widthSum+3)
    
    ### seqs w/ vector fragments which are not 'allCovered' with vector
    toTrim <- reads[names(reads) %in% hits$qNames[!hits$allCovered]]
    
    ### seqs w/o any vector sequences
    reads <- reads[!names(reads) %in% hits$qNames] 
    
    ### queries where genomic sequence is either at the beginning or the end
    hits$qEndDiff <- hits$qSize-end(hits)
    hits$StartDiff <- start(hits)-0
    cured <- hits[!hits$toCure]
    
    # take the side which has the higher unmatched bases to the vector sequence #
    cured$trueStart <- ifelse(cured$StartDiff > cured$qEndDiff, 0, end(cured) + 1)
    cured$trueEnd <- ifelse(cured$StartDiff > cured$qEndDiff,
                            start(cured) - 1, cured$qSize)
    cured$type <- "genomic either side"
    
    ### fix queries where genomic sequence is surrounded by vector sequence
    if(any(hits$toCure)) {
      hits.list <- split(hits[hits$toCure], hits$qNames[hits$toCure])
      bores <- bplapply(hits.list, function(bore) {        
        trueRange <- gaps(ranges(bore))
        bore$trueStart <- start(trueRange[width(trueRange) ==
                                            max(width(trueRange))])
        bore$trueEnd <- end(trueRange[width(trueRange) == max(width(trueRange))])
        bore$type <- "genomic in middle"
        bore$good.row <- TRUE
        bore[1]        
      })  
      bores <- unlist(GRangesList(bores), use.names = FALSE)
      cured <- c(cured, bores[,names(mcols(cured))])
      rm("hits.list", "bores")
    }
    rm("hits")
    
    atStart <- cured$StartDiff <= 10
    atEnd <- cured$qEndDiff <= 10
    
    ### remove sequences where vector sequence is in the middle 
    cured$toCure[!atStart & !atEnd] <- FALSE 
    cured$type[!atStart & !atEnd] <- "vector in middle" 
    
    ## extract sequence with trueStart & trueEnd 
    good.row <- !cured$toCure & !cured$allCovered
    trimmed <- subseq(toTrim[as.character(seqnames(cured))[good.row]],
                      start = cured$trueStart[good.row] + 1,
                      end = cured$trueEnd[good.row])
    
    ### combined seqs.good with trimmed ###
    reads <- c(reads, trimmed[width(trimmed) >= minLength])
    
    if(returnCoords) {
      list("hits" = cured, "reads" = reads)
    } else {
      reads
    }
  } else {
    message("No vector hits found")
    if(returnCoords) {
      list("hits" = "No hits found", "reads" = reads)
    } else {
      reads
    }
  }
}
#' Trim sequences from a specific side.
#'
#' This function trims a DNAStringSet object using the ranges from left, right, or middle of the sequence. This is a helper function utilized in \code{\link{primerIDAlignSeqs}} and \code{\link{extractSeqs}}. If dnaSet and coords are not the same length, then they are required to have a names attribute to perform the matched trimming. 
#'
#' @param dnaSet DNAStringSet object containing sequences to be trimmed.
#' @param coords IRanges object containing boundaries.
#' @param side either 'left','right',or the Default 'middle'.
#' @param offBy integer value dictating if the supplied coordinates should be offset by X number of bases. Default is 0.
#'
#' @return a DNAStringSet object with trimmed sequences. 
#'
#' @note If side is left, then any sequence following end of coords+offBy is returned. If side is right, then sequence preceding start of coords-offBy is returned. If side is middle, then sequence contained in coords is returned where offBy is added to start and subtracted from end in coords.
#'
#' @seealso \code{\link{extractSeqs}}, \code{\link{primerIDAlignSeqs}}
#'
#' @export
#'
#' @examples 
#' dnaSet <- DNAStringSet(c("AAAAAAAAAACCTGAATCCTGGCAATGTCATCATC", 
#' "AAAAAAAAAAATCCTGGCAATGTCATCATCAATGG", "AAAAAAAAAAATCAGTTGTCAACGGCTAATACGCG",
#' "AAAAAAAAAAATCAATGGCGATTGCCGCGTCTGCA", "AAAAAAAAAACCGCGTCTGCAATGTGAGGGCCTAA",
#' "AAAAAAAAAAGAAGGATGCCAGTTGAAGTTCACAC"))
#' coords <- IRanges(start=1, width=rep(10,6))
#' trimSeqs(dnaSet, coords, side="left", offBy=1)
#' trimSeqs(dnaSet, coords, side="middle")
trimSeqs <- function(dnaSet, coords, side = "middle", offBy = 0) {
  stopifnot(class(dnaSet) %in% c("DNAStringSet", "DNAString"))
  stopifnot(class(coords) == "IRanges")
  
  if(length(dnaSet) == 0 | length(coords) == 0) {
    stop("dnaSet/coords is empty. Please supply reads/coords to be trimmed.")
  }
  
  # check if both dnaSet and coords has 'names' attribute, 
  # if yes then check if they have matching names, else check lengths. 
  if(is.null(names(dnaSet)) | is.null(names(coords))) {
    stopifnot(length(dnaSet) == length(coords))
  } else {
    rows <- match(names(coords), names(dnaSet))
    if(any(is.na(rows))) {
      stop("Some of the names in coords are not present in dnaSet")
    }
    if(!is.ordered(rows)) {
      dnaSet <- dnaSet[rows]
      if(!identical(names(dnaSet), names(coords))) {
        stop("Names are not identical between dnaSet and coords parameter")
      }
    }
  }        
  
  # temp helper function to show messages #
  .showMessage <- function(x) {
    message("Following sequences were removed from trimming since their ",
            "coordinates+offBy were out of sequence length: ",
            paste(x, collapse = ", "))
  }
  
  # trim by side and check if any of the coords are off the sequence length in dnaSet
  if(tolower(side) == "left") {
    test <- end(coords) + offBy > width(dnaSet) | end(coords) + offBy < 1
    if(any(test)) {
      .showMessage(names(dnaSet)[test])
    }
    subseq(dnaSet[!test], start = end(coords[!test]) + offBy)
  } else if (tolower(side) == "right") {
    test <- start(coords) - offBy > width(dnaSet) | end(coords) + offBy < 1
    if(any(test)) {
      .showMessage(names(dnaSet)[test])
    }
    subseq(dnaSet[!test], end = start(coords[!test]) - offBy)
  } else {
    test <- start(coords) + offBy > width(dnaSet) |
      end(coords) - offBy > width(dnaSet) | start(coords) + offBy < 1
    if(any(test)) {
      .showMessage(names(dnaSet)[test])
    }
    subseq(dnaSet[!test],
           start = start(coords[!test]) + offBy,
           end = end(coords[!test]) - offBy)
  }    
}

#' Extract sequences for a feature in the sampleInfo object.
#'
#' Given a sampleInfo object, the function extracts sequences for a defined 
#' feature.
#'
#' @param sampleInfo sample information SimpleList object, which samples per 
#' sector/quadrant information along with other metadata.
#' @param sector specific sector to extract sequences from. Default is NULL, 
#' which extracts all sectors. 
#' @param samplename specific sample to extract sequences from. Default is NULL, 
#' which extracts all samples. 
#' @param feature which part of sequence to extract (case sensitive). Options 
#' include: primed, !primed, LTRed, !LTRed, linkered, !linkered, primerIDs, 
#' genomic, genomicLinkered, decoded, and unDecoded. If a sample was primerIDed 
#' and processed by \code{\link{primerIDAlignSeqs}}, then all the rejected and
#' unmatched attributes can be prepended to the feature. Example: vectored,
#' Rejectedlinkered, RejectedprimerIDslinkered, Absentlinkered, or 
#' unAnchoredprimerIDslinkered. When feature is genomic, it includes sequences 
#' which are primed, LTRed, linkered, and !linkered. The genomicLinkered is same 
#' as genomic minus the !linkered. When feature is decoded, it includes 
#' everything that demultiplexed. The '!' in front of a feature extracts the 
#' inverse. One can only get unDecoded sequences if returnUnmatched was TRUE in
#' \code{\link{findBarcodes}}. If \code{\link{findVector}} was run and 
#' "vectored" feature was found in the sampleInfo object, then genomic & 
#' genomicLinkered output will have vectored reads removed.
#' @param trim whether to trim the given feature from sequences or keep it. 
#' Default is TRUE. This option is ignored for feature with '!'.
#' @param minReadLength threshold for minimum length of trimmed sequences to return.
#' @param sideReturn if trim=TRUE, which side of the sequence to return: left, 
#' middle, or right. Defaults to NULL and determined automatically. Doesn't 
#' apply to features: decoded, genomic or genomicLinkered.
#' @param pairReturn if the data is paired end, then from which pair to return 
#' the feature. Options are "pair1", "pair2", or defaults to "both". Ignored if 
#' data is single end. 
#' @param strict this option is used when feature is either 'genomic' or
#' 'genomicLinkered'. When a sample has no LTRed reads, primer ends are used as 
#' starting points by default to extract the genomic part. Enabling this option
#' will strictly ensure that only reads with primer and LTR are trimmed for the
#' 'genomic' or 'genomicLinkered' feature. Default is FALSE.
#'
#' @return a listed DNAStringSet object structed by sector then sample. 
#' Note: when feature='genomic' or 'genomicLinkered' and when data is paired end, 
#' then "pair2" includes union of reads from both pairs which found LTR.
#'
#' @seealso \code{\link{findPrimers}}, \code{\link{findLTRs}}, 
#' \code{\link{findLinkers}}, \code{\link{trimSeqs}}, \code{\link{extractFeature}},
#' \code{\link{getSectorsForSamples}}
#'
#' @export
#'
#' @examples 
#' load(file.path(system.file("data", package = "hiReadsProcessor"),
#' "FLX_seqProps.RData"))
#' samples <- c('Roth-MLV3p-CD4TMLVWell6-Tsp509I', 
#' 'Roth-MLV3p-CD4TMLVWell6-MseI', 'Roth-MLV3p-CD4TMLVwell5-MuA')
#' extractSeqs(seqProps, sector='2', samplename=samples, feature="primed")
#' extractSeqs(seqProps, sector='2', samplename=samples, feature="!primed")
#' extractSeqs(seqProps, sector='2', samplename=samples, feature="linkered")
#' extractSeqs(seqProps, sector='2', samplename=samples, feature="genomic")
extractSeqs <- function(sampleInfo, sector = NULL, samplename = NULL,
                        feature = "genomic", trim = TRUE, minReadLength = 1,
                        sideReturn = NULL, pairReturn = "both", 
                        strict = FALSE) {
  
  .checkArgs_SEQed()
  
  # get all sectors and samplenames in each sector or a specific sector
  res <- getSectorsForSamples(sampleInfo, sector, samplename)
  sectors <- res[["sectors"]]
  samplenames <- res[["samplenames"]]
  
  if(tolower(feature) == "undecoded") {
    sapply(sectors, function(y) { 
      allmetadata <- metadata(sampleInfo$sectors[[y]])
      if("unDecodedSeqs" %in% names(allmetadata)) {
        allmetadata$unDecodedSeqs 
      } else {
        message("No unDecoded attribute found for the supplied ",
                "sampleInfo object and sector ", y)
      }
    })
  } else {        
    res <- sapply(sectors, function(y) {
      sapply(samplenames[[y]], function(x,y) { 
        decoded <- sampleInfo$sectors[[y]]$samples[[x]]$decoded
        isPaired <- sampleInfo$sectors[[y]]$samples[[x]]$pairedend
        if(isPaired & pairReturn != 'both') {
          decoded <- decoded[[pairReturn]]
        }
        
        if(tolower(feature) != "decoded") {
          # get ride of ! from feature, else R wont know what to do when making a... 
          # new object with ! in the front.
          assign(gsub("!", "", feature),
                 sampleInfo$sectors[[y]]$samples[[x]][[gsub("!", "", feature)]])
        }
        
        if(tolower(feature) == "decoded") {
          decoded
        } else if (tolower(feature) %in% c("genomic", "genomiclinkered")) {
          primed <- sampleInfo$sectors[[y]]$samples[[x]]$primed
          LTRed <- sampleInfo$sectors[[y]]$samples[[x]]$LTRed
          linkered <- sampleInfo$sectors[[y]]$samples[[x]]$linkered
          vectored <- sampleInfo$sectors[[y]]$samples[[x]]$vectored
          
          if(isPaired & pairReturn != 'both') {
            primed <- primed[[pairReturn]]
            LTRed <- LTRed[[pairReturn]]
            linkered <- linkered[[pairReturn]]
            vectored <- vectored[[pairReturn]]
          }
          
          # if reads dont have LTRs...use primers instead...but throw a warning!
          LTRed.test <- if(is.list(LTRed)) {
            any(sapply(LTRed,is.null))
          } else {
            is.null(LTRed)
          }
          
          if(LTRed.test & strict) {
            warning("LTRed information not found for ", x, " skipping...",
                    immediate. = TRUE)
            return()
          } else if (LTRed.test & !strict) {
            warning("LTRed information not found for ", x,
                    "...using primer end as starting boundary.",
                    immediate. = TRUE)
          }
          
          # check vectored attribute is not null #
          vec.test <- if(is.list(vectored)) {
            any(sapply(vectored,is.null))
          } else {
            is.null(vectored)
          }
          if(vec.test) {
            rm(vectored)                                
          }
          
          # test if there are any primed and/or linkered reads
          p.l.test <- if(is.list(primed)) {
            any(sapply(primed, is.null), sapply(linkered, is.null))
          } else {
            any(is.null(primed), is.null(linkered))
          }
          
          if(p.l.test) { 
            message("No sequences found for requested feature (", feature,
                    ") for sample: ", x,"...skipping.") 
          } else {
            if(trim) {
              # get all LTRed reads and make ends = size of each read
              if(isPaired & pairReturn == 'both') {
                if(LTRed.test & !strict) {
                  LTRed <- primed
                }
                
                if(exists("vectored")) {
                  toremove <- unlist(sapply(vectored, names), use.names = FALSE)
                  message("Removing ", length(toremove), " vectored reads...")
                  LTRed <-
                    lapply(LTRed, function(p)
                      p[!names(p) %in% toremove])
                }
                
                starts <- lapply(LTRed, function(p)
                  structure(end(p), names = names(p)))
                ## add reads present pair1 to pair2 where LTR wasn't found which
                ## makes things consistent to pair1 atleast!
                loners <- setdiff(names(starts$pair1), names(starts$pair2))
                starts$pair2 <- c(starts$pair2,
                                  structure(rep(0L, length(loners)), 
                                            names = loners))
                ends <- lapply(decoded, function(p)
                  structure(width(p), names = names(p)))
                
                stopifnot(identical(names(starts), names(ends)))
                stopifnot(mapply(function(s,e) all(names(s) %in% names(e)),
                                 starts, ends))
                
                coords <- mapply(function(s, e) {
                  IRanges(start = s + 1, end = e[names(s)], names = names(s))
                }, starts, ends)
                
                # alter ends for reads where linker was present
                # fix cases where start of linker earlier than edge of LTR/primer end
                ends <- lapply(linkered, function(p) structure(start(p),
                                                               names = names(p)))
                stopifnot(identical(names(coords), names(ends)))
                
                coords <- mapply(function(p, e) {
                  there <- intersect(names(p), names(e))
                  e <- as.numeric(e[there]-1)
                  culprits <- start(p[there]) > e
                  end(p[there][!culprits]) <- e[!culprits]
                  end(p[there][culprits]) <- start(p[there][culprits])
                  p
                }, coords, ends)
                
                if(tolower(feature)=="genomiclinkered") { 
                  stopifnot(identical(names(coords), names(linkered)))
                  coords <- mapply(function(p, l) p[names(p) %in% names(l)], 
                                   coords, linkered)
                }
                
                # trim it and return non zero length sequences
                if(any(sapply(coords, length) > 0)) {
                  stopifnot(identical(names(coords), names(decoded)))
                  mapply(function(d, p) {
                    seqs <- trimSeqs(d, p)
                    seqs[width(seqs) >= minReadLength]
                  }, decoded, coords)
                } else {
                  message("No linkered reads found for sample: ", x, "...skipping.")
                }
              } else {
                if(LTRed.test & !strict) {
                  LTRed <- primed
                }
                
                if(exists("vectored")) {                  
                  message("Removing ", length(vectored), " vectored reads...")
                  LTRed <- LTRed[!names(LTRed) %in% names(vectored)]
                }
                
                starts <- structure(end(LTRed), names = names(LTRed))
                ends <- structure(width(decoded), names = names(decoded))
                
                stopifnot(identical(names(starts), names(ends[names(starts)])))
                coords <- IRanges(start = starts + 1,
                                  end = ends[names(starts)],
                                  names = names(starts))
                
                # alter ends for reads where linker was present
                # fix cases where start of linker earlier than edge of LTR/primer end
                ends <- structure(start(linkered), names = names(linkered))
                there <- intersect(names(coords), names(ends))
                ends <- as.numeric(ends[there]-1)
                culprits <- start(coords[there]) > ends
                end(coords[there][!culprits]) <- ends[!culprits]
                end(coords[there][culprits]) <- start(coords[there][culprits])
                
                if(tolower(feature) == "genomiclinkered") { 
                  coords <- coords[names(coords) %in% names(linkered)] 
                }
                
                # trim it and return non zero length sequences
                if(length(coords) > 0) {
                  seqs <- trimSeqs(decoded,coords)
                  seqs[width(seqs) >= minReadLength]
                } else {
                  message("No linkered reads found for sample: ", x, "...skipping.")
                }                
              }
            } else {
              # just returning ranges...simply match names from decoded to the request
              if(isPaired & pairReturn == 'both') {
                if(tolower(feature) == "genomiclinkered") { 
                  mapply(function(d,l) d[names(d) %in% names(l)],
                         decoded, linkered)
                } else { ## everything past primer and LTR
                  if(LTRed.test & !strict) {
                    mapply(function(d,l) d[names(d) %in% names(l)],
                           decoded, primed)
                  } else {
                    mapply(function(d,l) d[names(d) %in% names(l)],
                           decoded, LTRed)
                  }
                }
              } else {
                if(tolower(feature) == "genomiclinkered") { 
                  decoded[names(decoded) %in% names(linkered)]
                } else { ## everything past primer and LTR
                  if(is.null(LTRed)) {
                    decoded[names(decoded) %in% names(primed)]
                  } else {
                    decoded[names(decoded) %in% names(LTRed)]
                  }
                }
              }              
            }
          }
        } else {
          notFeature <- grepl("!",feature)
          ## no need to trim if looking for "not" based feature since there are 
          ## no coordinates for it
          if(notFeature) {
            if(isPaired & pairReturn == 'both') {
              stopifnot(identical(names(decoded),
                                  names(get(gsub("!", "", feature)))))
              toreturn <- mapply(function(d,g) d[!names(d) %in% names(g)], 
                                 decoded, get(gsub("!", "", feature)))
              
              toreturn <- list()
              toreturn[["pair1"]] <- 
                decoded$pair1[!names(decoded$pair1) %in% 
                                names(get(gsub("!", "", feature))$pair1)]
              toreturn[["pair2"]] <- 
                decoded$pair2[!names(decoded$pair2) %in% 
                                names(get(gsub("!", "", feature))$pair2)]
            } else {
              toreturn <- decoded[!names(decoded) %in% 
                                    names(get(gsub("!", "", feature)))]
            }
            
            if(length(toreturn)) {
              toreturn
            } else {
              message("No sequences found for requested feature (", feature,
                      ") for sample: ", x,"...skipping.") 
            }
          } else {                    
            if(is.null(get(feature))) { 
              message("No sequences found for requested feature (", feature,
                      ") for sample: ", x,"...skipping.") 
            } else {
              if(isPaired & pairReturn == 'both') {
                stopifnot(identical(names(decoded), names(get(feature))))                
                res.seq <- list("pair1" =
                                  decoded$pair1[names(decoded$pair1) %in%
                                                  names(get(feature)$pair1)],
                                "pair2" =
                                  decoded$pair2[names(decoded$pair2) %in%
                                                  names(get(feature)$pair2)])
              } else {
                res.seq <- decoded[names(decoded) %in% names(get(feature))]  
              }
              
              if(trim) {
                if(is.null(sideReturn)) {
                  sidetype <- ifelse(grepl("primerID", feature, ignore.case = TRUE), 
                                     "middle", 
                                     ifelse(grepl("primed|LTRed", feature,
                                                  ignore.case = TRUE), 
                                            "left", 
                                            ifelse(grepl("linkered", feature,
                                                         ignore.case = TRUE), 
                                                   "right", 
                                                   "middle")))
                } else {
                  sidetype <- tolower(sideReturn)
                }                            
                
                offByLength <- ifelse(sidetype == "middle", 0, 1)
                
                if(isPaired & pairReturn == 'both') {
                  seqs.p1 <- trimSeqs(res.seq$pair1, get(feature)$pair1,
                                      side = sidetype, offBy = offByLength)
                  seqs.p1 <- seqs.p1[width(seqs.p1) >= minReadLength]
                  
                  seqs.p2 <- trimSeqs(res.seq$pair2, get(feature)$pair2,
                                      side = sidetype, offBy = offByLength)
                  seqs.p2 <- seqs.p2[width(seqs.p2) >= minReadLength]
                  
                  list("pair1" = seqs.p1, "pair2" = seqs.p2)
                } else {
                  seqs <- trimSeqs(res.seq, get(feature),
                                   side = sidetype, offBy = offByLength)
                  seqs[width(seqs) >= minReadLength]
                }                
              } else {
                res.seq
              }
            }
          }
        }
      }, y = y)
    }, simplify = FALSE)
    
    simpletons <- !sapply(res, class) == "matrix"
    if(any(simpletons)) {
      lengthTest <- lapply(lapply(lapply(res[simpletons], 
                                         function(x) sapply(x, length)),">",0),
                           which)
      res <- mapply(function(x, y) x[y], res[simpletons], lengthTest, 
                    SIMPLIFY = FALSE)  
    }
    
    if(any(!simpletons)) {
      res[!simpletons] <- sapply(res[!simpletons], as, 'DataFrame')
    }
    
    res
  }
}

#' Extract a specific feature/attribute of the sampleInfo object.
#'
#' Given a sampleInfo object, the function extracts a defined feature(s) for given sample or sector.
#'
#' @param sampleInfo sample information SimpleList object, which samples per sector/quadrant information along with other metadata.
#' @param sector a vector or specific sector to extract the feature from. Default is NULL, which extracts all sectors. 
#' @param samplename a character vector or a specific sample to extract feature from. Default is NULL, which extracts all samples. 
#' @param feature Options include: primed, LTRed, linkered, decoded, and any of the metadata. Default is NULL. When feature='metadata', then it returns names of all the metadata elements associated with the sample as a comma separated list.
#'
#' @return a list or list of lists depending upon which parameters were supplied.
#'
#' @seealso \code{\link{addFeature}}, \code{\link{findPrimers}},
#'  \code{\link{findLTRs}}, \code{\link{findLinkers}}, 
#'  \code{\link{extractSeqs}}, \code{\link{trimSeqs}}, 
#'  \code{\link{getSectorsForSamples}}
#'
#' @export
#'
#' @examples 
#' load(file.path(system.file("data", package = "hiReadsProcessor"),
#' "FLX_seqProps.RData"))
#' samples <- c('Roth-MLV3p-CD4TMLVWell6-Tsp509I', 
#' 'Roth-MLV3p-CD4TMLVWell6-MseI', 'Roth-MLV3p-CD4TMLVwell5-MuA')
#' extractFeature(seqProps, sector='2', samplename=samples, feature="primed")
#' extractFeature(seqProps, sector='2', samplename=samples, feature="linkered")
#' extractFeature(seqProps, sector='2', samplename=samples, feature="metadata")
extractFeature <- function(sampleInfo, sector = NULL, samplename = NULL,
                           feature = NULL) {
  
  .checkArgs_SEQed()
  
  # get all sectors and samplenames in each sector or a specific sector
  res <- getSectorsForSamples(sampleInfo, sector, samplename)
  sectors <- res[["sectors"]]
  samplenames <- res[["samplenames"]]
  
  res <- sapply(sectors, function(y) {
    sapply(samplenames[[y]], function(x, y) { 
      if(tolower(feature) == "metadata") {
        paste(names(sampleInfo$sectors[[y]]$samples[[x]]), collapse = ", ")
      } else {
        res <- sampleInfo$sectors[[y]]$samples[[x]][[feature]]
        ## convert any factor based vector to the appropriate regular vector
        if(class(res) == "factor") { 
          if(!any(is.na(suppressWarnings(as.numeric(levels(res)))))) { 
            as.numeric(as.character(res)) } 
          else { 
            as.character(res) 
          }    
        } else if (class(res) == "character"){ 
          ## if a numeric vector is stored as character, convert it back to numeric
          if(!any(is.na(suppressWarnings(as.numeric(res))))) { 
            as.numeric(res) 
          } else { 
            res 
          }
        } else {
          res
        }
      }
    }, y = y)
  }, simplify = FALSE)
  
  simpletons <- !sapply(res, class) == "matrix"
  if(any(simpletons)) {
    lengthTest <- lapply(lapply(lapply(res[simpletons], 
                                       function(x) sapply(x, length)), ">", 0), 
                         which)
    res <- mapply(function(x,y) x[y], res[simpletons], lengthTest, 
                  SIMPLIFY = FALSE)  
  }
  
  if(any(!simpletons)) {
    res[!simpletons] <- sapply(res[!simpletons], as, 'DataFrame')
  }
  
  res
}

#' Add a specific feature/attribute to the sampleInfo object.
#'
#' Given a sampleInfo object, the function adds a new feature for the 
#' given samples & sectors.
#'
#' @param sampleInfo sample information SimpleList object, which samples per 
#' sector/quadrant information along with other metadata.
#' @param sector a vector or a specific sector to add the new feature(s) to. 
#' Default is NULL, in which case the sectors are searched from 
#' samplename parameter.
#' @param samplename a character vector or a specific sample to add the new 
#' feature(s) to. Default is NULL.
#' @param feature a string of naming the new feature to add for the defined 
#' samplename and sector.
#' @param value named vector of samplenames & values which is assigned for the 
#' defined sector, samplename, and feature. Example: c("Sample1"="ACDTDASD")
#'
#' @return modified sampleInfo object with new feature(s) added.
#'
#' @seealso \code{\link{findPrimers}}, \code{\link{extractSeqs}}, 
#' \code{\link{trimSeqs}}, \code{\link{extractFeature}}, 
#' \code{\link{getSectorsForSamples}}
#'
#' @export
#'
#' @examples 
#' load(file.path(system.file("data", package = "hiReadsProcessor"),
#' "FLX_seqProps.RData"))
#' extractFeature(seqProps, sector="2", 
#' samplename="Roth-MLV3p-CD4TMLVWell6-MseI", feature="metadata")
#' seqProps <- addFeature(seqProps, sector="2", 
#' samplename="Roth-MLV3p-CD4TMLVWell6-MseI", feature="foo", 
#' value=c("Roth-MLV3p-CD4TMLVWell6-MseI"="woo"))
#' extractFeature(seqProps, sector="2", 
#' samplename="Roth-MLV3p-CD4TMLVWell6-MseI", feature="metadata")
addFeature <- function(sampleInfo, sector = NULL, samplename = NULL, 
                       feature = NULL, value = NULL) {
  
  .checkArgs_SEQed()
  
  if(is.null(value)) {
    stop("Please define the value parameter.")
  }
  
  if(is.null(samplename)) {
    stop("Please define the samplename to add the features to.")
  }
  
  if(!all(samplename %in% names(value))) {
    stop("Not all samplename(s) are found in value parameter")
  }

  # get all sectors and samplenames in each sector or a specific sector
  res <- getSectorsForSamples(sampleInfo, sector, samplename)
  sectors <- res[["sectors"]]
  samplenames <- res[["samplenames"]]
  
  for(y in sectors) {
    for(x in samplenames[[y]]) {
      cat(".")
      sampleInfo$sectors[[y]]$samples[[x]][[feature]] <- value[[x]]
    }
  }
  
  sampleInfo$callHistory <- append(sampleInfo$callHistory, match.call())
  return(sampleInfo)
}

#' Get sectors for samples defined in the sampleInfo object.
#'
#' Given a sampleInfo object, the function gets the sectors for each samplename. This is an accessory function utilized by other functions of this package to aid sector retrieval.
#'
#' @param sampleInfo sample information SimpleList object, which samples per sector/quadrant information along with other metadata.
#' @param sector a specific sector or vector of sectors if known ahead of time. Default is NULL, which extracts all sectors. 
#' @param samplename a specific sample or vector of samplenames to get sectors for. Default is NULL, which extracts all samples. 
#' @param returnDf return results in a dataframe. Default is FALSE.
#'
#' @return If returnDf=TRUE, then a dataframe of sector associated with each samplename, else a named list of length two: x[["sectors"]] and x[["samplenames"]]
#'
#' @seealso \code{\link{extractSeqs}}, \code{\link{extractFeature}}, 
#' \code{\link{addFeature}}
#'
#' @export
#'
#' @examples 
#' load(file.path(system.file("data", package = "hiReadsProcessor"),
#' "FLX_seqProps.RData"))
#' samples <- c('Roth-MLV3p-CD4TMLVWell6-Tsp509I', 
#' 'Roth-MLV3p-CD4TMLVWell6-MseI', 'Roth-MLV3p-CD4TMLVwell5-MuA')
#' getSectorsForSamples(seqProps, samplename=samples)
#' getSectorsForSamples(seqProps, samplename=samples, returnDf=TRUE)
getSectorsForSamples <- function(sampleInfo, sector = NULL, samplename = NULL,
                                 returnDf = FALSE) {
  
  .checkArgs_SEQed()
  
  # get all sectors and samplenames in each sector or a specific sector if defined
  if(is.null(sector)) {
    sectors <- names(sampleInfo$sectors)        
  } else {
    sectors <- sector
  }
  samplenames <- sapply(sectors,
                        function(x) names(sampleInfo$sectors[[x]]$samples),
                        simplify = FALSE)
  
  # if specific samplename(s) is defined, then search to find where they are
  if(is.null(samplename)) {
    samplename <- unlist(samplenames)     
  }
  
  if(!all(samplename %in% unlist(samplenames))) {            
    stop("Following sample(s) do not exist on given sector(s) (",
         paste(sectors, collapse = ", "),") in the supplied sampleInfo object: ",
         paste(samplename[!samplename %in% unlist(samplenames)], collapse = ", "))
  }
  sectors <- names(which(unlist(lapply(lapply(samplenames, "%in%", samplename),
                                       any)))) 
  if(returnDf) {
    return(do.call(rbind, lapply(sectors, function(x) { 
      data.frame(samplename = samplenames[[x]][samplenames[[x]] %in% samplename],
                 sector = x, stringsAsFactors = FALSE)
    })))
  } else {
    samplenames <- sapply(sectors,
                          function(x)
                            samplenames[[x]][samplenames[[x]] %in% samplename],
                          simplify = FALSE)
    return(list("sectors" = sectors, "samplenames" = samplenames))
  }
}

#' Read fasta/fastq/sff given the path or sampleInfo object.
#'
#' Given a sequence reads file path, the function returns a DNAStringSet object.
#'
#' @param seqFilePath a path to fasta/fastq/sff reads file or a sampleInfo 
#' object returned by \code{\link{read.SeqFolder}}
#' @param sector specific sector to reads sequences from. Default is 1, and not 
#' required if seqFilePath is a direct file path rather than sampleInfo object.
#' @param isPaired does the sector contain paired end reads? Default is FALSE
#'
#' @return if isPaired is FALSE, then a DNAStringSet object, else a list of 
#' DNAStringSet objects of three elements corresponding to reads from 
#' "barcode", "pair1", and "pair2". Note: "pair2" is reverse complemented!
#'
#' @seealso \code{\link{findBarcodes}}, \code{\link{read.SeqFolder}}, 
#' \code{\link{extractSeqs}}
#'
#' @export
#'
#' @examples 
#' \donttest{
#' load(file.path(system.file("data", package = "hiReadsProcessor"),
#' "FLX_seqProps.RData"))
#' read.seqsFromSector(seqProps, sector="2")
#' }
read.seqsFromSector <- function(seqFilePath=NULL, sector=1, isPaired=FALSE) {
  if(is.null(seqFilePath)) {
    stop("Missing seqFilePath!")
  }
  
  if(class(seqFilePath) == "SimpleList") {
    seqfilePattern <- seqFilePath$seqfilePattern
    seqFilePaths <- seqFilePath$seqFilePaths
    
    if(isPaired) {
      filePath <- seqFilePaths[grep(paste0("^", gsub("R1|R2|I1", ".*", sector),
                                           seqfilePattern),
                                    basename(seqFilePaths))]
      filePath <- normalizePath(filePath, mustWork = TRUE)
      
      if(length(filePath) == 0) {
        stop("No sequence file found for sector: ", sector,
             " in seqFilePath variable (",
             paste(seqFilePaths, collapse = " \n"), ") using pattern (",
             paste0(sector, seqfilePattern), ")")
      }
      
      nobarcodes <- FALSE ## incase the data is already demultiplexed
      if(!any(grepl("I1", basename(filePath)))) {
        warning("No index/barcode file (I1) found.")
        nobarcodes <- TRUE
      }
      
      pair2 <- paste0(sub("(.*)R\\d.*", "\\1", sector),
                      ifelse(sub(".*(R\\d).*", "\\1", sector) == "R2", "R1", "R2"))
      if(!any(grepl(pair2, basename(filePath)))) {
        stop("Pair #2 or Linker end read file not present!")
      }
      seqFilePath <- filePath
      
    } else {
      filePath <- seqFilePaths[grep(paste0("^", sector, seqfilePattern),
                                    basename(seqFilePaths))]
      filePath <- normalizePath(filePath, mustWork = TRUE)
      
      if(length(filePath) == 0) {
        stop("No sequence file found for sector: ", sector,
             " in seqFilePath variable (",
             paste(seqFilePaths, collapse = " \n"), ") using pattern (",
             paste0(sector, seqfilePattern), ")")
      }
      
      if(length(filePath)>1) {
        stop("Multiple sequence file found for sector: ", sector,
             " in seqFilePath variable (", paste(seqFilePaths, collapse = " \n"),
             ") using pattern (", paste0(sector, seqfilePattern), ")")
      }
      seqFilePath <- filePath
    }
  }
  
  doSingleEndCheck <- FALSE
  
  message("Reading:\n", paste(seqFilePath, collapse = "\n"))
  if(any(grepl("fastq", seqFilePath, ignore.case = TRUE))) {    
    dnaSet <- sapply(seqFilePath, function(x) {
      dnaSet <- readDNAStringSet(x, format = 'fastq')      
      names(dnaSet) <- sub("^\\S+-(\\S+) .+$", "\\1", names(dnaSet), perl = TRUE)
      if(any(duplicated(names(dnaSet)))) {
        stop("Duplicate definition lines found in file: ", x)
      }
      if(length(dnaSet) == 0) {
        stop("No sequences found in file: ", x)
      }
      dnaSet
    })
    
    if(isPaired & length(seqFilePath) > 1) {
      LTRside <- grep(sector, basename(names(dnaSet)))
      #names(dnaSet[[LTRside]]) <- paste0("@[email protected]",names(dnaSet[[LTRside]]))
      
      linkerSide <- grep(pair2, basename(names(dnaSet)))      
      #names(dnaSet[[linkerSide]]) <- paste0("@[email protected]",names(dnaSet[[linkerSide]]))
      
      if(!nobarcodes) {
        barcodes <- grep("I1", basename(names(dnaSet)))
        dnaSet <- list("barcode" = dnaSet[[barcodes]],
                       "pair1" = dnaSet[[LTRside]],
                       "pair2" = reverseComplement(dnaSet[[linkerSide]]))
      } else {
        dnaSet <- list("pair1" = dnaSet[[LTRside]],
                       "pair2" = reverseComplement(dnaSet[[linkerSide]]))
      }
      
    } else {
      ## for single-end data!
      dnaSet <- dnaSet[[1]]
    }
  } else if (any(grepl("sff", seqFilePath, ignore.case = TRUE))) {    
    dnaSet <- rSFFreader::sread(rSFFreader::readSff(seqFilePath,
                                                    use.qualities = FALSE,
                                                    verbose = FALSE))
    doSingleEndCheck <- TRUE    
  } else {
    dnaSet <- readDNAStringSet(seqFilePath)
    doSingleEndCheck <- TRUE    
  }
  
  if(doSingleEndCheck) {
    if(any(duplicated(names(dnaSet)))) {
      stop("Duplicate definition lines found in file(s): ",
           paste(seqFilePath, collapse = " * "))
    }
    
    if(length(dnaSet) == 0) {
      stop("No sequences found")
    }
  }
  
  return(dnaSet)
}

#' Write a fasta file per sample in parallel
#'
#' Given a listed DNAStringSet object return from \code{\link{extractSeqs}}, the
#' function writes a fasta file for each sample as defined in filePath parameter.
#'
#' @param dnaSet listed DNAStringSet object containing sequences to be written.
#' @param filePath a path write the fasta files per sample. Default is current
#' working directory.
#' @param filePrefix prefix the filenames with a string. Default is 'processed' 
#' followed by samplename.
#' @param prependSamplenames Prepend definition lines with samplenames. 
#' Default is TRUE. Make sure the dnaSet parameter is a named list where 
#' names are used as samplenames.
#' @param format either fasta (the default) or fastq.
#' @param parallel use parallel backend to perform calculation with 
#' \code{\link{BiocParallel}}. Defaults to TRUE. If no parallel backend is 
#' registered, then a serial version is ran using \code{\link{SerialParam}}.
#'
#' @seealso \code{\link{findBarcodes}}, \code{\link{read.SeqFolder}}, 
#' \code{\link{extractSeqs}}, \code{\link{addListNameToReads}}
#'
#' @note
#' \itemize{
#'   \item Writing of the files is done using \code{\link{writeXStringSet}} 
#'   with parameter append=TRUE. This is to aggregate reads from a sample 
#'   which might be present in more than one sector. 
#'   \item If data is paired end, then each pair will be written separately 
#'   with designations in the filename as well as in the definition line as 
#'   (at)pairX(at) appended at the end.
#'   \item If parallel=TRUE, then be sure to have a parallel backend registered 
#'   before running the function. One can use any of the following 
#'   \code{\link{MulticoreParam}} \code{\link{SnowParam}}
#' }
#'
#' @export
#' 
#' @examples
#'  
#' \donttest{
#' load(file.path(system.file("data", package = "hiReadsProcessor"),
#' "FLX_seqProps.RData"))
#' samples <- c('Roth-MLV3p-CD4TMLVWell6-Tsp509I', 
#' 'Roth-MLV3p-CD4TMLVWell6-MseI', 'Roth-MLV3p-CD4TMLVwell5-MuA')
#' seqs <- extractSeqs(seqProps, sector='2', samplename=samples, feature="primed")
#' write.listedDNAStringSet(seqs)
#' }
write.listedDNAStringSet <- function(dnaSet, filePath = ".",
                                     filePrefix = "processed",
                                     prependSamplenames = TRUE, 
                                     format = "fasta",
                                     parallel = FALSE) {
  stopifnot(class(dnaSet) == "list")
  
  if(filePrefix == "") { filePrefix <- NA }
  filePath <- normalizePath(filePath, mustWork = TRUE)
  
  dp <- if(parallel) { bpparam() } else { SerialParam() }
  for(x in seq_along(dnaSet)) {
    bpmapply(function(outputSeqs, samplename) {
      if(is.list(outputSeqs)) {
        list.test <- any(sapply(outputSeqs, length) > 0)
      } else {
        list.test <- length(outputSeqs) > 0
        outputSeqs <- list("tempy" = outputSeqs)
      }
      
      if(list.test) {
        for(p in names(outputSeqs)) {
          pairname <- ifelse(p == "tempy", NA, p)
          
          if(is.null(names(outputSeqs[[p]]))) {
            message("No names attribute found for ", samplename,
                    " ... using artifically generated names")
            names(outputSeqs[[p]]) <-
              paste("read", 1:length(outputSeqs[[p]]), sep = "-")
          }
          
          ## remove '.' at the beginning of the filename incase filePrefix is empty
          filename <- paste(na.omit(c(filePrefix, samplename, pairname, 
                                      ifelse(format == "fastq", "fastq", "fa"))), 
                            collapse = ".")
          filename <- file.path(filePath, filename)
          
          if(prependSamplenames) {
            names(outputSeqs[[p]]) <- paste(samplename, names(outputSeqs[[p]]),
                                            sep = "-")
          }
          
          if(p != "tempy") {
            names(outputSeqs[[p]]) <- paste0(names(outputSeqs[[p]]),
                                             "@", pairname, "@")
          }
          
          writeXStringSet(outputSeqs[[p]], filepath = filename, format = format,
                          append = TRUE) 
        } 
      } else {
        message("No reads written for ", samplename)
      }       
    }, as(dnaSet[[x]], "list"), names(dnaSet[[x]]), BPPARAM = dp)
  }
}

# Check args and set defaults for functions dealing with reads. This function
# checks all the arguments passed to a function related to aligning or trimming 
# a read and then sets default values for internal use. Evaluation of this 
# function happens in the parent function.
.checkArgs_SEQed <- function() {
  
  checks <- expression( 
    if("subjectSeqs" %in% names(formals())) { 
      if(is.null(subjectSeqs) | length(subjectSeqs)==0) {
        stop("subjectSeqs paramter is empty. Please supply reads to be aligned")
      }      
      
      ## give names if not there for troubleshooting purpose in later steps
      removeSubjectNamesAfter <- FALSE
      if(is.null(names(subjectSeqs))) {
        removeSubjectNamesAfter <- TRUE
        names(subjectSeqs) <- paste("read", 1:length(subjectSeqs))
      } 
    },
    
    if("patternSeq" %in% names(formals())) { 
      if(is.null(patternSeq) | length(patternSeq) == 0) {
        stop("patternSeq paramter is empty. Please supply reads to be aligned")
      } else if (length(patternSeq)>1) {
        stop("More than 1 patternSeq defined. Please only supply one pattern.")
      }
    },
    
    if("reads" %in% names(formals())) { 
      if(is.null(reads) | length(reads) == 0) {
        stop("reads paramter is empty. Please supply reads to be aligned")
      }
    },
    
    if("Vector" %in% names(formals())) { 
      if(is.null(Vector) | length(Vector) == 0) {
        stop("Vector paramter is empty. Please supply Vector to be aligned")
      }
      
      if(!grepl("DNAString", class(Vector))) {
        stop("Vector paramter is not of DNAString class")
      }
    },
    
    if("parallel" %in% names(formals())) { 
      dp <- if(parallel & .Platform$OS.type != "windows") 
        { bpparam() } else { SerialParam() }
    },
    
    if("sampleInfo" %in% names(formals())) { 
      stopifnot(is(sampleInfo, "SimpleList"))
    },
    
    if("feature" %in% names(formals())) {
      if(is.null(feature)) {
        stop("Please define a feature to extract.")
      }
    }      
  )
  
  eval.parent(checks)
}

# Check args and set defaults for functions dealing with hits of aligned reads.
# This function checks all the arguments passed to a function related to 
# checking hits of an aligned read and then sets default values for internal 
# use. Evaluation of this function happens in the parent function.
.checkArgsSetDefaults_ALIGNed <- function() {
  
  checks <- expression(
    if("pslFile" %in% names(formals()) | "files" %in% names(formals())) { 
      if(is.null(files) | length(files) == 0) {
        stop("files parameter empty. Please supply a filename to be read.")
      }
      
      if(any(grepl("\\*|\\$|\\+|\\^", files))) {
        ## vector of filenames
        files <- list.files(path = dirname(files), pattern = basename(files),
                            full.names = TRUE)      
      }
      
      if(length(files) == 0) { 
        stop("No file(s) found with given paramter in files:", files) 
      }
    },

    if("psl.rd" %in% names(formals())) {
      if(!is.null(psl.rd)) {
        if(length(psl.rd) == 0 | !is(psl.rd, "GRanges")) {
          stop("psl.rd paramter is empty or not a GRanges object")
        }
      }
    },

    if("parallel" %in% names(formals())) { 
      dp <- if(parallel & .Platform$OS.type != "windows") 
        { bpparam() } else { SerialParam() }
    }
  )
  
  eval.parent(checks)
}

#' Reads a BAM/SAM file and converts it into a PSL like format. 
#'
#' Given filename(s), the function reads the BAM/SAM file, converts into a PSL 
#' like format. Any other file format will yield errors or erroneous results.
#' This is intended to be used independently with other short read aligners.
#'
#' @param bamFile BAM/SAM filename, or vector of filenames, or a pattern of 
#' files to import.
#' @param removeFile remove the file(s) supplied in bamFile paramter after 
#' importing. Default is FALSE.
#' @param asGRanges return a GRanges object. Default is TRUE
#'
#' @return a GRanges or GAlignments object reflecting psl file type.
#'
#' @seealso \code{\link{pairwiseAlignSeqs}}, \code{\link{blatSeqs}}, 
#' \code{\link{read.blast8}}, \code{\link{read.psl}}, 
#' \code{\link{pslToRangedObject}}, \code{\link{pairUpAlignments}}
#'
#' @export
#'
#' @examples
#'  
#' \donttest{
#' read.BAMasPSL(bamFile="processed.*.bam$")
#' read.BAMasPSL(bamFile=c("sample1hits.bam","sample2hits.bam"))
#' }
read.BAMasPSL <- function(bamFile = NULL, removeFile = TRUE, asGRanges = TRUE) {
  if(is.null(bamFile) | length(bamFile) == 0) {
    stop("bamFile parameter empty. Please supply a filename to be read.")
  }
  
  if (any(grepl("\\*|\\$|\\+|\\^",bamFile))) {
    ## vector of filenames
    bamFile <- list.files(path = dirname(bamFile),
                          pattern = basename(bamFile), full.names = TRUE)      
  }
  
  if(length(bamFile) == 0) { 
    stop("No file(s) found with given paramter in bamFile:", bamFile) 
  }
  
  if("hits" %in% names(bamFile)) {
    bamFile <- bamFile[["hits"]]
  }
  
  param <- ScanBamParam(what = c("qname"),
                        flag = scanBamFlag(
                          isPaired = NA,
                          isProperPair = NA,
                          isUnmappedQuery = FALSE,
                          hasUnmappedMate = NA,
                          isMinusStrand = NA,
                          isMateMinusStrand = NA,
                          isFirstMateRead = NA,
                          isSecondMateRead = NA,
                          isSecondaryAlignment = NA,
                          isDuplicate = NA,
                          isNotPassingQualityControls = NA
                        ),
                        tag = c("CC", "CT", "CP", "CG"))
  hits <- lapply(bamFile, readGAlignments, param = param)
  hits <- do.call(c, hits)
  
  if(length(hits) == 0) {
    if(removeFile) { file.remove(bamFile) }
    stop("No hits found")
  }
  
  ## remove extra tag columns if not present ##
  for(f in c("CC", "CT", "CP", "CG")) {
    if(all(is.na(mcols(hits)[,f]))) {
      mcols(hits)[[f]] <- NULL
    }
  }
  
  message("Ordering by qName")
  mcols(hits)$qName <- mcols(hits)$qname
  mcols(hits)$qname <- NULL
  hits <- hits[order(mcols(hits)$qName)]
  
  ## add few required columns present in PSL format which are crucial to filtering
  message("Adding relevent PSL columns")
  stopifnot(identical(cigarWidthAlongQuerySpace(cigar(hits)), qwidth(hits)))
  mcols(hits)$qSize <- qwidth(hits)
  
  bore <- cigarRangesAlongQuerySpace(cigar(hits), ops = "M")
  mcols(hits)$matches <- sapply(width(bore), sum)
  mcols(hits)$qStart <- min(start(bore))
  mcols(hits)$qEnd <- max(end(bore))
  
  bore <- cigarRangesAlongQuerySpace(cigar(hits), ops = "X")
  mcols(hits)$misMatches <- sapply(width(bore), sum)
  
  bore <- cigarRangesAlongQuerySpace(cigar(hits), ops = "I")
  mcols(hits)$qNumInsert <- sapply(bore, length)
  mcols(hits)$qBaseInsert <- sapply(width(bore), sum)
  
  bore <- cigarRangesAlongReferenceSpace(cigar(hits), ops = "I")
  mcols(hits)$tNumInsert <- sapply(bore, length)
  mcols(hits)$tBaseInsert <- sapply(width(bore), sum)
  
  if(asGRanges) {
    message("Converting to GRanges object...")
    hits.gr <- as(hits, "GRanges")
    mcols(hits.gr) <- cbind(DataFrame(cigar = cigar(hits), ngap = njunc(hits)),
                            mcols(hits))
    hits <- hits.gr
    rm(hits.gr)    
  }
  
  if(removeFile) { file.remove(bamFile) }
  
  return(hits)
}

#' Pair up alignments in a GRanges object
#'
#' Given a GRanges object, the function uses specified gaplength parameter to 
#' pair up reads where the qName column ends with "atpersand pairname atpersand" 
#' which is outputted by \code{\link{extractSeqs}}.
#'
#' @param psl.rd a GRanges object with qNames ending in "atpersand pairname atpersand".
#' @param maxGapLength maximum gap allowed between end of pair1 and start of 
#' pair2. Default is 2500 bp.
#' @param sameStrand should pairs be aligned to the same strand or in same 
#' orientationin the reference genome? Default is TRUE. This is 'TRUE' because 
#' pair2 reads are reverseComplemented when reading in data in 
#' \code{\link{findBarcodes}}
#' @param parallel use parallel backend to perform calculation with 
#' \code{\link{BiocParallel}}. Defaults to TRUE. If no parallel backend is 
#' registered, then a serial version is ran using \code{\link{SerialParam}}.
#'
#' @return a GRanges object with reads paired up denoted by "paired" column. 
#' Improper pairs or unpaired reads are returned with "paired" column as FALSE.
#'
#' @seealso \code{\link{pairwiseAlignSeqs}}, \code{\link{blatSeqs}}, 
#' \code{\link{read.blast8}}, \code{\link{read.psl}}, 
#' \code{\link{getIntegrationSites}}, \code{\link{read.BAMasPSL}}
#'
#' @export
#'
#' @examples
#'  
#' \donttest{
#' psl.rd <- read.BAMasPSL(bamFile=c("sample1hits.bam","sample2hits.bam"))
#' pairUpAlignments(psl.rd)
#' }
pairUpAlignments <- function(psl.rd = NULL, maxGapLength = 2500,
                             sameStrand = TRUE, parallel = TRUE) {
  dp <- NULL
  
  .checkArgsSetDefaults_ALIGNed()
  stopifnot("qName" %in% names(mcols(psl.rd)))
  qName <- NULL
  
  ### identify pairs ###
  mcols(psl.rd)$pair <- sub("[email protected](.+)@", "\\1", mcols(psl.rd)$qName)
  mcols(psl.rd)$qName <- sub("(.+)@[email protected]", "\\1", mcols(psl.rd)$qName)
  stopifnot(!any(is.na(mcols(psl.rd)$pair)))
  
  ### check pairs ###
  p1 <- mcols(psl.rd)$qName[mcols(psl.rd)$pair == "pair1"]
  p2 <- mcols(psl.rd)$qName[mcols(psl.rd)$pair == "pair2"]
  loners <- unique(c(setdiff(p1, p2), setdiff(p2, p1)))
  loners <- subset(psl.rd, mcols(psl.rd)$qName %in% loners)
  mcols(loners)$paired <- FALSE
  psl.rd <- subset(psl.rd, !mcols(psl.rd)$qName %in% mcols(loners)$qName)  
  rm("p1", "p2")
  
  ### pair up pairs ###
  # collapse pairs first, then tag successfully merged pairs by checking counts
  # add respective metadata back to successfully collapsed reads
  # for unsuccessful ones, match which pairs overlap with the collapsed area then...
  # subset those reads and see they're multihits by any chance...
  # if not tag then as paired=FALSE
  
  bore <- GRanges(seqnames = paste(as.character(seqnames(psl.rd)),
                                   psl.rd$qName, sep = "@[email protected]"),
                  ranges = ranges(psl.rd), strand = strand(psl.rd))
  reduced <- reduce(bore, min.gapwidth = maxGapLength,
                    ignore.strand = (!sameStrand)) 
  rm(bore)
  
  reduced <- GRanges(seqnames = sub("(.+)@[email protected]+", "\\1",
                                    as.character(seqnames(reduced))),
                     ranges = ranges(reduced), strand = strand(reduced),
                     qName = sub("[email protected]@(.+)", "\\1",
                                 as.character(seqnames(reduced))))
  counts <- table(mcols(reduced)$qName)
  
  proper <- reduced[mcols(reduced)$qName %in% names(counts[counts == 1]), ]
  bore <- subset(psl.rd,
                 mcols(psl.rd)$qName %in% names(counts[counts == 1]) &
                   mcols(psl.rd)$pair == 'pair1')
  rows <- match(mcols(proper)$qName, mcols(bore)$qName)
  stopifnot(identical(mcols(proper)$qName, mcols(bore)$qName[rows]))
  mcols(proper) <- mcols(bore[rows])
  mcols(proper)$paired <- TRUE
  
  reduced <- reduced[mcols(reduced)$qName %in% names(counts[counts!=1]),]
  psl.rd <- subset(psl.rd, 
                   mcols(psl.rd)$qName %in% names(counts[counts!=1]))    
  rm(counts)
  
  reduced <- split(reduced, mcols(reduced)$qName)
  psl.rd <- split(psl.rd, mcols(psl.rd)$qName)
  psl.rd <- psl.rd[names(reduced)]
  stopifnot(identical(names(reduced), names(psl.rd)))
  
  pair <- bpmapply(function(x,y) {
    res <- as.data.frame(findOverlaps(x, y))
    res$pair <- mcols(y)$pair[res$subjectHits]
    res$qName <- mcols(y)$qName[res$subjectHits]
    test <- aggregate(pair ~ qName + queryHits, data = res,
      FUN = function(x) paste(sort(x), collapse = "")
    )
    
    # remove cases where a pair gives rise to many reduced
    # regions but one may ...
    # contain one pair and other contains both
    hasBoth <- subset(test, pair == "pair1pair2")
    test <- subset(test, pair != "pair1pair2" & !qName %in% hasBoth$qName)
    
    # despite the collapse if each pair is yielding it's own hit
    # then chances are...
    # you're on different chromosome, too far apart, or
    # different strands!
    if (nrow(test) > 0) {
      unpaired <- y[subset(res, queryHits %in% test$queryHits)$subjectHits]
      mcols(unpaired)$paired <- FALSE
    }
    
    # reduced is >1 but each collapse region has both reads...
    # hence a multihit #
    res <- subset(res, queryHits %in% hasBoth$queryHits)
    if (nrow(res) > 0) {
      paired <- x[unique(res$queryHits)]
      mcols(paired) <- mcols(y[res[res$pair == "pair1", "subjectHits"]])
      mcols(paired)$paired <- TRUE
    }
    
    rm("res", "test", "hasBoth")
    
    toReturn <- c()
    if (exists("unpaired")) {
      toReturn <- c(toReturn, unpai