R/qAlign.R

Defines functions determineSamplesFormat resolveCacheDir pathAsAbsoluteRedirected bamInfoOnlyBaseName qProjectBamInfo createReferenceSequenceIndices createQProject missingFilesMessage qAlign

Documented in qAlign

# bowtie alignments: --best --strata -m 1 (return one alignment, but only if it is a unique mapper)
# plus -q/-f [--phred33/64-quals] [-C] -S --quiet


# paired:
# valid values are NULL, "no", "fr", "rf", "ff"
# in case of NULL, QuasR determines the paired status automatically from the given sampleFile (2 columns:unpaired, 3 columns:paired).
# The orientation for paired end is set to "fr" by default. In case of bam input files, the user needs to specify manually the
# status ("no","fr","rf","ff")
# snpFile need four columns (chrom, pos, ref, alt) with no header
# geneAnnotation should be the path to either a gtf file or a TxDb sqlite file

qAlign <- function(sampleFile, genome, auxiliaryFile=NULL, aligner="Rbowtie", maxHits=1, paired=NULL,
                   splicedAlignment=FALSE, snpFile=NULL, bisulfite="no", alignmentParameter=NULL, projectName="qProject",
                   alignmentsDir=NULL, lib.loc=NULL, cacheDir=NULL, clObj=NULL, checkOnly=FALSE, geneAnnotation=NULL){

  # check if the user provided a samplefile and a genome
  if(missing(sampleFile)){stop("Missing 'sampleFile' parameter.")}
  if(missing(genome)){stop("Missing 'genome' parameter.")}

  # create a qProject, perform various tests, install required BSgenome and load aligner package
  # create fasta indices (.fai) files for all the reference sequences (genome & aux) as well as all md5 sum files
  proj <- createQProject(sampleFile, genome, auxiliaryFile, aligner, maxHits, paired, splicedAlignment, snpFile, bisulfite, alignmentParameter, projectName, alignmentsDir, lib.loc, cacheDir, geneAnnotation)

  # display the status of the project. in case of checkOnly=TRUE, throw an exception if there are alignment files missing
  missingFilesMessage(proj,checkOnly)

  # check if there are any genomic alignment files missing. if so, create index and align
  if(any(is.na(proj@alignments$FileName))){
    # build genome index
    if(is.na(proj@snpFile)){
      if(proj@genomeFormat=="file"){
        buildIndex(proj@genome, paste(proj@genome,proj@alnModeID,sep="."), proj@alnModeID, resolveCacheDir(proj@cacheDir))
      }else if(proj@genomeFormat=="BSgenome"){
        buildIndexPackage(proj@genome,proj@aligner,proj@alnModeID,resolveCacheDir(proj@cacheDir),proj@lib.loc)
      }else{stop("Fatal error 45906")}
    }else{
      # create indices for the two secondary genomes specified by snps
      buildIndexSNP(proj@snpFile,paste(proj@snpFile,proj@alnModeID,sep="."),proj@genome,proj@genomeFormat,proj@alnModeID,resolveCacheDir(proj@cacheDir))
    }

    # Rhisat2, generate splice site file
    if (proj@aligner == "Rhisat2" && !is.na(proj@geneAnnotation)) {
      buildSpliceSiteFile(proj@geneAnnotation, proj@geneAnnotationFormat)
    }

    # align to the genome (qProject gets updated with the alignment file names)
    proj <- createGenomicAlignments(proj, clObj)

  }

  # create indices for the auxiliary files if necessary (if there are any alignment files missing)
  if(any(is.na(proj@auxAlignments))){
    for(i in 1:nrow(proj@aux)){
      buildIndex(proj@aux$FileName[i], paste(proj@aux$FileName[i],proj@alnModeID,sep="."), proj@alnModeID, resolveCacheDir(proj@cacheDir),checkMD5=TRUE)
    }

    proj <- createAuxAlignments(proj, clObj)
  }

  return(proj)
}




# inspect the qProject and give an overview of all the files that need to be computed
missingFilesMessage <- function(proj, checkOnly){

  genomeIndexNeedsToBeCreated <- FALSE
  spliceSiteFileNeedsToBeCreated <- FALSE
  genomicAlignmentsNeedToBeCreated <- sum(is.na(proj@alignments$FileName))
  auxAlignmentsNeedToBeCreated <- sum(is.na(proj@auxAlignments))

  if(any(is.na(proj@alignments$FileName))){
    if(is.na(proj@snpFile)){
      if(proj@genomeFormat=="file"){
        indexPath <- paste(proj@genome,proj@alnModeID,sep=".")
        if(file.exists(indexPath)){
          if(!("ref_md5Sum.txt" %in% dir(indexPath))){
            genomeIndexNeedsToBeCreated <- TRUE
          }
        }else{
          genomeIndexNeedsToBeCreated <- TRUE
        }
      }else if(proj@genomeFormat=="BSgenome"){
       indexPackageName <- paste(proj@genome,proj@alnModeID,sep=".")
       if(!(indexPackageName %in% installed.packages()[,'Package'])){
         genomeIndexNeedsToBeCreated <- TRUE
       }
      }else{stop("Fatal error 45906")}
    }else{
      indexPathR <- paste(proj@snpFile,basename(proj@genome),"R","fa",proj@alnModeID,sep=".")
      indexPathA <- paste(proj@snpFile,basename(proj@genome),"A","fa",proj@alnModeID,sep=".")

      if(file.exists(indexPathR) & file.exists(indexPathA)){
        if( !("ref_md5Sum.txt" %in% dir(indexPathR)) | !("ref_md5Sum.txt" %in% dir(indexPathA)) ){
          genomeIndexNeedsToBeCreated <- TRUE
        }
      }else{
        genomeIndexNeedsToBeCreated <- TRUE
      }
    }

    ## Splice site file
    if (!is.null(proj@geneAnnotation) && !is.na(proj@geneAnnotation)) {
      if (!file.exists(paste0(proj@geneAnnotation, ".SpliceSites.txt"))) {
        spliceSiteFileNeedsToBeCreated <- TRUE
      }
    }
  }

  if(!genomeIndexNeedsToBeCreated & !spliceSiteFileNeedsToBeCreated & genomicAlignmentsNeedToBeCreated==0 & auxAlignmentsNeedToBeCreated==0){
    message("all necessary alignment files found")
  }else{
    message("alignment files missing - need to:")
    if(genomeIndexNeedsToBeCreated){message("    create alignment index for the genome")}
    if(spliceSiteFileNeedsToBeCreated){message("    create splice site file for gene annotation")}
    if(genomicAlignmentsNeedToBeCreated>0){message("    create ",genomicAlignmentsNeedToBeCreated," genomic alignment(s)")}
    if(auxAlignmentsNeedToBeCreated>0){message("    create ",auxAlignmentsNeedToBeCreated," auxiliary alignment(s)")}

    if(checkOnly){
      stop("alignment or index files are missing and checkOnly=TRUE. Aborting execution",call.=FALSE)
    }

    if(interactive()){
      message("will start in ", appendLF=FALSE); flush.console()
      for(i in 9:1) { message("..",i,"s",appendLF=FALSE); flush.console();
      Sys.sleep(1) }
      message("")
    }
  }
}



# read all the input information, perform various checks and compile the data into a qProject object
createQProject <- function(sampleFile, genome, auxiliaryFile, aligner, maxHits, paired, splicedAlignment,
                           snpFile, bisulfite, alignmentParameter, projectName, alignmentsDir,
                           lib.loc, cacheDir, geneAnnotation){

  # instantiate a qProject
  proj <-new("qProject")

  # -----------DETERMINE THE FORMAT OF THE GENOME AND DOWNLOAD FROM BIOCONDUCTOR IF NEEDED -------------

  if(length(genome) != 1){stop("The genome parameter should contain only a single entry",call.=FALSE);}

  # if there exists a / or \\ at the end of the genome string, remove it. this is required for the
  # consistent behavior of file.exists on windows systems. file.exists("dir/") != file.exists("dir").
  genome <- sub("(\\\\|/)$","",genome)

  if(file.exists(genome)){
    # genome is a directory or a file
    if(!(file.info(genome)$isdir)){
       # genome is a file, check if it is a fasta file
      if(consolidateFileExtensions(genome)=="fasta"){
        proj@genome <- tools::file_path_as_absolute(genome)
        proj@genomeFormat <- "file"
      }else{stop("The specified genome ",genome," does not have the extension of a fasta file (fa,fasta,fna)",call.=FALSE)}
    }else{
      # genome is a directory
      stop("The specified genome has to be a file and not a directory: ", genome,call.=FALSE)
    }
  }else{
    # check if the genome is a BSgenome.
    if(genome %in% installed.packages(lib.loc=lib.loc)[,'Package']){
      # BSgenome is already installed, load it
      if(!require(genome, character.only=TRUE, quietly=TRUE, lib.loc=lib.loc)){stop("The BSgenome ",genome," is installed but cannot be loaded. The version of the BSgenome might be too old",call.=FALSE)}

      proj@genome <- genome
      proj@genomeFormat <- "BSgenome"
    }else{
      message("The specified genome is not a fasta file or an installed BSgenome.")
      message("Connecting to Bioconductor and searching for a matching genome (internet connection required)...", appendLF = FALSE)

      if(testBioCConnection()){
        # Connection to Bioconductor OK
        message("OK")
        if(genome %in% available.genomes()){
          # The genome is available in Bioconductor, install it
          message("Downloading genome... ",genome," from Bioconductor")
          BiocManager::install(genome, update=FALSE, lib=lib.loc)

          # BSgenome has been installed, load it
          if(!require(genome, character.only=TRUE, quietly=TRUE, lib.loc=lib.loc)){stop("Fatal error 23445")}

          proj@genome <- genome
          proj@genomeFormat <- "BSgenome"
        }else{
          # The genome is not available in Bioconductor
          stop(genome," is not available in Bioconductor. Type available.genomes() for a complete list",call.=FALSE)
        }
      }else{
        # No connection to Bioconductor
        message("FAILED")
        stop("Could not find the specified genome: ",genome,call.=FALSE)
      }
    }
  }

  # --------------------------------- PROCESS THE GENOME ANNOTATION ---------------------------------
  if (!is.null(geneAnnotation)) {
    if (is(geneAnnotation, "character") && length(geneAnnotation) == 1 &&
        file.exists(geneAnnotation) &&
        tools::file_ext(geneAnnotation) %in% c("gtf", "gff", "gff3", "sqlite")) {
      if (tools::file_ext(geneAnnotation) == "sqlite") {
        proj@geneAnnotationFormat <- "TxDb"
      } else {
        proj@geneAnnotationFormat <- "gtf"
      }
    } else {
      stop("'geneAnnotation' must be a path to an existing sqlite, gtf or gff file.")
    }
    proj@geneAnnotation <- tools::file_path_as_absolute(geneAnnotation)
  } else {
    proj@geneAnnotation <- NA_character_
    proj@geneAnnotationFormat <- NA_character_
  }

  # ---------------------------------------- PARSE THE PAIRED PARAMETER ---------------------------------
  if(!is.null(paired)){
    if(!(paired %in% c("no","fr","rf","ff"))){
      stop("The parameter paired supports only NULL, 'no', 'fr', 'rf' and 'ff'",call.=FALSE)
    }
  }

  # ---------------------------------------- PARSE THE SAMPLE FILE -------------------------------------
  if(!file.exists(sampleFile)){stop("Cannot open ",sampleFile,call.=FALSE)}
  samples <- read.delim(sampleFile,header=TRUE,colClasses="character")

  if(nrow(samples)==0){stop(sampleFile," is either empty or there is a header missing",call.=FALSE)}

  # get absolute path of the sample.txt file
  sampleFileAbsolutePath <- dirname(tools::file_path_as_absolute(sampleFile))

  # Check if the reads are single end
  if(ncol(samples)==2){
    if(all(colnames(samples) == c("FileName","SampleName"))){
      # this is a valid single end samples file. check if listed files exist
      for(i in 1:nrow(samples)){
        pathRet <- pathAsAbsoluteRedirected(samples$FileName[i],sampleFileAbsolutePath)
        if(!is.null(pathRet)){
          if(samples$SampleName[i]==""){stop(samples$FileName[i]," listed in ",sampleFile," has no sample name",call.=FALSE)}
          samples$FileName[i] <- pathRet
        }else{stop(samples$FileName[i]," listed in ",sampleFile," does not exist",call.=FALSE)}
      }
      # determine format of the files (fa,fasta,fna)
      proj@samplesFormat <- determineSamplesFormat(samples$FileName)

      if(proj@samplesFormat == "bam"){
        # samples are provided as bam files
        if(is.null(paired)){stop("Paired status of the provided bam files cannot be determined automatically, please set the paired parameter",call.=FALSE)}

        if(length(unique(samples$FileName)) != nrow(samples)){
          stop("There are duplicate files in sampleFile. This is not allowed.",call.=FALSE)
        }

        if(paired=="no"){
          proj@reads <- samples
          proj@reads$FileName=NA_character_
        }else{
          proj@reads <- data.frame(FileName1=NA_character_,FileName2=NA_character_,SampleName=samples$SampleName,stringsAsFactors=FALSE)
        }

        proj@paired <- paired
        proj@alignments <- samples
      }else{
        # samples are provided as reads
        if(!is.null(paired)){
          if(paired!="no"){stop(sampleFile," contains 2 columns which represents unpaired end data. Please reset the paired parameter",call.=FALSE)}
        }
        if(length(unique(samples$FileName)) != nrow(samples)){
          stop("There are duplicate files in sampleFile. This is not allowed as it would result in non-unique alignment file names",call.=FALSE)
        }

        proj@paired <- "no"
        proj@reads <- samples
        proj@alignments <- samples
        proj@alignments$FileName=NA_character_
      }
    }else{stop(sampleFile," should contain column names FileName,SampleName (single end data)",call.=FALSE)} # incorrect format
  # Check if the reads are paired data
  }else if(ncol(samples)==3){
    if(all(colnames(samples) == c("FileName1","FileName2","SampleName"))){
      # this is a valid paired end samples file. check if listed files exist
      for(i in 1:nrow(samples)){
        pathRet <- pathAsAbsoluteRedirected(samples$FileName1[i],sampleFileAbsolutePath)
        if(!is.null(pathRet)){
          if(samples$SampleName[i]==""){stop(samples$FileName1[i]," listed in ",sampleFile," has no sample name",call.=FALSE)}
          samples$FileName1[i] <- pathRet
        }else{stop(samples$FileName1[i]," listed in ",sampleFile," does not exist",call.=FALSE)}
      }
      for(i in 1:nrow(samples)){
        pathRet <- pathAsAbsoluteRedirected(samples$FileName2[i],sampleFileAbsolutePath)
        if(!is.null(pathRet)){
          if(samples$SampleName[i]==""){stop(samples$FileName2[i]," listed in ",sampleFile," has no sample name",call.=FALSE)}

          samples$FileName2[i] <- pathRet
        }else{stop(samples$FileName2[i]," listed in ",sampleFile," does not exist",call.=FALSE)}
      }
      # determine format of the files (fa,fasta,fna)
      proj@samplesFormat <- determineSamplesFormat(c(samples$FileName1,samples$FileName2))

      if(proj@samplesFormat == "bam"){stop("Bam files need to be listed in a two column file: ",sampleFile,call.=FALSE)}
      if(!is.null(paired)){
        if(paired=="no"){stop(sampleFile," contains 3 columns which represents paired end data. Please reset the paired parameter",call.=FALSE)}
      }
      if(length(unique(c(samples$FileName1, samples$FileName2))) != (2 * nrow(samples))){
        stop("There are duplicate files in sampleFile. This is not allowed as it would result in non-unique alignment file names",call.=FALSE)
      }

      if(is.null(paired)){
        proj@paired <- "fr"
      }else{
        proj@paired <- paired
      }

      proj@reads <- samples
      proj@alignments <- data.frame(FileName=NA_character_,SampleName=samples$SampleName,stringsAsFactors=FALSE)

    }else{stop(sampleFile," should contain the column names FileName1,FileName2,SampleName (paired end data)",call.=FALSE)} # incorrect format
  # The format of the file is not supported
  }else{stop(sampleFile," should be a tab delimited file with either 2 or 3 columns",call.=FALSE)}


  # --------------- FOR FASTQ FILES, READ A SMALL CHUNK TO GUESS THE QUALITY FORMAT (phred33 or phred64) -------
  if(proj@samplesFormat == "fastq"){
    proj@reads <- data.frame(proj@reads,phred=NA_character_,stringsAsFactors=FALSE) # add an additional quality column to the reads table
    nthreads <- .Call(ShortRead:::.set_omp_threads, 1L) # switch off omp parallelization in ShortRead::FastqStreamer
    for(i in 1:nrow(proj@reads)){
      fastq_fs <- FastqStreamer(proj@reads[i,1], n=2000,readerBlockSize=5e5); # take only first column even for paired end data

      tryCatch({
        fastq_sq <- yield(fastq_fs)
      }, error = function(ex) {
        close(fastq_fs)
        stop("Incorrect format for the fastq file: ",proj@reads[i,1],call.=FALSE)
      })
      close(fastq_fs)
      # determine the format of the quality values
      if(inherits(quality(fastq_sq),"FastqQuality")){
        proj@reads$phred[i] <- "33"
      }else if(inherits(quality(fastq_sq),"SFastqQuality")){
        proj@reads$phred[i] <- "64"
      }else{stop("The quality values of the provided sequences files are not interpretable: ",proj@reads[i,1],call.=FALSE)}
    }
    .Call(ShortRead:::.set_omp_threads, nthreads) # switch back on omp parallelization in ShortRead::FastqStreamer
  }


  # -------------------- CALCULATE MD5 SUB SUMS FOR ALL READ FILES ----------------------------------------
  if(proj@samplesFormat != "bam"){
    if(proj@paired=="no"){
      proj@reads_md5subsum=data.frame(md5subsum=md5subsum(proj@reads$FileName),stringsAsFactors=FALSE)
    }else{
      proj@reads_md5subsum=data.frame(md5subsum1=md5subsum(proj@reads$FileName1),md5subsum2=md5subsum(proj@reads$FileName2),stringsAsFactors=FALSE)
    }
  }else{
    if(proj@paired=="no"){
      proj@reads_md5subsum=as.data.frame(matrix(NA_character_,nrow=nrow(proj@reads),ncol=1),stringsAsFactors=FALSE)
      colnames(proj@reads_md5subsum) <- "md5subsum"
    }else{
      proj@reads_md5subsum=as.data.frame(matrix(NA_character_,nrow=nrow(proj@reads),ncol=2),stringsAsFactors=FALSE)
      colnames(proj@reads_md5subsum) <- c("md5subsum1","md5subsum2")
    }
  }

  # ----------------------------------- PARSE THE SNP FILE ------------------------------------

  if(!is.null(snpFile)){
    if(!file.exists(snpFile)){stop("Cannot open ",snpFile,call.=FALSE)}
    proj@snpFile=tools::file_path_as_absolute(snpFile)
  }else{proj@snpFile=NA_character_}


  # ---------------------------------------- PARSE THE AUXILIARY FILE -------------------------------------
  if(!is.null(auxiliaryFile)){
    if(proj@samplesFormat == "bam"){stop("The option 'auxiliaryFile' is not supported for bam input files",call.=FALSE)}
    if(!is.na(proj@snpFile)){stop("The option 'auxiliaryFile' is not supported in allelic mode",call.=FALSE)}
    if(!file.exists(auxiliaryFile)){stop("Cannot open ",auxiliaryFile,call.=FALSE)}
    auxiliaries <- read.delim(auxiliaryFile,header=TRUE,colClasses="character")

    if(nrow(auxiliaries)==0){stop(auxiliaryFile," is either empty or there is a header missing",call.=FALSE)}

    # get absolute path of the auxiliary.txt file
    auxFileAbsolutePath <- dirname(tools::file_path_as_absolute(auxiliaryFile))

    if(ncol(auxiliaries)==2){
      if(all(colnames(auxiliaries) == c("FileName","AuxName"))){
        # this is a valid auxiliaries file. check if listed files exist
        for(i in 1:nrow(auxiliaries)){
          pathRet <- pathAsAbsoluteRedirected(auxiliaries$FileName[i],auxFileAbsolutePath)
          if(!is.null(pathRet)){
            if(auxiliaries$AuxName[i]==""){stop(auxiliaries$FileName[i]," listed in ",auxiliaryFile," has no name",call.=FALSE)}
            auxiliaries$FileName[i] <- pathRet
          }else{stop(auxiliaries$FileName[i]," listed in ",auxiliaryFile," does not exist",call.=FALSE)}
        }
        # test the file extensions
        allAuxFileExts <- unique(consolidateFileExtensions(auxiliaries$FileName))
        if(!all(consolidateFileExtensions(auxiliaries$FileName)=="fasta")){stop("All auxiliary files need to have a fasta extension (fa,fasta,fna)",call.=FALSE)}

        if(length(unique(auxiliaries$AuxName))!=nrow(auxiliaries)){stop("All auxiliary files need to have a unique AuxName",call.=FALSE)}

        # store the auxiliary files
        proj@aux <- auxiliaries
      }else{stop(auxiliaryFile," should contain column names FileName,AuxName",call.=FALSE)} # incorrect format
    }else{stop(auxiliaryFile," should be a tab delimited file with 2 columns",call.=FALSE)}

    # Fill data into auxAlignments
    auxAlignments <- data.frame(matrix(NA_character_,nrow=nrow(auxiliaries),ncol=nrow(proj@reads)),stringsAsFactors=FALSE)
    rownames(auxAlignments) <- auxiliaries$AuxName
    colnames(auxAlignments) <- proj@reads$SampleName
    proj@auxAlignments <- auxAlignments

  }



  # ----------------------------------- PARSE BISULFITE PARAMETER -----------------------------------

  if(bisulfite %in% c("no","dir","undir")){
    proj@bisulfite <- bisulfite
    if(bisulfite != "no" && !(proj@paired %in% c("no","fr"))){
      stop("Bisulfite paired-end mode only supports pair orientation 'fr'",call.=FALSE)
    }
  }else{stop("Bisulfite mode only supports 'no', 'dir' and 'undir'",call.=FALSE)}

  # ----------------------------------- PARSE MAXHITS PARAMETER --------------------------------------
  proj@maxHits <- maxHits

  #------------------------------------ PARSE THE PROJECT NAME ----------------------------------
  if(is.null(projectName)){
    proj@projectName <- NA_character_
  }else{
    if(length(projectName)==1){
      proj@projectName <- projectName
    }else{stop("The projectName should contain only a single entry",call.=FALSE)}
  }

  # ---------------------------------- PARSE THE BAMFILE DIRECTORY --------------------------
  if(is.null(alignmentsDir)){
    proj@alignmentsDir <- NA_character_
  }else{
     alignmentsDir <- sub("(\\\\|/)$","",alignmentsDir)  # for windows systems
    if(file.exists(alignmentsDir)){
      # it's a directory or a file
      if(file.info(alignmentsDir)$isdir){
        proj@alignmentsDir <- tools::file_path_as_absolute(alignmentsDir)
      }else{stop("alignmentsDir ",alignmentsDir, " is not a directory",call.=FALSE)}
    }else{stop("alignmentsDir ",alignmentsDir, " does not exist",call.=FALSE)}
  }

  # ---------------------------------- PARSE THE LIB.LOC DIRECTORY --------------------------
  if(is.null(lib.loc)){
    proj@lib.loc <- NA_character_
  }else{
     lib.loc <- sub("(\\\\|/)$","",lib.loc)  # for windows systems
    if(file.exists(lib.loc)){
      # it's a directory or a file
      if(file.info(lib.loc)$isdir){
        proj@lib.loc <- tools::file_path_as_absolute(lib.loc)
      }else{stop("lib.loc ",lib.loc, " is not a directory",call.=FALSE)}
    }else{stop("lib.loc ",lib.loc, " does not exist",call.=FALSE)}
  }

  # ---------------------------------- PARSE THE CACHE DIRECTORY --------------------------

  if(is.null(cacheDir)){
    proj@cacheDir <- NA_character_
  }else{
    cacheDir <- sub("(\\\\|/)$","",cacheDir)  # for windows systems
    if(file.exists(cacheDir)){
      # it's a directory or a file
      if(file.info(cacheDir)$isdir){
        # if not already present, create a subdirectory in the cacheDir, this prevent collisions
        # when the the same cacheDir is being used by multiple users
        proj@cacheDir <- file.path(tools::file_path_as_absolute(cacheDir),basename(tempdir()))

        if(!file.exists(proj@cacheDir)){
          if(!dir.create(path=proj@cacheDir, showWarnings=FALSE)){
            stop("No permissions to write in the cacheDir: ",proj@cacheDir,call.=FALSE)
          }
        }
      }else{stop("cacheDir ",cacheDir, " is not a directory",call.=FALSE)}
    }else{stop("cacheDir ",cacheDir, " does not exist",call.=FALSE)}
  }


  # --------------- SET THE ALIGNERMODE ID AND LOAD THE ALIGNER PACKAGE IF REQUIRED --------------------

  supportedAligners <- c("Rbowtie", "Rhisat2")
  if(aligner %in% supportedAligners){
    proj@aligner <- aligner
  }else{stop("The specified aligner is unknown, please select one of the following: ",paste(supportedAligners,collapse=","),call.=FALSE)}

  if((proj@samplesFormat %in% c("fasta","fastq")) & (proj@bisulfite=="no")){
    alnModeID <- proj@aligner
  }else if((proj@samplesFormat %in% c("fasta","fastq")) & (proj@bisulfite!="no")){
    if(aligner == "Rhisat2"){
      stop("Bisulfite alignment mode is not available for Rhisat2")
    }
    alnModeID <- paste(proj@aligner,"CtoT",sep="")
  }else if((proj@samplesFormat %in% c("csfasta","csfastq")) & proj@bisulfite=="no"){
    if(aligner == "Rhisat2"){
      stop("Color space alignment mode is not available for Rhisat2")
    }
    alnModeID <- paste(proj@aligner,"Cs",sep="")
  }else if((proj@samplesFormat %in% c("csfasta","csfastq")) & proj@bisulfite!="no"){
    stop("Bisulfite alignment mode is not available for color space reads")
  }else if(proj@samplesFormat == "bam"){
    alnModeID <- NA_character_
    proj@aligner <- NA_character_
  }else{stop("Fatal error 2340923")}

  proj@alnModeID <- alnModeID

  # load the aligner package in the case where there are missing alignments (genomic or aux)
  if(any(is.na(proj@alignments$FileName)) | any(is.na(proj@auxAlignments))){

    pkgname <- aligner

    # these test are not needed while Rbowtie is in "Depends"
    #if(!(pkgname %in% installed.packages()[,"Package"])){stop(pkgname, " package is required for the alignments but not installed on this system",call.=FALSE)}
    #if(!require(pkgname, character.only=TRUE, quietly=TRUE)){stop("Fatal error 340954")}
    #pkg_version <- as.character(packageVersion(pkgname))
    #QuasR_suggests <- unlist(strsplit(installed.packages()['QuasR','Suggests'], ","))
    #QuasR_suggests_pkg <- grep(pkgname, QuasR_suggests, value=T)
  }

  #------------------------------------ PARSE SPLICED ALIGNMENT PARAMETER ---------------------------
  proj@splicedAlignment <- splicedAlignment
  if(proj@splicedAlignment & (proj@bisulfite!="no")){stop("The spliced alignment mode is not supported for bisulfite samples")}
  if(proj@aligner == "Rbowtie" && proj@splicedAlignment && !(proj@paired %in% c("no","fr"))){stop("The spliced alignment mode with Rbowtie only supports the pair orientation 'fr'")}

  #------------------------------------ PARSE THE ALIGNMENT PARAMETERS ----------------------------------
  if (is.na(proj@aligner)) {
    proj@alignmentParameter <- ""
  } else {
    if(is.null(alignmentParameter)){
      if(!proj@splicedAlignment){
        if(proj@aligner == "Rbowtie"){  # bowtie
          # Test for the case where no merge reorder is going to be executed later on. In that case maxhits needs to
          # to be reinforced by bowtie.
          if((proj@bisulfite == "no") && (is.na(proj@snpFile))){
            proj@alignmentParameter <- paste("-m",proj@maxHits,"--best --strata")
          }else{
            proj@alignmentParameter <- paste("-k",proj@maxHits+1,"--best --strata")
          }
          # For the allelic case, ignore qualities. Anyways the assignment to the R or A genome is based on sequence.
          if((proj@samplesFormat == "fasta") || !is.null(snpFile)){
            proj@alignmentParameter <- paste(proj@alignmentParameter,"-v 2")
          }
          if(proj@paired != "no"){
            proj@alignmentParameter <- paste(proj@alignmentParameter,"--maxins 500")
          }
        }else{  # hisat2
          if (is.na(proj@snpFile)) {
            proj@alignmentParameter <- paste("-k",proj@maxHits+1)
          } else {
            # XV tag is not added to singly aligned reads and will make qCount fail
            proj@alignmentParameter <- paste("-k",proj@maxHits+1,"--no-mixed --no-discordant")
          }
        }
      }else{
        if(proj@aligner == "Rbowtie") {  # spliced, bowtie
          if(is.na(proj@snpFile)){
            proj@alignmentParameter <- "-max_intron 400000 -min_intron 20000 -max_multi_hit 10 -selectSingleHit TRUE -seed_mismatch 1 -read_mismatch 2 -try_hard yes"
          }else{
            proj@alignmentParameter <- "-max_intron 400000 -min_intron 20000 -max_multi_hit 10 -selectSingleHit FALSE -seed_mismatch 1 -read_mismatch 2 -try_hard yes"
          }
        } else{  # spliced, hisat2
          if (!is.na(proj@geneAnnotation)) {
            proj@alignmentParameter <- paste("-k",proj@maxHits+1,"--known-splicesite-infile",paste0(proj@geneAnnotation,".SpliceSites.txt"))
          } else {
            proj@alignmentParameter <- paste("-k",proj@maxHits+1)
          }
          if (!is.na(proj@snpFile)) {
            # XV tag is not added to singly aligned reads and will make qCount fail
            proj@alignmentParameter <- paste(proj@alignmentParameter,"--no-mixed --no-discordant")
          }
        }
      }
    }else{
      if(length(alignmentParameter)==1){
        proj@alignmentParameter <- alignmentParameter
      }else{stop("The alignmentParameter should contain only a single character string",call.=FALSE)}
    }
  }

  # ---------------------------------- CREATE .fai AND .md5 FILES --------------------------

  # create fasta indices (.fai) files for all the reference sequences (genome & aux)
  # create all the .md5 files necessary to identify preexisting bam files
  createReferenceSequenceIndices(proj)


  # --------------------- SEARCH FOR BAM FILES THAT HAVE BEEN CREATED PREVIOUSLY ----------------------

  # search for genomic alignments
  for(i in 1:nrow(proj@reads)){
    if(is.na(proj@alignments$FileName[i])){
       projBamInfo <- bamInfoOnlyBaseName(qProjectBamInfo(proj,i))

       if(is.na(proj@alignmentsDir)){bamDir <- dirname(proj@reads[i,1])}else{bamDir <- proj@alignmentsDir}
       samplePrefix <- basename(tools::file_path_sans_ext(proj@reads[i,1],compression=TRUE))
       filesInBamDir <- list.files(bamDir, pattern=".bam$")
       bamFilesToInspect <- filesInBamDir[sub("\\_[^\\_]+.bam$","",filesInBamDir)==samplePrefix]
       bamTxtFilesToInspect <- paste(file.path(bamDir,bamFilesToInspect),"txt",sep=".")
       bamTxtFilesToInspectExist <- bamTxtFilesToInspect[file.exists(bamTxtFilesToInspect)]
       bamFilesToInspectWithTxt <- file.path(bamDir,bamFilesToInspect[file.exists(bamTxtFilesToInspect)])

       compatibleBamFileInd=NULL
       if(length(bamTxtFilesToInspectExist)>0){
         for(m in 1:length(bamTxtFilesToInspectExist)){
           bamInfoT_DF <- read.delim(bamTxtFilesToInspectExist[m],header=FALSE,row.names=1,stringsAsFactors=FALSE)
           bamInfoT <- bamInfoT_DF[,1]
           names(bamInfoT) <- rownames(bamInfoT_DF)
           bamInfoT <- bamInfoOnlyBaseName(bamInfoT)

           # compare the actual parameters to the one from the bam file on disk
           ## Exclude slots related to the geneAnnotation if the aligner is Rbowtie
           if (proj@aligner == "Rbowtie") {
             projBamInfo <- projBamInfo[!(names(projBamInfo) %in%
                                            c("geneAnnotation", "geneAnnotationFormat",
                                              "geneAnnotation.md5"))]
             bamInfoT <- bamInfoT[!(names(bamInfoT) %in%
                                      c("geneAnnotation", "geneAnnotationFormat",
                                        "geneAnnotation.md5"))]
           }
           if(identical(projBamInfo,bamInfoT)){
             compatibleBamFileInd[length(compatibleBamFileInd)+1] <- m
           }
         }
         if(length(compatibleBamFileInd)>1){
           for(k in 1:length(compatibleBamFileInd)){message(bamFilesToInspectWithTxt[k])}
           stop("Multiple bam files exist with same alignment parameters (see above list). QuasR is unable to decide which one to use. Please delete manually the respective bam files",call.=FALSE)
         }
         if(length(compatibleBamFileInd)==1){
           proj@alignments$FileName[i] <- bamFilesToInspectWithTxt[compatibleBamFileInd]
         }
       }
    }
  }

  # search for aux alignments
  if(nrow(proj@auxAlignments)>0){
    for(i in 1:ncol(proj@auxAlignments)){
      for(j in 1:nrow(proj@auxAlignments)){
        projBamInfo <- bamInfoOnlyBaseName(qProjectBamInfo(proj,i,j))

        if(is.na(proj@auxAlignments[j,i])){
          if(is.na(proj@alignmentsDir)){bamDir <- dirname(proj@reads[i,1])}else{bamDir <- proj@alignmentsDir}
          samplePrefix <- basename(tools::file_path_sans_ext(proj@reads[i,1],compression=TRUE))
          filesInBamDir <- list.files(bamDir, pattern=".bam$")
          bamFilesToInspect <- filesInBamDir[sub("\\_[^\\_]+.bam$","",filesInBamDir)==samplePrefix]
          bamTxtFilesToInspect <- paste(file.path(bamDir,bamFilesToInspect),"txt",sep=".")
          bamTxtFilesToInspectExist <- bamTxtFilesToInspect[file.exists(bamTxtFilesToInspect)]
          bamFilesToInspectWithTxt <- file.path(bamDir,bamFilesToInspect[file.exists(bamTxtFilesToInspect)])

          compatibleBamFileInd=NULL
          if(length(bamTxtFilesToInspectExist)>0){
            for(m in 1:length(bamTxtFilesToInspectExist)){
              bamInfoT_DF <- read.delim(bamTxtFilesToInspectExist[m],header=FALSE,row.names=1,stringsAsFactors=FALSE)
              bamInfoT <- bamInfoT_DF[,1]
              names(bamInfoT) <- rownames(bamInfoT_DF)
              bamInfoT <- bamInfoOnlyBaseName(bamInfoT)

              # compare the actual parameters to the one from the bam file on disk
              if (proj@aligner == "Rbowtie") {
                projBamInfo <- projBamInfo[!(names(projBamInfo) %in%
                                               c("geneAnnotation", "geneAnnotationFormat",
                                                 "geneAnnotation.md5"))]
                bamInfoT <- bamInfoT[!(names(bamInfoT) %in%
                                         c("geneAnnotation", "geneAnnotationFormat",
                                           "geneAnnotation.md5"))]
              }
              if(identical(projBamInfo,bamInfoT)){
                compatibleBamFileInd[length(compatibleBamFileInd)+1] <- m
              }
            }
            if(length(compatibleBamFileInd)>1){
              for(k in 1:length(compatibleBamFileInd)){message(bamFilesToInspectWithTxt[k])}
              stop("Multiple bam files exist with same alignment parameters (see above list). QuasR is unable to decide which one to use. Please delete manually the respective bam files",call.=FALSE)
            }
            if(length(compatibleBamFileInd)==1){
              proj@auxAlignments[j,i] <- bamFilesToInspectWithTxt[compatibleBamFileInd]
            }
          }
        }
      }
    }
  }

  return(proj)
}



# create search index (.fai) files for all the reference sequences that are
# going to be used for the alignments (genome & aux)
# do nothing if the fai files exist already
# create an .md5 file that contains the md5sum of the referece sequences
# if snp file exists, calculate md5 and store it in a (.md5) file
createReferenceSequenceIndices <- function(proj){
  # create fasta index for the genome if it is not a BSgenome and if the .fai file is not present
  if(proj@genomeFormat=="file"){
    if(!file.exists(paste(proj@genome,"fai",sep="."))){
      # test if the sequence file is a valid fasta file using the fasta.seqlengths() function from biostrings
      # this is necessary because indexFa() from Rsamtools would crash R if it is passed an invalid fasta file
      message(paste("Creating .fai file for:",proj@genome))
      result <- try(fasta.seqlengths(proj@genome))
      if(class(result)=="try-error"){
        stop("The fasta file ",proj@genome," is not a valid fasta file",call.=FALSE)
      }
      faInfoNames <- sub("\\s+.+","",names(result),perl=TRUE) # remove everything after the white space after the id (not done by fasta.seqlengths)
      if(length(unique(faInfoNames)) != length(faInfoNames)){
        stop("Sequence names in the file: ",proj@genome," are not unique",call.=FALSE)
      }
      if(class(try(indexFa(proj@genome)))=="try-error"){
        stop("Cannot write into the directory where ",proj@genome," is located. Make sure you have the right permissions",call.=FALSE)
      }
    }else{
      faiSeqNames <- as.character(seqnames(scanFaIndex(proj@genome)))
      if(length(unique(faiSeqNames)) != length(faiSeqNames)){
        stop("Sequence names in the file: ",proj@genome," are not unique (information extracted from the .fai file",call.=FALSE)
      }
    }
    # create md5 sum file
    if(!file.exists(paste(proj@genome,"md5",sep="."))){
      write.table(tools::md5sum(proj@genome),paste(proj@genome,"md5",sep="."),sep="\t",quote=FALSE,col.names=FALSE,row.names=FALSE)
    }
  }
  # create fasta index for the auxilliaries if not present already
  if(nrow(proj@aux)>0){
    for(i in 1:nrow(proj@aux)){
      if(!file.exists(paste(proj@aux$FileName[i],"fai",sep="."))){
        # test if the sequence file is a valid fasta file using the fasta.seqlengths() function from biostrings
        # this is necessary because indexFa() from Rsamtools would crash R if it is passed an invalid fasta file (R version 2.14.1)
        result <- try(fasta.seqlengths(proj@aux$FileName[i]))
        if(class(result)=="try-error"){
          stop("The fasta file ",proj@aux$FileName[i]," is not a valid fasta file",call.=FALSE)
        }
        faInfoNames <- sub("\\s+.+","",names(result),perl=TRUE) # remove everything after the white space after the id (not done by fasta.seqlengths)
        if(length(unique(faInfoNames)) != length(faInfoNames)){
          stop("Sequence names in the file: ",proj@aux$FileName[i]," are not unique",call.=FALSE)
        }
        if(class(try(indexFa(proj@aux$FileName[i])))=="try-error"){
          stop("Cannot write into the directory where ",proj@aux$FileName[i]," is located. Make sure you have the right permissions",call.=FALSE)
        }
      }

      if(!file.exists(paste(proj@aux$FileName[i],"md5",sep="."))){
        # create md5 sum file
        write.table(tools::md5sum(proj@aux$FileName[i]),paste(proj@aux$FileName[i],"md5",sep="."),sep="\t",quote=FALSE,col.names=FALSE,row.names=FALSE)
      }
    }
  }

  #create md5 sum file for the snp file
  if(!is.na(proj@snpFile)){
    if(!file.exists(paste(proj@snpFile,"md5",sep="."))){
      write.table(tools::md5sum(proj@snpFile),paste(proj@snpFile,"md5",sep="."),sep="\t",quote=FALSE,col.names=FALSE,row.names=FALSE)
    }
  }

  # create md5 sum file for the geneAnnotation
  if (!is.na(proj@geneAnnotation)) {
    if (!file.exists(paste0(proj@geneAnnotation, ".md5"))) {
      write.table(tools::md5sum(proj@geneAnnotation), paste0(proj@geneAnnotation, ".md5"),
                  sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE)
    }
  }
}


# returns information for one bam file in qProject
# sampleNr specifies the sample for which the information is compiled
# if auxNr is not NULL, it returns information about aux bam file
qProjectBamInfo <- function(proj,sampleNr,auxNr=NULL){

  if(sampleNr > nrow(proj@reads)){stop("project has no sample ",sampleNr)}
  if(!is.null(auxNr)){
    if(auxNr > nrow(proj@aux)){stop("project has no aux ",auxNr)}
  }

  alnInfo <- NULL
  if(proj@paired=="no"){
    alnInfo["reads.FileName1"]=proj@reads$FileName[sampleNr]
    alnInfo["reads.FileName2"]=NA
    alnInfo["reads.md5subsum1"]=proj@reads_md5subsum$md5subsum[sampleNr]
    alnInfo["reads.md5subsum2"]=NA
  }else{
    alnInfo["reads.FileName1"]=proj@reads$FileName1[sampleNr]
    alnInfo["reads.FileName2"]=proj@reads$FileName2[sampleNr]
    alnInfo["reads.md5subsum1"]=proj@reads_md5subsum$md5subsum1[sampleNr]
    alnInfo["reads.md5subsum2"]=proj@reads_md5subsum$md5subsum2[sampleNr]
  }

  alnInfo["samplesFormat"]=proj@samplesFormat
  alnInfo["genome"]=proj@genome
  if(proj@genomeFormat=="file"){
    alnInfo["genome.md5"]=as.character(read.delim(paste(proj@genome,"md5",sep="."),header=FALSE,colClasses="character")[1,1])
  }else{
    alnInfo["genome.md5"]=NA
  }
  alnInfo["genomeFormat"]=proj@genomeFormat
  alnInfo["aux"]=NA
  alnInfo["aux.md5"]=NA
  alnInfo["aligner"]=proj@aligner
  if(!is.na(proj@aligner)){
    alnInfo["aligner.version"] = as.character(packageVersion(proj@aligner))
  }else{
    alnInfo["aligner.version"] = NA
  }
  alnInfo["maxHits"]=proj@maxHits
  alnInfo["paired"]=proj@paired
  alnInfo["splicedAlignment"]=proj@splicedAlignment
  alnInfo["snpFile"]=proj@snpFile
  alnInfo["snpFile.md5"]=NA
  alnInfo["bisulfite"]=proj@bisulfite
  alnInfo["alignmentParameter"]=proj@alignmentParameter
  alnInfo["geneAnnotation"] <- proj@geneAnnotation
  alnInfo["geneAnnotationFormat"] <- proj@geneAnnotationFormat
  alnInfo["geneAnnotation.md5"] <- NA
  alnInfo["QuasR.version"] = as.character(packageVersion('QuasR'))

  if(!is.null(auxNr)){
    alnInfo["aux"]=proj@aux$FileName[auxNr]
    alnInfo["aux.md5"]=as.character(read.delim(paste(proj@aux$FileName[auxNr],"md5",sep="."),header=FALSE,colClasses="character")[1,1])
  }

  if(!is.na(proj@snpFile)){
    alnInfo["snpFile.md5"]=as.character(read.delim(paste(proj@snpFile,"md5",sep="."),header=FALSE,colClasses="character")[1,1])
  }
  if (!is.na(proj@geneAnnotation)) {
    alnInfo["geneAnnotation.md5"] <- as.character(read.delim(paste0(proj@geneAnnotation, ".md5"),
                                                             header = FALSE,
                                                             colClasses = "character")[1, 1])
  }

  return(alnInfo)
}


# replace full file paths in bamInfo by base name
# remove the aligner and QuasR version. this allows using bam files generated with an older QuasR version
bamInfoOnlyBaseName <- function(bamInfo){
  bamInfo["reads.FileName1"] <- basename(bamInfo["reads.FileName1"])
  bamInfo["reads.FileName2"] <- basename(bamInfo["reads.FileName2"])
  bamInfo["genome"] <- basename(bamInfo["genome"])
  bamInfo["aux"] <- basename(bamInfo["aux"])
  bamInfo["snpFile"] <- basename(bamInfo["snpFile"])
  bamInfo["geneAnnotation"] <- basename(bamInfo["geneAnnotation"])

  if("aligner.version" %in% names(bamInfo)){
    bamInfo <- bamInfo[!(names(bamInfo) %in% "aligner.version")]
  }

  if("QuasR.version" %in% names(bamInfo)){
    bamInfo <- bamInfo[!(names(bamInfo) %in% "QuasR.version")]
  }

  return(bamInfo)
}

# helper function that converts filepaths to absolute filepaths in the case where the working
# directory needs to be temporarily redirected. This is needed if e.g. the samples.txt file
# is not located in the working directory.
# returns the absolute path if the file exists
# returns NULL if the file does not exist
pathAsAbsoluteRedirected <- function(fileName,redirectPath){
  curwd <- getwd() # store the original working directory required to jump back
  on.exit(setwd(curwd)) # make sure that it will jump back in a case of error or not

  setwd(redirectPath) # jump to the redirected position
  if(file.exists(fileName)){
    return(tools::file_path_as_absolute(fileName))
  }else{return(NULL);}
}

# helper function that returns the temporary directory given the cacheDir (from qProject)
# if the user specified cacheDir, then it returns it. but if the user did not specify a cacheDir
# it returns the R temporary directory (which can be different for different machines)
resolveCacheDir <- function(cacheDir){
  if(is.na(cacheDir))
    return(tempdir())
  else
    return(cacheDir)
}


# given a vector of filenames, the function determines the final format
# taking into account all the different types of extensions for
# sequence files. Throughs an error if one ore more samples do not contain
# a valid file extension
determineSamplesFormat <- function(filename){
  fileExtension <- consolidateFileExtensions(filename,compressed=TRUE)

  validExtsSel <- fileExtension %in% c("fasta","fastq","bam","csfasta","csfastq")

  if(all(validExtsSel)){
    if(length(unique(fileExtension))==1){
      return(unique(fileExtension))
    }else{stop("Mixed sample file formats are not allowed: ",paste(unique(fileExtension),collapse=","),call.=FALSE)}
  }else{
    stop(filename[!validExtsSel][1]," does not have a valid file extension (fa,fna,fasta,fq,fastq,bam,csfasta,csfastq)",call.=FALSE)
  }
}

Try the QuasR package in your browser

Any scripts or data that you put into this service are public.

QuasR documentation built on Nov. 8, 2020, 8:31 p.m.