R/createIndex-functions.R
In QuasR: Quantify and Annotate Short Reads in R

# build an index based on a given BSgenome, create a package with the files and install it on the system
buildIndexPackage <- function(genome,aligner,alnModeID,cacheDir,lib.loc){
  indexPackageName <- paste(genome,alnModeID,sep=".")

  lib.locTemp <- lib.loc
  if(is.na(lib.loc)){lib.locTemp<-NULL;} # this is a way to convert NA to NULL needed for install.packages
  
  # Create the index and install it if it is not yet installed on the system
  if(!(indexPackageName %in% installed.packages(lib.loc = lib.locTemp)[,'Package'])){
    genomeObj <- get(genome) # access the BSgenome
    fastaFilepath <- BSgenomeSeqToFasta(genomeObj, tempfile(tmpdir=cacheDir, fileext=".fa"))  # flush the BSgenome to disk
    on.exit(unlink(fastaFilepath))

    # create the package (without installing it yet). if it is already in the temp dir, keep it. It might contain an index
    # calculated in a previous round where the installation was not successful (due to permissions)
    if(!file.exists(file.path(cacheDir,indexPackageName))){
      seedList <- createSeedList(genomeObj, aligner, indexPackageName)
      templatePath <- system.file("AlignerIndexPkg-template", package="QuasR")
      Biobase::createPackage(indexPackageName, cacheDir, templatePath, seedList, quiet=TRUE)
    }
    # create the index files
    buildIndex(fastaFilepath,file.path(cacheDir,indexPackageName,"inst","alignmentIndex"),alnModeID,cacheDir)

    # install the package
    install.packages(file.path(cacheDir,indexPackageName), repos=NULL, dependencies=FALSE, type="source", lib=lib.locTemp)

    if(indexPackageName %in% installed.packages(lib.loc = lib.locTemp)[,'Package']){
      # package installation was successful, clean up
      unlink(file.path(cacheDir,indexPackageName),recursive=TRUE)
    }else{"Fatal error 2309420"}
  }
}


buildIndexSNP <- function(snpFile,indexPath,genome,genomeFormat,alnModeID,cacheDir,checkMD5=FALSE){

  fastaOutFileR <- paste(snpFile,basename(genome),"R","fa",sep=".")
  fastaOutFileA <- paste(snpFile,basename(genome),"A","fa",sep=".")
 
  if(!file.exists(fastaOutFileR) | !file.exists(fastaOutFileA)){
    # read in the SNPs
    message(paste("Reading and processing the SNP file:",snpFile))
    snps <- read.table(snpFile,colClasses=c("factor","numeric","character","character"))
    colnames(snps) <- c("chrom","pos","ref","alt")
 
    snps[,3] <- toupper(snps[,3]) # convert ref nucleotides to upper case if not already the case
    snps[,4] <- toupper(snps[,4]) # convert alt nucleotides to upper case if not already the case

    # check if there are only regular nucleotides (ACGT) in the snp file
    if(!all(unique(c(snps[,3],snps[,4])) %in% c("A","C","G","T"))){stop("There are non-regular nucleoides in snpFile. Only ACGT are allowed.",call.=FALSE)}

    snpsL <- split(snps,snps[,1]) # split SNPs accoring to chromosome (first column)

    # check for duplicate snp entries (not allowed)
    if(sum(sapply(snpsL,function(x){sum(duplicated(x[,2]))})) > 0){stop("There are duplicate SNP positions in snpFile.",call.=FALSE)}

    if(genomeFormat=="file"){
      idx <- scanFaIndex(genome)
      allChrs <- as.character(seqnames(idx))
    }else{
      genomeObj <- get(genome) # access the BSgenome
      allChrs <- seqnames(genomeObj)
    }
    if(!all(names(snpsL) %in% allChrs)){stop("The snpFile contains chromosomes that are not present in the genome",call.=FALSE)}
  }

  for(j in 1:2){
    fastaOutFile <- c(fastaOutFileR,fastaOutFileA)[j]

    if(!file.exists(fastaOutFile)){
      append <- FALSE
      message(paste("Creating the genome fasta file containing the SNPs:",fastaOutFile))
      for(i in 1:length(allChrs)){
        if(genomeFormat=="file"){
          seq <- scanFa(genome, idx[i])[[1]]
        }else{
          if(is.null(masks(genomeObj[[allChrs[i]]]))){
            seq <- genomeObj[[allChrs[i]]]
          }else{
            seq <- injectHardMask(genomeObj[[allChrs[i]]], letter="N")
          }
        }
        snpsLC <- snpsL[[allChrs[i]]]
        # check if the ref nucleotide in snpFile matches the genome
        if(!is.null(snpsLC)){
          # inject the SNPs
          seqSNP <- replaceLetterAt(seq,snpsLC[,2],snpsLC[,2+j])
          seqSNP_SS <- DNAStringSet(seqSNP)
        }else{
          seqSNP_SS <- DNAStringSet(seq)
        }
        names(seqSNP_SS) <- allChrs[i]

        writeXStringSet(seqSNP_SS, filepath=fastaOutFile, format="fasta", append=append)
        append <- TRUE
      }
    }
  }

  # create .fai file for the snp genome
  for(fastaOutFile in c(fastaOutFileR,fastaOutFileA)){
    if(!file.exists(paste(fastaOutFile,"fai",sep="."))){
      message(paste("Creating a .fai file for the snp genome:",fastaOutFile))
      if(class(try(indexFa(fastaOutFile)))=="try-error"){
        stop("Cannot write into the directory where ",fastaOutFile," is located. Make sure you have the right permissions",call.=FALSE)
      }
    }
  }

  # create The two indices
  buildIndex(fastaOutFileR,paste(snpFile,basename(genome),"R","fa",alnModeID,sep="."),alnModeID,cacheDir)
  buildIndex(fastaOutFileA,paste(snpFile,basename(genome),"A","fa",alnModeID,sep="."),alnModeID,cacheDir)

}



# build an index based on a given fasta file with one or more sequences.
buildIndex <- function(seqFile,indexPath,alnModeID,cacheDir,checkMD5=FALSE){

  # check if the directory exists but contains no ref_md5Sum.txt file. This means that a last index builder call did not
  # finish properly. Delete the directory containing the partial index
  if(file.exists(indexPath)){
    if(!("ref_md5Sum.txt" %in% dir(indexPath))){
       message(paste("Deleting an incomplete index at:",indexPath))
       file.remove(dir(indexPath,full.names=TRUE))
       unlink(indexPath,recursive=TRUE)
    }else{
      # if checkMD5==TRUE, check consistency between the sequences and the index
      # delete index in the case of inconsistency
      if(checkMD5){
        MD5_fromIndexTab <- read.delim(file.path(indexPath,"ref_md5Sum.txt"),header=FALSE,colClasses="character")
        if(!all(dim(MD5_fromIndexTab)==c(1,1))){stop("The md5sum file does not have the right format: ",file.path(indexPath,"ref_md5Sum.txt"),call.=FALSE)}
        MD5_fromIndex <- MD5_fromIndexTab[1,1]
        MD5_fromSeq <- tools::md5sum(seqFile)
        if(MD5_fromIndex != MD5_fromSeq){
          message(paste("The sequence file",seqFile,"was changed. Updating the index: ",indexPath)) 
          file.remove(dir(indexPath,full.names=TRUE))
          unlink(indexPath,recursive=TRUE)
        }
      }
    }
  }
  if(!file.exists(indexPath)){
    # create the directory for the index
    if(!dir.create(indexPath)){stop("Cannot create the directory: ",indexPath,call.=FALSE)}

    # Create all the various indices
    message(paste("Creating an",alnModeID,"index for",seqFile))

    if(alnModeID=="Rbowtie"){
      ret <- buildIndex_Rbowtie(seqFile,indexPath)
    }else if(alnModeID=="RbowtieCtoT"){
      ret <- buildIndex_RbowtieCtoT(seqFile,indexPath,cacheDir)
    }else if(alnModeID=="RbowtieCs"){
      ret <- buildIndex_RbowtieCs(seqFile,indexPath)
    }else if(alnModeID=="Rhisat2"){
      ret <- buildIndex_Rhisat2(seqFile,indexPath)
    }else{stop("Fatal error 2374027")}

    if(ret==0){
      indexMD5 <- tools::md5sum(seqFile)
      write.table(indexMD5,file=file.path(indexPath,"ref_md5Sum.txt"),row.names=FALSE,col.names=FALSE,sep="\t",quote=FALSE)
      message("Finished creating index")
    }else{
      # the execution of index-build failed, delete the directory.
      file.remove(dir(indexPath,full.names=TRUE))
      unlink(indexPath,recursive=TRUE)
      stop("The execution of the index builder failed. No index was created",call.=FALSE)
    }
  }
}

# build index for Rhisat2, base space, non-bisulfite converted
buildIndex_Rhisat2 <- function(seqFile,indexPath){
  indexFullPath <- file.path(indexPath,"hisat2Index")
  
  ret <- system2(file.path(system.file(package="Rhisat2"),"hisat2-build"),c(shQuote(seqFile),shQuote(indexFullPath)), stdout=TRUE, stderr=TRUE)
  if(!(grepl("^Total time for call to driver", ret[length(ret)]))){
    ret <- 1
  }else{
    ret <- 0
  }
  return(ret)
}

# generate splice site file
buildSpliceSiteFile <- function(geneAnnotation, geneAnnotationFormat) {
  if (file.exists(paste0(geneAnnotation, ".SpliceSites.txt"))) {
    # if the SpliceSites.txt file exists, but not the md5sum file, remove the former
    if (!file.exists(paste0(geneAnnotation, ".SpliceSites.txt.md5"))) {
      message("Deleting an incomplete SpliceSites.txt file at: ",
              paste0(geneAnnotation, ".SpliceSites.txt"))
      unlink(paste0(geneAnnotation, ".SpliceSites.txt"))
    } else {
      # both SpliceSites.txt and md5 file exist
      md5FromFile <- read.delim(paste0(geneAnnotation, ".SpliceSites.txt.md5"),
                                header = FALSE, colClasses = "character")
      if (!all(dim(md5FromFile) == c(1, 1))) {
        stop("The md5sum file does not have the right format: ",
             paste0(geneAnnotation, ".SpliceSites.txt.md5"), call. = FALSE)
      }
      md5FromFile <- md5FromFile[1, 1]
      md5FromObj <- tools::md5sum(geneAnnotation)
      if (md5FromFile != md5FromObj) {
        message("The annotation file ", geneAnnotation,
                " was changed. Recreating the SpliceSites file.")
        unlink(paste0(geneAnnotation, ".SpliceSites.txt"))
      }
    }
  }
  if (!file.exists(paste0(geneAnnotation, ".SpliceSites.txt"))) {
      if (geneAnnotationFormat == "TxDb") {
          txdb <- AnnotationDbi::loadDb(geneAnnotation)
      } else if (geneAnnotationFormat == "gtf") {
          txdb <- suppressWarnings(
              GenomicFeatures::makeTxDbFromGFF(geneAnnotation, format = "auto")
          )
      } else {
          stop("Fatal error 81956293")
      }
      Rhisat2::extract_splice_sites(txdb, paste0(geneAnnotation, ".SpliceSites.txt"), 
                                    min_length = 5)
      md5FromObj <- tools::md5sum(geneAnnotation)
      write.table(md5FromObj, file = paste0(geneAnnotation, ".SpliceSites.txt.md5"),
                  row.names = FALSE, col.names = FALSE, sep = "\t", quote = FALSE)
  }
}

# build index for Rbowtie, base space, non-bisulfite converted
buildIndex_Rbowtie <- function(seqFile,indexPath){
  indexFullPath <- file.path(indexPath,"bowtieIndex")

  ret <- system2(file.path(system.file(package="Rbowtie"),"bowtie-build"),c(shQuote(seqFile),shQuote(indexFullPath)), stdout=TRUE, stderr=TRUE)
  if(!(grepl("^Total time for backward call to driver", ret[length(ret)]))){
    ret <- 1
  }else{
    ret <- 0
  }
 
  return(ret)
}

# build index for Rbowtie, base space, bisulfite converted
buildIndex_RbowtieCtoT <- function(seqFile,indexPath,cacheDir){
  indexFullPath <- file.path(indexPath,"bowtieIndex")

  # read the reference sequences
  idx <- scanFaIndex(seqFile)

  # create two temporary sequences files, C->T the plus strand and G->A for the minus strand
  outFilePlus <- tempfile(tmpdir=cacheDir, fileext=".fa")
  outFileMinus <- tempfile(tmpdir=cacheDir, fileext=".fa")

  on.exit(unlink(c(outFilePlus,outFileMinus)))

  # read one chromosome after the other, convert and write to disk
  append <- FALSE
  for(i in 1:length(idx)){
    seq <- scanFa(seqFile, idx[i])
    plus_strand <- chartr("C", "T", seq)
    minus_strand <- chartr("G", "A", seq)
    writeXStringSet(plus_strand, filepath=outFilePlus, format="fasta", append=append)
    writeXStringSet(minus_strand, filepath=outFileMinus, format="fasta", append=append)
    append <- TRUE
  }

  # execute bowtie twice to create the two indices
  ret1 <- system2(file.path(system.file(package="Rbowtie"),"bowtie-build"),c(shQuote(outFilePlus),shQuote(file.path(indexPath,"bowtieIndexCtoT"))), stdout=TRUE, stderr=TRUE)
  if(!(grepl("^Total time for backward call to driver", ret1[length(ret1)]))){
    ret1 <- 1
  }else{
    ret1 <- 0
  }

  ret2 <- system2(file.path(system.file(package="Rbowtie"),"bowtie-build"),c(shQuote(outFileMinus),shQuote(file.path(indexPath,"bowtieIndexGtoA"))), stdout=TRUE, stderr=TRUE)
  if(!(grepl("^Total time for backward call to driver", ret2[length(ret2)]))){
    ret2 <- 1
  }else{
    ret2 <- 0
  }
 
  if((ret1==0) & (ret2==0)){
    return(0)
  }else{
    return(1)
  }
}

# build index for Rbowtie, color space, non-bisulfite converted
buildIndex_RbowtieCs <- function(seqFile,indexPath){
  indexFullPath <- file.path(indexPath,"bowtieIndexCs")

  ret <- system2(file.path(system.file(package="Rbowtie"),"bowtie-build"),c(shQuote(seqFile),shQuote(indexFullPath),"-C"), stdout=TRUE, stderr=TRUE)
  if(!(grepl("^Total time for backward call to driver", ret[length(ret)]))){
    ret <- 1
  }else{
    ret <- 0
  }

  return(ret)
}


# flush all chromosomes of a BSgenome into a file
BSgenomeSeqToFasta <- function(bsgenome, outFile)
{
    if(!is(bsgenome, "BSgenome")){stop("The variable 'bsgenome' is not a BSgenome")}
    append <- FALSE
    for(chrT in seqnames(bsgenome)){
        if(is.null(masks(bsgenome[[chrT]])))
            chrSeq <- DNAStringSet(bsgenome[[chrT]])
        else
            chrSeq <- DNAStringSet(injectHardMask(bsgenome[[chrT]], letter="N"))
        names(chrSeq) <- chrT
        writeXStringSet(chrSeq, filepath=outFile, format="fasta", append=append)
        append <- TRUE
    }
    return(outFile)
}



createSeedList <- function(genome, aligner, indexPackageName)
{
    pv <- metadata(genome)$genome
    seed <- list(##package seeds
                 PKGTITLE=paste(aligner, "Index of", bsgenomeName(genome)),
                 PKGDESCRIPTION=paste(aligner, "Index of", bsgenomeName(genome)),
                 PKGVERSION="1.0",
                 DATE=format(Sys.time(), "%Y-%M-%d"),
                 AUTHOR="QuasR",
                 MAINTAINER="This package was automatically created <yourfault@somewhere.net>",
                 LIC=paste("see",bsgenomeName(genome)),
                 PKGDETAILS="Storing genome index for BSgenome which is needed for alignments.",
                 PKGEXAMPLES="No examples",

                 ##genome seeds
                 GENOMENAME=bsgenomeName(genome),
                 PROVIDER=provider(genome),
                 PROVIDERVERSION=ifelse(!is.null(pv) && is.character(pv) && length(pv) == 1L, pv, "not_available"),
                 RELEASEDATE=releaseDate(genome),
                 RELEASENAME="not_available",
                 ORGANISM=organism(genome),
                 SPECIES=commonName(genome),
                 SRCDATAFILES=bsgenomeName(genome),
                 ORGANISMBIOCVIEW=gsub(" ", "_", organism(genome)),

                 #aligner seeds
                 ALIGNER=aligner,
                 ALIGNERVERSION = as.character(packageVersion(aligner))
                 )

    return(seed)
}