RNASeqPipelineR: Construct Standardized pipelines for processing RNASeq data

Documented in AlignmentSTAR annotateSRA annotateUCSC annotationsFromSRX assignConfig buildReference concatenateFastq configure_project convertSRAtoFastQ createProject detectAspera devel_truncateData downloadSRA getAndConnectSRAdb getConfig getConfigDir getConfigFile getDataFromSRX getDir getExpressionSet loadImmportTables loadProject mergeData MiTCR pear pipelineReport QualityControl qualityRNASeQC readConfig RSEMAssembleExpressionMatrix RSEMCalculateExpression RSEMSummarizeMapping runFastQC saveConfig sequenceAlignmentTopHat setGenomeReference SRAOverview sumDuplicates summarizeDuplication summarizeFastQC

#' Simplify pipelining RNASeq data
#' 
#' RNASeqPipeline allows you to standardize the processing of your RNASeq data.
#' 
#' RNASeqPipeline simplifies the process of pipelining the processing of RNASeq
#' data by helping you organize the raw, intermediate and processed data in a consistent
#' manner. 
#' 
#' @docType package
#' @name RNASeqPipeline
#' @author Raphael Gottardo and Greg Finak
#' @import data.table
#' @import RSQLite
#' @import GEOquery
#' @import SRAdb
#' @import stringr
#' @import utils
#' @import stats
#' @import Biobase
#' @import biomaRt
#' @import parallel
NULL

## silence complaints about variables not found in calls that use non-standard evaluation
if(getRversion() >= "2.15.1") globalVariables(c(
                  'transcript_ids', # RSEMAssembleExpressionMatrix, BioCAnnotate
                  'transcript_id',  #BioCAnnotate
                  'entrez_id',
                  'gene_symbol',
                  'gene_id',
                  'record', #summarizeDuplication
                  'value',
                  'duplication level',
                  'totaldup',
                  'qresult', #summarizeFastQC
                  'result',
                  'sum_qresult',
                  'test'))
                  


dir.exists<-function (x)
{
  res <- file.exists(x) & file.info(x)$isdir
  setNames(res, x)
}

#' Create a new RNASeqPipeline project
#' 
#' Create the skeleton for a new RNASeqPipeline project.
#' 
#' createProject will create a new RNASeqPipeline 
#' project under directory 'project_name' in the path specified by 'path'.
#' The function creates the directory structure libraryd by RNASeqPipeline
#' within the new project directory, including locations for fastQ files, 
#' fastQC output, RSEM quantification, and optionally GEO and SRA files if the data
#' must be downloaded or linked to a GEO accession. Standard locations will also be provided 
#' to annotate the data utilizing the Immport schema. If the project exists, it will load the configuration 
#' if it can find it, otherwise it will proceed to re-configure the project.
#' 
#' @param project_name \code{character} name (ie, subdirectory) for project
#' @param path \code{character} Root directory for project
#' @param verbose \code{logical} Should verbose output be given?
#' @param load_from_immport \code{logical} Creates a 'Tab' directory for Immport tables if the data are loaded from Immport.
#' @return NULL
#' @export
#' @examples
#' # construct a projects skeleton in a new folder titled "myproject".
#' createProject("myproject",path=".")
#' createProject("myproject",path=".",verbose=TRUE)
#' \dontrun{
#' createProject("myproject",path=".",load_from_immport=TRUE) #won't work unless immport is set up
#' }
createProject <- function(project_name, path=".", verbose=FALSE, 
                          load_from_immport=FALSE){
  project_dir <- file.path(path,project_name)
  success<-FALSE
  oldwarn<-options("warn")$warn
  options("warn"=-1)
  normpath<-try(normalizePath(project_dir),silent=TRUE)
  options("warn"=oldwarn)
  if(dir.exists(normpath)){
    #load the configuration and return
    message("Project exists, loading configuration.")
    success<-readConfig(normalizePath(project_dir))      
  }
  if(success)
    invisible()
  cmnd_prefix <- "mkdir -p "
  dirs <- c(SRA="SRA",FASTQ="FASTQ",RSEM="RSEM",FASTQC="FASTQC",GEO="GEO",
            CONFIG="CONFIG",OUTPUT="OUTPUT",RAWANNOTATIONS="RAW_ANNOTATIONS",
            RNASEQC="RNASEQC",TOPHAT="TOPHAT", BAM='BAM', KALLISTO="KALLISTO")
  if(load_from_immport)
    dirs<-c(dirs,Tab="Tab")
  
  ## add Kallisto log and error directories
  dirs <- c(dirs, "KALLISTO/log", "KALLISTO/error")
  
  subdirs <- file.path(project_dir,dirs)
  names(subdirs)<-names(dirs)
  build_skeleton_commands<-paste0(cmnd_prefix,c(project_dir,subdirs))
  results<-sapply(build_skeleton_commands,system)
  if(verbose){
    for(i in seq_along(results)){
      cat(names(results)[i], ": ",ifelse(results[i]==0,"success!","failed!"),"\n")
    }
  }else{
    if(any(results>0)){
      sapply(names(results[results>0]),function(x){
        cat("Failed at: ",x,"\n")
      })
    }else{
      cat("Project successfully created.\n")
    }
  }
  configure_project(subdirs)
  saveConfig()
} 

#' Load an RNASeqPipelineR project
#' 
#' Load an RNASeqPipelineR project
#' 
#' Load an RNASeqPipelineR project from disk.
#' 
#' @param project_dir \code{character} the path to the project
#' @param name \code{character} the name of the project
#' @export
loadProject <- function(project_dir=NULL,name=NULL){
    success <- FALSE
    pathy <- normalizePath(file.path(project_dir,name), mustWork=FALSE)
    if(dir.exists(pathy)){
        ## load the configuration and return
        message("Loading configuration for ",name, " from ", project_dir);
        success <- readConfig(pathy)      
    } else {
         stop(paste("ERROR:", pathy, "does not exist."))
    } ## end if else
    if(success) {
        invisible()
    }else {
        stop("Can't load project.")
    }
}


#' Set path to reference genome index
#' @param utils \code{character} the path to the directory containing reference genome.
#' @param refName \code{character)} name of reference index or genome
#' @export
setGenomeReference <- function(utils, refName) {
  subdirs <- getConfig()[["subdirs"]]
  subdirs[["Utils"]] <- utils
  assignConfig("subdirs", subdirs)
  assignConfig("reference_genome_name", refName)
}

#global config in package namespace
rnaseqpipeliner_configuration <- list()

#' Configure the project and store info in namespace
#' 
#' Configures the project based on the directories created in the project path
#' 
#' Stores the project configuration in the rnaseqpipeliner_configuration 
#' variable in the packge namespace.
#' Generally, this function shouldn't be called by the user.
#' 
#' @param subdirectories a \code{character} vector of subdirectories passed from \code{createProject}, or a path to an existing project.
#' @param fromDisk \code{logical} specifying whether to load the configuration of an existing proejct, in which case the path to the proejct is given by \code{subdirectories} argument
configure_project <- function(subdirectories=NULL,fromDisk=FALSE){
  ns <- getNamespace("RNASeqPipelineR")
  obj<-getConfig(create=TRUE)
  if(!fromDisk){
    obj$subdirs <- normalizePath(subdirectories)
    names(obj$subdirs)<-names(subdirectories)
    obj$immport_files<- c(file_info="file_info.txt", expsample="expsample.txt", biosample="biosample.txt",bio_to_exp="biosample_2_expsample.txt",arm_to_subject="arm_2_subject.txt", arm_to_cohort="arm_or_cohort.txt",subject="subject.txt")
  }else{
    message("Reading existing configuration from disk not yet implemented")
  }
  unlockBinding(sym = "rnaseqpipeliner_configuration",ns)
  assign("rnaseqpipeliner_configuration", obj, envir = ns)
  #detect aspera 
  detectAspera()
  invisible()
}

#' Get the configuration for the project from the namespace
#' 
#' Retrieve the configuration for the project from the namespace. If
#' configuration not found exit with error.
#'
#' @param create /code{logical} If TRUE then this will return empty list that can be used
#' to create new configuration. If FALSE then standard error checking will be run.
#' @return Returns configuration structure or stops with error if not found
#' @export
getConfig<-function(create=FALSE){
  ns <- getNamespace("RNASeqPipelineR")
  config <- get("rnaseqpipeliner_configuration", ns)
  if(!create & length(config) == 0) {
       stop("Configuration not found: Have you run loadProject or buildReference (for Utils directory)?")
  }
  return(config)
}


#' Retrieve full path to 'dir' in project configuration structure
#'
#' Retrieve full path to 'dir' in project configuration structure. If 'dir'
#' not found exit with error.
#'
#' @param dir \code{character} directory that path is required for
#' @return \code{character} full path to directory
#' @export
getConfigDir <- function(dir) {

    path <- getConfig()[["subdirs"]]

    if(is.null(path)) stop("subdirs list not found. Configuration corrupted!")
    if( is.na(match(dir, names(path))) ) {
        stop(paste("directory", dir, "not found in project configuration"))
    }

    return(getConfig()[["subdirs"]][[dir]])
} ## end getConfigDir


#' Retrieve full path to 'dir/file' in project configuration structure
#'
#' Retrieve full path to 'dir/file' in project configuration structure. If
#' file not found exit with error.
#'
#' @param dir \code{character} directory containing file
#' @param file \code{character} file sought after
#' @return \code{character} full path to directory
#' @export
getConfigFile <- function(dir, file) {

    ## construct path
    path <- getConfigDir(dir)
    filey <- paste(path, file, sep="/")

    ## error check
    if(!file.exists(filey)) {
        stop(paste(filey, "not found"))
    }
  
    ## return file path
    return(filey)
} ## end getConfigFile


#' Assign a new element to the RNASeqPipelineR configuration
#' 
#' Assign value to name in `rnaseqpipeliner_configuration` object
#' 
#' Convenience function to assign a new or reassign an element named \code{name} to the \code{rnaseqpipeliner_configuration} object with the value \code{value}
#' @param name \code{character} assign element with this name
#' @param value \code{value} an R object that assigned to the 'rnaseqpipeliner_configuration' \code{list} with name 'name'
#' @export
assignConfig<-function(name,value){
  ns<-getNamespace("RNASeqPipelineR")
  unlockBinding(sym = "rnaseqpipeliner_configuration",ns)
  obj<-getConfig()  
  obj[[name]]<-value
  assign("rnaseqpipeliner_configuration",obj,ns)
  invisible(NULL)
}

#' Load data from Immport tables
#' 
#' Loads the GEO accession numbers from Immport tables
#' 
#' Load data from Immport tables in the \code{Tab} directory.
#' Stores the tables in the configuration in the namespace.
#' 
#' @param warn \code{integer} specifies whether to print warnings. -1 suppresses warnings (default)
#' @param verbose \code{integer} how verbose should we be about what's happening. 0 (default) is quiet.
#' @export
loadImmportTables <- function(warn=-1,verbose=0){
  savewarn<-options("warn")[[1]]
  options("warn"=warn)
  immport_files <- getConfig()$immport_files
  subdirs <- getConfig()$subdirs
  #test whether this is an immport project by looking for a Tab directory.
  if(!any(grepl("^Tab$",basename(subdirs)))){
    stop("Can't load Immport data as this project is not configured for Immport")
  }
  tabdir<-subdirs[grep("^Tab$",basename(subdirs))]
  #check if files exist
  anyerror<-FALSE
  for(i in names(immport_files)){
    itexists<-file.exists(file.path(tabdir,immport_files[i]))
    if(verbose>0)
      message("File ",immport_files[i]," in ./Tab directory ",c("doesn't exist","exists")[itexists+1])
    if(!itexists)
      anyerror<-TRUE
  }
  
  if(anyerror){
    stop("Populate the Immport Tab directory before continuing.")
  }
  
  #No errors, go ahead and load the data
  #Populate a named list
  immport_tables<-vector('list')
  for(i in names(immport_files)){
    immport_tables[[i]] <- fread(file.path(tabdir,immport_files[i]),verbose=FALSE)
  }
  
  #names are file_info, expsample, biosample, bio_to_exp, arm_to_subject, arm_to_cohort, subject
  #the following are created
  immport_tables$arm <- with(immport_tables,merge(arm_to_subject, arm_to_cohort, by="ARM_ACCESSION"))
  immport_tables$GSM_table <- with(immport_tables,expsample[DESCRIPTION%like%"GSM",GSM:=gsub(".*:", "", DESCRIPTION)][DESCRIPTION%like%"GSM"])
  immport_tables$pData <- with(immport_tables,merge(GSM_table[,c("EXPSAMPLE_ACCESSION","GSM","EXPERIMENT_ACCESSION"),with=FALSE], bio_to_exp[,c("BIOSAMPLE_ACCESSION","EXPSAMPLE_ACCESSION"), with=FALSE], by="EXPSAMPLE_ACCESSION"))
  immport_tables$pData <- with(immport_tables,merge(pData, biosample[,c("BIOSAMPLE_ACCESSION","STUDY_TIME_COLLECTED","STUDY_TIME_COLLECTED_UNIT","SUBJECT_ACCESSION"), with=FALSE], by="BIOSAMPLE_ACCESSION"))
  immport_tables$pData <- with(immport_tables,merge(pData, arm[,c("SUBJECT_ACCESSION","ARM_ACCESSION","NAME"),with=FALSE], by="SUBJECT_ACCESSION"))
  immport_tables$pData <- with(immport_tables,merge(pData, subject[,c("SUBJECT_ACCESSION", "AGE_REPORTED", "GENDER", "ETHNICITY"), with=FALSE], by="SUBJECT_ACCESSION"))
  immport_tables$pData<-with(immport_tables,setnames(pData, names(pData), tolower(names(pData))))
  
  #assign immport_tables to the configuration object
  assignConfig("immport_tables",immport_tables)
  options("warn"=savewarn)
}

#' Checks if the SRAdb sqlite database is available
#' 
#' If the database files is not available, it is downloaded.
#' 
#' Downloads and stores the SRAdb sqlite database if necessary and stores it in 
#' the Utils folder.
#' @param path \code{character} path to SRAmetadb.sqlite file exists or will be placed
#' @export
getAndConnectSRAdb <- function(path=NULL){
  if(is.null(path)){
    stop("You must specify a path to the SRAmetadb.sqlite file, or a path where the file will be placed.")
  }
  
  #set the path to the Utils directory, doesn't actually have to be named Utils.
  path<-normalizePath(path)
  obj<-getConfig()[["subdirs"]]
  obj[["Utils"]]<-path
  
  #store the path
  assignConfig("subdirs",obj)
  
  #create if necessary
  system(paste0("mkdir -p ",file.path(path,"SRAdb")))  
  
  #download SRA db if necessary
  if(!file.exists(file.path(getConfig()[["subdirs"]]["Utils"],"SRAdb",'SRAmetadb.sqlite'))) {
    sqlfile <- getSRAdbFile(destdir = file.path(getConfig()[["subdirs"]]["Utils"],"SRAdb"))
  }else{
    message("SRAmetadb.sqlite found")
  }
  #connect and grab the data
  sra_con <- dbConnect(SQLite(),file.path(getConfig()[["subdirs"]]["Utils"],"SRAdb",'SRAmetadb.sqlite'))
  sra_tables <- dbListTables(sra_con)
  assignConfig("sra_tables",sra_tables)
  assignConfig("sra_con",sra_con)
}

#' Detect and configure the aspera client location
#' 
#' Detect the installation location of aspera client based on expected defaults
#' 
#' Sets the location of aspera client based on expected defaults. Or pass the installation path
#' @note Needs to be tested on linux and modified to locate the identity keys for aspera on linux
#' @param path \code{character} path to the aspera client. If NULL, will try to detect the location based on the OS
#' @export
detectAspera<-function(path=NULL){
  #get OS
  sysname=Sys.info()['sysname']
  #detect installation
  if(sysname=="Darwin"){
    homedir<-Sys.getenv("HOME")
    if(file.exists(file.path(homedir,"Applications/Aspera Connect.app"))){
      path <- file.path(homedir,"Applications/Aspera Connect.app","Contents","Resources")
    }
    else if(file.exists(file.path("/Applications/Aspera Connect.app/Contents/Resources"))){
      path <- file.path("/Applications/Aspera Connect.app/Contents/Resources")
    }else{
      stop("Can't detect aspera installation for Mac OS. Pass a path variable to configure.")
    }
    message("aspera detected at ", path)
    aspera_path=path
    assignConfig("aspera_path",aspera_path)
  }else{
    if(is.null(path)){
      message("Run detectAspera with the path argument to configure the aspera client for your OS.")
    }
    stopifnot(file.exists(file.path(path,"ascp")))
    aspera_path=path
    assignConfig("aspera_path",aspera_path)
  }
}

#' Download FastQ files based on Immport data and SRA db, unless they already exist.
#'
#' Based on the SRA db, download the FastQ files, unless they already exist.
#' 
#' Conditionally download the FastQ files in the SRA db based on Immport tables.
#' If the files are already present, they will not be downloaded.
#' @param GSE_accession (not implemented)
#' @export
downloadSRA <- function(GSE_accession=NULL){
  #Two branches.. if immport_tables exists and if they don't
  if(names(getConfig())%in%"immport_tables"){
    n = nrow(getConfig()[["immport_tables"]][["GSM_table"]])
  }else{
    sra_con<-getConfig()[["sra_con"]]
    if(is.null(GSE_accession)){
      stop("Must supply a GEO accession to download SRA files.")
    }
    stop("Support for SRA from non-immport data is pending.")
  }
  cond_eval <- length(list.files(getConfig()[["subdirs"]][["SRA"]], pattern=".sra")) < n
  if(cond_eval){
    if(names(getConfig())%in%"immport_tables"){
      GSM_table<-getConfig()[["immport_tables"]][["GSM_table"]]
    }else{
      sra_con<-getConfig()[["sra_con"]]
      stop("Support for SRA from non-immport data is pending.")
    }
    successes<-0
    for(file in GSM_table[,GSM]) {
      gd <- getGEO(file, destdir=getConfig()[["subdirs"]][["GEO"]])
      SRX_number <- gsub(".*=SRX", "SRX", gd@header$relation[1])
      
      # Convert to aspera address
      sra_con<-getConfig()[["sra_con"]]
      run_accession <- listSRAfile(SRX_number, sra_con, fileType = "sra" )$run
      aspera_url <- paste0("anonftp@ftp.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra", "/", substr(run_accession,1,3), "/", substr(run_accession,1,6), "/", run_accession, "/", run_accession, ".sra")
      
      out<-system(paste0('ascp -i ',gsub(" ","\\\\ ",getConfig()[["aspera_path"]]),'/asperaweb_id_dsa.openssh -k 1 -T -l200m ', aspera_url, " ",getConfig()[["subdirs"]][["SRA"]]))
      if(out==0)
        successes<-successes+1
    }
    message("Downloaded ", successes, " files")
  }else{
    message("SRA files already downloaded")
  }
}

#' Utility function to truncate the test data to `n` SRA files for testing
#' 
#' This will truncate the GSM_table to `n` files for testing purposes.
#' 
#' Since there is over 100Gb of data, we truncate to n (default 2) files for testing.
#' @param n \code{integer} the number of files to limit
#' @export
devel_truncateData <- function(n=2){
  obj <- getConfig()
  #Truncate to download only two files
  immport_tables <- obj[["immport_tables"]]
  immport_tables[["GSM_table"]] <- immport_tables[["GSM_table"]][1:n,]
  assignConfig("immport_tables",immport_tables)
  message("Truncating data to first ", n, " SRA files.")
  invisible(NULL)
}

#' Annotate the SRA file pData
#' 
#' Set the pData to reflect the downloaded SRA files
#' 
#' @export
annotateSRA <- function(){
  pData<-getConfig()[["immport_tables"]][["pData"]]
  GSM_table<-getConfig()[["immport_tables"]][["GSM_table"]]
  pData<-pData[gsm%in%GSM_table[,GSM]]
  SRX_number<-pData[gsm%in%GSM_table[,GSM]][,gsm]
  #SRX_number <- pData[,gsm]
  for(file in pData[,gsm]) {
    gd <- getGEO(file, destdir=getConfig()[["subdirs"]][["GEO"]])
    number <- gsub(".*=SRX", "SRX", gd@header$relation[1]) 
    run_accession <- listSRAfile(number, getConfig()[["sra_con"]], fileType = "sra" )$run
    SRX_number[SRX_number==file] <- run_accession
  }
  pData$srr <- SRX_number
  
  pData[,c("expsample_accession","experiment_accession","arm_accession"):=NULL]
  setnames(pData, c("name", "ethnicity"), c("arm_name", "race"))
  setcolorder(pData, neworder = c("subject_accession", "age_reported", "gender", "race", "study_time_collected", "study_time_collected_unit", "arm_name", "gsm", "srr","biosample_accession"))
  write.csv(pData, file=file.path(getConfig()[["subdirs"]][["RSEM"]],"rsem_pdata.csv"), row.names=FALSE) 
  
  #save the changes
  immport_tables<-getConfig()[["immport_tables"]]
  immport_tables[["pData"]] <- pData
  immport_tables[["GSM_table"]] <- GSM_table
  assignConfig("immport_tables",immport_tables)
  message("Wrote pData to RSEM directory")
}

#' Overview of downloaded files
#' 
#' Prints an overview of the SRA files
#' 
#' Generates a table of the downloaded files for an overview to be included
#' in reports.
#' @export
#' @importFrom knitr kable
SRAOverview <- function(){
  pData <- getConfig()[["immport_tables"]][["pData"]]
  kable(utils::head(pData), format = "markdown")
}

#' Convert SRA files to FastQ
#' 
#' Uses CLI SRA Toolkit to convert SRA files to FastQ if they don't already exist.
#' 
#' Converts SRA files to FastQ using the SRA Toolkit. The function checks whether the FastQ files exist, and if not, performs the conversion.
#' @export
#' @param ncores \code{integer} how many threads to use
convertSRAtoFastQ <- function(ncores=8){
  convertFastQ <- length(list.files(getConfig()[["subdirs"]][["FASTQ"]]))<length(list.files(getConfig()[["subdirs"]][["SRA"]]))
  convert_command <-  paste0("parallel -j ",ncores," fastq-dump {} -O ",getConfig()[["subdirs"]][["FASTQ"]]," ::: ", file.path(getConfig()[["subdirs"]][["SRA"]],"*.sra"))
  if(convertFastQ){
    out<-system(convert_command)
    if(out==0)
      message("Finished converting ",length(list.files(getConfig()[["subdirs"]][["SRA"]])), " files.")
    else
      warning("Something went wrong when converting FastQ files.")
  }else{
    message("FASTQ Files already present.")
  }
  
}

#' Concatenate FastQ files
#'
#' Combine several fastq.gz files into one fastq.gz file for each library
#' Decompress the fastq.gz 
#' Copy the fastQ files into FASTQ folder
#' @export
#' @param infile \code{character} specify the path to the individual fastq.gz directory.
#' @param outfile \code{character} specify the path to the FASTQ directory
#' @param pattern \code{character} the string want to remove in the fastq.gz files name
concatenateFastq = function(infile, outfile, pattern)
{
  samples <- system(paste0("ls ", infile), intern = T)
  sample.name <- unique(sub(pattern, "", samples))
  fastq.name <- paste0(sample.name, ".fastq.gz")
  system(paste0("cat ", infile,  "/", sample.name, "_*.fastq.gz > ", infile, "/", fastq.name))
  system(paste0("gunzip ", infile,  "/", fastq.name))
  system(paste0("mv ", infile, "/*.fastq ", outfile))
}

stripExtension <- function(finame, pattern='([.][A-Za-z]+$)'){
    sub(pattern, '', finame)
}

#' Perform FastQC quality control
#' 
#' Runs fastqc quality control on the FASTQ files. Parallel processing is run using fastqc
#' internal threading (-t option). Each thread requires 250MB of memory.
#' 
#' Produces FASTQC reports for each FASTQ file using fastqc. FASTQ input files must have 
#' extension \code{fastqc} or \code{fq} (capitalization sensitive). FASTQC output will be
#' written to FASTQC directory.
#' 
#' @param ncores \code{integer} how many threads to use. Must be less than or equal to number
#' of cores on machine.
#' @export
runFastQC <- function(ncores=4){
  
  ## make sure we have enough threads to run locally
  avail_cores <- parallel::detectCores()
  if(ncores > avail_cores){
    stop(paste(ncores, "cores required but only", avail_cores, " cores found on local machine"))
  }
  
  fastQCout <- list.files(getDir("FASTQC"))
  stripFQC <- unique(stripExtension(fastQCout, '\\.(fastq|fq)_fastqc.*$')) #expanded fastqc generates at least 3 files per input
  FQfile <- list.files(getConfig()[["subdirs"]][["FASTQ"]], pattern="*\\.(fastq|fq)$", ignore.case=TRUE, full.names=TRUE)
  FQfile <- data.frame(file=FQfile, done=stripExtension(basename(FQfile)) %in% stripFQC, stringsAsFactors=FALSE)
  notrun <- FQfile[!FQfile$done,]
  
  run_command <- paste0("fastqc -o ", getDir("FASTQC"), " --extract -t ", ncores, " ",
                 noquote(paste(notrun$file, sep="", collapse=" ")))
  
  if(nrow(notrun) > 0){
      message('Running fastqc process on ', nrow(notrun), ' files.')
    out<-system(run_command)
    if(out==0){
      message("Finished fastqc process for ", nrow(notrun), " files.")
      if(any(out!=0)){
        warning("encountered an error unzipping fastqc reports")
      }
    }
    else
      warning("Something went wrong when running fastqc")
  }else{
    message("FASTQC already run")  
  }
}

.collectFastqcFiles <- function(pattern){
  # Put the FASTQC results together
  # List sudirs
  fastqc_subdirs <- grep("_fastqc", list.dirs(getConfig()[["subdirs"]][["FASTQC"]], recursive = FALSE), value=TRUE)
  # Find reports
  sapply(fastqc_subdirs, list.files, pattern=pattern, full.names = TRUE)
  
}

#' Summarize the fastQC results
#' 
#' Summarize the fastqc results
#' 
#' Function to be run after `runFastQC` that will summarize the fastQC results
#' and generate a plot.
#'
#' @return prints a plot and invisibly returns the data.table used to generate the plot
#' @export
#' @import ggplot2
summarizeFastQC <- function(){
  
  fastqc_summary_files <- .collectFastqcFiles("summary.txt")
  # Read all files and create a list of data.tables
  fastqc_list <- lapply(fastqc_summary_files, function(x, ...){
    y <- fread(x, header=FALSE);
    setnames(y, colnames(y), c("result", "test", "file"))
    return(y);})
  
  fastqc_data <- rbindlist(fastqc_list)
  fastqc_data[,qresult:=ifelse(result=="FAIL",0, ifelse(result=="PASS", 1, 1/2))]
  fastqc_data[,sum_qresult:=sum(qresult), by="file"]
  fastqc_data <- fastqc_data[order(-sum_qresult)]
  
  # Let's reorder by overall fastqc score
  fastqc_data$file <- reorder(fastqc_data$file, -fastqc_data$sum_qresult)
  print(ggplot(fastqc_data, aes(x=test, y=file, fill=result))+geom_raster()+theme(axis.text.x=element_text(angle=90,hjust=1)))
  invisible(fastqc_data)
  #tidy up
  #sapply(fastqc_subdirs,function(x)system(paste0("rm -rf ",x)))
}

##' Summarize duplication statistics from fastqc
##'
##' Prints plots of the duplication distribution and % duplication across libraries
##' @return data.table of source data, invisbly
##' @export
summarizeDuplication <- function(){
  fastqc_complete <- .collectFastqcFiles("fastqc_data.txt")
  fastqc_list <- lapply(fastqc_complete, function(x, ...){
    header <- read.table(x, skip=3, nrows=7, sep='\t', colClasses='character')
    setDT(header)
    setnames(header, c('record', 'value'))
    setkey(header, record)
    totaldup <- fread(x, skip='#Total Deduplicated Percentage', nrows=1, sep='\t', header=F, colClasses=c('character', 'numeric'))
    y <- fread(x, skip='#Duplication Level', nrows=15, sep='\t', autostart=4, header=TRUE, colClasses=c('character', 'numeric', 'numeric'))        
    setnames(y, colnames(y), c("duplication level", "pct dedup", "pct total"))
    y[,file:=header['Filename', value]]
    y[,totaldup:=100-totaldup[1,2,with=FALSE]]
    return(y)})
  fastqc_data <- rbindlist(fastqc_list, fill=TRUE)
  fastqc_data[,`duplication level`:=factor(`duplication level`, levels=c(1:9, c('>10', '>50', '>100', '>500', '>1k', '>5k')))]
  M <- data.table::melt.data.table(fastqc_data, measure.vars=c('pct dedup', 'pct total'))
  print(ggplot(M, aes(x=`duplication level`, y=value))+geom_boxplot() + facet_wrap(~variable))
  U <- unique(fastqc_data[,list(file, totaldup)])
  print(ggplot(U, aes(x=totaldup)) + geom_density() + geom_text(aes(x=totaldup, y=0, label=file), size=2, angle=90, hjust=0))
  invisible(fastqc_data)
}


#' Build a reference genome 2
#' 
#' Builds a reference genome at `Utils/Reference_Genome`
#' 
#' You must specify the Utils path if it is not already defined, and have your genome in a folder titled
#' `Reference_Genome`. This function will construct the reference genome using RSEM tools.
#' The command line is the default shown in the documentation.
#' `rsem-prepare-reference --gtf gtf_file --transcript-to-gene-map knownIsoforms.txt --bowtie2 fasta_file name`
#' If the gtf_file is not give, then the transcript-to-gene-map option is not used either. A fasta_file and a name must be provided.
#' @param path \code{character} specifying an \emph{absolute path} path to the Utils directory.
#' @param gtf_file \code{character} the name of the gtf file. Empty by default. If specified the function will look for a file named `knownIsoforms.txt`
#' @param fasta_file \code{character} the name of the fasta file, must be specified
#' @param star_threads \code{integer} number of threads to use when building index
#' @param name \code{character} the name of the genome output.
#' @param gff3 \code{logical} when gtf_file is gff3 format set it to TRUE to add "--sjdbGTFtagExonParentTranscript gene".
#' @param additional_param \code{character} default "".  Additional parameters to pass.
#' @export

buildGenomeIndexSTAR2 = function (path = NULL, gtf_file = "", fasta_file = NULL, star_threads = 1, name=NULL, gff3 = FALSE, additional_param = ""){
  if (is.null(fasta_file)){
    stop("You must provide a fasta_file")
  }

  if (is.null(path) & inherits(try(getConfig()[["subdirs"]][["Utils"]], silent = TRUE), "try-error")) {
    stop("Please specify where to build the reference genome")
  }
  else if (is.null(path)) {
    path <- file.path(getConfig()[["subdirs"]][["Utils"]], "Reference_Genome/STARIndex")
  }
  else {
    subdirs <- getConfig()[["subdirs"]]
    subdirs[["Utils"]] <- path
    assignConfig("subdirs", subdirs)
    path <- file.path(getConfig()[["subdirs"]][["Utils"]], "Reference_Genome/STARIndex")
  }
  
  if (substr(path, 1, 1) != "/") {
    stop("'path' must be an absolute path")
  }
  dir.create(path, showWarnings=FALSE)
  if (gtf_file == "") {
    gtfopt <- ""
  }
  else {
    gtfopt <- "--sjdbGTFfile"
    gtf.file <- file.path(paste0(getConfig()[["subdirs"]][["Utils"]], "/Reference_Genome/", gtf_file))
  }
  gffopt=""
  if(gff3==TRUE){
    gffopt<- "--sjdbGTFtagExonParentTranscript gene"
  }
  fasta.file <- file.path(paste0(getConfig()[["subdirs"]][["Utils"]], "/Reference_Genome/", fasta_file))
  if (length(fasta.file) > 1) {
    fasta.file <- paste(fasta.file, collapse = ",")
  }
  if (length(list.files(pattern="SAindex", path=path)) == 0) {
    command = paste0("STAR --runThreadN ", star_threads, " --runMode genomeGenerate --genomeDir ", path, " --genomeFastaFiles ", fasta.file, " ", gtfopt, " ", gtf.file, " ", gffopt , " ", additional_param  )
    system(command)
  }
  else {
    message("Reference Genome Found")
  }
  assignConfig("reference_genome_name", name)
}

#' Use the RSEM tool to annotate and quantify the reads
#'
#' Use the RSEM tool to annotate and quantify the reads
#'
#' Uses RSEM to quantify reads in FASTQ files against the reference genome. Optionally you can specify paired end reads. The code assumes
#' paired reads have fastq files that differ by one character (i.e. sampleA_read1.fastq, sampleA_read2.fastq) and will perform
#' matching of paired fastq files based on that assumption using string edit distance. Read 1 is assumed to be upstream
#' and read 2 is assumed to be downstream.
#' The number of parallel_threads*bowtie_threads should not be more than the number of cores available on your system.
#' @param parallel_threads \code{integer} specify how many parallel processes to spawn
#' @param bowtie_threads \code{integer} specify how many threads bowtie should use.
#' @param paired \code{logical} specify whether you have paried reads or not.
#' @param frag_mean \code{numeric}
#' @param frag_sd \code{numeric} For single ended reads, specifying these might make calculations of effect length more effective. Optional.
#' @param nchunks \code{integer} number of chunks to split the files for a slurm job. Ignored if slurm = FALSE
#' @param days_requested \code{integer} number of days requested for the job (when submitting a slurm job). Ignored if slurm = FALSE
#' @param slurm \code{logical} if \code{TRUE} job is submitted as a slurm batch job, otherwise it's run on the local machine. Slurm jobs will honour the nchunks and days_requested arguments.
#' @param slurm_partition \code{character} the slurm partition to submit to. Ignored if slurm=FALSE
#' @param ram_per_node \code{numeric} The number of Mb per node. Ignored if slurm=FALSE. Default of \code{parallel_threads*bowtie_threads*1000}
#' @param fromBAM \code{logical} if \code{TRUE} then RSEM will attempt to use previously aligned BAM files, in the \code{BAM} directory, rather than fastq files. The file names expected to end with \code{.transcript.bam}. See RSEM documentation for the format these files must obey.
#' @param fromSTAR \code{logical} if \code{TRUE} then RSEM will use the
#' STAR notation for the BAM files
#' @note The amount of memory requested should be set to bowtie_threads*parallel_threads*1G 
#' as this is the default requested by samtools for sorting. If insufficient memory is 
#' requested, the bam files will not be created successfully.
#' @param mail email address to send failure message to, if desired.
#' @param force If TRUE then quantify all FASTQ/BAM files else only quantify those
#'  files that have not been quantified. 
#' @export
RSEMCalculateExpression <- function(parallel_threads=6,bowtie_threads=1,paired=FALSE, frag_mean=NULL, frag_sd=NULL,
                                    nchunks=10,days_requested=5,slurm=FALSE, 
                                    slurm_partition=NULL, #slurm_partition="gottardo_r",
                                    ram_per_node=bowtie_threads*parallel_threads*1200, 
                                    fromBAM=FALSE, fromSTAR=FALSE, mail=NULL, force=FALSE){
  ncores<-parallel_threads*bowtie_threads
  if(ncores>parallel::detectCores()&!slurm){
    stop("The number of parallel_threads*bowite_threads is more than the number of cores detected by detectCores() on the local machine for non-slurm jobs")
  }
  #Chunking for slurm
  .chunkDataFrame<-function(df=pairs,nchunks=nchunks){
    groupsize<-nrow(df)%/%nchunks
    split(as.data.frame(df),gl(nchunks,groupsize,length=nrow(df)))
  }
  rsem_dir <- getConfig()[["subdirs"]][["RSEM"]]
  ## parse RSEM output to get names of fastq files which have already been annotated.
  done <- stringr::str_replace(list.files(path=rsem_dir,pattern="\\.genes\\.results$"), 'Aligned\\.toTranscriptome\\.out\\.genes\\.results$', '')

  #browser()
  if(!fromBAM){
    fastq_dir <- getConfig()[["subdirs"]][["FASTQ"]]
    todo <- unique(stringr::str_replace(list.files(path=fastq_dir,pattern="\\.fastq$"), '(_[12])?\\.fastq$', ''))
    extension <- if(paired) c('_1.fastq', '_2.fastq') else '.fastq'
  } else if(!fromSTAR){
    fastq_dir <- getConfig()[["subdirs"]][["BAM"]]
    orig <- list.files(path=fastq_dir,pattern="\\.bam$")
    todo <- stringr::str_replace(orig, '(\\.transcript)?\\.bam$', '')
    extension <- stringr::str_match(orig[1], '(\\.transcript)?\\.bam$')[1,1]
  }else{
    fastq_dir <- getConfig()[["subdirs"]][["BAM"]]
    orig <- list.files(path=fastq_dir,pattern="Aligned.toTranscriptome.out.bam$")
    todo <- stringr::str_replace(orig, 'Aligned.toTranscriptome.out.bam$', '')
    extension <- stringr::str_match(orig[1], 'Aligned.toTranscriptome.out.bam$')[1,1]
  }
  
  keep <- todo
  if(!force) {
      keep <- setdiff(todo, done)
      if(length(keep)==0){
        warning("Expression already calculated. Set force=TRUE to force quantification")
        return()
      }
  }
  
  reference_genome_path <- file.path(getConfig()[["subdirs"]][["Utils"]],"Reference_Genome")
  reference_genome_name <- file.path(getConfig()[["reference_genome_name"]])
  keep <- outer(keep, extension, paste0)
  
  ## Write slurm preamble to a shell script
  if(slurm){
    
    ## set up email section if email address is given
    mail_string <- mail_address <- ""
    if(!is.null(mail)){
      mail_string <- "#SBATCH --mail-type=FAIL"
      mail_address <- paste0("#SBATCH --mail-user=", mail)
    }
    
    con<-file(file.path(getConfig()[["subdirs"]][["FASTQ"]],"batchSubmitJob.sh"),open = "w")
    ram_requested<-parallel_threads*ram_per_node
    writeLines(c("#!/bin/bash",
                 paste0("#SBATCH -n ",ncores," # Number of cores"),
                 "#SBATCH -N 1 # Ensure that all cores are on one machine",
                 paste0("#SBATCH -t ",days_requested,"-00:00 # Runtime in D-HH:MM"),
                 ifelse(is.null(slurm_partition),"", paste0("#SBATCH -p ",slurm_partition," # Partition to submit to")),
                 paste0("#SBATCH --mem=",ram_requested," # Memory pool for all cores (see also --mem-per-cpu)"),
                 paste0("#SBATCH -o ",file.path(getConfig()[["subdirs"]][["FASTQ"]],"rsem_%a.out")," # File to which STDOUT will be written"),
                 paste0("#SBATCH -e ",file.path(getConfig()[["subdirs"]][["FASTQ"]],"rsem_%a.err"), " # File to which STDERR will be written"),
                 mail_string,
                 mail_address,
                 'module add bowtie2'),
               con=con)
    argumentFile <- file.path(getConfig()[["subdirs"]][["FASTQ"]], "arguments_chunk_${SLURM_ARRAY_TASK_ID}.txt")
  } else{
    #If we aren't using slurm, set the number of chunks to one.
    nchunks<-1
    argumentFile <- file.path(getConfig()[["subdirs"]][["FASTQ"]], "arguments_chunk_1.txt")
  } ## end if slurm
  
  if(!paired){
    #get file names, commandline for single-end
    # keep<-paste0(keep,".fastq")
    myfiles<-file.path(fastq_dir,keep)
    fragLenArg <- ''
    bamArg <- ''
    if(!is.null(frag_mean) && !is.null(frag_sd)){
      fragLenArg <- paste0(" --fragment-length-mean ", frag_mean, " --fragment-length-sd ", frag_sd)
    }
    if(fromBAM){
      command<-paste0("cd ",rsem_dir," && parallel -j ",parallel_threads*bowtie_threads," rsem-calculate-expression --no-bam-output --bam {} ",file.path(reference_genome_path,reference_genome_name)," {/.} :::: ", argumentFile)
    } else{
      command<-paste0("cd ",rsem_dir," && parallel -j ",parallel_threads," rsem-calculate-expression --bowtie2 -p ", bowtie_threads," {} ",file.path(reference_genome_path,reference_genome_name)," {/.} :::: ", argumentFile)
    }
  } else {
    #get file names, commandline for Paired end
    fastq_files<-file.path(fastq_dir,sort(keep))
    if(!is.null(frag_mean) || !is.null(frag_sd)){
      warning('`frag_mean` and `frag_sd` ignored for paired-end reads.')
    }
    if(fromBAM){
      myfiles<-file.path(fastq_dir,keep)
      command<-paste0("cd ",rsem_dir," && parallel -j ",parallel_threads*bowtie_threads," rsem-calculate-expression --no-bam-output --paired-end --bam {} ",file.path(reference_genome_path,reference_genome_name)," {/.} :::: ", argumentFile)
    } else{
      pairs<-matrix(fastq_files,ncol=2,byrow=TRUE)
      myfiles<-cbind(pairs,gsub("\\.fastq","",gsub("_[12]\\.","\\.",basename(pairs[,1]))))
      command<-paste0("cd ",rsem_dir," && parallel -n 3 -j ",parallel_threads," rsem-calculate-expression --bowtie2 -p ", bowtie_threads," --paired-end {1} {2} ",file.path(reference_genome_path,reference_genome_name)," {3.} :::: ", argumentFile)
    }
  }
  ## for both, divide split the files into chunks (maybe just 1) and write the argument chunks
  chunked<-.chunkDataFrame(data.frame(files=myfiles),nchunks)
  for(i in seq_along(chunked)){
    con2=file(file.path(getConfig()[["subdirs"]][["FASTQ"]],paste0("arguments_chunk_",i,".txt")))
    writeLines(t(chunked[[i]]),con=con2)
    close(con2)
  }
  if(slurm){
    ## add the rsem command to the shell script
    message('Using ', nchunks , 'clusters ')
    message("Sending ", command)
    writeLines(paste0(command,"\n"),con=con)
    close(con)
    slurm_command<-paste0("sbatch --array=1-",nchunks," ",file.path(getConfig()[["subdirs"]][["FASTQ"]],"batchSubmitJob.sh"))
    system(slurm_command)
  } else{
   
    bamDir <- getDir("BAM")
    runRSEM <- function(b_name, b_dir=bamDir) {
      
      bamOut <- tools::file_path_sans_ext(b_name)
      
      rsemCommand <- paste0("rsem-calculate-expression --no-bam-output --paired-end --bam ",
                            file.path(b_dir, b_name), " ", 
                            file.path(reference_genome_path, reference_genome_name), " ",
                            bamOut)
      system(rsemCommand)
      
    } ## end run RSEM
    
    setwd(rsem_dir)
    nothing <- mclapply(keep, runRSEM, mc.cores=ncores)
    
  }
}


#' Assemble an expression matrix of all results
#' 
#' Put all the counts from the individual libraries into a single matrix result
#' 
#' Assemble an expression matrix from the individual libraries.
#' @param force \code{logical} rerun even if output exists
#'@export 
RSEMAssembleExpressionMatrix <- function(force=FALSE){
  
  cond_eval <- length(list.files(getConfig()[["subdirs"]][["RSEM"]], pattern="rsem_"))<4
  if(cond_eval | force){
    message("Assembling counts matrix")
    rsem_files <- list.files(getConfig()[["subdirs"]][["RSEM"]], pattern="genes.results", full.names = TRUE)
    # Read all files and create a list of data.tables
    rsem_list <- lapply(rsem_files, function(x, ...){
      y<-fread(x, drop=c("length", "FPKM")); 
      y$sample_name <- gsub(".genes.results", "", basename(x));
      return(y);})
    
    # Bind all the data.tables to create a long data.table
    rsem_data_long <- rbindlist(rsem_list)

    ## remove 'Aligned.toTranscriptome.out' from rsem file names
    rsem_data_long[,sample_name := gsub("Aligned.toTranscriptome.out", "", sample_name)]
    
    # Rename a column, so that it's a bit R friendly
    setnames(rsem_data_long, "transcript_id(s)", "transcript_ids")
    
    # Reshape the long data.table to create a matrix
    # Assume that missing entries would have a TPM value of 0
    rsem_tpm_matrix <- dcast.data.table(rsem_data_long, gene_id+transcript_ids~sample_name, value.var = "TPM", fill=0)
    rsem_count_matrix <- dcast.data.table(rsem_data_long, gene_id+transcript_ids~sample_name, value.var = "expected_count", fill=0)
    rsem_effective_length_matrix<-dcast.data.table(rsem_data_long,gene_id+transcript_ids~sample_name,value.var="effective_length",fill=0)
    # Keep track of the gene_id - transcript mapping 
    rsem_txs_table <- rsem_tpm_matrix[, c("gene_id","transcript_ids"), with=FALSE]
    
    # Remove the transcript column (we create an annotation table in the next chunk)
    rsem_tpm_matrix <- rsem_tpm_matrix[, transcript_ids:=NULL][order(gene_id)]
    rsem_count_matrix <- rsem_count_matrix[, transcript_ids:=NULL][order(gene_id)]
    rsem_effective_length_matrix <- rsem_effective_length_matrix[, transcript_ids:=NULL][order(gene_id)]
    
    # Write to disk
    write.csv(rsem_tpm_matrix, file=file.path(getConfig()[["subdirs"]][["RSEM"]],"rsem_tpm_matrix.csv"), row.names=FALSE)
    write.csv(rsem_effective_length_matrix, file=file.path(getConfig()[["subdirs"]][["RSEM"]],"rsem_effective_length_matrix.csv"), row.names=FALSE)
    write.csv(rsem_count_matrix, file=file.path(getConfig()[["subdirs"]][["RSEM"]],"rsem_count_matrix.csv"), row.names=FALSE)
    write.csv(rsem_txs_table,file=file.path(getConfig()[["subdirs"]][["RSEM"]],"rsem_txs_table.csv"))
  }
}

##' Summarize the number of mapped, unmapped and ambiguous reads from RSEM output
##'
##' @param dir Directory containing RSEM cnt files.  If missing, use the RSEM directory from project.
##' @param log return log of reads?
##' @param plot print a ggplot summary
##' @return returns a data table of # reads per library, invisibly
RSEMSummarizeMapping <- function(dir, log=TRUE, plot=TRUE){
    if(missing(dir)) dir <- getConfig()[["subdirs"]][["RSEM"]]
    cnt_files <- list.files(dir, pattern="*.cnt", full.names = TRUE, recursive=TRUE)
    fastqc_list <- lapply(cnt_files, function(x, ...){
        y <- fread(x, skip=3)
        setnames(y, c('nmap', 'reads'))
        y[,curead:=cumsum(reads)]
        file <- stringr::str_replace(basename(x), fixed('.cnt'), '')
        total <- y[nmap==Inf,curead]
        mapped <- total-y[nmap==0,reads]
        unique <- max(y[nmap==1,reads], 0)
        z <- data.table(file, total, mapped, unique)
        return(z)})
    fql <- rbindlist(fastqc_list)
    fqlM <- data.table::melt.data.table(fql, id.vars='file')
    if(log){
        fqlM[,value:=log10(value+1)]
    }
    fqlM[,outlier:={
        bs <- boxplot.stats(value)
        value<bs$stats[1] | value > bs$stats[5]
    },keyby='variable'
     ]

    ggp <- ggplot(fqlM, aes(x=variable, y=value))+geom_boxplot() + geom_text(aes(label=ifelse(outlier, file, "")), size=2) + ylab(if(log) 'log10(reads+1)' else 'reads')
    if(plot) print(ggp)
    invisible(fqlM)
}

#' Produce a report listing the tools and packages used by the pipeline.
#' 
#' Produce a report listing the tools and packages used by the pipeline.
#' 
#' Helps with reproducibility by producing a report listing the tools and packages used by the pipeline.
#' This needs to be run on the system that produced the results. Output to project directory OUTPUT/pipeline_report.md.
#' @export
pipelineReport<-function(){
  #save the configuration, since we're probably done.
  saveConfig()
  command <- c("ascp --version",
               "fastq-dump --version",
               "bowtie2 --version",
               "fastqc --version",
               "rsem-calculate-expression --version")
  versions<-sapply(command,function(x)system(x,intern=TRUE)) 
  session<-capture.output(sessionInfo())
  sapply(versions,function(x)cat(paste0(x,"\n"),"\n"))
  cat(paste0(session,"\n"))
  #output to OUTPUT/pipeline_report.md
  f<-file(open = "a",description = (file.path(getConfig()[["subdirs"]][["OUTPUT"]],"pipeline_report.md")))
  sapply(versions,function(x)cat(paste0(x,"\n"),"\n",file = f))
  cat(paste0(session,"\n"),file = f)
  close(f)
}

#' Save the configuration for a project
#' 
#' Save the configuration for a project
#' 
#' Save the configuration from a project, stored in the CONFIG directory
#' 
#' @export
saveConfig <- function(){
  #TODO blindly stores configuration info. 
  #Wanto to use YAML eventually.
  config<-getConfig(); 
  saveRDS(config,file=file.path(config[["subdirs"]][["CONFIG"]],"configuration.rds"))
  invisible(TRUE)
} 

#' Read the configuration for a project
#' 
#' Read the configuration for a project
#' 
#' Read the configuration from a project, stored in the CONFIG directory
#' 
#' @param project \code{character} the path to the project folder
#' @export
readConfig <- function(project=NULL){
  #TODO this just blindly reads the config info. Some error checking needs to be done, ensuring the configuration matches what's in the project directory.
  #Ideally should call configure_project as well. 
  #Want to also store human-readable config information ultimately, perhaps using YAML.
  confdir <- file.path(project,"CONFIG")
  error<-length(list.files(confdir,pattern="configuration.rds"))!=1
  if(error){
    message("No configuration information found")
    return(invisible(FALSE))
  }
  obj<-readRDS(list.files(confdir,pattern="configuration.rds",full.names=TRUE))
  
  #store in the namespace
  ns <- getNamespace("RNASeqPipelineR")
  unlockBinding(sym = "rnaseqpipeliner_configuration",ns)
  assign("rnaseqpipeliner_configuration", obj, envir = ns) 
  invisible(TRUE)
}

#' Return and expression set of the counts or TPM values
#' 
#' Construct and return an expression set of counts or TPM values
#' 
#' Constructs an expression set of counts or tpm values depending on the value of which parameter.
#' @param which \code{character} \code{c("counts","tpm")} specifies what to return.
#' @export
getExpressionSet <- function(which="counts"){
  which<-match.arg(arg = which, c("counts","tpm"))
  if(which%in%"counts")
    mat <- fread(list.files(getConfig()[["subdirs"]][["RSEM"]],pattern="rsem_count_matrix.csv",full.names=TRUE))
  else
    mat <- fread(list.files(getConfig()[["subdirs"]][["RSEM"]],pattern="rsem_tpm_matrix.csv",full.names=TRUE))
  featuredata <- fread(list.files(getConfig()[["subdirs"]][["RSEM"]],pattern="rsem_fdata.csv",full.names=TRUE))
  pdata <- fread(list.files(getConfig()[["subdirs"]][["RSEM"]],pattern="rsem_pdata.csv",full.names=TRUE))
  
  #Construct an Eset and return
  pdata<-data.frame(pdata)
  rownames(pdata) <- pdata$srr  
  pdata<-AnnotatedDataFrame(data.frame(pdata))
  featuredata<-AnnotatedDataFrame(data.frame(featuredata))
  matr<-as.matrix(mat[,-1,with=FALSE])
  rownames(matr)<-mat[,gene_id]
  row.names(pdata)
  matr<-matr[,rownames(pdata),drop=FALSE]
  eset<-ExpressionSet(assayData = matr,phenoData = pdata, featureData = featuredata, annotation = getConfig()[["annotation_library"]])
  eset
}

#' Run MiTCR on each fastQ file
#' 
#' Run the MiTCR tool on each fastQ file
#' 
#' Runs MiTCR on each fastQ file. 
#' @param gene \code{character} c("TRB","TRA")
#' @param species \code{character} c("hs","mm")
#' @param ec \code{integer}, c(0,1,2)
#' @param pset \code{character} c("flex")
#' @param ncores \code{integer} number of cores for running in parallel
#' @param output_format \code{character} either "txt" or "cls". cls files can be viewed in the MiTCR viewer. Txt files can be parsed and used to annotate libraries.
#' @param paired \code{logical} specify whether data is paired (in which case the fastq files in PEAR directory are used). Defaults to FALSE.
#' @export
MiTCR <- function(gene="TRB",species=NULL,ec=2,pset="flex",ncores=1,output_format="text",paired=FALSE){
  pset<-match.arg(pset,"flex")
  output_format<-match.arg(output_format,c("txt","cls"))
  gene<-match.arg(gene,c("TRB","TRA"))
  species<-match.arg(species,c("hs","mm",NULL))
  if(!ec%in%c(0,1,2)){
    stop("Invalid value of ec")
  }
  if(length(system("which mitcr",intern=TRUE))==0){
    stop("mitcr can't be found on the path")
  }
  if(output_format=="txt")
    output_format<-".txt"
  else
    output_format<-".cls"
  command <-  paste0("mitcr -pset ",pset)
  command <- paste0(command," -gene ",gene," ")
  if(!is.null(species)){
    command <- paste0(command,"-species ",species," ")
  }
  if(!is.null(ec)){
    command <- paste0(command,"-ec ",ec," ")
  }
  #Run on each fastq file and output to TCR directory
  if(paired){
    fastqdir <- getConfig()[["subdirs"]][["PEAR"]]
    if(!dir.exists(fastqdir))
      stop("paired is TRUE, but PEAR directory not found. Did you run pear?")
  }else{
    fastqdir <- getConfig()[["subdirs"]][["FASTQ"]]
  }
  tcrdir <- file.path(dirname(fastqdir),"TCR")
  system(paste0("mkdir -p ",tcrdir))
  if(!paired){
    fastqfiles<-list.files(fastqdir,pattern="fastq$",full.names=TRUE)
  }else{
    fastqfiles<-list.files(fastqdir,pattern="\\.assembled\\.fastq$",full.names=TRUE)
  }
  if(ncores>1){
    outpath<-NULL
    for(i in fastqfiles){
      outpath<-c(outpath,file.path(tcrdir,paste0(gsub("fastq",gene,basename(i)),output_format)))
    }
    inargs<-paste(as.vector(apply(cbind(fastqfiles,outpath),1,function(x)paste(x,collapse=" "))),collapse=" ")
    system(paste0("parallel -j ",ncores," -n 2 ",command," {} ::: ",inargs))
  }else{
    for(i in fastqfiles){
      outpath<-file.path(tcrdir,paste0(gsub("fastq",gene,basename(i)),output_format))
      commandi<-paste0(command, i, " ", outpath)
      system(commandi)
    }
  } 
}

#' Run PEAR to assemble paired-end fastq files into one files.
#' 
#' Run the pear tool to assemble paired end fastq files into a single output fastq file.
#' 
#' Use this as a preliminary preprocessing step to running MiTCR if you have paired-end data.
#' @param ncores \code{integer} number of cores to use
#' @export
pear<-function(ncores=4){
  fastq_dir <- getConfig()[["subdirs"]][["FASTQ"]]
  pear_directory<-try(getConfig()[["subdirs"]][["PEAR"]],silent=TRUE)
  if(inherits(pear_directory,"try-error")){
    pear_directory<-file.path(dirname(getConfig()[["subdirs"]][["FASTQ"]]),"PEAR")
    dir.create(pear_directory)
    subdirs<-getConfig()[["subdirs"]]
    subdirs[["PEAR"]]<-pear_directory
    assignConfig("subdirs",subdirs)
    saveConfig()
  }
  pear_directory<-getConfig()[["subdirs"]][["PEAR"]]
  #We just assume that paired fastq files differ by one character, that there are an even number of them, and that the operating system
  #will return them to us in lexicographical order. 
  files<-(list.files(path=fastq_dir,pattern="*.fastq",full.names=FALSE))
  f<-file.path(pear_directory,"pear_arguments.txt")
  connection<-file(f,open="w")
  writeLines(files,con = connection)
  close(connection)
  command<-paste0("cd ", fastq_dir, " && parallel -j ",ncores, " -n2 pear -f {1} -r {2} -o ",file.path(pear_directory,"{1}")," :::: < ",file.path(pear_directory,"pear_arguments.txt"))
  system(command)
}


#' Download SRA files from SRX accessions
#' 
#' Download SRA files from SRX accessions. Downloads asynchronously. Won't message you when complete, so can't be run in 
#' batch mode at the moment.
#' @param x \code{character} a vector of SRX accession numbers
#' @export
getDataFromSRX<-function(x=NULL){
  if(is.null(x)){
    stop("Please pass a vector of SRX numbers.")
  }
  sra_con<-getConfig()[["sra_con"]]
  run_accession <- listSRAfile(x, sra_con, fileType = "sra" )$run
  aspera_url <- paste0("anonftp@ftp.ncbi.nlm.nih.gov:/sra/sra-instant/reads/ByRun/sra", "/", substr(run_accession,1,3), "/", substr(run_accession,1,6), "/", run_accession, "/", run_accession, ".sra")
  out<-paste0('ascp -i ',gsub(" ","\\\\ ",getConfig()[["aspera_path"]]),'/asperaweb_id_dsa.openssh -k 1 -T -l200m ', aspera_url, " ",getConfig()[["subdirs"]][["SRA"]])
  for(i in out){
    system(i,wait=FALSE)
  }
  message("Files are downloading. Wait and check your downloads before proceeding")
}

#'Quick and dirty annotations from SRAmetadb
#'
#'Gets annotations for SRR files based on SRAmetadb contents
#'@param x \code{character} vector of SRX accessions
#'@export
annotationsFromSRX<-function(x){
  samples<-data.table(listSRAfile(x,getConfig()[["sra_con"]]))
  setnames(samples,"sample","sample_accession")
  
  src<-src_sqlite(getConfig()[["sra_con"]]@dbname, create = F)
  sample_table<-tbl(src, sql("SELECT * FROM sample"))
  
  pdata<-merge(data.table(as.data.frame(filter(sample_table,sample_accession%in%samples$sample_accession))),samples,by="sample_accession")
  pdata<-select(pdata,sample_accession,experiment,run,sample_attribute)
  
  attributes<-strsplit(pdata$sample_attribute,"\\|\\|")
  names(attributes)<-pdata$run
  attributes<-lapply(attributes,function(x)t(cbind(x)))
  annotations<-plyr::ldply(lapply(attributes,function(x){
    column_names<-gsub("(^.+?):.*","\\1",x)
    matrix_entries<-gsub(" +$","",gsub("^ +","",gsub("^.+?:(.*)","\\1",x)))
    colnames(matrix_entries)<-column_names
    matrix_entries
  }))
  setnames(annotations,".id","run")
  annotations<-merge(pdata,annotations,by="run")
  annotations[,sample_attribute:=NULL]
  setnames(annotations,"run","srr")
  write.csv(annotations, file=file.path(getConfig()[["subdirs"]][["RSEM"]],"rsem_pdata.csv"), row.names=FALSE) 
  message("Wrote pData to RSEM/rsem_pdata.csv")
  annotations
}

#' Use the Tophat tool to align reads
#' 
#' Uses Tophat to align reads in FASTQ files against the reference genome hg38 from UCSC database. Optionally you can specify paired end reads. The code assumes 
#' paired reads have fastq files that differ by one character (i.e. sampleA_read1.fastq, sampleA_read2.fastq) and will perform
#' matching of paired fastq files based on that assumption using string edit distance. Read 1 is assumed to be upstream
#' and read 2 is assumed to be downstream. 
#' 
#' The number of parallel_threads*tophat_threads should not be more than the number of cores available on your system.
#' @param path \code{character} specifying an \emph{absolute path} path to the iGenome directory.
#' @param parallel_threads \code{integer} specify how many parallel processes to spawn
#' @param tophat_threads \code{integer} specify how many threads bowtie should use.
#' @param paired \code{logical} specify whether you have paried reads or not.
#' @param nchunks \code{integer} number of chunks to split the files for a slurm job. Ignored if slurm = FALSE
#' @param days_requested \code{integer} number of days requested for the job (when submitting a slurm job). Ignored if slurm = FALSE
#' @param slurm \code{logical} if \code{TRUE} job is submitted as a slurm batch job, otherwise it's run on the local machine. Slurm jobs will honour the nchunks and days_requested arguments. 
#' @param slurm_partition \code{character} the slurm partition to submit to. Ignored if slurm=FALSE
#' @param ram_per_node \code{numeric} The number of Mb per node. Ignored if slurm=FALSE. Default of \code{parallel_threads*bowtie_threads*1000}
#' @note The amount of memory requested should be set to bowtie_threads*parallel_threads*1G as this is the default requested by samtools for sorting. If insufficient memory is requested, the bam files will not be created successfully.
#' @export
sequenceAlignmentTopHat = function(path="/shared/silo_researcher/Gottardo_R/jingyuan_working/iGenome/Mus_musculus/UCSC/mm10", 
                                   parallel_threads=1,tophat_threads=6, paired=FALSE, nchunks=10,days_requested=5,
                                   slurm=FALSE,slurm_partition="gottardo_r",ram_per_node=tophat_threads*parallel_threads*1200)
{
  ncores<-parallel_threads*tophat_threads
  if(ncores>parallel::detectCores()&!slurm){
    stop("The number of parallel_threads*bowite_threads is more than the number of cores detected by detectCores() on the local machine for non-slurm jobs")
  }
  if(!slurm){
    #If we aren't using slurm, set the number of chunks to one.
    nchunks<-1
  }
  #Chunking for slurm
  .chunkDataFrame<-function(df=pairs,nchunks=nchunks){        
    groupsize<-nrow(df)%/%nchunks
    split(as.data.frame(df),gl(nchunks,groupsize,length=nrow(df)))
  }
  
  subdirs<-getConfig()[["subdirs"]]
  subdirs[["iGenome"]]<-path
  assignConfig("subdirs",subdirs)
  tophat_dir <- getConfig()[["subdirs"]][["TOPHAT"]]
  fastq_dir <- getConfig()[["subdirs"]][["FASTQ"]]
  genome.gtf <- file.path(getConfig()[["subdirs"]][["iGenome"]],"genes.gtf")
  genome.index <- file.path(getConfig()[["subdirs"]][["iGenome"]],"genome")
  fastq.files <- list.files(fastq_dir, ".fastq$", full.names = TRUE, recursive = TRUE)
  if(paired){
    samples <- unique(sub("_..fastq", "", basename(fastq.files)))
    f1 <- fastq.files[match(paste0(samples,"_1.fastq"), basename(fastq.files))]
    f2 <- fastq.files[match(paste0(samples,"_2.fastq"), basename(fastq.files))]
    pairs <- cbind(f1, f2, samples)
    chunked<-.chunkDataFrame(pairs,nchunks)
    for(i in seq_along(chunked)){
      con=file(file.path(getConfig()[["subdirs"]][["FASTQ"]],paste0("arguments_chunk_",i,".txt")))
      writeLines(t(chunked[[i]]),con=con)
      close(con)
    }
    if(slurm){
      #With chunking enabled this is meant to be submitted as a slurm batch job with the --array option set to 1-nchunks
      con<-file(file.path(getConfig()[["subdirs"]][["FASTQ"]],"batchSubmitJob.sh"),open = "w")
      command<-paste0("cd ",tophat_dir," && parallel -n 3 -j ",parallel_threads," tophat -p ", tophat_threads," -G ", genome.gtf, " --keep-fasta-order --no-novel-junc -o {3} ",
                      genome.index, " {1} {2} :::: ",file.path(getConfig()[["subdirs"]][["FASTQ"]],paste0("arguments_chunk_${SLURM_ARRAY_TASK_ID}.txt","\n")))
      #Before writing the command we need to  preamble for the slurm job
      # We'll request 1Gb of RAM per parallel thread per node
      ram_requested<-parallel_threads*ram_per_node
      writeLines(c("#!/bin/bash\n",
                   paste0("#SBATCH -n ",ncores,"                 # Number of cores"),
                   "#SBATCH -N 1                 # Ensure that all cores are on one machine",
                   paste0("#SBATCH -t ",days_requested,"-00:00           # Runtime in D-HH:MM"),
                   paste0("#SBATCH -p ",slurm_partition,"    # Partition to submit to"),
                   paste0("#SBATCH --mem=",ram_requested,"            # Memory pool for all cores (see also --mem-per-cpu)"),
                   paste0("#SBATCH -o ",file.path(getConfig()[["subdirs"]][["FASTQ"]],"tophat_%a.out"),"      # File to which STDOUT will be written"),
                   paste0("#SBATCH -e ",file.path(getConfig()[["subdirs"]][["FASTQ"]],"tophat_%a.err"),"      # File to which STDERR will be written")),con=con)
      
      writeLines(paste0(command,"\n"),con=con)      
      close(con)
      #Now the command to launch the job is sbatch batchSubmitJob.sh
      command<-paste0("sbatch --array=1-",nchunks," ",file.path(getConfig()[["subdirs"]][["FASTQ"]],"batchSubmitJob.sh"))
    }else{
      # Command for a non-slurm job
      command<-paste0("cd ",tophat_dir," && parallel -n 3 -j ",parallel_threads," tophat -p ", tophat_threads," -G ", genome.gtf, " --keep-fasta-order --no-novel-junc -o {3} ", 
                      genome.index, " {1} {2} :::: ",file.path(getConfig()[["subdirs"]][["FASTQ"]],paste0("arguments_chunk_1.txt","\n")))        
    }
  }else{
    myfiles <- cbind(fastq.files, gsub(".fastq", "", basename(fastq.files)))
    chunked<-.chunkDataFrame(myfiles,nchunks)
    for(i in seq_along(chunked)){
      con=file(file.path(getConfig()[["subdirs"]][["FASTQ"]],paste0("arguments_chunk_",i,".txt")))
      writeLines(t(chunked[[i]]),con=con)
      close(con)
    }
    if(slurm){
      #With chunking enabled this is meant to be submitted as a slurm batch job with the --array option set to 1-nchunks
      con<-file(file.path(getConfig()[["subdirs"]][["FASTQ"]],"batchSubmitJob.sh"),open = "w")
      command<-paste0("cd ",tophat_dir," && parallel -n 2 -j ",parallel_threads," tophat -p ", tophat_threads," -G ", genome.gtf, " --keep-fasta-order --no-novel-junc -o {2} ",
                      genome.index, " {1} :::: ",file.path(getConfig()[["subdirs"]][["FASTQ"]],paste0("arguments_chunk_${SLURM_ARRAY_TASK_ID}.txt","\n")))
      #Before writing the command we need to  preamble for the slurm job
      # We'll request 1Gb of RAM per parallel thread per node
      ram_requested<-parallel_threads*ram_per_node
      writeLines(c("#!/bin/bash\n",
                   paste0("#SBATCH -n ",ncores,"                 # Number of cores"),
                   "#SBATCH -N 1                 # Ensure that all cores are on one machine",
                   paste0("#SBATCH -t ",days_requested,"-00:00           # Runtime in D-HH:MM"),
                   paste0("#SBATCH -p ",slurm_partition,"    # Partition to submit to"),
                   paste0("#SBATCH --mem=",ram_requested,"            # Memory pool for all cores (see also --mem-per-cpu)"),
                   paste0("#SBATCH -o ",file.path(getConfig()[["subdirs"]][["FASTQ"]],"tophat_%a.out"),"      # File to which STDOUT will be written"),
                   paste0("#SBATCH -e ",file.path(getConfig()[["subdirs"]][["FASTQ"]],"tophat_%a.err"),"      # File to which STDERR will be written")),con=con)
      
      writeLines(paste0(command,"\n"),con=con)      
      close(con)
      #Now the command to launch the job is sbatch batchSubmitJob.sh
      command<-paste0("sbatch --array=1-",nchunks," ",file.path(getConfig()[["subdirs"]][["FASTQ"]],"batchSubmitJob.sh"))
    }else{
      # Command for a non-slurm job
      command<-paste0("cd ",tophat_dir," && parallel -n 2 -j ",parallel_threads," tophat -p ", tophat_threads," -G ", genome.gtf, " --keep-fasta-order --no-novel-junc -o {2} ", 
                      genome.index, " {1} :::: ",file.path(getConfig()[["subdirs"]][["FASTQ"]],paste0("arguments_chunk_1.txt","\n")))        
    }
  }
  cat(command)
  system(command)
}

#' Create Sample file required as the input of RNASeQC
#' 
#' The sample file is the tab-delimited description of samples and their bams. The header of the file should be: SampleID BamFile Notes
#' The BamFile should be the path to the input file
.createSampleFile = function(){
  bam.files <- list.files(getConfig()[["subdirs"]][["TOPHAT"]], "accepted_hits_added.bam$", full.names = TRUE, recursive = TRUE)
  sampleid <- sapply(strsplit(bam.files, "/"), function(x) x[length(x)-1])
  note <- sapply(strsplit(sampleid, "_"), function(x) x[1])  
  dat <- data.frame("Sample ID" = sampleid, "Bam File" = bam.files, "Notes" = note, stringsAsFactors = F)
  write.table(dat, file = paste0(getConfig()[["subdirs"]][["RAWANNOTATIONS"]],"/samplefile"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep = "\t")
  dat
}

#' Create a dict (dictionary) file for the refrence genome required as the input of RNASeQC
#' 
#' This dict file should be within the same folder as the refrence genome
#' @param fasta_file \code{character} the name of the fasta file, must be specified
#' @param picard.path \code{character} specifying an \emph{absolute path} path to the PICARD software directory.
.createGenomeDict=function(picard.path="/home/jdeng/bin/picard-tools-1.110/")
{
  genome.fa <- file.path(getConfig()[["subdirs"]][["iGenome"]], "genome.fa")
  cmd=paste("java -Xmx2048m -Xms2048m -jar ",picard.path,"/CreateSequenceDictionary.jar REFERENCE=",genome.fa," OUTPUT=",gsub(".fa.*",".dict",genome.fa),sep="")
  print(cmd)
  system(cmd)
}

#' Run RNASeQC quality control
#' 
#' Pre-run Checklist
#' 1. Are the contig names consistent betwen BAM, Refrence and GTF file? 
#' 2. Is the BAM indexed? we index the bam files in the prepareBamFiles
#' 3. Is you reference Indexed? We index the refrence genome before runing the tophat
#' 4. Dose the reference genome have a dict file? we generated the dict file in the createGenomeDict
#' 
#' @param rna_seqc.path \code{character} specifying an \emph{absolute path} path to the RNASeQC directory.
#' @param picard.path \code{character} specifying an \emph{absolute path} path to the PICARD software directory.
#' @param paired \code{logical} specify whether data is paired. Defaults to FALSE.
#' @param ncores \code{integer} number of cores for running in parallel
#' @export
qualityRNASeQC = function(picard.path="/home/jdeng/bin/picard-tools-1.110/", ncores=6, rna_seqc.path="/home/jdeng/bin/RNA-SeQC_v1.1.7.jar", paired=FALSE){
  print("Step 1: Prepare bam files")
  genome.fa <- file.path(getConfig()[["subdirs"]][["iGenome"]], "genome.fa")
  bam.files <- list.files(getConfig()[["subdirs"]][["TOPHAT"]], "accepted_hits.bam$", full.names = TRUE, recursive = TRUE)
  mclapply(bam.files, function(s){
    # add read group to bam
    cmd <- paste0("nohup java -Xmx2048m -Xms2048m -jar ", picard.path, "AddOrReplaceReadGroups.jar I=", s, " O=", sub(".bam", "_added.bam", s), " SORT_ORDER=coordinate RGID=1 RGLB=1 RGPL=illumina RGPU=barcode RGSM=1")
    system(cmd)
    # index bam
    cmd <- paste0("samtools index ", sub(".bam", "_added.bam", s))
    system(cmd)
  }, mc.preschedule = F, mc.cores = ncores)
  
  print("Step2: Create Sample file\n")
  .createSampleFile()
  
  print("step3: Greate Genome dict")
  .createGenomeDict(picard.path)
  
  print("step4: Run RNASeQC")
  rnaseqc.dir <- getConfig()[["subdirs"]][["RNASEQC"]]
  genome.gtf <- file.path(getConfig()[["subdirs"]][["iGenome"]],"genes.gtf")
  samplefile <- file.path(getConfig()[["subdirs"]][["RAWANNOTATIONS"]],"samplefile")
  if(paired){
    cmd <- paste0("nohup java -Xmx2048m -Xms2048m -jar ", rna_seqc.path, " -o ", rnaseqc.dir, " -r ", genome.fa, " -s ", samplefile, " -t ", genome.gtf)                        
  }else{
    cmd <- paste0("nohup java -Xmx2048m -Xms2048m -jar ", rna_seqc.path, " -o ", rnaseqc.dir, " -r ", genome.fa, " -s ", samplefile, " -singleEnd -t ", genome.gtf)
  }
  system(cmd)
}


#' Generate QualityControl Matric.
#'
#' calculate qc statistics: nGenesOn, librarySize, alignment percentage to reference genome, exon mapping rate,
#' and per base sequence quality output by FASTQC software
#'
#' @param paired \code{logical} specify whether you have paried reads or not.
#' @export
QualityControl <- function(paired=FALSE){
  countMatrix <- fread(list.files(getConfig()[["subdirs"]][["RSEM"]],pattern="rsem_count_matrix.csv",full.names=TRUE))[ ,-1,with=FALSE]
  sample <- gsub("Aligned.toTranscriptome.out", "", colnames(countMatrix))
  result <- data.table(Sample=sample, nGeneOn = colSums(countMatrix>5))
  ### alignment rate
  bam.dir <- getConfig()[["subdirs"]][["BAM"]]
  rsem.dir <- getConfig()[["subdirs"]][["RSEM"]]
  # starAlignSummary <- list.files(bam.dir, "Log.final.out$", full.names=TRUE, recursive=FALSE)
  rsemAlignSummary <- list.files(rsem.dir, ".cnt$", full.names=TRUE, recursive=TRUE)
  rsemFileNames <- gsub("Aligned.toTranscriptome.out.cnt", "", basename(rsemAlignSummary))
  starFileNames <- gsub("Log.final.out", "", basename(starAlignSummary))
  rsemAlignSummary <- rsemAlignSummary[match(starFileNames, rsemFileNames)]
  res <- t(sapply(seq(starAlignSummary), function(i) {
    print(i)
    starIn <- read.delim(starAlignSummary[i], stringsAsFactors=FALSE)
    libSize <- as.numeric(starIn[4,2])
    alignRate <- (as.numeric(sub("%","",starIn[8,2]))+as.numeric(sub("%","",starIn[23,2])))/100
    rsemIn <- scan(rsemAlignSummary[i], what = "", nlines=1)
    exonRate <- as.numeric(rsemIn[2])/libSize
    cbind(libSize, alignRate, exonRate)}))
  
  res <- data.table(Sample=starFileNames, libSize=res[,1], alignRate=res[,2], exonRate=res[,3])
  result <- merge(result, res, by="Sample")
  ###### Fastqc
  fastqc <- list.files(getConfig()[["subdirs"]][["FASTQC"]], "summary.txt", full.names=TRUE, recursive=TRUE)
  
  ## remove R[12]...fastqc suffix from file name. Initial (.*) causes right hand semantics so non-greedy matches and
  ## captures the fle name without the suffix.
  ## Extra complicated regexp just in case someone has inserted an R1 or R2 in file name before the read pair identifier
  ## added ? after R as it is possible for fastq files to not have the R. (? matches 0 or 1)
  fastqcFileNames <- sub("(.*)_R?[12].*_fastqc$", "\\1", sapply(strsplit(fastqc, "/"), function(x) x[length(x)-1]), perl=TRUE)

   if(!paired){
    res_fastqc <- sapply(fastqc, function(i) read.delim(i, header=FALSE, sep="\t", stringsAsFactors=FALSE)[2,1])
    res_fastqc[res_fastqc == "WARN"] <- "PASS"
    res_fastqc <- data.table(Sample=fastqcFileNames, fastqc=res_fastqc)
    res_fastqc$Sample <- gsub("_fastqc", "", res_fastqc$Sample)
    result <- merge(result, res_fastqc, by="Sample")
  }else{
    res_fastqc <- sapply(fastqc, function(i) read.delim(i, header=FALSE, sep="\t", stringsAsFactors=FALSE)[2,1])
    res_fastqc[res_fastqc == "WARN"] <- "PASS"
    res_fastqc <- data.table(Sample=fastqcFileNames, fastqc=res_fastqc)
    res_fastqc <- res_fastqc[,.SD[,paste(fastqc, collapse="_"), by=Sample]]
    res_fastqc$Sample <- gsub("_fastqc", "", res_fastqc$Sample)
    result <- merge(result, res_fastqc, by="Sample")
  }
  write.csv(result, file=file.path(getConfig()[["subdirs"]][["OUTPUT"]],"quality_control_matrix.csv"), row.names=FALSE)
  result
}



#' Build a genome index for STAR
#'
#' Builds a genome Index at `Utils/Reference_Genome/STARIndex`
#'
#' You must specify the Utils path if it is not already defined, and have your genome and annotation file in a folder titled
#' `Reference_Genome`. This function will construct the genome index using STAR tools.
#' The command line is the default shown in the documentation.
#' A fasta_file must be provided.
#' gtf_file is highly recommended, and if not provided, STAR will run without annotations.
#' @param path \code{character} specifying an \emph{absolute path} path to the Utils directory.
#' @param gtf_file \code{character} the name of the gtf file.
#' @param fasta_file \code{character} the name of the fasta file, must be specified
#' @param star_threads \code{integer} specify how many threads star should use.
#' @param name \code{character} the name of the genome output
#' @export

buildGenomeIndexSTAR = function (path = NULL, gtf_file = "", fasta_file = NULL, star_threads = 1, name=NULL){
  if (is.null(fasta_file)){
    stop("You must provide a fasta_file")
  }
  if (is.null(path) & inherits(try(getConfig()[["subdirs"]][["Utils"]], silent = TRUE), "try-error")) {
    stop("Please specify where to build the reference genome")
  }
  else if (is.null(path)) {
    path <- file.path(getConfig()[["subdirs"]][["Utils"]], "Reference_Genome/STARIndex")
  }
  else {
    subdirs <- getConfig()[["subdirs"]]
    subdirs[["Utils"]] <- path
    assignConfig("subdirs", subdirs)
    path <- file.path(getConfig()[["subdirs"]][["Utils"]], "Reference_Genome/STARIndex")
  }
  if (substr(path, 1, 1) != "/") {
    stop("'path' must be an absolute path")
  }
  dir.create(path, showWarnings=FALSE)
  if (gtf_file == "") {
    gtfopt <- ""
  }
  else {
    gtfopt <- "--sjdbGTFfile"
    gtf.file <- file.path(paste0(getConfig()[["subdirs"]][["Utils"]], "/Reference_Genome/", gtf_file))
  }
  fasta.file <- file.path(paste0(getConfig()[["subdirs"]][["Utils"]], "/Reference_Genome/", fasta_file))
  if (length(fasta.file) > 1) {
    fasta.file <- paste(fasta.file, collapse = ",")
  }
  if (length(list.files(pattern="SAindex", path=path)) == 0) {
    command = paste0("STAR --runThreadN ", star_threads, " --runMode genomeGenerate --genomeDir ", path, " --genomeFastaFiles ", fasta.file, " ", gtfopt, " ", gtf.file)
    system(command)
  }
  else {
    message("Reference Genome Found")
  }
  assignConfig("reference_genome_name", name)
}



#' Use the STAR tool to align the reads 
#'
#' Use the STAR tool to align reads and generate transcriptome bam files
#'
#' Uses STAR to align reads in FASTQ files against the reference genome. Optionally you can specify paired end reads. The code assumes
#' paired reads have fastq files that differ by one character (i.e. sampleA_read1.fastq, sampleA_read2.fastq) and will perform
#' matching of paired fastq files based on that assumption using string edit distance. Read 1 is assumed to be upstream
#' and read 2 is assumed to be downstream.
#' The number of parallel_threads*star_threads should not be more than the number of cores available on your system.
#' @param parallel_threads  specify how many parallel processes to spawn
#' @param star_threads  specify how many threads star should use.
#' @param SJ_num argument for STAR parameter 'limitOutSJcollapsed' which is number of
#' collapsed splice junctions allowed. Must be integer > 0
#' @param paired  specify whether you have paried reads or not.
#' @param force force STAR to align all FASTQ files
#' @param paired_pattern  specify the suffix of the paired-end fastq file names. If not 
#' paired then use only a single suffix.
#' @param fastqPath specify path to FASTQ files if different than default
#' @export
AlignmentSTAR <- function(parallel_threads=1, star_threads=1, SJ_num=5000000, paired=TRUE, 
                          force=FALSE,
                          paired_pattern=c("_1.fastq", "_2.fastq"), fastqPath=""){
  
  ncores<-parallel_threads*star_threads
  if(ncores>parallel::detectCores()){
    stop("The number of parallel_threads*bowite_threads is more than the number of cores detected by detectCores() on the local machine for non-slurm jobs")
  }

  star_dir <- getConfig()[["subdirs"]][["BAM"]]
  done <- stringr::str_replace(list.files(path = star_dir, pattern = "\\Aligned.toTranscriptome.out.bam$"), "Aligned.toTranscriptome.out.bam", "")

  fastq_dir <- ifelse(fastqPath=="",  getConfig()[["subdirs"]][["FASTQ"]], fastqPath)
  if(!file.exists(fastq_dir)) {
    stop(paste(fastq_dir, "doesn't exist\n"))
  }
  
  ## get names of fastq files
  orig <- list.files(fastq_dir, pattern="fastq|fq")
  
  ## check that there are fastq files in directory
  if(length(orig) == 0) {
    stop(paste("No fastq files found in", fastq_dir))
  }
  
  ## extract sample names without suffix
  sufLength <- nchar(paired_pattern[1])
  sampleNames <- unique(substr(orig, start=1, stop=nchar(orig) - (sufLength)))
 
  ## obtain names of fastq files that have not been aligned.
  keep <- sampleNames
  if(!force) {
    keep <- setdiff(sampleNames, done)
    if(length(keep)==0){
      message("All FASTQ files already aligned. Use 'force=TRUE' to force realignment")
      return()
    }
  } ## end if force
  
  index <-  file.path(getConfig()[["subdirs"]][["Utils"]], "Reference_Genome/")
 
  if(!paired){
    
    ## run STAR for all fastq files on workstation
    runSTAR <- function(fastqName, f_dir=fastq_dir, r_path=index,
                        s_threads=star_threads, s_dir=star_dir,
                        p_pattern=paired_pattern, paired=TRUE) {
      
      starCommand<-paste0("cd ", s_dir," && STAR --runThreadN ", star_threads, 
                          " --outFileNamePrefix", fastqName, " --genomeDir ", r_path, 
                          " --readFilesIn", f_dir, "/", fastqName, p_pattern[1],
          " --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM",
          " --limitOutSJcollapsed ", format(SJ_num, scientific=FALSE))
       system(starCommand)
    } ## end runSTAR
    
  } else {
    
    ## run STAR for all fastq files on workstation
    runSTAR <- function(fastqName, f_dir=fastq_dir, r_path=index,
                            s_threads=star_threads, s_dir=star_dir,
                            p_pattern=paired_pattern) {
      
      starCommand <- paste0("cd ", s_dir, " && STAR --runThreadN ", star_threads, 
                             " --outFileNamePrefix ", fastqName, " --genomeDir ", r_path, 
                             " --readFilesIn ", f_dir, "/", fastqName, p_pattern[1], 
                              " ", f_dir, "/", fastqName, p_pattern[2], 
 " --outSAMtype BAM SortedByCoordinate --outReadsUnmapped Fastx --quantMode TranscriptomeSAM",
 " --limitOutSJcollapsed ", format(SJ_num, scientific=FALSE))
      
       system(starCommand)
     } ## end runSTAR
   } ## end if !paired
 
    nothing <- mclapply(keep, runSTAR, mc.cores=parallel_threads)
    
}





#' Sum all read counts for multiple instances of a single gene symbol.
#' 
#' A gene symbol can map to multiple UCSC gene cluster ids so in those
#' instances all read counts mapped to a single gene symbol are summed.
#' The summed read counts are returned in an Expression Set along with
#' the feature data table and optionally an experimental design table.
#' @param counts \code{matrix} table of read counts or tpms. Rows are genes and columns are
#' samples
#' @param fDat \code{data.frame} feature data - rows map to counts rows
#' @param cDat \code{data.frame}. experimental design or phenotype data - rows map to counts columns
#' @return ExpressionSet containing the summed counts data (counts), the feature data which will map to the summed counts data (fDat), and optionally, the experimental data (cDat)
#' @export
sumDuplicates <- function(counts, fDat, cDat=NULL) {

    ## apply basic QC
    if (!is.numeric(as.matrix(counts))) 
        stop("`counts` must be numeric")
    if(!is.null(cDat)) {
        if (ncol(counts) != nrow(cDat)) 
            stop("`cDat` must contain as many rows as `counts` columns")
    }
    if (nrow(counts) != nrow(fDat)) 
        stop("`fDat` must contain as many rows as `counts` rows")
 
    ## sum read counts across identical gene symbols by sample
    sumGenes <- function(x, indy=fInd) {
         sums <- tapply(x, indy, sum, na.rm=TRUE)
    }

    ## extract index of gene symbols
    fInd <- as.character(fDat$gene_symbol)
    
    ## sum read counts of identical gene symbols for each sample. Orders
    ## by gene symbol
    newDat <- apply(counts, 2, sumGenes)

    ## remove duplicate gene symbols
    newfDat <- fDat[!duplicated(fDat$gene_symbol),]
    
    ## order feature data by gene symbol
    newfDat <- newfDat[order(newfDat$gene_symbol),]
    rownames(newfDat) <- newfDat$gene_symbol
   
    ## check that order is correct for both data objects
    if(sum(rownames(newDat) != newfDat$gene_symbol) != 0) {
        stop("sumDuplicates: feature and assay data not in correct order")
    }

    ## construct ExpressionSet for return object
    if(!is.null(cDat)) {
        newEset <- ExpressionSet(assayData=newDat,
                                 phenoData=AnnotatedDataFrame(cDat),
                                 featureData=AnnotatedDataFrame(newfDat))
    } else {
        newEset <- ExpressionSet(assayData=newDat,
                                 featureData=AnnotatedDataFrame(newfDat))
    }
   
    return(newEset)
    
} ## end sumDuplicates



#' Build a reference genome
#' 
#' Builds a reference genome at `path/Reference_Genome/`
#' 
#' You must specify the Utils path if it is not already defined, and have your genome in a folder titled
#' `Reference_Genome`. This function will construct the reference genome using RSEM tools.
#' The command line is the default shown in the documentation.
#' `rsem-prepare-reference --gtf gtf_file --transcript-to-gene-map knownIsoforms.txt --bowtie2 fasta_file name`
#' or if doSTAR=TRUE
#'  `rsem-prepare-reference --gtf gtf_file --transcript-to-gene-map knownIsoforms.txt --star --star-path starPath fasta_file name`
#' If the gtf_file is not give, then the transcript-to-gene-map option is not used either. A fasta_file and a name must be provided.
#' @param path \code{character} specifying an \emph{absolute path} path to the Reference_Genome directory.
#' @param gtf_file \code{character} the name of the gtf file. Empty by default. If specified the function will look for the named file in the 'path' directory
#' @param fasta_file \code{character} the name of the fasta file, must be specified and located in the 'path' directory
#' @param isoformsFile \code{character} the name if the known isoforms file (optional). If specified it must be located in the 'path' file.
#' @param name \code{character} the prefix of the genome output.
#' @param doSTAR \code{logical} logical showing whether to build genome with STAR index.
#' @param starPath \code{character}  full path to the 'star' executable.
#' @param threads \code{integer} number of threads to use
#' @export
buildReference <- function(path=NULL, gtf_file=NULL, fasta_file=NULL, isoformsFile=NULL, name=NULL, doSTAR=TRUE, starPath=NULL, threads=1){
  if(is.null(fasta_file)|is.null(name))
    stop("You must provide a fasta_file  and a genome name")
  if(is.null(path)&inherits(try(getConfig()[["subdirs"]][["Utils"]],silent=TRUE),"try-error")){
    stop("Please specify where to build the reference genome")
  }else if(is.null(path)){
    path = file.path(getConfig()[["subdirs"]][["Utils"]],"Reference_Genome")
  } else{
    subdirs<-getConfig()[["subdirs"]]
    subdirs[["Utils"]]<-path
    assignConfig("subdirs",subdirs) #save the user-provided path
    path = file.path(path,"Reference_Genome")
  }
  if(substr(path, 1, 1) != '/') stop("'path' must be an absolute path")

  ## check that gtf file is provided with known isoforms file
  if(is.null(gtf_file) & !is.null(isoformsFile)) {
     stop("Must provide a .gtf file if knownIsoforms file is provided")
  }

  ## set up gtf and known isoforms arguments if they exist
  gtfopt <- ""
  isoformsOpt <- ""

  if(!is.null(gtf_file)) {
      gtfopt <- "--gtf"
  }

  if(!is.null(isoformsFile)) {
      isoformsOpt<-"--transcript-to-gene-map"
  }

  if(length(fasta_file)>1){
      fasta_file <- paste(fasta_file,collapse=",")
  }

  ## build genome with STAR
  if(doSTAR) {

      ## get path to STAR if not provided
      if(is.null(starPath)) {
          starPath <- dirname(Sys.which("STAR"))
      }
      
      ## check if the reference genome has already been built
      if(length(list.files(pattern="SAindex", path=file.path(getConfig()[["subdirs"]]["Utils"],"Reference_Genome")))==0){
          command = paste("cd ",path," && rsem-prepare-reference ", gtfopt , gtf_file, isoformsOpt, isoformsFile, "--p", threads, "--star --star-path", starPath, fasta_file, name)
          print(command)
          system(command)
      }else{
          message("Reference Genome Found")
      }

  ## bulid genome with RSEM
  } else { 
      
      if(length(list.files(pattern=paste0(name,".chrlist"),path=file.path(getConfig()[["subdirs"]]["Utils"],"Reference_Genome")))==0){
          command = paste0("cd ",path," && rsem-prepare-reference ",gtfopt," ",gtf_file," ",isoformsOpt, " ",isoformsFile, " --bowtie2 ",fasta_file," ",name," ")
          system(command)
      }else{
          message("Reference Genome Found")
      }
  } ## end if doSTAR
      
  #set the reference genome name
  assignConfig("reference_genome_name",name)

} ## end BuildReference



#' Annotate features: map UCSC gene cluster id to gene symbols
#'
#' Annotate features by mapping UCSC gene cluster ids to gene symbols.
#' 
#' @param genome specify the genome and version of
#' annotation
#' @param annotateTable Each row is a transcript and the
#' columns are: transcript ID, Ensemble gene ID, gene symbol, UCSC
#' clusterId. The clusterId column must have 'clusterId' in column name and
#' not be present in any other column.
#' @param force if TRUE force the annoation even if the
#' feature file already exists
#' @export
annotateUCSC <- function(genome="hg38", annotateTable=NULL, force=TRUE) {

    ## get annotation table - passed in or read internal data
    if(is.null(annotateTable)) {
    
       ## read in UCSC annotation file mapping cluster id to gene symbol
        if(genome == "hg38") {
                 annoFile <- ucschg38Table
        } else if(genome == "mm10") {
                 annoFile <- ucscmm10Table
         } else {
            stop(cat("Genome", genome, "is not supported."))
        }
    } else {
        annoFile <- annotateTable
    }
    
   featuredata_outfile <- "rsem_fdata.csv"
    if (!force & file.exists(file.path(getConfig()[["subdirs"]][["RSEM"]], 
        featuredata_outfile))) {
        message("Annotation already done, skipping. Use force=TRUE to rerun.")
        invisible(return(0))
    }
    
    message("Annotating transcripts.")

    ## get experimental annotation file which maps cluster ids to count rows
    rsem_txs_table <- fread(file.path(getConfig()[["subdirs"]][["RSEM"]], 
        "rsem_txs_table.csv"))

    ## rename clusterID column to match clusterId column in rsem_txs_table
    colnames(annoFile)[grep("clusterId", colnames(annoFile))] <- "gene_id"

    ## map UCSC cluster ids from rsem to gene symbols in anno file
    joiny <- plyr::join(rsem_txs_table, annoFile, by="gene_id", type="left",
                        match="first")
    
    ## remove irrelevant columns
    removeCols <- grep("knownGene", colnames(joiny))
    joiny[, removeCols] <- NULL
    joiny$V1 <- NULL
  
    ## make human friendly column labels
    colnames(joiny) <- c("gene_clusterID", "transcript_ids","gene_symbol")
    
    message("Writing feature data to rsem_fdata.csv")
    write.csv(joiny, file = file.path(getConfig()[["subdirs"]][["RSEM"]], featuredata_outfile), row.names = FALSE)
    
} ## end annotateUCSC


#' Map the annotation file to the count and tpm data
#'
#' Match the count, tpm, and feature data to the annotation file. Remove
#' samples in count and tpm that are not found in the annotation file and
#' ensure that the count and tpm columns match the rows of the annoation
#' file. Also attach the results of the quality_control table to the
#' annotation file.
#'
#' @param annotationMatch Column name of annotation file
#' that maps to the column names of the count and tpm data sets (fastq
#' file names with the suffixes removed including paired end identifiers
#' if they exist). Default column name used is 'Sample'. Each row is a
#' sample library and the columns are the  annotation information
#' @param mergeAnnotation If TRUE add annotation information
#' contained in a .csv file in RAW_ANNOTATIONS directory.
#' @return \code{list} list containing the count, tpm, feature, and
#' annotation data. list names are counts, tpms, featureData, annoData.
#'
#' The annotation file, which contains the experimental design information,
#' must be constructed by the user and placed in the RAW_ANNOTATIONS
#' directory. It must be a .csv file and be the only .csv file in the
#' RAW_ANNOTATION directory. By default 'mergeData' will look for a column
#' called 'Sample' that maps each row of the annotation file to the columns
#' of the counts and tpm columns.
#' The quality control matrix is generated by running 'runFASTQ' and then
#' 'QualityControl'
#' @export
mergeData <- function(annotationMatch=NULL, mergeAnnotation=TRUE) {

    ## read feature, counts, and tpm data files - ensure all in same order
    fd <- fread(getConfigFile("RSEM", "rsem_fdata.csv"), data.table=FALSE)
    indy <- order(fd$gene_clusterID)
    count <- fread(getConfigFile("RSEM", "rsem_count_matrix.csv"),
                   data.table=FALSE)[indy, ]
    tpm <- fread(getConfigFile("RSEM", "rsem_tpm_matrix.csv"),
                 data.table=FALSE)[indy, ]
    
    ## double check everyting is ordered correctly
    if(sum(fd$gene_clusterID != count[,1]) != 0 | sum(fd$gene_clusterID != tpm[,1]) != 0) {
        stop("count or tpm matrices are not ordered the same as feature data matrix")
    }

    ## remove gene_id columns
    count <- count[,-1]
    tpm <- tpm[,-1]

    fd <- data.frame(fd)
    rownames(fd) <- rownames(count) <- rownames(tpm) <- fd$gene_ClusterID

    ## add expiremental design information to return object
    anno <- fread(list.files(getConfigDir("RAWANNOTATIONS"), pattern="*.csv$", full.names=TRUE), data.table=TRUE)
    cd <- data.frame(anno)
    
    if(mergeAnnotation) {
         qc <- fread(getConfigFile("OUTPUT", "quality_control_matrix.csv"))

         ## rename column that maps to fastq file prefixes
         if(!is.null(annotationMatch)) {
             setnames(anno, "Sample", annotationMatch)
         }
   
        ## remove samples with descrepancies between fastq and anno
        inter <- intersect(anno$Sample, qc$Sample)
        anno <- anno[Sample %in% inter,]
        qc <- qc[Sample %in% inter,]
        count <- count[,colnames(count) %in% inter]
        tpm <- tpm[,colnames(tpm) %in% inter]
        samples <- colnames(count)

        ## merge
        anno <- merge(anno, qc, by="Sample")
        anno <- anno[match(samples, Sample)]
        cd <- data.frame(wellKey=samples, anno)
        rownames(cd) <- cd$wellKey
        
        ## match sample order with annotation file
        count <- count[,cd$wellKey]
        tpm <- tpm[,cd$wellKey]
        if(sum(colnames(count) == cd$wellKey) != dim(count)[[2]]) {stop("ERROR: colnames don't match")}
        if(sum(colnames(tpm) == cd$wellKey) != dim(tpm)[[2]]) {stop("ERROR: colnames don't match")}
    } ## end merge Annotation
    
    ## return the updated data
    return(list(counts=count, tpms=tpm, featureData=fd, annoData=cd))

} ## end mergeData


#' Obtain directory path for configuration variable.
#' 
#' @param directory configuration label
#'
getDir <- function(directory) {

  ns <- getNamespace("RNASeqPipelineR")
  config <- get("rnaseqpipeliner_configuration", ns)
  if(length(config) == 0) {
    stop("Configuration not found: Have you run loadProject or buildReference (for Utils directory)?")
  }

  if(!(directory %in% names(config$subdirs))) {
     stop(paste(directory, "not found in configuration"))
  }

  return(config$subdirs[directory])
} ## end getDir
RGLab/RNASeqPipelineR documentation built on Jan. 19, 2020, 12:31 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
RGLab/RNASeqPipelineR
Construct Standardized pipelines for processing RNASeq data

R/RNASeqPipelineR.R
In RGLab/RNASeqPipelineR: Construct Standardized pipelines for processing RNASeq data

R Package Documentation

Browse R Packages

We want your feedback!

RGLab/RNASeqPipelineR Construct Standardized pipelines for processing RNASeq data

R/RNASeqPipelineR.R In RGLab/RNASeqPipelineR: Construct Standardized pipelines for processing RNASeq data

R Package Documentation

Browse R Packages

We want your feedback!

RGLab/RNASeqPipelineR
Construct Standardized pipelines for processing RNASeq data

R/RNASeqPipelineR.R
In RGLab/RNASeqPipelineR: Construct Standardized pipelines for processing RNASeq data