R/step1_preprocess.R
In metaMix: Bayesian Mixture Analysis for Metagenomic Community Profiling

################################################ CALCULATE pij: PROBABILITY READ IS GENERATED BY CERTAIN SPECIES ###############################################Use Poisson probabilities to incorporate quality of match.
#' @name generative.prob
#' @aliases generative.prob
#' @aliases generative.prob.nucl
#' @title Compute generative probabilities from BLAST output
NULL
#' @rdname generative.prob
#' @title generative.prob
#' @description generative.prob() computes the probability for a read to be generated by a certain species, given that it originates from this species. The input for this function is the tabular BLAST output format, either default or custom. The function uses the recorded mismatches to produce a Poisson probability.
#' @param  blast.output.file This is the  tabular BLASTx output format for generative.prob(), while it is the tabular BLASTn output format for generative.prob.nucl(). It can either be the default output format or a specific custom output format, incorporating read length and taxon identifier. Please see the vignette for column order and the exact BLAST command to use. You can also use DIAMOND instead of BLASTx which is much faster and produces default format according to BLAST default output specifications.
#' @param read.length.file This argument can either be a file mapping each read to its length or a numerical value, representing the average read length.
#' @param contig.weight.file This argument can either be a file where weights are assigned to reads and contigs. For unassembled reads the weight is equal to 1 while for contigs the weight should reflect the number of reads that assembled it.
#' @param gi.taxon.file For generative.prob() this would be the 'gi_taxid_prot.dmp' taxonomy file, mapping each protein gi identifier to the corresponding taxon identifier. It can be downloaded from \url{ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.dmp.gz} . For generative.prob.nucl() this would be the 'gi_taxid_nucl.dmp' taxonomy file, mapping each nucleotide gi identifier to the corresponding taxon identifier. It can be downloaded from \url{ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz}.
#' @param protaccession.taxon.file This parameter has been added as NCBI is phasing out the usage of GI identifiers. For generative.prob() this would be the prot.accession2taxid taxonomy file, mapping each protein accession identifier to the corresponding taxon identifier. It can be downloaded from \url{ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz}. I have found that it is useful to concatenate it with \url{ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/dead_prot.accession2taxid.gz} so you can search in both files for the protein identifier (sometimes obsolete sequences can still be present in latest RefSeq releases but not in taxonomy files and vice versa and these mismatches can cause loss of information). TODO add support for nucleotides as well.
#' @param gi.or.prot This parameter specifies whether the user is using the GI identifiers or protein accession identifiers to map to taxon identifiers. Values are 'gi' or 'prot'. The default value is 'prot'.
#' @param gen.prob.unknown User-defined generative probability for unknown category. Default value for generative.prob() is 1e-06, while for generative.prob.nucl() is 1e-20.
#' @param outDir Output directory.
#' @param blast.default  logical. Is the input the default blast output tabular format? Default value is TRUE. That means that the BLAST output file needs to have the following fields:Query id, Subject id, percent identity, alignment length, mismatches, gap openings, query start, query end, subject start, subject end, e-value, bit score.  Alternatively we can use the 'blast.default=FALSE' option, providing a custom blast output that has been produced using the option -outfmt '6 qacc qlen sacc slen stitle bitscore length pident evalue staxids'.
#' @return step1: A list with five elements. The first one (pij.sparse.mat) is a sparse matrix with the generative probability between each read and each species. The second (ordered.species) is matrix containing all the potential species as recorded by BLAST, ordered by the number of reads. The third one (read.weights) is a data.frame mapping each contig to a weight which is essentially the number of reads that were used to assemble it. For unassembled reads the weight is equal to one. The fourth one is the generative probability for the unknown category (gen.prob.unknown), to be used in all subsequent steps. Finally we also record the output directory (outDir) where the results will be stored.
#' @keywords generative.prob
#' @export generative.prob
#' @import Matrix data.table
#' @importFrom stats ppois
#' @importFrom stats quantile
#' @examples
#' # See vignette for more details
#' 
#' \dontrun{
#' # When using custom BLAST output file
#' step1 <-generative.prob(blast.output.file = "pathtoFile/blastOut.custom", blast.default=FALSE)
#'
#' # When using default BLAST output file
#' step1 <-generative.prob(blast.output.file = "pathtoFile/blastOut.default",
#'                         read.length.file="pathtoFile/read.lengths",
#'                         contig.weight.file="pathtoFile/read.weights",
#'                         gi.taxon.file = "pathtoFile/taxon.file")
#' }
##############################################################################################################################################################

generative.prob = function(blast.output.file=NULL, read.length.file=80, contig.weight.file=1, gi.taxon.file=NULL, protaccession.taxon.file=NULL, gi.or.prot="prot", gen.prob.unknown=1e-06, outDir=NULL, blast.default=TRUE){

  if (blast.default){

    if (gi.or.prot=="gi") {
      if (is.null(gi.taxon.file)) {
        stop("Please provide the 'gi_taxid_prot.dmp' file.  This can be downloaded from ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.dmp.gz")
      } else {  
        taxon.prot<-fread(input=gi.taxon.file, sep="\t", header=F)  
        setnames(x=taxon.prot, old=c("proteinID","taxonID" ))
      }
    }

    if (gi.or.prot=="prot") {
      if (is.null(protaccession.taxon.file)) {
        stop("Please provide the 'prot.accession2taxid' file.  This can be downloaded from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz . I have found that it is useful to concatenate it with ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/dead_prot.accession2taxid.gz so you can search in both for the protein identifier, since obsolete sequences can still be present in RefSeq releases. However due to its size you need to increase the memory requirements quite a bit, more than 20G.")
      } else {  
        taxon.prot<-fread(input=protaccession.taxon.file, sep="\t", header=T, select=c(2,3))  
        setnames(x=taxon.prot, old=c("proteinID", "taxonID"))
      }
    }

    
    if (!is.null(blast.output.file)) {

      check.output<-fread(input=blast.output.file, sep="\t", header=F)

      if (ncol(check.output)!=12) {  ###quick check for format
        
        stop("Please check your BLAST output file. You have used the 'blast.default=TRUE' option, therefore your file needs to have the following fields:'Query id, Subject id, % identity, alignment length, mismatches, gap openings, query start, query end, subject start, subject end, e-value, bit score'. \n Alternatively you can use the 'blast.default=FALSE' option, providing a custom blast output that has been produced using the option  -outfmt '6 qacc qlen sacc slen stitle bitscore length pident evalue staxids' ")
      }  else if (ncol(check.output)==12){      
        blast.output<-fread(input=blast.output.file, sep="\t", header=F, select=c(1,3, 4, 12))
        setnames(x=blast.output, old=c("read", "ident", "aln", "bit.score"))           

        if (gi.or.prot=="prot"){
        protID<-fread(input=blast.output.file, sep="|", header=F, select=c(4))
      } else {
        protID<-fread(input=blast.output.file, sep="|", header=F, select=c(2))
        
      }
        setnames(x=protID, old="proteinID")
        blast.out.gi<-cbind.data.frame(blast.output, protID)
        
      
      } else {
        stop("Please provide the output file from BLASTx. The default tabular format is accepted, using the -m6 flag in the BLASTx command.")
      }
    }

      if (is.character(read.length.file)) {
        read.length<-fread(input=read.length.file, sep="\t", header=F)
        setnames(x=read.length, old=c("read", "length"))
      } else if (is.numeric(read.length.file)) {    
        read.length<-cbind.data.frame("read"=unique(blast.output[["read"]]), "length"=read.length.file)
      } else {
        stop("Please provide either a file containing the sequence lengths for reads and contigs or, assuming you haven't assembled contigs enter an average read length value.")
      }


    if (is.character(contig.weight.file)) {
      contig.weights<-fread(input=contig.weight.file, sep="\t", header=F)
      setnames(x=contig.weights, old=c("read", "weight"))
      rownames(contig.weights)<-contig.weights[["read"]]
    } else if (is.numeric(contig.weight.file)) {    
      contig.weights<-cbind.data.frame("read"=unique(blast.output[["read"]]), "weight"=contig.weight.file)
      rownames(contig.weights)<-contig.weights[,"read"]
    } else {
      stop("Please provide either a file containing the weights for reads (=1) and contigs (>1) or, assuming you haven't assembled contigs enter contig.weight.file=1.")
    }
  
    rm(blast.output)
    gc()

    blast.length<-merge(blast.out.gi, read.length, by="read")

    rm(blast.out.gi)
    gc()
    
    blast.length$mismatch <- (blast.length[["length"]]/3)-(blast.length[["ident"]]* blast.length[["aln"]])/100   ## the mismatches throughout read length not hsp length


    blast.length.weight<-merge(blast.length, contig.weights, by ="read", all.x=T)
    rm(blast.length)
    gc()
    
    indx<-which(is.na(blast.length.weight[["weight"]]))
    blast.length.weight$weight[indx]<-1
    read.weights<- unique(blast.length.weight[,c("read","weight"), with=FALSE])
    rownames(read.weights)<-read.weights[["read"]]

   
    allInfo.temp<-merge(blast.length.weight, taxon.prot, by="proteinID", all.x=T)

    if (length(unique(allInfo.temp[which(is.na(allInfo.temp[["taxonID"]])), get('proteinID')]))!=0){
      message("Some of the proteins in your reference database are not in the 'gi_taxon_prot.dmp' or the 'prot.accession2taxid' file (depends on which you used. If you used the latter 'prot.accession2taxid' it may be worth adding to it dead_prot.accession2taxid.gz, see help to see download details). Therefore these cannot be assigned a taxon identifier. \n We will remove these hits from subsequent analyses. \n For reference the respective gis are:  ")
      print(unique(allInfo.temp[which(is.na(allInfo.temp[["taxonID"]])),get('proteinID')]))
      allInfo<-merge(blast.length.weight, taxon.prot, by="proteinID")
    } else {
      allInfo<-allInfo.temp
    }

  
    probPois<-ppois(allInfo[["mismatch"]] - 1, lambda=0.09*((allInfo[["length"]])/3), lower.tail=FALSE)  ### at least that many mismatches could be observed by chance. Divide by 3 for translated read length
    data_0.03<-cbind.data.frame(allInfo, "pij"=probPois)                                 ### combine Poisson prob with other read info.

    data.dt<- data.table(data_0.03)
    data.grouped<-data.dt[,list(pij=max(pij)),by=c("read",  "weight", "taxonID")]  ### one hit per taxonid (best one)
    pij<-as.data.frame(data.grouped)

    taxonID<-NULL

    
############use Sparse Matrix "reshape" long -> wide fromat
    pij.sparse<-with(pij, sparseMatrix(i = as.numeric(factor(get('read'))), j=as.numeric((factor(taxonID))), x=pij,  dimnames=list(levels(factor(get('read'))), levels(factor(taxonID)))))

    pij.dt<-data.table(pij)
    allspecies.dt<-pij.dt[,list(count.reads=sum(get('weight'))), by="taxonID"]
  
    allspecies<-as.data.frame(allspecies.dt)
    ordered.species<-allspecies[order(-allspecies[,2]) , ]                      #### order them by read count 

### add unknown bin and assign a pij
    pij.sparse.mat<-cbind(pij.sparse, gen.prob.unknown)
    colnames(pij.sparse.mat)<-c(colnames(pij.sparse), "unknown")
    
    if (length(which(!(rownames(read.weights)%in%rownames(pij.sparse.mat))))!=0) {
      message("The following reads match only proteins that are no longer supported, i.e their gis are not in 'gi_taxon_prot.dmp' or their protein accession not in 'prot.accession2taxid' (depends on which you used. If you used the latter 'prot.accession2taxid' it may be worth adding to it dead_prot.accession2taxid.gz, see help to see download details). \n Therefore the reads were removed from subsequent analyses as these could be misclassified.")
      print(rownames(read.weights[which(!(rownames(read.weights)%in%rownames(pij.sparse.mat))),]))
      read.weights<-read.weights[which(rownames(read.weights)%in%rownames(pij.sparse.mat)),]
      read.weights<-data.frame(read.weights,  row.names=read.weights[["read"]] )
    } else {
      read.weights<-data.frame(read.weights,  row.names=read.weights[["read"]] )
    }
  
  
### sampling weight to use when choosing which species to add during MCMC
    uniqReads<-dim(pij.sparse.mat)[[1]]
    ordered.species$sampling.weight<-ordered.species[,2]/uniqReads

###Flattening the sampling probabilities
    percentiles<-quantile(ordered.species$sampling.weight,  probs=c(0.2, 0.8))
    ordered.species$sampling.weight[which(ordered.species$sampling.weight  >= percentiles["80%"])] <- percentiles["80%"]
    ordered.species$sampling.weight[which(ordered.species$sampling.weight  <= percentiles["20%"])] <- percentiles["20%"]
    colnames(ordered.species)<-c("taxonID", "count.reads", "sampling.weight")

 
### remove objects
    
    step1<-list("ordered.species"=ordered.species,"pij.sparse.mat"=pij.sparse.mat, "read.weights"=read.weights, "outDir"=outDir, "gen.prob.unknown"=gen.prob.unknown)

    if (!is.null(outDir)) {
      step1.name <- paste(outDir, "/step1.RData", sep = "")
      save(step1, file=step1.name)
      rm(list= ls()[!ls() %in% c("step1")])
      gc()
    } else {
      rm(list= ls()[!ls() %in% c("step1")])
      gc()
    }

  
  
    return(step1)

    
  } else {  ###if we are not dealing with default format, but rather with custom blast output that has incorporated the query length and the taxon id)

    if (!is.null(blast.output.file)) {

      check.output<-fread(input=blast.output.file, sep="\t", header=F)
      if (ncol(check.output)!=10) {  ###quick check for format
        stop("Please check your blast output file. The custom blast tabular format we accept as input has the following 10 columns 'qacc qlen sacc slen stitle bitscore length pident evalue staxids'. Alternatively you can use the default blast tabular output and use it along with 'blast.default=TRUE' option. You will need also to provide the 'gi_taxid_prot.dmp' or the ' prot.accession2taxid' file. The first can be downloaded from ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.dmp.gz and the latter from  ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz")

      } else if (ncol(check.output==10)) {      
        blast.output.length.temp<-fread(input=blast.output.file, sep="\t", header=F, select=c(1,2, 7, 8, 10))
        setnames(x=blast.output.length.temp, old=c("read", "length",  "aln", "ident", "taxonID"))

      } else {
        stop("Please provide the output file from BLASTx. The default tabular format is accepted, using the '-outfmt 6' flag in the BLASTx command.")
      }
    }

    taxonID<-NULL
    read<-NULL
    length<-NULL
    aln<-NULL
    ident<-NULL
    
    if ( is.character(blast.output.length.temp[,taxonID])){
    
      blast.output.length.temp2 <- blast.output.length.temp[, list(taxonID = as.integer(unlist(strsplit(taxonID, ";")))), by=list(read, length, aln, ident)]  ## handle comma separated taxonIDS and NAs
      blast.output.length<- blast.output.length.temp2[which(!is.na(blast.output.length.temp2[,taxonID])),]

    }  else {
      blast.output.length<-blast.output.length.temp
    
    }

    if (!is.integer(blast.output.length[["taxonID"]])) {
      stop("Your blast output does not have correctly formatted taxon identifiers. Please use the 'blast.default=TRUE' option, providing the default blast tabular format (using the blast option '-outfmt 6'  and provide seperately the 'gi_taxid_prot.dmp' file.")        
    }
  

    if (is.character(contig.weight.file)) {
      contig.weights<-fread(input=contig.weight.file, sep="\t", header=F)
      setnames(x=contig.weights, old=c("read", "weight"))
      rownames(contig.weights)<-contig.weights[["read"]]
    } else if (is.numeric(contig.weight.file)) {    
      contig.weights<-cbind.data.frame("read"=unique(blast.output.length[["read"]]), "weight"=contig.weight.file)
      rownames(contig.weights)<-contig.weights[,"read"]
    } else {
      stop("Please provide either a file containing the weights for reads (=1) and contigs (>1) or, assuming you haven't assembled contigs enter contig.weight.file=1")
    }


    
    blast.output.length$mismatch <- (blast.output.length[["length"]]/3)-(blast.output.length[["ident"]]* blast.output.length[["aln"]])/100    ## the mismatches throughout read length not hsp length

    blast.length.weight<-merge(blast.output.length, contig.weights, by ="read", all.x=T)

    rm(blast.length)
    gc()
    
    indx<-which(is.na(blast.length.weight[["weight"]]))
    blast.length.weight$weight[indx]<-1
    
    read.weights<- unique(blast.length.weight[,c("read","weight"), with=FALSE])
    rownames(read.weights)<-read.weights[["read"]]
    
    allInfo<-blast.length.weight
    probPois<-ppois(allInfo[["mismatch"]] - 1, lambda=0.09*((allInfo[["length"]])/3), lower.tail=FALSE)  ### at least that many mismatches could be observed by chance. Divide by 3 for translated read length
    data_0.03<-cbind.data.frame(allInfo, "pij"=probPois)                                 ### combine Poisson prob with other read info.

    data.dt<- data.table(data_0.03)
    data.grouped<-data.dt[,list(pij=max(pij)),by=c("read",  "weight", "taxonID")]  ### one hit per taxonid (best one)
    pij<-as.data.frame(data.grouped)
    
############use Sparse Matrix "reshape" long -> wide fromat
    pij.sparse<-with(pij, sparseMatrix(i = as.numeric(factor(get('read'))), j=as.numeric((factor(taxonID))), x=pij,  dimnames=list(levels(factor(get('read'))), levels(factor(taxonID)))))

    pij.dt<-data.table(pij)
    allspecies.dt<-pij.dt[,list(count.reads=sum(get('weight'))), by="taxonID"]
  
    allspecies<-as.data.frame(allspecies.dt)
    ordered.species<-allspecies[order(-allspecies[,2]) , ]                      #### order them by read count 

### add unknown bin and assign a pij
    pij.sparse.mat<-cbind(pij.sparse, gen.prob.unknown)
    colnames(pij.sparse.mat)<-c(colnames(pij.sparse), "unknown")
    
    if (length(which(!(rownames(read.weights)%in%rownames(pij.sparse.mat))))!=0) {
#      message("The following reads match only proteins that are no longer supported, i.e their gis are not in 'gi_taxon_prot.dmp'. \n Therefore the reads were removed from subsequent analyses.")
      print(rownames(read.weights[which(!(rownames(read.weights)%in%rownames(pij.sparse.mat))),]))
      read.weights<-read.weights[which(rownames(read.weights)%in%rownames(pij.sparse.mat)),]
      read.weights<-data.frame(read.weights,  row.names=read.weights[["read"]] )
    } else {
      read.weights<-data.frame(read.weights,  row.names=read.weights[["read"]] )

    }
  
  
### sampling weight to use when choosing which species to add during MCMC
    uniqReads<-dim(pij.sparse.mat)[[1]]
    ordered.species$sampling.weight<-ordered.species[,2]/uniqReads

###Flattening the sampling probabilities
    percentiles<-quantile(ordered.species$sampling.weight,  probs=c(0.2, 0.8))
    ordered.species$sampling.weight[which(ordered.species$sampling.weight  >= percentiles["80%"])] <- percentiles["80%"]
    ordered.species$sampling.weight[which(ordered.species$sampling.weight  <= percentiles["20%"])] <- percentiles["20%"]
    colnames(ordered.species)<-c("taxonID", "count.reads", "sampling.weight")

 
### remove objects
    
    step1<-list("ordered.species"=ordered.species,"pij.sparse.mat"=pij.sparse.mat, "read.weights"=read.weights, "outDir"=outDir, "gen.prob.unknown"=gen.prob.unknown)

    if (!is.null(outDir)) {
      step1.name <- paste(outDir, "/step1.RData", sep = "")
      save(step1, file=step1.name)
      rm(list= ls()[!ls() %in% c("step1")])
      gc()
    } else {
      rm(list= ls()[!ls() %in% c("step1")])
      gc()
    }

  
  

    return(step1)

    
    
  }
  
}


#' @rdname generative.prob
#' @title generative.prob.nucl
#' @description generative.prob.nucl()  for when we have nucleotide similarity, i.e we have performed BLASTn.
#' @param genomeLength This is applicable only for generative.prob.nucl() . It is a file mapping each genome/nucleotide to its respective length. The file must be tab seperated and the first column the nucleotide gi identifier (integer) and the second the corresponding sequence length (integer). It will be used to correct the Poisson probabilities between each read and genome.
#' @keywords generative.prob.nucl
#' @export generative.prob.nucl
#' @import Matrix data.table
##############################################################################################################################################################
generative.prob.nucl = function(blast.output.file=NULL, read.length.file=80, contig.weight.file=1, gi.taxon.file,  gen.prob.unknown=1e-20, outDir=NULL, genomeLength=NULL, blast.default=TRUE){


  if (blast.default){
    
    if (!is.null(blast.output.file)) {

      check.output<-fread(input=blast.output.file, sep="\t", header=F)

      if (ncol(check.output)!=12) {  ###quick check for format
        
        stop("Please check your BLAST output file. You have used the 'blast.default=TRUE' option, therefore your file needs to have the following fields:'Query id, Subject id, % identity, alignment length, mismatches, gap openings, query start, query end, subject start, subject end, e-value, bit score'. \n Alternatively you can use the 'blast.default=FALSE' option, providing a custom blast output that has been produced using the option  -outfmt '6 qacc qlen sacc slen stitle bitscore length pident evalue staxids' ")
      }  else if (ncol(check.output)==12){      
        blast.output<-fread(input=blast.output.file, sep="\t", header=F, select=c(1,3, 4, 5))
        setnames(x=blast.output, old=c("read", "ident", "aln", "mismatch"))           

        nuclID<-fread(input=blast.output.file, sep="|", header=F, select=c(2))
        setnames(x=nuclID, old="nucleotideID")
        blast.out.gi<-cbind.data.frame(blast.output, nuclID)
        
      
      } else {
        stop("Please provide the output file from BLASTx. The default tabular format is accepted, using the -m6 flag in the BLASTx command.")
      }
    }

   
    if (is.character(read.length.file)) {
      read.length<-fread(input=read.length.file, sep="\t", header=F)
      setnames(x=read.length, old=c("read", "length"))
    } else if (is.numeric(read.length.file)) {    
      read.length<-cbind.data.frame("read"=unique(blast.output[["read"]]), "length"=read.length.file)
    } else {
      stop("Please provide either a file containing the sequence lengths for reads and contigs or, assuming you haven't assembled contigs enter an average read length value.")
    }


    if (is.character(contig.weight.file)) {
      contig.weights<-fread(input=contig.weight.file, sep="\t", header=F)
      setnames(x=contig.weights, old=c("read", "weight"))
      rownames(contig.weights)<-contig.weights[["read"]]
    } else if (is.numeric(contig.weight.file)) {    
      contig.weights<-cbind.data.frame("read"=unique(blast.output[["read"]]), "weight"=contig.weight.file)
      rownames(contig.weights)<-contig.weights[,"read"]
    } else {
      stop("Please provide either a file containing the weights for reads (=1) and contigs (>1) or, assuming you haven't assembled contigs enter contig.weight=1.")
    }


    message("Map nucleotide gi identifiers to taxon identifiers. This could take a couple of minutes.")
    if (is.null(gi.taxon.file)) {
      stop("Please provide the 'gi_taxid_nucl.dmp' file. It can be downloaded from ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.dmp.gz")
    } else {  
      taxon.nucl<-fread(input=gi.taxon.file, sep="\t", header=F)  
      setnames(x=taxon.nucl, old=c("nucleotideID","taxonID" ))
    }

    if (is.null(genomeLength)) {
      stop("Please provide the file of genomes lengths. The first column should be the gi nucleotide identifier and the second column the sequence length")
    } else {  
      genome.length<-fread(input=genomeLength, sep="\t", header=F)  
      setnames(x=genome.length, old=c("nucleotideID","genome.length" ))  
      genome.length$genome.length<-as.integer(genome.length$genome.length)
      genome.length$nucleotideID<-as.integer(genome.length$nucleotideID)
      genome.length<-unique(genome.length)
    }
  
 
################### poisson prob with  lambda=3 mismatches per 100 nucleotides

    blast.length<-merge(blast.out.gi, read.length, by="read")


    mismatchNew<-ifelse(blast.length[["length"]]>=200, blast.length[["mismatch"]], blast.length[["length"]]-(blast.length[["ident"]]* blast.length[["aln"]])/100)

    blast.length$mismatch<-mismatchNew

    
    
    blast.length.weight<-merge(blast.length, contig.weights, by ="read", all.x=T)
    indx<-which(is.na(blast.length.weight[["weight"]]))
    blast.length.weight$weight[indx]<-1
    read.weights<- unique(blast.length.weight[,c("read","weight"), with=FALSE])
    rownames(read.weights)<-read.weights[["read"]]
  
  
    allInfo.temp<-merge(blast.length.weight, taxon.nucl, by="nucleotideID", all.x=T)

    if (length(unique(allInfo.temp[which(is.na(allInfo.temp[["taxonID"]])),get('nucleotideID')]))!=0){
      message("Some of the nucleotides in your reference database are not in the 'gi_taxon_nucl.dmp' file. Therefore these cannot be assigned a taxon identifier. \n We will remove these hits from subsequent analyses. \n For reference the respective gis are:  ")
      print(unique(allInfo.temp[which(is.na(allInfo.temp[["taxonID"]])),get('nucleotideID')]))
      allInfo<-merge(blast.length.weight, taxon.nucl, by="nucleotideID") ######Just corrected it
    } else {
      allInfo<-allInfo.temp
    }
    

  
    probPois<-ppois(allInfo[["mismatch"]] - 1, lambda=0.03*(allInfo[["length"]]), lower.tail=FALSE)  ### at least that many mismatches could be observed by chance. Divide by 3 for translated read length
    data_0.03<-cbind.data.frame(allInfo, "pois"=probPois)                                 ### combine Poisson prob with other read info.
    data_0.03.dt<- data.table(data_0.03)
    data.dt<-merge(data_0.03.dt, genome.length, by="nucleotideID")
    data.dt$pij<-data.dt[["pois"]]/data.dt[["genome.length"]]
  
    data.grouped<-data.dt[,list(pij=max(pij)),by=c("read",  "weight", "taxonID")]  ### one hit per taxonid (best one)
    pij<-as.data.frame(data.grouped)


        taxonID<-NULL

############use Sparse Matrix "reshape" long -> wide fromat
    pij.sparse<-with(pij, sparseMatrix(i = as.numeric(factor(get('read'))), j=as.numeric((factor(taxonID))), x=pij,  dimnames=list(levels(factor(get('read'))), levels(factor(taxonID)))))
    
    pij.dt<-data.table(pij)
    allspecies.dt<-pij.dt[,list(count.reads=length(get('read'))), by="taxonID"]
    allspecies<-as.data.frame(allspecies.dt)
    ordered.species<-allspecies[order(-allspecies[,2]) , ]                      #### order them by read count 

### add unknown bin and assign a pij
    pij.sparse.mat<-cbind(pij.sparse, gen.prob.unknown)
    colnames(pij.sparse.mat)<-c(colnames(pij.sparse), "unknown")

    if (length(which(!(rownames(read.weights)%in%rownames(pij.sparse.mat))))!=0) {
#    message("The following reads match only nucleotides that are no longer supported, i.e their gis are not in", gi.taxon.file, "\n Therefore the reads were removed from subsequent analyses.")
      print(rownames(read.weights[which(!(rownames(read.weights)%in%rownames(pij.sparse.mat))),]))
      read.weights<-read.weights[which(rownames(read.weights)%in%rownames(pij.sparse.mat)),]
      read.weights<-data.frame(read.weights, row.names=read.weights[["read"]] )
    } else {
      read.weights<-data.frame(read.weights,  row.names=read.weights[["read"]] )

    }
  
  
### sampling weight to use when choosing which species to add during MCMC
    uniqReads<-dim(pij.sparse.mat)[[1]]
    ordered.species$sampling.weight<-ordered.species[,2]/uniqReads

###Flattening the sampling probabilities
    percentiles<-quantile(ordered.species$sampling.weight,  probs=c(0.2, 0.8))
    ordered.species$sampling.weight[which(ordered.species$sampling.weight  >= percentiles["80%"])] <- percentiles["80%"]
    ordered.species$sampling.weight[which(ordered.species$sampling.weight  <= percentiles["20%"])] <- percentiles["20%"]
    colnames(ordered.species)<-c("taxonID", "count.reads", "sampling.weight")

 
### remove objects

    step1<-list("ordered.species"=ordered.species,"pij.sparse.mat"=pij.sparse.mat, "read.weights"=read.weights, "outDir"=outDir, "gen.prob.unknown"=gen.prob.unknown)

    if (!is.null(outDir)) {
      step1.name <- paste(outDir, "/step1.RData", sep = "")
      save(step1, file=step1.name)
      rm(list= ls()[!ls() %in% c("step1")])
      gc()
    } else {
      rm(list= ls()[!ls() %in% c("step1")])
      gc()
    }
  
    return(step1)
    
        
  } else {  ################ if we are not dealing with default format, but rather with custom blast output that has incorporated the query length and the taxon id)

    if (!is.null(blast.output.file)) {

      check.output<-fread(input=blast.output.file, sep="\t", header=F)

      if (ncol(check.output)!=10) {  ###quick check for format
        stop("Please check your blast output file. The custom blast tabular format we accept as input has the following 10 columns 'qacc qlen sacc slen stitle bitscore length pident evalue staxids'. Alternatively you can use the default blast tabular output and use it along with 'blast.default=TRUE' option. You will need also to provide the 'gi_taxid_prot.dmp' file, which can be downloaded from ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.dmp.gz")
        
      }  else if (ncol(check.output)==10){

        
        blast.output.length.temp<-fread(input=blast.output.file, sep="\t", header=F, select=c(1,2, 4, 7, 8, 10))
        setnames(x=blast.output.length.temp, old=c("read", "length", "genome.length",  "aln",  "ident", "taxonID"))
        
      } else {
        stop("Please provide the output file from BLASTx. The default tabular format is accepted, using the '-outfmt 6' flag in the BLASTx command.")
      }
    }

    taxonID<-NULL
    read<-NULL
    length<-NULL
    aln<-NULL
    ident<-NULL
    genome.length<-NULL


    if ( is.character(blast.output.length.temp[,taxonID])){
    
      blast.output.length.temp2 <- blast.output.length.temp[, list(taxonID = as.integer(unlist(strsplit(taxonID, ";")))), by=list(read, length, genome.length, aln, ident)]  ## handle comma separated taxonIDS and NAs
      blast.output.length<- blast.output.length.temp2[which(!is.na(blast.output.length.temp2[,taxonID])),]

    }  else {
      blast.output.length<-blast.output.length.temp
    
    }

    
    
    if (!is.integer(blast.output.length[["taxonID"]])) {
      stop("Your blast output does not have correctly formatted taxon identifiers. Please use the 'blast.default=TRUE' option, providing the default blast tabular format (using the blast option '-outfmt 6'  and provide seperately the 'gi_taxid_prot.dmp' file.")        
    }
      
    if (is.character(contig.weight.file)) {
      contig.weights<-fread(input=contig.weight.file, sep="\t", header=F)
      setnames(x=contig.weights, old=c("read", "weight"))
      rownames(contig.weights)<-contig.weights[["read"]]
    } else if (is.numeric(contig.weight.file)) {    
      contig.weights<-cbind.data.frame("read"=unique(blast.output.length[["read"]]), "weight"=contig.weight.file)
      rownames(contig.weights)<-contig.weights[,"read"]
    } else {
      stop("Please provide either a file containing the weights for reads (=1) and contigs (>1) or, assuming you haven't assembled contigs enter contig.weight.file=1.")
    }
        

blast.output.length$mismatch<-ifelse(blast.output.length[["length"]]>=200, (1-blast.output.length[["ident"]]/100)*blast.output.length[["aln"]], blast.output.length[["length"]]-(blast.output.length[["ident"]]* blast.output.length[["aln"]])/100)

#    blast.output.length$mismatch<-mismatchNew
    
    blast.length.weight<-merge(blast.output.length, contig.weights, by ="read", all.x=T)
    indx<-which(is.na(blast.length.weight[["weight"]]))
    blast.length.weight$weight[indx]<-1
    read.weights<- unique(blast.length.weight[,c("read","weight"), with=FALSE])
    rownames(read.weights)<-read.weights[["read"]]

################### poisson prob with  lambda=3 mismatches per 100 nucleotides

    allInfo<-blast.length.weight
    probPois<-ppois(allInfo[["mismatch"]] - 1, lambda=0.03*(allInfo[["length"]]), lower.tail=FALSE)  ### at least that many mismatches could be observed by chance. Divide by 3 for translated read length
      data_0.03<-cbind.data.frame(allInfo, "pois"=probPois)                                 ### combine Poisson prob with other read info.
      data.dt<- data.table(data_0.03)
     # data.dt<-merge(data_0.03.dt, genome.length, by="nucleotideID")
      data.dt$pij<-data.dt[["pois"]]/data.dt[["genome.length"]]
      
      data.grouped<-data.dt[,list(pij=max(pij)),by=c("read",  "weight", "taxonID")]  ### one hit per taxonid (best one)
      pij<-as.data.frame(data.grouped)

############use Sparse Matrix "reshape" long -> wide fromat
      pij.sparse<-with(pij, sparseMatrix(i = as.numeric(factor(get('read'))), j=as.numeric((factor(taxonID))), x=pij,  dimnames=list(levels(factor(get('read'))), levels(factor(taxonID)))))
      
      pij.dt<-data.table(pij)
      allspecies.dt<-pij.dt[,list(count.reads=length(get('read'))), by="taxonID"]
      allspecies<-as.data.frame(allspecies.dt)
      ordered.species<-allspecies[order(-allspecies[,2]) , ]                      #### order them by read count 

### add unknown bin and assign a pij
      pij.sparse.mat<-cbind(pij.sparse, gen.prob.unknown)
      colnames(pij.sparse.mat)<-c(colnames(pij.sparse), "unknown")

      if (length(which(!(rownames(read.weights)%in%rownames(pij.sparse.mat))))!=0) {
#    message("The following reads match only nucleotides that are no longer supported, i.e their gis are not in", gi.taxon.file, "\n Therefore the reads were removed from subsequent analyses.")
        print(rownames(read.weights[which(!(rownames(read.weights)%in%rownames(pij.sparse.mat))),]))
        read.weights<-read.weights[which(rownames(read.weights)%in%rownames(pij.sparse.mat)),]
        read.weights<-data.frame(read.weights, row.names=read.weights[["read"]] )
      } else {
        read.weights<-data.frame(read.weights,  row.names=read.weights[["read"]] )
      }
  
  
### sampling weight to use when choosing which species to add during MCMC
      uniqReads<-dim(pij.sparse.mat)[[1]]
      ordered.species$sampling.weight<-ordered.species[,2]/uniqReads

###Flattening the sampling probabilities
      percentiles<-quantile(ordered.species$sampling.weight,  probs=c(0.2, 0.8))
      ordered.species$sampling.weight[which(ordered.species$sampling.weight  >= percentiles["80%"])] <- percentiles["80%"]
      ordered.species$sampling.weight[which(ordered.species$sampling.weight  <= percentiles["20%"])] <- percentiles["20%"]
      colnames(ordered.species)<-c("taxonID", "count.reads", "sampling.weight")

 
### remove objects

      step1<-list("ordered.species"=ordered.species,"pij.sparse.mat"=pij.sparse.mat, "read.weights"=read.weights, "outDir"=outDir, "gen.prob.unknown"=gen.prob.unknown)

      if (!is.null(outDir)) {
        step1.name <- paste(outDir, "/step1.RData", sep = "")
        save(step1, file=step1.name)
        rm(list= ls()[!ls() %in% c("step1")])
        gc()
      } else {
        rm(list= ls()[!ls() %in% c("step1")])
        gc()
      }

  

      return(step1)

  }

}