R/hello.R

Defines functions seqCompile SeqAnnotate Consensuspredict MiRNASpeciesAnnot MiRNAname MISeed MIQuerySeq MIRNATargetpredict IUPAC_Boolean TFpredict VariantSort ChromLabel ChromSeqConvert IUPAC_ScoreR SpeciesTFCons TFRankR

Documented in ChromLabel ChromSeqConvert Consensuspredict IUPAC_Boolean IUPAC_ScoreR MIQuerySeq MiRNAname MiRNASpeciesAnnot MIRNATargetpredict MISeed SeqAnnotate seqCompile SpeciesTFCons TFpredict TFRankR VariantSort

#' seqCompile: sequence Compile function
#'
#' Sequence compilation to a data table from a fasta file: Loops through Fasta Files in a directory and returns
#' a data table compilation of the sequences and their ID's.  The returned data.table contains the following
#' column names: "Sequence", "ensembl_transcript_id", "Species_File", "Scientific_Name", "Common_Name"
#'
#' @param files a vector of numbers indicating what fasta files to compile in a directory. if type = "miRNA", the FASTA file to convert.
#' @param type a single character strig, either "mRNA", "protein", or "miRNA"
#' @param direct a character describing the path to the file containing the FASTA files.
#' @param miRNA_type used only when type = "miRNA".  A single character vector indicating the type of miRNA: either "MATURE", or IMMATURE_HAIR_PIN".
#' @return if type = "mRNA", A data table containing the following columns:  "Sequence", "ensembl_transcript_id", "Species_File",
#' Scientific_Name, Common_Name.  if type = "protein", A data table containing the following columns:  "Sequence", "ensembl_peptide_id", "Species_File",
#' Scientific_Name, Common_Name.  If type = "miRNA, A data table containing the following columns: "Sequence", "miRNA_Name", "miRNA_type".
#' @author Brendan Gongol
#' @importFrom Biostrings readAAStringSet
#' @importFrom data.table setnames
#' @export
#' @examples
#' setwd("C:/Users/Brendan/Dropbox/Brendan Documents/R programming/bioinformatics/cDNA_transcriptome/2016-3-6 cDNA transcriptomes")
#' seqDT1 <- seqCompile(files = (1:2))
#' seqDT1 <- seqCompile(files = (1:length(dir())))
#' head(seqDT1)
#' tail(seqDT1)
#' dim(seqDT1)

seqCompile <- function(files= 1:5, direct = getwd(), type, miRNA_type = "MATURE"){
  olddir <- getwd()
  setwd(direct)
  if(type == "mRNA"){
    #### Read in the sequences and the transcript ID's ####
    pb <- txtProgressBar(min = 0, max = length(files), style = 3)

    seqDT <- NULL
    for(i in files){
      mRNA <- readAAStringSet(dir()[i],"fasta")

      RefID <- names(mRNA)
      RefID <- sub("\\..*", "", RefID) # Remove everything after first decimal
      RefID <- sub(" .*", "", RefID) #erase all characters after the first space
      seq <- NULL
      for (z in 1:length(mRNA)){
        seq[z] <- toString(mRNA[z])
      }
      RefSeqID <- data.frame(seq, RefID)
      setnames(RefSeqID, c("seq", "RefID"), c("Sequence", "ensembl_transcript_id"))
      RefSeqID$Species_File <- dir()[i]

      seqDT <- rbind(seqDT, RefSeqID)

      setTxtProgressBar(pb, i)
    }
    close(pb)

    #### Add the scientific name ####
    seqDT$Scientific_Name <- sub("\\..*", "", seqDT$Species_File)

    #### Add the common name ####
    pb <- txtProgressBar(min = 0, max = nrow(seqDT), style = 3)

    Common_Name <- NULL
    for(i in 1:nrow(seqDT)){
      if(seqDT$Scientific_Name[i]== "Ailuropoda_melanoleuca"){
        Common_Name[i] <- "Giant panda"
      }
      else if(seqDT$Scientific_Name[i]== "Anas_platyrhynchos"){
        Common_Name[i] <- "Duck"
      }
      else if(seqDT$Scientific_Name[i]== "Anolis_carolinensis"){
        Common_Name[i] <- "Lizard"
      }
      else if(seqDT$Scientific_Name[i]== "Astyanax_mexicanus"){
        Common_Name[i] <- "Cave fish"
      }
      else if(seqDT$Scientific_Name[i]== "Bos_taurus"){
        Common_Name[i] <- "Cattle"
      }
      else if(seqDT$Scientific_Name[i]== "Caenorhabditis_elegans"){
        Common_Name[i] <- "roundworm"
      }
      else if(seqDT$Scientific_Name[i]== "Callithrix_jacchus"){
        Common_Name[i] <- "Common marmoset monkey"
      }
      else if(seqDT$Scientific_Name[i]== "Canis_familiaris"){
        Common_Name[i] <- "Dog"
      }
      else if(seqDT$Scientific_Name[i]== "Cavia_porcellus"){
        Common_Name[i] <- "Guinea pig"
      }
      else if(seqDT$Scientific_Name[i]== "Chlorocebus_sabaeus"){
        Common_Name[i] <- "Green monkey"
      }
      else if(seqDT$Scientific_Name[i]== "Choloepus_hoffmanni"){
        Common_Name[i] <- "Hoffmann's two-toed sloth"
      }
      else if(seqDT$Scientific_Name[i]== "Ciona_intestinalis"){
        Common_Name[i] <- "sea squirt"
      }
      else if(seqDT$Scientific_Name[i]== "Ciona_savignyi"){
        Common_Name[i] <- "Pacific transparent sea squirt"
      }
      else if(seqDT$Scientific_Name[i]== "Danio_rerio"){
        Common_Name[i] <- "Zebrafish"
      }
      else if(seqDT$Scientific_Name[i]== "Dasypus_novemcinctus"){
        Common_Name[i] <- "Nine-banded armadillo"
      }
      else if(seqDT$Scientific_Name[i]== "Dipodomys_ordii"){
        Common_Name[i] <- "Ord's kangaroo rat"
      }
      else if(seqDT$Scientific_Name[i]== "Drosophila_melanogaster"){
        Common_Name[i] <- "Fruit fly"
      }
      else if(seqDT$Scientific_Name[i]== "Echinops_telfairi"){
        Common_Name[i] <- "Lesser hedgehog tenrec"
      }
      else if(seqDT$Scientific_Name[i]== "Equus_caballus"){
        Common_Name[i] <- "Horse"
      }
      else if(seqDT$Scientific_Name[i]== "Erinaceus_europaeus"){
        Common_Name[i] <- "European hedgehog"
      }
      else if(seqDT$Scientific_Name[i]== "Felis_catus"){
        Common_Name[i] <- "Cat"
      }
      else if(seqDT$Scientific_Name[i]== "Ficedula_albicollis"){
        Common_Name[i] <- "Collared flycatcher"
      }
      else if(seqDT$Scientific_Name[i]== "Gadus_morhua"){
        Common_Name[i] <- "Atlantic cod"
      }
      else if(seqDT$Scientific_Name[i]== "Gallus_gallus"){
        Common_Name[i] <- "Chicken"
      }
      else if(seqDT$Scientific_Name[i]== "Gasterosteus_aculeatus"){
        Common_Name[i] <- "Three-spined stickleback"
      }
      else if(seqDT$Scientific_Name[i]== "Gorilla_gorilla"){
        Common_Name[i] <- "Gorilla"
      }
      else if(seqDT$Scientific_Name[i]== "Homo_sapiens"){
        Common_Name[i] <- "Human"
      }
      else if(seqDT$Scientific_Name[i]== "Ictidomys_tridecemlineatus"){
        Common_Name[i] <- "Thirteen-lined ground squirrel"
      }
      else if(seqDT$Scientific_Name[i]== "Latimeria_chalumnae"){
        Common_Name[i] <- "West indian ocean coelacanth"
      }
      else if(seqDT$Scientific_Name[i]== "Lepisosteus_oculatus"){
        Common_Name[i] <- "Spotted gar"
      }
      else if(seqDT$Scientific_Name[i]== "Loxodonta_africana"){
        Common_Name[i] <- "African bush elephant"
      }
      else if(seqDT$Scientific_Name[i]== "Macaca_mulatta"){
        Common_Name[i] <- "Rhesus macaque"
      }
      else if(seqDT$Scientific_Name[i]== "Macropus_eugenii"){
        Common_Name[i] <- "Tammar wallaby"
      }
      else if(seqDT$Scientific_Name[i]== "Meleagris_gallopavo"){
        Common_Name[i] <- "Wild turkey"
      }
      else if(seqDT$Scientific_Name[i]== "Microcebus_murinus"){
        Common_Name[i] <- "Gray mouse lemur"
      }
      else if(seqDT$Scientific_Name[i]== "Monodelphis_domestica"){
        Common_Name[i] <- "Gray short-tailed opossum"
      }
      else if(seqDT$Scientific_Name[i]== "Mus_musculus"){
        Common_Name[i] <- "Mouse"
      }
      else if(seqDT$Scientific_Name[i]== "Mustela_putorius_furo"){
        Common_Name[i] <- "Ferret"
      }
      else if(seqDT$Scientific_Name[i]== "Myotis_lucifugus"){
        Common_Name[i] <- "Little brown bat"
      }
      else if(seqDT$Scientific_Name[i]== "Nomascus_leucogenys"){
        Common_Name[i] <- "Northern white-cheeked gibbon"
      }
      else if(seqDT$Scientific_Name[i]== "Ochotona_princeps"){
        Common_Name[i] <- "American pika"
      }
      else if(seqDT$Scientific_Name[i]== "Oreochromis_niloticus"){
        Common_Name[i] <- "Nile tilapia"
      }
      else if(seqDT$Scientific_Name[i]== "Ornithorhynchus_anatinus"){
        Common_Name[i] <- "Platypus"
      }
      else if(seqDT$Scientific_Name[i]== "Oryctolagus_cuniculus"){
        Common_Name[i] <- "European rabbit"
      }
      else if(seqDT$Scientific_Name[i]== "Oryzias_latipes"){
        Common_Name[i] <- "Japanese rice fish"
      }
      else if(seqDT$Scientific_Name[i]== "Otolemur_garnettii"){
        Common_Name[i] <- "Northern greater galago"
      }
      else if(seqDT$Scientific_Name[i]== "Ovis_aries"){
        Common_Name[i] <- "Sheep"
      }
      else if(seqDT$Scientific_Name[i]== "Pan_troglodytes"){
        Common_Name[i] <- "Chimpanzee"
      }
      else if(seqDT$Scientific_Name[i]== "Papio_anubis"){
        Common_Name[i] <- "Olive baboon"
      }
      else if(seqDT$Scientific_Name[i]== "Pelodiscus_sinensis"){
        Common_Name[i] <- "Chinese softshell turtle"
      }
      else if(seqDT$Scientific_Name[i]== "Petromyzon_marinus"){
        Common_Name[i] <- "Sea lamprey"
      }
      else if(seqDT$Scientific_Name[i]== "Poecilia_formosa"){
        Common_Name[i] <- "Amazon molly"
      }
      else if(seqDT$Scientific_Name[i]== "Pongo_abelii"){
        Common_Name[i] <- "Sumatran orangutan"
      }
      else if(seqDT$Scientific_Name[i]== "Procavia_capensis"){
        Common_Name[i] <- "Rock badger"
      }
      else if(seqDT$Scientific_Name[i]== "Pteropus_vampyrus"){
        Common_Name[i] <- "Large flying fox"
      }
      else if(seqDT$Scientific_Name[i]== "Rattus_norvegicus"){
        Common_Name[i] <- "Rat"
      }
      else if(seqDT$Scientific_Name[i]== "Saccharomyces_cerevisiae"){
        Common_Name[i] <- "Yeast"
      }
      else if(seqDT$Scientific_Name[i]== "Sarcophilus_harrisii"){
        Common_Name[i] <- "Tasmanian devil"
      }
      else if(seqDT$Scientific_Name[i]== "Sorex_araneus"){
        Common_Name[i] <- "Common shrew"
      }
      else if(seqDT$Scientific_Name[i]== "Sus_scrofa"){
        Common_Name[i] <- "Wild boar"
      }
      else if(seqDT$Scientific_Name[i]== "Taeniopygia_guttata"){
        Common_Name[i] <- "Zebra finch"
      }
      else if(seqDT$Scientific_Name[i]== "Takifugu_rubripes"){
        Common_Name[i] <- "Japanese puffer"
      }
      else if(seqDT$Scientific_Name[i]== "Tarsius_syrichta"){
        Common_Name[i] <- "Philippine tarsier"
      }
      else if(seqDT$Scientific_Name[i]== "Tetraodon_nigroviridis"){
        Common_Name[i] <- "Green spotted puffer"
      }
      else if(seqDT$Scientific_Name[i]== "Tupaia_belangeri"){
        Common_Name[i] <- "Northern treeshrew"
      }
      else if(seqDT$Scientific_Name[i]== "Tursiops_truncatus"){
        Common_Name[i] <- "Bottlenose dolphin"
      }
      else if(seqDT$Scientific_Name[i]== "Vicugna_pacos"){
        Common_Name[i] <- "Alpaca"
      }
      else if(seqDT$Scientific_Name[i]== "Xenopus_tropicalis"){
        Common_Name[i] <- "Western clawed frog"
      }
      else if(seqDT$Scientific_Name[i]== "Xiphophorus_maculatus"){
        Common_Name[i] <- "Southern platyfish"
      }
      else{
        Common_Name[i] <- "NA"
      }
      setTxtProgressBar(pb, i)
    }
    close(pb)

    seqDT$Common_Name <- Common_Name

    setwd(olddir)
    return(seqDT)
  }


  if(type == "protein"){
    #### Read in the sequences and the transcript ID's ####
    pb <- txtProgressBar(min = 0, max = length(files), style = 3)

    seqDT <- NULL
    for(i in files){
      mRNA <- readAAStringSet(dir()[i],"fasta")

      RefID <- names(mRNA)
      RefID <- sub("\\..*", "", RefID) # Remove everything after first decimal
      RefID <- sub(" .*", "", RefID) #erase all characters after the first space
      seq <- NULL
      for (z in 1:length(mRNA)){
        seq[z] <- toString(mRNA[z])
      }
      RefSeqID <- data.frame(seq, RefID)
      setnames(RefSeqID, c("seq", "RefID"), c("Sequence", "ensembl_peptide_id"))
      RefSeqID$Species_File <- dir()[i]

      seqDT <- rbind(seqDT, RefSeqID)

      setTxtProgressBar(pb, i)
    }
    close(pb)

    #### Add the scientific name ####
    seqDT$Scientific_Name <- sub("\\..*", "", seqDT$Species_File)

    #### Add the common name ####
    pb <- txtProgressBar(min = 0, max = nrow(seqDT), style = 3)

    Common_Name <- NULL
    for(i in 1:nrow(seqDT)){
      if(seqDT$Scientific_Name[i]== "Ailuropoda_melanoleuca"){
        Common_Name[i] <- "Giant panda"
      }
      else if(seqDT$Scientific_Name[i]== "Anas_platyrhynchos"){
        Common_Name[i] <- "Duck"
      }
      else if(seqDT$Scientific_Name[i]== "Anolis_carolinensis"){
        Common_Name[i] <- "Lizard"
      }
      else if(seqDT$Scientific_Name[i]== "Astyanax_mexicanus"){
        Common_Name[i] <- "Cave fish"
      }
      else if(seqDT$Scientific_Name[i]== "Bos_taurus"){
        Common_Name[i] <- "Cattle"
      }
      else if(seqDT$Scientific_Name[i]== "Caenorhabditis_elegans"){
        Common_Name[i] <- "roundworm"
      }
      else if(seqDT$Scientific_Name[i]== "Callithrix_jacchus"){
        Common_Name[i] <- "Common marmoset monkey"
      }
      else if(seqDT$Scientific_Name[i]== "Canis_familiaris"){
        Common_Name[i] <- "Dog"
      }
      else if(seqDT$Scientific_Name[i]== "Cavia_porcellus"){
        Common_Name[i] <- "Guinea pig"
      }
      else if(seqDT$Scientific_Name[i]== "Chlorocebus_sabaeus"){
        Common_Name[i] <- "Green monkey"
      }
      else if(seqDT$Scientific_Name[i]== "Choloepus_hoffmanni"){
        Common_Name[i] <- "Hoffmann's two-toed sloth"
      }
      else if(seqDT$Scientific_Name[i]== "Ciona_intestinalis"){
        Common_Name[i] <- "sea squirt"
      }
      else if(seqDT$Scientific_Name[i]== "Ciona_savignyi"){
        Common_Name[i] <- "Pacific transparent sea squirt"
      }
      else if(seqDT$Scientific_Name[i]== "Danio_rerio"){
        Common_Name[i] <- "Zebrafish"
      }
      else if(seqDT$Scientific_Name[i]== "Dasypus_novemcinctus"){
        Common_Name[i] <- "Nine-banded armadillo"
      }
      else if(seqDT$Scientific_Name[i]== "Dipodomys_ordii"){
        Common_Name[i] <- "Ord's kangaroo rat"
      }
      else if(seqDT$Scientific_Name[i]== "Drosophila_melanogaster"){
        Common_Name[i] <- "Fruit fly"
      }
      else if(seqDT$Scientific_Name[i]== "Echinops_telfairi"){
        Common_Name[i] <- "Lesser hedgehog tenrec"
      }
      else if(seqDT$Scientific_Name[i]== "Equus_caballus"){
        Common_Name[i] <- "Horse"
      }
      else if(seqDT$Scientific_Name[i]== "Erinaceus_europaeus"){
        Common_Name[i] <- "European hedgehog"
      }
      else if(seqDT$Scientific_Name[i]== "Felis_catus"){
        Common_Name[i] <- "Cat"
      }
      else if(seqDT$Scientific_Name[i]== "Ficedula_albicollis"){
        Common_Name[i] <- "Collared flycatcher"
      }
      else if(seqDT$Scientific_Name[i]== "Gadus_morhua"){
        Common_Name[i] <- "Atlantic cod"
      }
      else if(seqDT$Scientific_Name[i]== "Gallus_gallus"){
        Common_Name[i] <- "Chicken"
      }
      else if(seqDT$Scientific_Name[i]== "Gasterosteus_aculeatus"){
        Common_Name[i] <- "Three-spined stickleback"
      }
      else if(seqDT$Scientific_Name[i]== "Gorilla_gorilla"){
        Common_Name[i] <- "Gorilla"
      }
      else if(seqDT$Scientific_Name[i]== "Homo_sapiens"){
        Common_Name[i] <- "Human"
      }
      else if(seqDT$Scientific_Name[i]== "Ictidomys_tridecemlineatus"){
        Common_Name[i] <- "Thirteen-lined ground squirrel"
      }
      else if(seqDT$Scientific_Name[i]== "Latimeria_chalumnae"){
        Common_Name[i] <- "West indian ocean coelacanth"
      }
      else if(seqDT$Scientific_Name[i]== "Lepisosteus_oculatus"){
        Common_Name[i] <- "Spotted gar"
      }
      else if(seqDT$Scientific_Name[i]== "Loxodonta_africana"){
        Common_Name[i] <- "African bush elephant"
      }
      else if(seqDT$Scientific_Name[i]== "Macaca_mulatta"){
        Common_Name[i] <- "Rhesus macaque"
      }
      else if(seqDT$Scientific_Name[i]== "Macropus_eugenii"){
        Common_Name[i] <- "Tammar wallaby"
      }
      else if(seqDT$Scientific_Name[i]== "Meleagris_gallopavo"){
        Common_Name[i] <- "Wild turkey"
      }
      else if(seqDT$Scientific_Name[i]== "Microcebus_murinus"){
        Common_Name[i] <- "Gray mouse lemur"
      }
      else if(seqDT$Scientific_Name[i]== "Monodelphis_domestica"){
        Common_Name[i] <- "Gray short-tailed opossum"
      }
      else if(seqDT$Scientific_Name[i]== "Mus_musculus"){
        Common_Name[i] <- "Mouse"
      }
      else if(seqDT$Scientific_Name[i]== "Mustela_putorius_furo"){
        Common_Name[i] <- "Ferret"
      }
      else if(seqDT$Scientific_Name[i]== "Myotis_lucifugus"){
        Common_Name[i] <- "Little brown bat"
      }
      else if(seqDT$Scientific_Name[i]== "Nomascus_leucogenys"){
        Common_Name[i] <- "Northern white-cheeked gibbon"
      }
      else if(seqDT$Scientific_Name[i]== "Ochotona_princeps"){
        Common_Name[i] <- "American pika"
      }
      else if(seqDT$Scientific_Name[i]== "Oreochromis_niloticus"){
        Common_Name[i] <- "Nile tilapia"
      }
      else if(seqDT$Scientific_Name[i]== "Ornithorhynchus_anatinus"){
        Common_Name[i] <- "Platypus"
      }
      else if(seqDT$Scientific_Name[i]== "Oryctolagus_cuniculus"){
        Common_Name[i] <- "European rabbit"
      }
      else if(seqDT$Scientific_Name[i]== "Oryzias_latipes"){
        Common_Name[i] <- "Japanese rice fish"
      }
      else if(seqDT$Scientific_Name[i]== "Otolemur_garnettii"){
        Common_Name[i] <- "Northern greater galago"
      }
      else if(seqDT$Scientific_Name[i]== "Ovis_aries"){
        Common_Name[i] <- "Sheep"
      }
      else if(seqDT$Scientific_Name[i]== "Pan_troglodytes"){
        Common_Name[i] <- "Chimpanzee"
      }
      else if(seqDT$Scientific_Name[i]== "Papio_anubis"){
        Common_Name[i] <- "Olive baboon"
      }
      else if(seqDT$Scientific_Name[i]== "Pelodiscus_sinensis"){
        Common_Name[i] <- "Chinese softshell turtle"
      }
      else if(seqDT$Scientific_Name[i]== "Petromyzon_marinus"){
        Common_Name[i] <- "Sea lamprey"
      }
      else if(seqDT$Scientific_Name[i]== "Poecilia_formosa"){
        Common_Name[i] <- "Amazon molly"
      }
      else if(seqDT$Scientific_Name[i]== "Pongo_abelii"){
        Common_Name[i] <- "Sumatran orangutan"
      }
      else if(seqDT$Scientific_Name[i]== "Procavia_capensis"){
        Common_Name[i] <- "Rock badger"
      }
      else if(seqDT$Scientific_Name[i]== "Pteropus_vampyrus"){
        Common_Name[i] <- "Large flying fox"
      }
      else if(seqDT$Scientific_Name[i]== "Rattus_norvegicus"){
        Common_Name[i] <- "Rat"
      }
      else if(seqDT$Scientific_Name[i]== "Saccharomyces_cerevisiae"){
        Common_Name[i] <- "Yeast"
      }
      else if(seqDT$Scientific_Name[i]== "Sarcophilus_harrisii"){
        Common_Name[i] <- "Tasmanian devil"
      }
      else if(seqDT$Scientific_Name[i]== "Sorex_araneus"){
        Common_Name[i] <- "Common shrew"
      }
      else if(seqDT$Scientific_Name[i]== "Sus_scrofa"){
        Common_Name[i] <- "Wild boar"
      }
      else if(seqDT$Scientific_Name[i]== "Taeniopygia_guttata"){
        Common_Name[i] <- "Zebra finch"
      }
      else if(seqDT$Scientific_Name[i]== "Takifugu_rubripes"){
        Common_Name[i] <- "Japanese puffer"
      }
      else if(seqDT$Scientific_Name[i]== "Tarsius_syrichta"){
        Common_Name[i] <- "Philippine tarsier"
      }
      else if(seqDT$Scientific_Name[i]== "Tetraodon_nigroviridis"){
        Common_Name[i] <- "Green spotted puffer"
      }
      else if(seqDT$Scientific_Name[i]== "Tupaia_belangeri"){
        Common_Name[i] <- "Northern treeshrew"
      }
      else if(seqDT$Scientific_Name[i]== "Tursiops_truncatus"){
        Common_Name[i] <- "Bottlenose dolphin"
      }
      else if(seqDT$Scientific_Name[i]== "Vicugna_pacos"){
        Common_Name[i] <- "Alpaca"
      }
      else if(seqDT$Scientific_Name[i]== "Xenopus_tropicalis"){
        Common_Name[i] <- "Western clawed frog"
      }
      else if(seqDT$Scientific_Name[i]== "Xiphophorus_maculatus"){
        Common_Name[i] <- "Southern platyfish"
      }
      else{
        Common_Name[i] <- "NA"
      }
      setTxtProgressBar(pb, i)
    }
    close(pb)

    seqDT$Common_Name <- Common_Name

    setwd(olddir)
    return(seqDT)
  }

  if(type == "miRNA"){

    #### Read in the sequences and the transcript ID's ####
    MiRNAHP <- readAAStringSet(files)
    RefIDHPmiRNA <- names(MiRNAHP)
    RefIDHPmiRNA <- sub(" .*", "", RefIDHPmiRNA) #erase all characters after the first space
    seqHPmiRNA <- NULL

    pb <- txtProgressBar(min = 0, max = length(MiRNAHP), style = 3)
    seqHPmiRNA <- NULL
    for (i in 1:length(MiRNAHP)){
      seqHPmiRNA[i]=toString(MiRNAHP[i])
      setTxtProgressBar(pb, i)
    }
    RefSeqHPmiRNA <- data.frame(seqHPmiRNA, RefIDHPmiRNA)
    setnames(RefSeqHPmiRNA, c("seqHPmiRNA", "RefIDHPmiRNA"), c("Sequence", "miRNA_Name"))
    RefSeqHPmiRNA$miRNA_type <- miRNA_type

    close(pb)
    setwd(olddir)
    return(RefSeqHPmiRNA)
  }

}



#' SeqAnnotate
#'
#' Requires a data table containing a column labeled "ensembl_peptide_id" or "ensembl_transcript_id"
#' Returns a data.table contaiining "ensembl_peptide_id", "external_gene_name", "transcription_start_site", "transcript_start", "transcript_end", "chromosome_name" for all species compiled
#' @param DT a data table containing a column labeled "ensembl_peptide_id" or "ensembl_transcript_id"
#' @param type single character string either "protein" or "mRNA".  If "protein", requires the "ensembl_peptide_id" column and returns a data table containing protein ensembl peptide id's.
#' If "mRNA", requires the "ensembl_transcript_id" column and returns a data table containing protein ensembl transcript id's.
#' @return A data.table contaiining "ensembl_peptide_id"/"ensembl_transcript_id", "external_gene_name", "transcription_start_site", "transcript_start", "transcript_end", "chromosome_name" for all species compiled
#' @author Brendan Gongol
#' @importFrom biomaRt useMart
#' @importFrom biomaRt getBM
#' @export
#' @examples
#' library(data.table)
#' setwd("C:/Users/Brendan/Dropbox/Brendan Documents/R programming/bioinformatics/proteomes/2016-4-29 Proteomes")
#' Proteome <- fread("species protein compilation.xls")
#' head(SeqAnnotate(DT = Proteome, type = "protein"))
#'
#' library(data.table)
#' setwd("C:/Users/Brendan/Dropbox/Brendan Documents/R programming/bioinformatics/cDNA_transcriptome/2016-3-6 cDNA transcriptomes")
#' Transcriptome <- fread("species mRNA compilation.xls")
#' head(SeqAnnotate(Transcriptome, type = "mRNA"))

SeqAnnotate <- function(DT, type){

  if(type == "protein"){
    Values <- DT$ensembl_peptide_id
    data.set <- c("oanatinus_gene_ensembl", "cporcellus_gene_ensembl", "gaculeatus_gene_ensembl", "lafricana_gene_ensembl", "itridecemlineatus_gene_ensembl",
                  "choffmanni_gene_ensembl", "csavignyi_gene_ensembl", "fcatus_gene_ensembl", "rnorvegicus_gene_ensembl", "psinensis_gene_ensembl",
                  "cjacchus_gene_ensembl", "ttruncatus_gene_ensembl", "scerevisiae_gene_ensembl", "celegans_gene_ensembl", "csabaeus_gene_ensembl",
                  "oniloticus_gene_ensembl", "trubripes_gene_ensembl", "amexicanus_gene_ensembl", "pmarinus_gene_ensembl", "eeuropaeus_gene_ensembl",
                  "falbicollis_gene_ensembl", "ptroglodytes_gene_ensembl", "etelfairi_gene_ensembl", "cintestinalis_gene_ensembl", "nleucogenys_gene_ensembl",
                  "sscrofa_gene_ensembl", "ocuniculus_gene_ensembl", "dnovemcinctus_gene_ensembl", "pcapensis_gene_ensembl", "tguttata_gene_ensembl",
                  "mlucifugus_gene_ensembl", "hsapiens_gene_ensembl", "pformosa_gene_ensembl", "mfuro_gene_ensembl", "tbelangeri_gene_ensembl",
                  "ggallus_gene_ensembl", "xtropicalis_gene_ensembl", "ecaballus_gene_ensembl", "pabelii_gene_ensembl", "xmaculatus_gene_ensembl",
                  "drerio_gene_ensembl", "lchalumnae_gene_ensembl", "tnigroviridis_gene_ensembl", "amelanoleuca_gene_ensembl", "mmulatta_gene_ensembl",
                  "pvampyrus_gene_ensembl", "panubis_gene_ensembl", "mdomestica_gene_ensembl", "acarolinensis_gene_ensembl", "vpacos_gene_ensembl",
                  "tsyrichta_gene_ensembl", "ogarnettii_gene_ensembl", "dmelanogaster_gene_ensembl", "mmurinus_gene_ensembl", "loculatus_gene_ensembl",
                  "olatipes_gene_ensembl", "ggorilla_gene_ensembl", "oprinceps_gene_ensembl", "dordii_gene_ensembl", "oaries_gene_ensembl",
                  "mmusculus_gene_ensembl", "mgallopavo_gene_ensembl", "gmorhua_gene_ensembl", "aplatyrhynchos_gene_ensembl", "saraneus_gene_ensembl",
                  "sharrisii_gene_ensembl", "meugenii_gene_ensembl", "btaurus_gene_ensembl", "cfamiliaris_gene_ensembl")

    pb <- txtProgressBar(min = 0, max = length(data.set), style = 3)
    COMPANNOT <- NULL
    for(i in 1:length(data.set)){
      mymart <- useMart("ensembl",dataset=data.set[i]) # host="www.ensembl.org", host="jul2015.archive.ensembl.org"
      mRNAdata_annot <- getBM(attributes=c("ensembl_peptide_id", "external_gene_name", "transcription_start_site", "transcript_start", "transcript_end", "chromosome_name"), values = Values, mart = mymart)
      COMPANNOT <- rbind(COMPANNOT, mRNAdata_annot)
      setTxtProgressBar(pb, i)
    }
    close(pb)
    return(COMPANNOT)
  }

  if(type == "mRNA"){
    Values <- DT$ensembl_transcript_id
    data.set <- c("oanatinus_gene_ensembl", "cporcellus_gene_ensembl", "gaculeatus_gene_ensembl", "lafricana_gene_ensembl", "itridecemlineatus_gene_ensembl",
                  "choffmanni_gene_ensembl", "csavignyi_gene_ensembl", "fcatus_gene_ensembl", "rnorvegicus_gene_ensembl", "psinensis_gene_ensembl",
                  "cjacchus_gene_ensembl", "ttruncatus_gene_ensembl", "scerevisiae_gene_ensembl", "celegans_gene_ensembl", "csabaeus_gene_ensembl",
                  "oniloticus_gene_ensembl", "trubripes_gene_ensembl", "amexicanus_gene_ensembl", "pmarinus_gene_ensembl", "eeuropaeus_gene_ensembl",
                  "falbicollis_gene_ensembl", "ptroglodytes_gene_ensembl", "etelfairi_gene_ensembl", "cintestinalis_gene_ensembl", "nleucogenys_gene_ensembl",
                  "sscrofa_gene_ensembl", "ocuniculus_gene_ensembl", "dnovemcinctus_gene_ensembl", "pcapensis_gene_ensembl", "tguttata_gene_ensembl",
                  "mlucifugus_gene_ensembl", "hsapiens_gene_ensembl", "pformosa_gene_ensembl", "mfuro_gene_ensembl", "tbelangeri_gene_ensembl",
                  "ggallus_gene_ensembl", "xtropicalis_gene_ensembl", "ecaballus_gene_ensembl", "pabelii_gene_ensembl", "xmaculatus_gene_ensembl",
                  "drerio_gene_ensembl", "lchalumnae_gene_ensembl", "tnigroviridis_gene_ensembl", "amelanoleuca_gene_ensembl", "mmulatta_gene_ensembl",
                  "pvampyrus_gene_ensembl", "panubis_gene_ensembl", "mdomestica_gene_ensembl", "acarolinensis_gene_ensembl", "vpacos_gene_ensembl",
                  "tsyrichta_gene_ensembl", "ogarnettii_gene_ensembl", "dmelanogaster_gene_ensembl", "mmurinus_gene_ensembl", "loculatus_gene_ensembl",
                  "olatipes_gene_ensembl", "ggorilla_gene_ensembl", "oprinceps_gene_ensembl", "dordii_gene_ensembl", "oaries_gene_ensembl",
                  "mmusculus_gene_ensembl", "mgallopavo_gene_ensembl", "gmorhua_gene_ensembl", "aplatyrhynchos_gene_ensembl", "saraneus_gene_ensembl",
                  "sharrisii_gene_ensembl", "meugenii_gene_ensembl", "btaurus_gene_ensembl", "cfamiliaris_gene_ensembl")

    pb <- txtProgressBar(min = 0, max = length(data.set), style = 3)
    COMPANNOT <- NULL
    for(i in 1:length(data.set)){
      mymart <- useMart("ensembl", dataset=data.set[i]) # , host="www.ensembl.org"     ,host = "uswest.ensembl.org"  ,host="jul2015.archive.ensembl.org"
      mRNAdata_annot <- getBM(attributes=c("ensembl_transcript_id", "external_gene_name", "transcription_start_site", "transcript_start", "transcript_end", "chromosome_name"), values = Values, mart = mymart)
      COMPANNOT <- rbind(COMPANNOT, mRNAdata_annot)
      setTxtProgressBar(pb, i)
    }
    close(pb)
    return(COMPANNOT)
  }

}



#' Consensuspredict
#'
#' Takes a data.table or mRNA sequences and returns mRNA's, miRNA's, or proteins containing a specified consensus sequence.
#' Requires a three column data.table with columns labeled according to the type of specified search.
#' If type = "mRNA", one column should be labeled "Sequence" containing the mRNA sequences to query, a column should be
#' labeled "ensembl_transcript_id" containing the mRNA transcript id, and a column should be labeled "Species" designating
#' the species of the mRNA.
#' If type = "protein", one column should be labeled "Sequence" containing the protein sequences to query, a column should be
#' labeled "ensembl_peptide_id" containing the protein peptide id, and a column should be labeled "Species" designating
#' the species of the protein.
#' If type = "miRNA", one column should be labeled "Sequence" containing the miRNA sequences to query, a column should be
#' labeled "miRNA_Name" containing the miRNA name (ex: hsa-miRXXX), and a column should be labeled "miRNA_type" designating
#' if the miRNA is an IMMATURE_HAIR_PIN or a MATURE miRNA.
#'
#'
#' @param DT a data table with three columns.  If type = "mRNA", the columns should be labeled "Sequence", "ensembl_transcript_id", "Species".
#' If type = "miRNA", the columns should be labeled "Sequence", "miRNA_Name", "miRNA_type".
#' If type = "protein", the columns should be labeled "Sequence", "ensembl_peptide_id", "Species".
#' @param conse a character string containing a single consensus sequence to query.
#' @param type a single character either "miRNA", "protein", or "mRNA" designating what type of sequences are being queried.
#' @return A data table containing the following columns:
#' @author Brendan Gongol
#' @importFrom data.table setnames
#' @importFrom data.table data.table
#' @importFrom data.table setkey
#' @importFrom stringr str_locate_all
#' @export
#' @examples
#' ensembl_transcript_id <- c("ENSACAT00000000002","ENSACAT00000000003","ENSACAT00000000004","ENSACAT00000000006","ENSACAT00000000007","ENSACAT00000000008")
#' Sequence <- c("CTGTTTCGAGCCTGAATCTCGATCGCTCGCGCTAGACAGCTCGACGCACTTTTCAGCAGGAGCCTG",
#'               "AATTAATTTCAGCAGATAGCGCTCGATACAGCTCGACAGCTCTTGCTGTATTGTGTG",
#'               "TTGCTGTATTGTGTGATCCTCGATACAGGTATTTTCTGAGCCTGATAGCTAGCTTTGCTGTATTGTGTGAATTAATT",
#'               "AAAAAATTTTTTAATTAATTCCCCCCGGGGGG", "AATACAGCTCGCGCGCGGAACCAAT",
#'               "AATTAATTATCGCTACAGCTCGACACAATTAATTAGCTCGTGGGTTCCGGCCTTAACAATTAATT")
#' external_gene_name <- c("AKT", "PI3K", "SREBP", "FOXO", "PKA", "NRF")
#' Species <- c("Rat", "Mouse", "Human", "Pig", "Goat", "Fox")
#' mRNADT <- data.frame(cbind(ensembl_transcript_id, Sequence, external_gene_name, Species))
#' mRNADT$Sequence <- as.character(mRNADT$Sequence)
#' Consensuspredict(DT=mRNADT, conse= "AATTAATT", type = "mRNA")
#'
#' ensembl_peptide_id <- c("ENSAMEP00000003151","ENSAMEP00000003176","ENSAMEP00000003150","ENSAMEP00000003213","ENSAMEP00000003164","ENSXMAP00000020464")
#' Sequence <- c("RKQHFIHQAVRNSDLVPKAKGRKSLQRLENTQYLLSLLETDGGTAGLDDGDLAPPAAPGIFAEACSNETYMEVWNDFMNRSGEEQERVLRYLEDEGKSK",
#'               "GADKSNRFPLPFPFPSKLYIMCMANLEELQSTDSLDCLERLIDLNNGEGQIFTIDGPLCLKNVQSMFGKLIDLAYTPFH",
#'               "IIALALEANNQLTWRDVQHLLVKTSRPAHLKANDWKVNGAGHKVSHLYGFGLVDAEALVMEAKKWTAVPAAEH",
#'               "VGSAAVSAPVLALHRLSPGPRTYCSEVFPSRALERAFALYNLLALYLLPLAATCA", "KFVNYMQQVSVQATCATLTAMSVDRWY",
#'               "VHEHVILDPLTKELNYPFIILALWGVIMTGSICGLERLRQTDLKALIAYSSVSHMGLVAAAILIQTPWALTGALILMIVHDK")
#' external_gene_name <- c("AKT", "PI3K", "SREBP", "FOXO", "PKA", "NRF")
#' Species <- c("Rat", "Mouse", "Human", "Pig", "Goat", "Fox")
#' PRODT <- data.frame(cbind(ensembl_peptide_id, Sequence, external_gene_name, Species))
#' PRODT$Sequence <- as.character(PRODT$Sequence)
#' conseq= "(G|A|V)(L|A|H)(D|E)(K|R|H)"
#' Consensuspredict(DT = PRODT, conse = conseq, type = "protein")
#'
#' Sequence <- c("UAGCGAUUGGAUCCGGUGAGGUAGUAGGUUGUAUAGUUUGGAAUAUUACCACCGGUGAACUAUGCAAUUUUCUACCUUACCGGAGACAGAACUCUUCGA",
#'               "AUGCUUCCGGCCUGUUCCCUGAGACCUCAAGUGUGAGUGUACUAGCGAUGCUUCACACCUGGGCUCUCCGGGUACCAGGACGGUUUGAGCAGAU",
#'               "AAAGUGACCGUACCGAGCUGCAUACUUCCUUACAUGCCCAUACUAUAUCAUAAAUGGAUAUGGAAUGUAAAGAAGUAUGUAGAACGGGGUGGUAGU ",
#'               "UAAACAGUAUACAGAAAGCCAUCAAAGCGGUGGUUGAUGUGUUGCAAAUUAUGACUUUCAUAUCACAGCCAGCUUUGAUGUGCUGCCUGUUGCACUGU",
#'               "CGGACAAUGCUCGAGAGGCAGUGUGGUUAGCUGGUUGCAUAUUUCCUUGACAACGGCUACCUUCACUGCCACCCCGAACAUGUCGUCCAUCUUUGAA",
#'               "UAGCGAUUCAGAUCGAGCCAUUGCUGGUUUCUUCCACAGUAGCGAUUUCCAUUAGAACUAUCACCGGGUGGAAACUAGCAGUGGCUCGAUUAGCGAU",
#'               "UGAGGUAGUAGGUUGUAUAGUU", "CUAUGCAUAGCGAUACCUUACC", "UCCCUGAGACCUCAAGUGUGA", "ACACCUGGGCUCUCCGGGUACC",
#'               "UAGCGAUCCUUACAUGCCCAUA", "UCGUGUCCGUUUCUCGUUUCGA", "ACAAUAAUCGGACACUAGCGAU", "CACACCGGACGAGAUUUCAU",
#'               "UACCGGGCGUGGGGAGGGCAGG", "UAGCGAUUCCUUCUUAGCGAU")
#' miRNA_Name <- c("hsa-let-7","cel-lin-4","aae-mir-1","bol-mir-2","rno-mir-34","cel-mir-35",
#'                 "cel-let-7-5p", "cel-let-7-3p", "cel-lin-4-5p", "cel-lin-4-3p", "cel-miR-1-5p", "ame-miR-9895",
#'                 "hsa-miR-9896", "rno-miR-3478", "mmu-miR9897-5p", "cre-miR9897-3p")
#' miRNA_type <- c("IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN",
#'                 "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE")
#' MiRNADT<- data.frame(cbind(Sequence, miRNA_Name, miRNA_type))
#' MiRNADT$Sequence <- as.character(MiRNADT$Sequence)
#'
#' Consensuspredict(DT = MiRNADT, conse = "UAGCGAU", type = "miRNA")

Consensuspredict <- function(DT, conse, type){

  if(type == "mRNA"){
    RNApredict2 <- function(DT2, conse3){
      DNAsequence <- DT2$Sequence
      conse2 <- conse3
      #### query the input DNA sequence for consensus sequence elements.
      hold <- data.table(NULL)
      for(ce in conse2){
        lis2 <- as.data.frame(str_locate_all(DNAsequence, ce))
        if(nrow(lis2) > 0){
          dt <- cbind(lis2, nameu=ce)
          hold <- rbind(hold, dt)
        }
      }

      #### Count the number of times each consensus sequence appears.
      if(nrow(hold) > 0){
        setkey(hold, nameu)
        numhits <- hold[,length(end), by=nameu]
        hold <- merge(hold, numhits, by="nameu", allow.cartesian=TRUE)
      }

      #### Add the length of each miRNA sequence
      if(nrow(hold) > 0){
        spl <- strsplit(DNAsequence, split="")
        hold$length <- length(spl[[1]])
      }

      # Add the sequences with hits reformatted into lower case.
      if(nrow(hold) > 0){
        spl <- strsplit(DNAsequence, split="")
        chromtot2 <- NULL
        for(i in 1:length(hold$start)){

          if((hold$end[i] == hold$length[i])){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            sequencelow <- paste(lower, collapse="")
            first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")

            chromtot <- paste(first, sequencelow, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)

          }
          if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
            sequencelow <- paste(lower, collapse="")
            last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

            chromtot <- paste(first, sequencelow, last, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)

          }
          if((hold$start[i]-1) == 0){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            sequencelow <- paste(lower, collapse="")
            last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

            chromtot <- paste(sequencelow, last, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)
          }
          i <- i + 1
        }

        hold$sequence <- chromtot2
      }

      # Add the transcript ID and Species
      if(nrow(hold) > 0){
        hold$ensembl_transcript_id <- DT2$ensembl_transcript_id
        hold$Species <- DT2$Species
      }
      # Rename the columns
      if(nrow(hold) > 0){
        setnames(hold, c("nameu", "start", "end", "V1", "sequence", "ensembl_transcript_id", "Species"),
                 c("Consensus_Sequence", "start", "end", "number of hits", "sequence", "ensembl_transcript_id", "Species"))
      }
      return(hold)
    }


    pb <- txtProgressBar(min = 0, max = nrow(DT), style = 3)
    NCLmRNAhits <- NULL
    for(i in 1:nrow(DT)){
      fir <- RNApredict2(DT[i,], conse3 = conse)
      NCLmRNAhits <- rbind(NCLmRNAhits, fir)
      setTxtProgressBar(pb, i)
    }
    close(pb)
    return(NCLmRNAhits)
  }

  if(type == "protein"){
    Proteinpredict2 <- function(DT2, conse3){
      PROTEINsequence <- DT2$Sequence
      conse2 <- conse3
      #### query the input Protein sequence for consensus sequence elements.
      hold <- data.table(NULL)
      for(ce in conse2){
        lis2 <- as.data.frame(str_locate_all(PROTEINsequence, ce))
        if(nrow(lis2) > 0){
          dt <- cbind(lis2, nameu=ce)
          hold <- rbind(hold, dt)
        }
      }

      #### Count the number of times each consensus sequence appears.
      if(nrow(hold) > 0){
        setkey(hold, nameu)
        numhits <- hold[,length(end), by=nameu]
        hold <- merge(hold, numhits, by="nameu", allow.cartesian=TRUE)
      }

      #### Add the length of each Protein sequence
      if(nrow(hold) > 0){
        spl <- strsplit(PROTEINsequence, split="")
        hold$length <- length(spl[[1]])
      }

      # Add the sequences with hits reformatted into lower case.
      if(nrow(hold) > 0){
        spl <- strsplit(PROTEINsequence, split="")
        chromtot2 <- NULL
        for(i in 1:length(hold$start)){

          if((hold$end[i] == hold$length[i])){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            sequencelow <- paste(lower, collapse="")
            first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")

            chromtot <- paste(first, sequencelow, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)

          }
          if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
            sequencelow <- paste(lower, collapse="")
            last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

            chromtot <- paste(first, sequencelow, last, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)

          }
          if((hold$start[i]-1) == 0){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            sequencelow <- paste(lower, collapse="")
            last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

            chromtot <- paste(sequencelow, last, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)
          }
          i <- i + 1
        }

        hold$sequence <- chromtot2
      }

      # Add the transcript ID and Species
      if(nrow(hold) > 0){
        hold$ensembl_peptide_id <- DT2$ensembl_peptide_id
        hold$Species <- DT2$Species
      }

      # Add the consensus sequence hit
      if(nrow(hold) > 0){
        AMPK_seq <- NULL
        for(i in 1:nrow(hold)){
          sp1 <- strsplit(hold$sequence[i], split="")
          sp2 <- sp1[[1]][(hold$start[i]):(hold$end[i])]
          AMPK_seq[i] <- paste(sp2, collapse="")
        }
        hold$consensus <- AMPK_seq
      }

      # Rename the columns
      if(nrow(hold) > 0){
        setnames(hold, c("nameu", "start", "end", "V1", "sequence", "ensembl_peptide_id", "Species", "consensus"),
                 c("Consensus_Sequence", "start", "end", "number of hits", "sequence", "ensembl_peptide_id", "Species", "consensus"))
      }
      return(hold)
    }


    pb <- txtProgressBar(min = 0, max = nrow(DT), style = 3)
    Proteinhits <- NULL
    for(i in 1:nrow(DT)){
      fir <- Proteinpredict2(DT[i,], conse3 = conse)
      Proteinhits <- rbind(Proteinhits, fir)
      setTxtProgressBar(pb, i)
    }
    close(pb)
    return(Proteinhits)
  }

  if(type == "miRNA"){
    MIRNApredict2 <- function(DT2, conse3 ){
      DNAsequence <- DT2$Sequence
      conse2 <- conse3
      #### query the input DNA sequence for consensus sequence elements.
      hold <- data.table(NULL)
      for(ce in conse2){
        lis2 <- as.data.frame(str_locate_all(DNAsequence, ce))
        if(nrow(lis2) > 0){
          dt <- cbind(lis2, nameu=ce)
          hold <- rbind(hold, dt)
        }
      }

      #### Count the number of times each consensus sequence appears.
      if(nrow(hold) > 0){
        setkey(hold, nameu)
        numhits <- hold[,length(end), by=nameu]
        hold <- merge(hold, numhits, by="nameu", allow.cartesian=TRUE)
      }

      #### Add the length of each miRNA sequence
      if(nrow(hold) > 0){
        spl <- strsplit(DNAsequence, split="")
        hold$length <- length(spl[[1]])
      }

      # Add the sequences with hits reformatted into lower case.
      if(nrow(hold) > 0){
        spl <- strsplit(DNAsequence, split="")
        chromtot2 <- NULL
        for(i in 1:length(hold$start)){

          if((hold$end[i] == hold$length[i])){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            sequencelow <- paste(lower, collapse="")
            first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")

            chromtot <- paste(first, sequencelow, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)

          }
          if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
            sequencelow <- paste(lower, collapse="")
            last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

            chromtot <- paste(first, sequencelow, last, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)

          }
          if((hold$start[i]-1) == 0){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            sequencelow <- paste(lower, collapse="")
            last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

            chromtot <- paste(sequencelow, last, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)
          }
          i <- i + 1
        }

        hold$sequence <- chromtot2
      }

      # Add the transcript ID and Species
      if(nrow(hold) > 0){
        hold$miRNA_Name <- DT2$miRNA_Name
        hold$miRNA_type <- DT2$miRNA_type
      }
      # Rename the columns
      if(nrow(hold) > 0){
        setnames(hold, c("nameu", "start", "end", "V1", "sequence", "miRNA_Name", "miRNA_type"),
                 c("Consensus_Sequence", "start", "end", "number of hits", "sequence", "miRNA_Name", "miRNA_type"))
      }
      return(hold)
    }


    pb <- txtProgressBar(min = 0, max = nrow(DT), style = 3)
    MiRNAhits <- NULL
    for(i in 1:nrow(DT)){
      fir <- MIRNApredict2(DT[i,], conse3 = conse)
      MiRNAhits <- rbind(MiRNAhits, fir)
      setTxtProgressBar(pb, i)
    }
    close(pb)
    return(MiRNAhits)
  }

}



#' MiRNASpeciesAnnot
#'
#' Adds two extra columns to a data.table of the Species a miRNA originates in Scientific_Name and common designations
#' Requires a data.table with the column housing the miRNA labelled "miRNA_Name
#'
#' @param  MiRNADT a data table with one labeled miRNA_Name that harbors the miRNA name.
#' @return A data table containing the following columns:
#' @author Brendan Gongol
#' @export
#' @examples
#' miRNA_Name <- c("hsa-let-7","cel-lin-4","aae-mir-1","bol-mir-2","rno-mir-34","cel-mir-35",
#'                 "cel-let-7-5p", "cel-let-7-3p", "cel-lin-4-5p", "cel-lin-4-3p", "cel-miR-1-5p", "ame-miR-9895",
#'                 "hsa-miR-9896", "rno-miR-3478", "mmu-miR9897-5p", "cre-miR9897-3p")
#' MiRNADT<- data.frame(cbind(miRNA_Name))
#' MiRNASpeciesAnnot(MiRNADT)
#'
#'
#' Sequence <- c("UAGCGAUUGGAUCCGGUGAGGUAGUAGGUUGUAUAGUUUGGAAUAUUACCACCGGUGAACUAUGCAAUUUUCUACCUUACCGGAGACAGAACUCUUCGA",
#'               "AUGCUUCCGGCCUGUUCCCUGAGACCUCAAGUGUGAGUGUACUAGCGAUGCUUCACACCUGGGCUCUCCGGGUACCAGGACGGUUUGAGCAGAU",
#'               "AAAGUGACCGUACCGAGCUGCAUACUUCCUUACAUGCCCAUACUAUAUCAUAAAUGGAUAUGGAAUGUAAAGAAGUAUGUAGAACGGGGUGGUAGU ",
#'               "UAAACAGUAUACAGAAAGCCAUCAAAGCGGUGGUUGAUGUGUUGCAAAUUAUGACUUUCAUAUCACAGCCAGCUUUGAUGUGCUGCCUGUUGCACUGU",
#'               "CGGACAAUGCUCGAGAGGCAGUGUGGUUAGCUGGUUGCAUAUUUCCUUGACAACGGCUACCUUCACUGCCACCCCGAACAUGUCGUCCAUCUUUGAA",
#'               "UAGCGAUUCAGAUCGAGCCAUUGCUGGUUUCUUCCACAGUAGCGAUUUCCAUUAGAACUAUCACCGGGUGGAAACUAGCAGUGGCUCGAUUAGCGAU",
#'               "UGAGGUAGUAGGUUGUAUAGUU", "CUAUGCAUAGCGAUACCUUACC", "UCCCUGAGACCUCAAGUGUGA", "ACACCUGGGCUCUCCGGGUACC",
#'               "UAGCGAUCCUUACAUGCCCAUA", "UCGUGUCCGUUUCUCGUUUCGA", "ACAAUAAUCGGACACUAGCGAU", "CACACCGGACGAGAUUUCAU",
#'               "UACCGGGCGUGGGGAGGGCAGG", "UAGCGAUUCCUUCUUAGCGAU")
#' miRNA_Name <- c("hsa-let-7","cel-lin-4","aae-mir-1","bol-mir-2","rno-mir-34","cel-mir-35",
#'                 "cel-let-7-5p", "cel-let-7-3p", "cel-lin-4-5p", "cel-lin-4-3p", "cel-miR-1-5p", "ame-miR-9895",
#'                 "hsa-miR-9896", "rno-miR-3478", "mmu-miR9897-5p", "cre-miR9897-3p")
#' miRNA_type <- c("IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN",
#'                 "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE")
#' MiRNADT<- data.frame(cbind(Sequence, miRNA_Name, miRNA_type))
#' MiRNASpeciesAnnot(MiRNADT)

MiRNASpeciesAnnot<- function(MiRNADT){
  MiRNADT$miRNA_Name <- toupper(MiRNADT$miRNA_Name)
  pb <- txtProgressBar(min = 0, max = length(MiRNADT$miRNA_Name), style = 3)
  SP <- NULL
  for(i in 1:length(MiRNADT$miRNA_Name)){
    spl <- strsplit(MiRNADT$miRNA_Name[i], split = "-")[[1]]
    SP[i] <- spl[1]
  }
  SP <- toupper(SP)

  Common_Name <- NULL
  Scientific_Name <- NULL
  for(i in 1:nrow(MiRNADT)){
    if(SP[i]== "AAU"){
      Scientific_Name[i] <- "Acacia_Auriculiformis"
      Common_Name[i] <- "Acacia_Auriculiformis"
    }
    else if(SP[i]== "AAE"){
      Scientific_Name[i] <- "Aedes_Aegypti"
      Common_Name[i] <- "mosquito"
    }
    else if(SP[i]== "ABU"){
      Scientific_Name[i] <- "Astatotilapia burtoni"
      Common_Name[i] <- "Astatotilapia burtoni"
    }
    else if(SP[i]== "ACA"){
      Scientific_Name[i] <- "Anolis_carolinensis"
      Common_Name[i] <- "Lizard"
    }
    else if(SP[i]== "AGA"){
      Scientific_Name[i] <- "Anopheles gambiae"
      Common_Name[i] <- "mosquito"
    }
    else if(SP[i]== "AHY"){
      Scientific_Name[i] <- "Arachis hypogaea"
      Common_Name[i] <- "Peanut"
    }
    else if(SP[i]== "AJA"){
      Scientific_Name[i] <- "Artibeus jamaicensis"
      Common_Name[i] <- "Bat"
    }
    else if(SP[i]== "AMA"){
      Scientific_Name[i] <- "Avicennia marina"
      Common_Name[i] <- "mangrove"
    }
    else if(SP[i]== "AMG"){
      Scientific_Name[i] <- "Acacia mangium"
      Common_Name[i] <- "Forest Mangrove"
    }
    else if(SP[i]== "AOF"){
      Scientific_Name[i] <- "Asparagus officinalis"
      Common_Name[i] <- "Asparagus"
    }
    else if(SP[i]== "API"){
      Scientific_Name[i] <- "Acyrthosiphon pisum"
      Common_Name[i] <- "pea aphid"
    }
    else if(SP[i]== "APL"){
      Scientific_Name[i] <- "Anas platyrhynchos"
      Common_Name[i] <- "mallard"
    }
    else if(SP[i]== "AQC"){
      Scientific_Name[i] <- "Aquilegia caerulea"
      Common_Name[i] <- "Colorado Blue Columbine"
    }
    else if(SP[i]== "AQU"){
      Scientific_Name[i] <- "Amphimedon queenslandica"
      Common_Name[i] <- "sponge"
    }
    else if(SP[i]== "ASU"){
      Scientific_Name[i] <- "Ascaris suum"
      Common_Name[i] <- "roundworm of pigs"
    }
    else if(SP[i]== "ATA"){
      Scientific_Name[i] <- "Aegilops Tauschii"
      Common_Name[i] <- "Goatgrass"
    }
    else if(SP[i]== "ATR"){
      Scientific_Name[i] <- "Amborella trichopoda"
      Common_Name[i] <- "Amborella"
    }
    else if(SP[i]== "AME"){
      Scientific_Name[i] <- "Apis mellifera"
      Common_Name[i] <- "Honey bee"
    }
    else if(SP[i]== "AMI"){
      Scientific_Name[i] <- "Alligator mississippiensis"
      Common_Name[i] <- "Alligator"
    }
    else if(SP[i]== "ALY"){
      Scientific_Name[i] <- "Arabidopsis lyrata"
      Common_Name[i] <- "Arabidopsis lyrata"
    }
    else if(SP[i]== "ATH"){
      Scientific_Name[i] <- "Arabidopsis thaliana"
      Common_Name[i] <- "Arabidopsis thaliana"
    }
    else if(SP[i]== "AGE"){
      Scientific_Name[i] <- "Ateles geoffroyi"
      Common_Name[i] <- "Geoffroy spider monkey"
    }
    else if(SP[i]== "BCY"){
      Scientific_Name[i] <- "Bruguiera cylindrica"
      Common_Name[i] <- "mangrove"
    }
    else if(SP[i]== "BFV"){
      Scientific_Name[i] <- "Bovine foamy virus"
      Common_Name[i] <- "Bovine foamy virus"
    }
    else if(SP[i]== "BGY"){
      Scientific_Name[i] <- "Bruguiera gymnorhiza"
      Common_Name[i] <- "black mangrove"
    }
    else if(SP[i]== "BHV1"){
      Scientific_Name[i] <- "Bovine herpesvirus 1"
      Common_Name[i] <- "Bovine herpesvirus 1"
    }
    else if(SP[i]== "BHV5"){
      Scientific_Name[i] <- "Bovine herpesvirus 5"
      Common_Name[i] <- "Bovine herpesvirus 5"
    }
    else if(SP[i]== "BIB"){
      Scientific_Name[i] <- "Biston betularia"
      Common_Name[i] <- "peppered moth"
    }
    else if(SP[i]== "BKV"){
      Scientific_Name[i] <- "BK polyomavirus"
      Common_Name[i] <- "BK polyomavirus"
    }
    else if(SP[i]== "BMA"){
      Scientific_Name[i] <- "Brugia malayi"
      Common_Name[i] <- "nematode"
    }
    else if(SP[i]=="BMO"){
      Scientific_Name[i] <- "Bombyx mori"
      Common_Name[i] <- "silkworm"
    }
    else if(SP[i]=="BOL"){
      Scientific_Name[i] <- "Brassica oleracea"
      Common_Name[i] <- "wild cabbage"
    }
    else if(SP[i]=="BPCV1"){
      Scientific_Name[i] <- "Bandicoot papillomatosis carcinomatosis virus type 1"
      Common_Name[i] <- "Bandicoot papillomatosis carcinomatosis virus type 1"
    }
    else if(SP[i]=="BPCV2"){
      Scientific_Name[i] <- "Bandicoot papillomatosis carcinomatosis virus type 2"
      Common_Name[i] <- "Bandicoot papillomatosis carcinomatosis virus type 2"
    }
    else if(SP[i]=="BRA"){
      Scientific_Name[i] <- "Brassica rapa"
      Common_Name[i] <- "field mustard"
    }
    else if(SP[i]=="BTA"){
      Scientific_Name[i] <- "Bos_taurus"
      Common_Name[i] <- "Cattle"
    }
    else if(SP[i]== "BDI"){
      Scientific_Name[i] <- "Brachypodium distachyon"
      Common_Name[i] <- "Brachypodium distachyon"
    }
    else if(SP[i]== "BBE"){
      Scientific_Name[i] <- "Branchiostoma belcheri"
      Common_Name[i] <- "Branchiostoma belcheri"
    }
    else if(SP[i]== "BDO"){
      Scientific_Name[i] <- "Bactrocera dorsalis"
      Common_Name[i] <- "oriental fruit fly"
    }
    else if(SP[i]=="BFL"){
      Scientific_Name[i] <- "Branchiostoma floridae"
      Common_Name[i] <- "Branchiostoma floridae"
    }
    else if(SP[i]=="BLV"){
      Scientific_Name[i] <- "Bovine leukemia virus"
      Common_Name[i] <- "Bovine leukemia virus"
    }
    else if(SP[i]=="BNA"){
      Scientific_Name[i] <- "Brassica napus"
      Common_Name[i] <- "Rapeseed"
    }
    else if(SP[i]=="CAS"){
      Scientific_Name[i] <- "Camelina sativa"
      Common_Name[i] <- "Camelina sativa"
    }
    else if(SP[i]=="CBN"){
      Scientific_Name[i] <- "Caenorhabditis brenneri"
      Common_Name[i] <- "nematode"
    }
    else if(SP[i]=="CBR"){
      Scientific_Name[i] <- "Caenorhabditis briggsae"
      Common_Name[i] <- "nematode"
    }
    else if(SP[i]=="CCA"){
      Scientific_Name[i] <- "Cynara cardunculus"
      Common_Name[i] <- "artichoke thistle"
    }
    else if(SP[i]=="CCL"){
      Scientific_Name[i] <- "Citrus clementina"
      Common_Name[i] <- "Clementine"
    }
    else if(SP[i]=="CCR"){
      Scientific_Name[i] <- "Cyprinus carpio"
      Common_Name[i] <- "Carp"
    }
    else if(SP[i]=="CEL"){
      Scientific_Name[i] <- "Caenorhabditis_elegans"
      Common_Name[i] <- "roundworm"
    }
    else if(SP[i]=="CFA"){
      Scientific_Name[i] <- "Canis_familiaris"
      Common_Name[i] <- "Dog"
    }
    else if(SP[i]=="CGR"){
      Scientific_Name[i] <- "Cricetulus griseus"
      Common_Name[i] <- "Chinese hamster"
    }
    else if(SP[i]=="CHI"){
      Scientific_Name[i] <- "Capra hircus"
      Common_Name[i] <- "Goat"
    }
    else if(SP[i]=="CIN"){
      Scientific_Name[i] <- "Ciona_intestinalis"
      Common_Name[i] <- "Ciona intestinalis"
    }
    else if(SP[i]=="CJA"){
      Scientific_Name[i] <- "Callithrix jacchus"
      Common_Name[i] <- "Marmoset"
    }
    else if(SP[i]=="CLA"){
      Scientific_Name[i] <- "Cerebratulus lacteus"
      Common_Name[i] <- "Atlantic jackknife clam"
    }
    else if(SP[i]=="CLI"){
      Scientific_Name[i] <- "Columba livia"
      Common_Name[i] <- "rock dove"
    }
    else if(SP[i]=="CLN"){
      Scientific_Name[i] <- "Cunninghamia lanceolata"
      Common_Name[i] <- "evergreen trees"
    }
    else if(SP[i]=="CME"){
      Scientific_Name[i] <-  "Cucumis melo"
      Common_Name[i] <- "Muskmelon"
    }
    else if(SP[i]=="CPA"){
      Scientific_Name[i] <- "Carica papaya"
      Common_Name[i] <- "Papaya"
    }
    else if(SP[i]=="CPI"){
      Scientific_Name[i] <- "Chrysemys picta"
      Common_Name[i] <- "painted turtle"
    }
    else if(SP[i]=="CPO"){
      Scientific_Name[i] <- "Cavia porcellus"
      Common_Name[i] <- "guinea pig"
    }
    else if(SP[i]=="CQU"){
      Scientific_Name[i] <- "Culex quinquefasciatus"
      Common_Name[i] <- "Southern house mosquito"
    }
    else if(SP[i]=="CRE"){
      Scientific_Name[i] <- "Chlamydomonas reinhardtii"
      Common_Name[i] <- "green alga"
    }
    else if(SP[i]=="CRM"){
      Scientific_Name[i] <- "Caenorhabditis remanei"
      Common_Name[i] <- "nematode"
    }
    else if(SP[i]=="CRT"){
      Scientific_Name[i] <- "Citrus reticulata"
      Common_Name[i] <- "mandarin orange"
    }
    else if(SP[i]=="CSA"){
      Scientific_Name[i] <- "Ciona_savignyi"
      Common_Name[i] <- "sea squirt"
    }
    else if(SP[i]=="CSI"){
      Scientific_Name[i] <- "Citrus sinensis"
      Common_Name[i] <- "Sweet Orange Group"
    }
    else if(SP[i]=="CST"){
      Scientific_Name[i] <- "Cucumis sativus"
      Common_Name[i] <- "Cucumber"
    }
    else if(SP[i]=="CTE"){
      Scientific_Name[i] <- "Capitella teleta"
      Common_Name[i] <- "polychaete worm"
    }
    else if(SP[i]=="CTR"){
      Scientific_Name[i] <- "Citrus trifoliata"
      Common_Name[i] <- "Trifoliate orange"
    }
    else if(SP[i]=="DAN"){
      Scientific_Name[i] <- "Drosophila ananassae"
      Common_Name[i] <- "fruit fly"
    }
    else if(SP[i]=="DDI"){
      Scientific_Name[i] <- "Dictyostelium discoideum"
      Common_Name[i] <- "slime mold"
    }
    else if(SP[i]=="DER"){
      Scientific_Name[i] <- "Drosophila erecta"
      Common_Name[i] <- "fruit fly"
    }
    else if(SP[i]=="DEV"){
      Scientific_Name[i] <- "Duck enteritis virus"
      Common_Name[i] <- "Duck enteritis virus"
    }
    else if(SP[i]=="DGR"){
      Scientific_Name[i] <- "Drosophila grimshawi"
      Common_Name[i] <- "fruit fly"
    }
    else if(SP[i]=="DMA"){
      Scientific_Name[i] <- "Daubentonia madagascariensis"
      Common_Name[i] <- "aye-aye lemur"
    }
    else if(SP[i]=="DME"){
      Scientific_Name[i] <- "Drosophila_melanogaster"
      Common_Name[i] <- "Fruit fly"
    }
    else if(SP[i]=="DMO"){
      Scientific_Name[i] <- "Drosophila mojavensis"
      Common_Name[i] <- "Fruit fly"
    }
    else if(SP[i]=="DNO"){
      Scientific_Name[i] <- "Dasypus novemcinctus"
      Common_Name[i] <- "armadillo"
    }
    else if(SP[i]=="DPE"){
      Scientific_Name[i] <- "Drosophila persimilis"
      Common_Name[i] <- "Fruit fly"
    }
    else if(SP[i]=="DPR"){
      Scientific_Name[i] <- "Digitalis purpurea"
      Common_Name[i] <- "foxglove"
    }
    else if(SP[i]=="DPS"){
      Scientific_Name[i] <- "Drosophila pseudoobscura"
      Common_Name[i] <- "Fruit fly"
    }
    else if(SP[i]=="DPU"){
      Scientific_Name[i] <- "Daphnia pulex"
      Common_Name[i] <- "water flea"
    }
    else if(SP[i]=="DRE"){
      Scientific_Name[i] <- "Danio_rerio"
      Common_Name[i] <- "Zebrafish"
    }
    else if(SP[i]=="DSE"){
      Scientific_Name[i] <- "Drosophila sechellia"
      Common_Name[i] <- "Fruit fly"
    }
    else if(SP[i]=="DQU"){
      Scientific_Name[i] <- "Dinoponera quadriceps"
      Common_Name[i] <- "Dinoponera quadriceps"
    }
    else if(SP[i]== "DSI"){
      Scientific_Name[i] <- "Drosophila simulans"
      Common_Name[i] <- "Fruit fly"
    }
    else if(SP[i]== "DVI"){
      Scientific_Name[i] <- "Drosophila virilis"
      Common_Name[i] <- "Fruit fly"
    }
    else if(SP[i]== "DWI"){
      Scientific_Name[i] <- "Drosophila willistoni"
      Common_Name[i] <- "Fruit fly"
    }
    else if(SP[i]== "DYA"){
      Scientific_Name[i] <- "Drosophila yakuba"
      Common_Name[i] <- "Fruit fly"
    }
    else if(SP[i]== "EBV"){
      Scientific_Name[i] <- "Epstein Barr virus"
      Common_Name[i] <- "Epstein Barr virus"
    }
    else if(SP[i]== "EGR"){
      Scientific_Name[i] <- "Echinococcus granulosus"
      Common_Name[i] <- "Hyper Tape-worm"
    }
    else if(SP[i]== "EGU"){
      Scientific_Name[i] <- "Elaeis guineensis"
      Common_Name[i] <- "African oil palm"
    }
    else if(SP[i]== "EMU"){
      Scientific_Name[i] <- "Echinococcus multilocularis"
      Common_Name[i] <- "tapeworm"
    }
    else if(SP[i]=="ESI"){
      Scientific_Name[i] <- "Ectocarpus siliculosus"
      Common_Name[i] <- "Brown alga"
    }
    else if(SP[i]=="EUN"){
      Scientific_Name[i] <- "Eugenia uniflora"
      Common_Name[i] <- "pitanga"
    }
    else if(SP[i]=="EEL"){
      Scientific_Name[i] <- "Electrophorus electricus"
      Common_Name[i] <- "electric eel"
    }
    else if(SP[i]=="EFU"){
      Scientific_Name[i] <- "Eptesicus fuscus"
      Common_Name[i] <- "Big brown bat"
    }
    else if(SP[i]=="ECA"){
      Scientific_Name[i] <-"Equus_caballus"
      Common_Name[i] <- "Horse"
    }
    else if(SP[i]=="FAR"){
      Scientific_Name[i] <-"Festuca arundinacea"
      Common_Name[i] <- "Grass"
    }
    else if(SP[i]=="FHE"){
      Scientific_Name[i] <-"Fasciola hepatica"
      Common_Name[i] <- "liver fluke"
    }
    else if(SP[i]=="FRU"){
      Scientific_Name[i] <-"Festuca arundinacea"
      Common_Name[i] <- "pufferfish"
    }
    else if(SP[i]=="FVE"){
      Scientific_Name[i] <-"Fragaria vesca"
      Common_Name[i] <- "strawberry"
    }
    else if(SP[i]=="GAR"){
      Scientific_Name[i] <- "Gossypium_arboreum"
      Common_Name[i] <- "tree cotton"
    }
    else if(SP[i]=="GGA"){
      Scientific_Name[i] <- "Gallus_gallus"
      Common_Name[i] <- "Chicken"
    }
    else if(SP[i]=="GGO"){
      Scientific_Name[i] <- "Gorilla_gorilla"
      Common_Name[i] <- "Gorilla"
    }
    else if(SP[i]=="GHR"){
      Scientific_Name[i] <- "Gossypium hirsutum"
      Common_Name[i] <- "cotton"
    }
    else if(SP[i]=="GMA"){
      Scientific_Name[i] <- "Glycine max"
      Common_Name[i] <- "Soybean"
    }
    else if(SP[i]=="GMO"){
      Scientific_Name[i] <- "Gadus morhua"
      Common_Name[i] <- "Atlantic cod"
    }
    else if(SP[i]=="GPY"){
      Scientific_Name[i] <- "Glottidia pyramidata"
      Common_Name[i] <- "Glottidia pyramidata"
    }
    else if(SP[i]=="GRA"){
      Scientific_Name[i] <- "Gossypium raimondii"
      Common_Name[i] <- "Cotton plant"
    }
    else if(SP[i]=="GSA"){
      Scientific_Name[i] <- "Gyrodactylus salaris"
      Common_Name[i] <- "monogenean ectoparasite"
    }
    else if(SP[i]=="GSO"){
      Scientific_Name[i] <- "Glycine soja"
      Common_Name[i] <- "soybean"
    }
    else if(SP[i]=="HAN"){
      Scientific_Name[i] <- "Helianthus annuus"
      Common_Name[i] <- "sunflower"
    }
    else if(SP[i]=="HAR"){
      Scientific_Name[i] <- "Helianthus annuus"
      Common_Name[i] <- "sunflower"
    }
    else if(SP[i]=="HBR"){
      Scientific_Name[i] <- "Hevea brasiliensis"
      Common_Name[i] <- "rubber tree"
    }
    else if(SP[i]=="HBV"){
      Scientific_Name[i] <- "Herpes B virus"
      Common_Name[i] <- "Herpes B virus"
    }
    else if(SP[i]=="HCI"){
      Scientific_Name[i] <- "Helianthus ciliaris"
      Common_Name[i] <- "sunflower"
    }
    else if(SP[i]=="HCMV"){
      Scientific_Name[i] <- "Human cytomegalovirus"
      Common_Name[i] <- "Human cytomegalovirus"
    }
    else if(SP[i]=="HCO"){
      Scientific_Name[i] <- "Haemonchus contortus"
      Common_Name[i] <- "Barber's pole worm"
    }
    else if(SP[i]=="HEX"){
      Scientific_Name[i] <- "Helianthus exilis"
      Common_Name[i] <- "sunflower"
    }
    else if(SP[i]=="HHI"){
      Scientific_Name[i] <- "Hippoglossus hippoglossus"
      Common_Name[i] <- "Atlantic halibut"
    }
    else if(SP[i]=="HHV6B"){
      Scientific_Name[i] <- "Human herpesvirus 6B"
      Common_Name[i] <- "Human herpesvirus 6B"
    }
    else if(SP[i]=="HIV1"){
      Scientific_Name[i] <- "Human immunodeficiency virus 1"
      Common_Name[i] <- "Human immunodeficiency virus 1"
    }
    else if(SP[i]=="HMA"){
      Scientific_Name[i] <- "Hydra magnipapillata"
      Common_Name[i] <- "fresh-water polyp"
    }
    else if(SP[i]=="HME"){
      Scientific_Name[i] <- "Heliconius melpomene"
      Common_Name[i] <- "postman butterfly"
    }
    else if(SP[i]=="HPA"){
      Scientific_Name[i] <- "Helianthus paradoxus"
      Common_Name[i] <- "sunflower"
    }
    else if(SP[i]=="HPE"){
      Scientific_Name[i] <- "Helianthus petiolaris"
      Common_Name[i] <- "sunflower"
    }
    else if(SP[i]=="HPO"){
      Scientific_Name[i] <- "Heligmosomoides polygyrus"
      Common_Name[i] <- "Heligmosomoides polygyrus"
    }
    else if(SP[i]=="HRU"){
      Scientific_Name[i] <- "Haliotis rufescens"
      Common_Name[i] <- "sea snail"
    }
    else if(SP[i]=="HSA"){
      Scientific_Name[i] <- "Homo_sapiens"
      Common_Name[i] <- "Human"
    }
    else if(SP[i]=="HSV1"){
      Scientific_Name[i] <- "Herpes simplex sirus 1"
      Common_Name[i] <- "Herpes simplex virus 1"
    }
    else if(SP[i]=="HSV2"){
      Scientific_Name[i] <- "Herpes simplex virus 2"
      Common_Name[i] <- "Herpes simplex virus 2"
    }
    else if(SP[i]=="HTU"){
      Scientific_Name[i] <- "Helianthus tuberosus"
      Common_Name[i] <- "Jerusalem artichoke"
    }
    else if(SP[i]=="HVSA"){
      Scientific_Name[i] <- "Herpes virus saimiri strain A11"
      Common_Name[i] <- "Herpes virus saimiri strain A11"
    }
    else if(SP[i]=="HVT"){
      Scientific_Name[i] <- "Herpesvirus of Turkeys"
      Common_Name[i] <- "Herpesvirus of Turkeys"
    }
    else if(SP[i]=="HVU"){
      Scientific_Name[i] <- "Hordeum vulgare"
      Common_Name[i] <- "Barley"
    }
    else if(SP[i]=="ILTV"){
      Scientific_Name[i] <- "Infectious laryngotracheitis"
      Common_Name[i] <- "Gallid herpesvirus 1"
    }
    else if(SP[i]=="IPU"){
      Scientific_Name[i] <- "Ictalurus punctatus"
      Common_Name[i] <- "Channel catfish"
    }
    else if(SP[i]=="ISC"){
      Scientific_Name[i] <- "Ixodes scapularis"
      Common_Name[i] <- "deer tick"
    }
    else if(SP[i]=="JCV"){
      Scientific_Name[i] <- "JC polyomavirus"
      Common_Name[i] <- "JC virus"
    }
    else if(SP[i]=="KSHV"){
      Scientific_Name[i] <- "Kaposi sarcoma-associated herpesvirus"
      Common_Name[i] <- "herpes virus"
    }
    else if(SP[i]=="LCA"){
      Scientific_Name[i] <- "Lemur catta"
      Common_Name[i] <- "ring-tailed lemur"
    }
    else if(SP[i]=="LCO"){
      Scientific_Name[i] <- "Leucosolenia complicata"
      Common_Name[i] <- "Leucosolenia complicata"
    }
    else if(SP[i]=="LGI"){
      Scientific_Name[i] <- "Lottia gigantea"
      Common_Name[i] <- "owl limpet"
    }
    else if(SP[i]=="LJA"){
      Scientific_Name[i] <- "Lotus japonicus"
      Common_Name[i] <- "legume"
    }
    else if(SP[i]=="LLA"){
      Scientific_Name[i] <- "Lagothrix lagotricha"
      Common_Name[i] <- "Brown woolly monkey"
    }
    else if(SP[i]=="LMI"){
      Scientific_Name[i] <- "Locusta migratoria"
      Common_Name[i] <- "Migratory locust"
    }
    else if(SP[i]=="LUS"){
      Scientific_Name[i] <- "Linum usitatissimum"
      Common_Name[i] <- "Flax"
    }
    else if(SP[i]=="LVA"){
      Scientific_Name[i] <- "Lytechinus variegatus"
      Common_Name[i] <- "green sea urchin"
    }
    else if(SP[i]=="MCMV"){
      Scientific_Name[i] <- "Mouse cytomegalovirus"
      Common_Name[i] <- "Mouse cytomegalovirus"
    }
    else if(SP[i]=="MCO"){
      Scientific_Name[i] <- "Mesocestoides corti"
      Common_Name[i] <- "Mesocestoides corti"
    }
    else if(SP[i]=="MCV"){
      Scientific_Name[i] <- "Merkel cell polyomavirus"
      Common_Name[i] <- "Merkel cell polyomavirus"
    }
    else if(SP[i]=="MDM"){
      Scientific_Name[i] <- "Malus domestica"
      Common_Name[i] <- "Apple"
    }
    else if(SP[i]=="MDO"){
      Scientific_Name[i] <- "Monodelphis_domestica"
      Common_Name[i] <- "Gray short-tailed opossum"
    }
    else if(SP[i]=="MDV1"){
      Scientific_Name[i] <- "Mareks disease virus"
      Common_Name[i] <- "Mareks disease virus"
    }
    else if(SP[i]=="MDV2"){
      Scientific_Name[i] <- "Mareks disease virus type 2"
      Common_Name[i] <- "Mareks disease virus type 2"
    }
    else if(SP[i]=="MES"){
      Scientific_Name[i] <- "Manihot esculenta"
      Common_Name[i] <- "yuca"
    }
    else if(SP[i]=="MEu"){
      Scientific_Name[i] <- "	Macropus eugenii"
      Common_Name[i] <- "dama wallaby"
    }
    else if(SP[i]=="MGHV"){
      Scientific_Name[i] <- "Mouse gammaherpesvirus 68"
      Common_Name[i] <- "Mouse gammaherpesvirus 68"
    }
    else if(SP[i]=="MJA"){
      Scientific_Name[i] <- "Marsupenaeus japonicu"
      Common_Name[i] <- "Marsupenaeus japonicu"
    }
    else if(SP[i]=="MLE"){
      Scientific_Name[i] <- "Melibe leonina"
      Common_Name[i] <- "hooded nudibranch"
    }
    else if(SP[i]=="MML"){
      Scientific_Name[i] <- "Macaca_mulatta"
      Common_Name[i] <- "Rhesus macaque"
    }
    else if(SP[i]=="MMR"){
      Scientific_Name[i] <- "Microcebus murinus"
      Common_Name[i] <- "gray mouse lemur"
    }
    else if(SP[i]=="MMU"){
      Scientific_Name[i] <- "Mus_musculus"
      Common_Name[i] <- "Mouse"
    }
    else if(SP[i]=="MNE"){
      Scientific_Name[i] <- "Macaca nemestrina"
      Common_Name[i] <- "Southern pig-tailed macaque"
    }
    else if(SP[i]=="MSE"){
      Scientific_Name[i] <- "Manduca sexta"
      Common_Name[i] <- "Carolina sphinx moth"
    }
    else if(SP[i]=="MTR"){
      Scientific_Name[i] <- "Medicago truncatula"
      Common_Name[i] <- "Barrelclover"
    }
    else if(SP[i]=="MZE"){
      Scientific_Name[i] <- "Metriaclima zebra"
      Common_Name[i] <- "Tilapia zebra"
    }
    else if(SP[i]=="NBR"){
      Scientific_Name[i] <- "Neolamprologus brichardi"
      Common_Name[i] <- "cichlid"
    }
    else if(SP[i]=="NGI"){
      Scientific_Name[i] <- "Nasonia giraulti"
      Common_Name[i] <- "Nasonia"
    }
    else if(SP[i]=="NLE"){
      Scientific_Name[i] <- "Nomascus leucogenys"
      Common_Name[i] <- "white-cheeked gibbon"
    }
    else if(SP[i]=="NLO"){
      Scientific_Name[i] <- "Nasonia longicornis"
      Common_Name[i] <- "Nasonia"
    }
    else if(SP[i]=="NTA"){
      Scientific_Name[i] <- "Nicotiana tabacum"
      Common_Name[i] <- "Tobacco"
    }
    else if(SP[i]=="NVE"){
      Scientific_Name[i] <- "Nematostella vectensis"
      Common_Name[i] <- "Starlet sea anemone"
    }
    else if(SP[i]=="NVI"){
      Scientific_Name[i] <- "Nasonia vitripennis"
      Common_Name[i] <- "parasitoid wasps"
    }
    else if(SP[i]=="OAN"){
      Scientific_Name[i] <- "Ornithorhynchus_anatinus"
      Common_Name[i] <- "Platypus"
    }
    else if(SP[i]=="OAR"){
      Scientific_Name[i] <- "Ovis_aries"
      Common_Name[i] <- "Sheep"
    }
    else if(SP[i]=="OCU"){
      Scientific_Name[i] <- "Oryctolagus_cuniculus"
      Common_Name[i] <- "rabbit"
    }
    else if(SP[i]=="ODI"){
      Scientific_Name[i] <- "Oikopleura dioica"
      Common_Name[i] <- "Oikopleura dioica"
    }
    else if(SP[i]=="OGA"){
      Scientific_Name[i] <- "Otolemur garnettii"
      Common_Name[i] <- "northern greater galago"
    }
    else if(SP[i]=="OHA"){
      Scientific_Name[i] <- "Ophiophagus hannah"
      Common_Name[i] <- "King cobra"
    }
    else if(SP[i]=="OLA"){
      Scientific_Name[i] <- "Oryzias_latipes"
      Common_Name[i] <- "Japanese rice fish"
    }
    else if(SP[i]=="ONI"){
      Scientific_Name[i] <- "Oreochromis niloticus"
      Common_Name[i] <- "Nile tilapia"
    }
    else if(SP[i]=="OSA"){
      Scientific_Name[i] <- "Oryza sativa"
      Common_Name[i] <- "Asian rice"
    }
    else if(SP[i]=="PAB"){
      Scientific_Name[i] <- "Picea abies"
      Common_Name[i] <- "Norway spruce"
    }
    else if(SP[i]=="PAL"){
      Scientific_Name[i] <- "Pteropus alecto"
      Common_Name[i] <- "black fruit bat"
    }
    else if(SP[i]=="PBI"){
      Scientific_Name[i] <- "Pygathrix bieti"
      Common_Name[i] <- "Pygathrix bieti"
    }
    else if(SP[i]=="PBV"){
      Scientific_Name[i] <- "Python bivittatus"
      Common_Name[i] <- "Python"
    }
    else if(SP[i]=="PCA"){
      Scientific_Name[i] <- "Polistes canadensis"
      Common_Name[i] <- "red paper wasp"
    }
    else if(SP[i]=="PDE"){
      Scientific_Name[i] <- "Pinus densata"
      Common_Name[i] <- "Sikang pine"
    }
    else if(SP[i]=="PEU"){
      Scientific_Name[i] <- "Populus euphratica"
      Common_Name[i] <- "Euphrates poplar"
    }
    else if(SP[i]=="PGI"){
      Scientific_Name[i] <- "Panax ginseng"
      Common_Name[i] <- "ginseng"
    }
    else if(SP[i]=="PHA"){
      Scientific_Name[i] <- "Papio hamadryas"
      Common_Name[i] <- "hamadryas baboon"
    }
    else if(SP[i]=="PIN"){
      Scientific_Name[i] <- "Phytophthora infestans"
      Common_Name[i] <- "potato blight"
    }
    else if(SP[i]=="PLA"){
      Scientific_Name[i] <- "Paeonia lactiflora"
      Common_Name[i] <- "Paeonia lactiflora"
    }
    else if(SP[i]=="POL"){
      Scientific_Name[i] <- "Paralichthys olivaceus"
      Common_Name[i] <- "halibut"
    }
    else if(SP[i]=="PMA"){
      Scientific_Name[i] <- "Petromyzon_marinus"
      Common_Name[i] <- "Sea lamprey"
    }
    else if(SP[i]=="PMI"){
      Scientific_Name[i] <- "Patiria miniata"
      Common_Name[i] <- "Bat star"
    }
    else if(SP[i]=="PNY"){
      Scientific_Name[i] <- "Pundamilia nyererei"
      Common_Name[i] <- "Pundamilia nyererei"
    }
    else if(SP[i]=="PPA"){
      Scientific_Name[i] <- "Pan paniscus"
      Common_Name[i] <- "bonobo chimpanzee"
    }
    else if(SP[i]=="PPC"){
      Scientific_Name[i] <- "Pristionchus pacificus"
      Common_Name[i] <- "Nematode"
    }
    else if(SP[i]=="PPE"){
      Scientific_Name[i] <- "Prunus persica"
      Common_Name[i] <- "Peach"
    }
    else if(SP[i]=="PPT"){
      Scientific_Name[i] <- "Physcomitrella patens"
      Common_Name[i] <- "Spreading earthmoss"
    }
    else if(SP[i]=="PPY"){
      Scientific_Name[i] <- "Pongo pygmaeus"
      Common_Name[i] <- "Bornean orangutan"
    }
    else if(SP[i]=="PRA"){
      Scientific_Name[i] <- "Phytophthora ramorum"
      Common_Name[i] <- "Phytophthora ramorum"
    }
    else if(SP[i]=="PRD"){
      Scientific_Name[i] <- "Panagrellus redivivus"
      Common_Name[i] <- "sour paste nematode"
    }
    else if(SP[i]=="PRV"){
      Scientific_Name[i] <- "Pseudorabies virus"
      Common_Name[i] <- "Pseudorabies virus"
    }
    else if(SP[i]=="PSJ"){
      Scientific_Name[i] <- "Phytophthora sojae"
      Common_Name[i] <- "Phytophthora sojae"
    }
    else if(SP[i]=="PTA"){
      Scientific_Name[i] <- "Pinus taeda"
      Common_Name[i] <- "loblolly pine"
    }
    else if(SP[i]=="PTC"){
      Scientific_Name[i] <- "Populus trichocarpa"
      Common_Name[i] <- "Black cottonwood"
    }
    else if(SP[i]=="PTE"){
      Scientific_Name[i] <- "Parasteatoda tepidariorum"
      Common_Name[i] <- "house spider"
    }
    else if(SP[i]=="PTI"){
      Scientific_Name[i] <- "Phaeodactylum tricornutum"
      Common_Name[i] <- "diatom"
    }
    else if(SP[i]=="PTR"){
      Scientific_Name[i] <- "Pan_troglodytes"
      Common_Name[i] <- "Chimpanzee"
    }
    else if(SP[i]=="PVU"){
      Scientific_Name[i] <- "Phaseolus vulgaris"
      Common_Name[i] <- "Bean"
    }
    else if(SP[i]=="PXY"){
      Scientific_Name[i] <- "Plutella xylostella"
      Common_Name[i] <- "Diamondback mot"
    }
    else if(SP[i]=="RCO"){
      Scientific_Name[i] <- "Ricinus communis"
      Common_Name[i] <- "castor-oil-plant"
    }
    else if(SP[i]=="RGL"){
      Scientific_Name[i] <- "Rehmannia glutinosa"
      Common_Name[i] <- "Rehmannia glutinosa"
    }
    else if(SP[i]=="RLCV"){
      Scientific_Name[i] <- "Rhesus lymphocryptovirus"
      Common_Name[i] <- "gamma-1 herpesvirus"
    }
    else if(SP[i]=="RMI"){
      Scientific_Name[i] <- "Rhipicephalus microplus"
      Common_Name[i] <- "cattle tick"
    }
    else if(SP[i]=="RNO"){
      Scientific_Name[i] <- "Rattus_norvegicus"
      Common_Name[i] <- "Rat"
    }
    else if(SP[i]=="RRV"){
      Scientific_Name[i] <- "Rhesus monkey rhadinovirus"
      Common_Name[i] <- "Rhesus monkey rhadinovirus"
    }
    else if(SP[i]=="SBI"){
      Scientific_Name[i] <- "Sorghum bicolor"
      Common_Name[i] <- "Sorghum grass"
    }
    else if(SP[i]=="SBO"){
      Scientific_Name[i] <- "Saimiri boliviensis"
      Common_Name[i] <- "black-capped squirrel monkey"
    }
    else if(SP[i]=="SCI"){
      Scientific_Name[i] <- "Sycon ciliatum"
      Common_Name[i] <- "calcareous sponge"
    }
    else if(SP[i]=="SEU"){
      Scientific_Name[i] <- "Salicornia europaea"
      Common_Name[i] <- "glasswort"
    }
    else if(SP[i]=="SFR"){
      Scientific_Name[i] <- "Spodoptera frugiperda"
      Common_Name[i] <- "fall armyworm"
    }
    else if(SP[i]=="SHA"){
      Scientific_Name[i] <- "Sarcophilus_harrisii"
      Common_Name[i] <- "Tasmanian devil"
    }
    else if(SP[i]=="SJA"){
      Scientific_Name[i] <- "Schistosoma japonicum"
      Common_Name[i] <- "Schistosoma japonicum"
    }
    else if(SP[i]=="SKO"){
      Scientific_Name[i] <- "Saccoglossus kowalevskii"
      Common_Name[i] <- "Acorn worm"
    }
    else if(SP[i]=="SLA"){
      Scientific_Name[i] <- "Saguinus labiatus"
      Common_Name[i] <- "White-lipped tamarin"
    }
    else if(SP[i]=="SLY"){
      Scientific_Name[i] <- "Solanum lycopersicum"
      Common_Name[i] <- "Tomato"
    }
    else if(SP[i]=="SMA"){
      Scientific_Name[i] <- "Schistosoma mansoni"
      Common_Name[i] <- "Trematode"
    }
    else if(SP[i]=="SMC"){
      Scientific_Name[i] <- "Symbiodinium microadriaticum"
      Common_Name[i] <- "zooxanthellae"
    }
    else if(SP[i]=="SME"){
      Scientific_Name[i] <- "Schmidtea mediterranea"
      Common_Name[i] <- "Schmidtea mediterranea"
    }
    else if(SP[i]=="SMO"){
      Scientific_Name[i] <- "Selaginella moellendorffii"
      Common_Name[i] <- "Selaginella moellendorffii"
    }
    else if(SP[i]=="SMR"){
      Scientific_Name[i] <- "Strigamia maritima"
      Common_Name[i] <- "Strigamia maritima"
    }
    else if(SP[i]=="SOF"){
      Scientific_Name[i] <- "Saccharum officinarum"
      Common_Name[i] <- "Grass"
    }
    else if(SP[i]=="SPU"){
      Scientific_Name[i] <- "Strongylocentrotus purpuratus"
      Common_Name[i] <- "Purple sea urchin"
    }
    else if(SP[i]=="SSA"){
      Scientific_Name[i] <- "Salmo salar"
      Common_Name[i] <- "Atlantic salmon"
    }
    else if(SP[i]=="SSC"){
      Scientific_Name[i] <- "Sus_scrofa"
      Common_Name[i] <- "Wild boar"
    }
    else if(SP[i]=="SSL"){
      Scientific_Name[i] <- "Salvia sclarea"
      Common_Name[i] <- "clary sage"
    }
    else if(SP[i]=="SSP"){
      Scientific_Name[i] <- "Saccharum spontaneum"
      Common_Name[i] <- "wild sugarcane"
    }
    else if(SP[i]=="SSY"){
      Scientific_Name[i] <- "Symphalangus syndactylus"
      Common_Name[i] <- "black-furred gibbon"
    }
    else if(SP[i]=="STR"){
      Scientific_Name[i] <- "Strongyloides ratti"
      Common_Name[i] <- "Strongyloides ratti"
    }
    else if(SP[i]=="STU"){
      Scientific_Name[i] <- "Solanum tuberosum"
      Common_Name[i] <- "Potato"
    }
    else if(SP[i]=="SV40"){
      Scientific_Name[i] <- "Simian virus 40"
      Common_Name[i] <- "Simian virus 40"
    }
    else if(SP[i]=="TAE"){
      Scientific_Name[i] <- "Triticum aestivum"
      Common_Name[i] <- "Common wheat"
    }
    else if(SP[i]=="TCA"){
      Scientific_Name[i] <- "Tribolium castaneum"
      Common_Name[i] <- "red flour beetle"
    }
    else if(SP[i]=="TCC"){
      Scientific_Name[i] <- "Theobroma cacao"
      Common_Name[i] <- "cocoa tree"
    }
    else if(SP[i]=="TCF"){
      Scientific_Name[i] <- "Triops cancriformis"
      Common_Name[i] <- "tadpole shrimp"
    }
    else if(SP[i]=="TCH"){
      Scientific_Name[i] <- "Tupaia chinensis"
      Common_Name[i] <- "Tupaia chinensis"
    }
    else if(SP[i]=="TGU"){
      Scientific_Name[i] <- "Taeniopygia_guttata"
      Common_Name[i] <- "Zebra finch"
    }
    else if(SP[i]=="TNI"){
      Scientific_Name[i] <- "Tetraodon_nigroviridis"
      Common_Name[i] <- "green spotted puffer"
    }
    else if(SP[i]=="TRE"){
      Scientific_Name[i] <- "Terebratulina retusa"
      Common_Name[i] <- "Terebratulina retusa"
    }
    else if(SP[i]=="TTU"){
      Scientific_Name[i] <- "Triticum turgidum"
      Common_Name[i] <- "pasta wheat"
    }
    else if(SP[i]=="TUR"){
      Scientific_Name[i] <- "Tetranychus urticae"
      Common_Name[i] <- "red spider mite"
    }
    else if(SP[i]=="VCA"){
      Scientific_Name[i] <- "Vriesea carinata"
      Common_Name[i] <- "Vriesea carinata"
    }
    else if(SP[i]=="VUN"){
      Scientific_Name[i] <- "Vigna unguiculata"
      Common_Name[i] <- "Cowpea"
    }
    else if(SP[i]=="VVI"){
      Scientific_Name[i] <- "Vitis vinifera"
      Common_Name[i] <- "Grape vine"
    }
    else if(SP[i]=="XBO"){
      Scientific_Name[i] <- "Xenoturbella bocki"
      Common_Name[i] <- "Xenoturbella bocki"
    }
    else if(SP[i]=="XLA"){
      Scientific_Name[i] <- "Xenopus laevis"
      Common_Name[i] <- "African clawed frog"
    }
    else if(SP[i]=="XTR"){
      Scientific_Name[i] <- "Xenopus_tropicalis"
      Common_Name[i] <- "Western clawed frog"
    }
    else if(SP[i]=="ZMA"){
      Scientific_Name[i] <- "Zea Mays"
      Common_Name[i] <- "Corn"
    }
    else{
      Scientific_Name[i] <- "NA"
      Common_Name[i] <- "NA"
    }
    setTxtProgressBar(pb, i)
  }
  close(pb)
  MiRNADT$Common_Name <- Common_Name
  MiRNADT$Scientific_Name <- Scientific_Name
  return(MiRNADT)
}




#' MiRNAname
#'
#' Removes the species designation from the miRNA name and adds it to a new column on a data.frame
#' Requires a data.table with the column housing the miRNA labelled "miRNA_Name"
#'
#' @param  miDT a data table with one column labeled miRNA_Name
#' @return A data table containing the following columns:
#' @author Brendan Gongol
#' @export
#' @examples
#'miRNA_Name <- c("hsa-let-7","cel-lin-4","aae-mir-1","bol-mir-2","rno-mir-34","cel-mir-35",
#'                "cel-let-7-5p", "cel-let-7-3p", "cel-lin-4-5p", "cel-lin-4-3p", "cel-miR-1-5p", "ame-miR-9895",
#'                "hsa-miR-9896", "rno-miR-3478", "mmu-miR9897-5p", "cre-miR9897-3p")
#'MiRNADT<- data.frame(cbind(miRNA_Name))
#'MiRNAname(MiRNADT)
#'
#'Sequence <- c("UAGCGAUUGGAUCCGGUGAGGUAGUAGGUUGUAUAGUUUGGAAUAUUACCACCGGUGAACUAUGCAAUUUUCUACCUUACCGGAGACAGAACUCUUCGA",
#'              "AUGCUUCCGGCCUGUUCCCUGAGACCUCAAGUGUGAGUGUACUAGCGAUGCUUCACACCUGGGCUCUCCGGGUACCAGGACGGUUUGAGCAGAU",
#'              "AAAGUGACCGUACCGAGCUGCAUACUUCCUUACAUGCCCAUACUAUAUCAUAAAUGGAUAUGGAAUGUAAAGAAGUAUGUAGAACGGGGUGGUAGU ",
#'              "UAAACAGUAUACAGAAAGCCAUCAAAGCGGUGGUUGAUGUGUUGCAAAUUAUGACUUUCAUAUCACAGCCAGCUUUGAUGUGCUGCCUGUUGCACUGU",
#'              "CGGACAAUGCUCGAGAGGCAGUGUGGUUAGCUGGUUGCAUAUUUCCUUGACAACGGCUACCUUCACUGCCACCCCGAACAUGUCGUCCAUCUUUGAA",
#'              "UAGCGAUUCAGAUCGAGCCAUUGCUGGUUUCUUCCACAGUAGCGAUUUCCAUUAGAACUAUCACCGGGUGGAAACUAGCAGUGGCUCGAUUAGCGAU",
#'              "UGAGGUAGUAGGUUGUAUAGUU", "CUAUGCAUAGCGAUACCUUACC", "UCCCUGAGACCUCAAGUGUGA", "ACACCUGGGCUCUCCGGGUACC",
#'              "UAGCGAUCCUUACAUGCCCAUA", "UCGUGUCCGUUUCUCGUUUCGA", "ACAAUAAUCGGACACUAGCGAU", "CACACCGGACGAGAUUUCAU",
#'              "UACCGGGCGUGGGGAGGGCAGG", "UAGCGAUUCCUUCUUAGCGAU")
#'miRNA_Name <- c("hsa-let-7","cel-lin-4","aae-mir-1","bol-mir-2","rno-mir-34","cel-mir-35",
#'                "cel-let-7-5p", "cel-let-7-3p", "cel-lin-4-5p", "cel-lin-4-3p", "cel-miR-1-5p", "ame-miR-9895",
#'                "hsa-miR-9896", "rno-miR-3478", "mmu-miR9897-5p", "cre-miR9897-3p")
#'miRNA_type <- c("IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN",
#'                "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE")
#'MiRNADT<- data.frame(cbind(Sequence, miRNA_Name, miRNA_type))
#'MiRNAname(MiRNADT)

MiRNAname <- function(miDT){
  miDT$miRNA_Name <- toupper(miDT$miRNA_Name)
  pb <- txtProgressBar(min = 0, max = nrow(miDT), style = 3)
  NAM <- NULL
  for(i in 1:nrow(miDT)){
    spl <- strsplit(miDT$miRNA_Name[i], split = "-")[[1]]

    if(length(spl == 2)){
      first <- strsplit(spl[2], split="")
      first <- first[[1]][1:3]
      first <- paste(first[1], first[2], first[3])
      first <- gsub(" ", "", first)

      last <- gsub("MIR", "", spl[2])

      NAM[i] <- paste(first, last, sep = "-")
    }

    if((length(spl) == 3) & (spl[2] == "MIR")){
      NAM[i] <- paste(spl[2], spl[3], sep = "-")
    }

    if((length(spl) == 3) & (spl[2] == "LET")){
      NAM[i] <- paste(spl[2], spl[3], sep = "-")
    }

    if((length(spl) == 3) & (spl[3] == "5P")){
      first <- strsplit(spl[2], split="")
      first <- first[[1]][1:3]
      first <- paste(first[1], first[2], first[3])
      first <- gsub(" ", "", first)

      last <- gsub("MIR", "", spl[2])

      NAM[i] <- paste(first, last, spl[3], sep = "-")

    }

    if(length(spl) == 4){
      NAM[i] <- paste(spl[2], spl[3], spl[4], sep = "-")
    }

    if(length(spl) == 5){
      NAM[i] <- paste(spl[2], spl[3], spl[4], spl[5], sep = "-")

    }
    setTxtProgressBar(pb, i)
  }
  miDT$miRNA <- NAM
  close(pb)
  return(miDT)
}



#' MISeed
#'
#' Adds a row to a data.table containing the seed sequence of the miRNA.
#' Requires a column named "Sequence" housing the miRNA sequence and a column named "miRNA_Name"
#' housing the name of the miRNA.
#'
#' @param  DT a data table with one column labeled Sequence
#' @return A data table containing the following columns:
#' @author Brendan Gongol
#' @export
#' @examples
#' Sequence <- c("UGAGGUAGUAGGUUGUAUAGUU", "CUAUGCAUAGCGAUACCUUACC", "UCCCUGAGACCUCAAGUGUGA", "ACACCUGGGCUCUCCGGGUACC",
#'               "UAGCGAUCCUUACAUGCCCAUA", "UCGUGUCCGUUUCUCGUUUCGA", "ACAAUAAUCGGACACUAGCGAU", "CACACCGGACGAGAUUUCAU",
#'               "UACCGGGCGUGGGGAGGGCAGG", "UAGCGAUUCCUUCUUAGCGAU")
#' miRNA_Name <- c("hsa-let-7","cel-lin-4","aae-mir-1","bol-mir-2","rno-mir-34",
#'                 "cel-lin-4-5p", "ame-miR-9895", "rno-miR-3478", "mmu-miR9897-5p", "cre-miR9897-3p")
#' miRNA_type <- c("MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE")
#' MiRNADT<- data.frame(cbind(Sequence, miRNA_Name, miRNA_type))
#' MiRNADT$Sequence <- as.character(MiRNADT$Sequence)
#' MISeed(MiRNADT)

MISeed <- function(DT){
  pb <- txtProgressBar(min = 0, max = nrow(DT), style = 3)
  seedSeq <- NULL
  for(i in 1: nrow(DT)){

    seq <- strsplit(DT$Sequence[i], split = "")
    seedseq <- seq[[1]][2:8]
    seedSeq[i] <- paste(seedseq, collapse = "")
    setTxtProgressBar(pb, i)
  }
  DT$seed_Sequence <- seedSeq
  close(pb)
  return(DT)
}



#' MIQuerySeq
#'
#' Adds a row to the original data.table containing the query sequence of the miRNA.
#' Requires a column named "seed_Sequence" housing the miRNA sequence and a column named "miRNA_Name" housing the name of the miRNA.
#'
#' @param  DT a data table with one column labeled seed_Sequence
#' @param wobble a logical statement either TRUE or FALSE.  If wobble = TRUE, will returen a query sequence allowing for G-U basepairing.  If wobble = FALSE, will return a query sequence that considers only Watson-Crick base pairing.
#' @return A data table containing the following columns:
#' @author Brendan Gongol
#' @export
#' @examples
#' Sequence <- c("UGAGGUAGUAGGUUGUAUAGUU", "CUAUGCAUAGCGAUACCUUACC", "UCCCUGAGACCUCAAGUGUGA", "ACACCUGGGCUCUCCGGGUACC",
#'               "UAGCGAUCCUUACAUGCCCAUA", "UCGUGUCCGUUUCUCGUUUCGA", "ACAAUAAUCGGACACUAGCGAU", "CACACCGGACGAGAUUUCAU",
#'               "UACCGGGCGUGGGGAGGGCAGG", "UAGCGAUUCCUUCUUAGCGAU")
#' miRNA_Name <- c("hsa-let-7","cel-lin-4","aae-mir-1","bol-mir-2","rno-mir-34",
#'                 "cel-lin-4-5p", "ame-miR-9895", "rno-miR-3478", "mmu-miR9897-5p", "cre-miR9897-3p")
#' miRNA_type <- c("MATURE", "MATURE", "MATURE", "MATURE", "MATURE",
#'                 "MATURE", "MATURE", "MATURE", "MATURE", "MATURE")
#' seed_Sequence <- c("GAGGUAG", "UAUGCAU", "CCCUGAG", "CACCUGG", "AGCGAUC",
#'                    "CGUGUCC", "CAAUAAU", "ACACCGG", "ACCGGGC", "AGCGAUU")
#' MiRNADT<- data.frame(cbind(Sequence, miRNA_Name, miRNA_type, seed_Sequence))
#' MiRNADT$Sequence <- as.character(MiRNADT$Sequence)
#' MiRNADT$seed_Sequence <- as.character(MiRNADT$seed_Sequence)
#' MIQuerySeq(MiRNADT)

MIQuerySeq <- function(DT, wobble = FALSE){
  x <- DT$seed_Sequence
  pb <- txtProgressBar(min = 0, max = nrow(DT), style = 3)
  queSeq <- NULL
  for(i in 1: nrow(DT)){

    if(wobble == TRUE){
      y <- substring(x[i], 1:nchar(x[i]), 1:nchar(x[i]))
      y <- (rev(y))
      y <- chartr("AUGC", "TACG", y)
      y <- gsub("C", "(C|T)", y)
      y <- gsub("A", "(A|G)", y)
      queSeq[i] <- paste(y, collapse="")
    }
    else if(wobble == FALSE){
      y <- substring(x[i], 1:nchar(x[i]), 1:nchar(x[i]))
      y <- (rev(y))
      y <- chartr("AUGC", "TACG", y)
      queSeq[i] <- paste(y, collapse="")
    }
    setTxtProgressBar(pb, i)
  }
  DT$Query_Sequence <- queSeq
  close(pb)

  return(DT)
}



#' MIRNATargetpredict
#'
#' uses a data.table with a column named Query_Sequence and a column named miRNA_Name to predict what miRNAs
#' target one or more mRNAs housed in a data.table
#'
#' Requires miRNA DT with a column named "Query_Sequence" housing the miRNA sequence, a column named "Scientific_Name", and a column named "miRNA_Name"
#' Requires mRNA DT with a columns named "Sequence", "external_gene_name", "Species", and "Scientific_Name".
#'
#' @param  MiRNADT a data table with one column labeled Query_Sequence and a column labeled miRNA_Name
#' @param  mRNADT a data table with three columns labeled: Sequence, external_gene_name, Species
#' @param  type a character indicating "Single" or "Multiple".  If type = "Multiple", will match miRNAs to mRNAs by the Scientific_Name in the miRNA DT
#' to the Scientific_Name in the mRNA DT.  Both of these species profiles must match across both data tables.  Requires the Spe argument indicating what species to use.
#' @param  Spe a character string indicating the species to use when type = "Multiple" is used.
#' The following specied are available if used in conjunction with other functions in the EntroSolve package: "Anolis_carolinensis", "Bos_taurus", "Caenorhabditis_elegans",
#' "Canis_familiaris", "Ciona_intestinalis", "Ciona_savignyi", "Danio_rerio", "Drosophila_melanogaster", "Equus_caballus", "Gallus_gallus", "Gorilla_gorilla", "Homo_sapiens",
#'  "Macaca_mulatta", "Monodelphis_domestica", "Mus_musculus", "Ornithorhynchus_anatinus", "Oryctolagus_cuniculus", "Oryzias_latipes", "Ovis_aries", "Pan_troglodytes",
#'  "Petromyzon_marinus", "Rattus_norvegicus", "Sus_scrofa", "Taeniopygia_guttata", "Tetraodon_nigroviridis", "Xenopus_tropicalis".
#' @return A data table containing the following columns:
#' @author Brendan Gongol
#' @importFrom data.table setnames
#' @importFrom data.table data.table
#' @importFrom data.table setkey
#' @importFrom stringr str_locate_all
#' @export
#' @examples
#' Sequence <- c("CTGTTTCGAGCCTGAATCTCGATCGCTCGCGCTAGACAGCTCGACGCACTTTTCAGCAGGAGCCTG",
#'              "TCAGCAGATAGCGCTCGATACAGCTCGACAGCTCTTGCTGTATTGTGTG",
#'              "TTGCTGTATTGTGTGATCCTCGATACAGGTATTTTCTGAGCCTGATAGCTAGCTTTGCTGTATTGTGTG",
#'              "AAAAAATTTTTTCCCCCCGGGGGG")
#' external_gene_name <- c("AKT", "PI3K", "SREBP", "FOXO")
#' Species <- c("Rat", "Mouse", "Human", "Pig")
#' mRNADTsamp <- data.frame(cbind(Sequence, external_gene_name, Species))
#' mRNADTsamp$Sequence <- as.character(mRNADTsamp$Sequence)
#'
#' Query_Sequence <- c("(C|T)T(A|G)(C|T)(C|T)T(C|T)", "TTG(C|T)G(C|T)(A|G)", "(A|G)(C|T)(A|G)TT(C|T)(C|T)")
#' miRNA_Name <- c("MMU-LET-7G-5P", "MMU-LET-7I-3P", "MMU-MIR-1A-3P")
#' miRNADT1 <-  data.frame(cbind(Query_Sequence, miRNA_Name))
#' miRNADT1$Query_Sequence <- as.character(miRNADT1$Query_Sequence)
#'
#' MIRNATargetpredict(miRNADT1, mRNADTsamp)


MIRNATargetpredict <- function(MiRNADT, mRNADT, type, Spe){

  if(type == "Single"){
    MIRNATargetpredict2 <- function(MiRNADT2, mRNADT2){
      conse2 <- MiRNADT2$Query_Sequence
      Name <- MiRNADT2$miRNA_Name
      sequence <- mRNADT2$Sequence
      #### query the input DNA sequence for consensus sequence elements.
      hold <- data.table(NULL)
      for(i in 1:length(conse2)){
        lis2 <- as.data.frame(str_locate_all(sequence, conse2[i]))
        if(nrow(lis2) > 0){
          dt <- cbind(lis2, Query_Sequence=MiRNADT2$Query_Sequence[i], miRNA_Name = Name[i], mRNA_Name = mRNADT2$external_gene_name, Species = mRNADT2$Species)
          hold <- rbind(hold, dt)

        }
      }

      if(nrow(hold) > 0){
        #### Count the number of times each consensus sequence appears.
        numhits <- hold[,length(start), by=Query_Sequence]
        hold <- merge(hold, numhits, by="Query_Sequence", allow.cartesian=TRUE)

        #### Add the length of each mRNA sequence
        spl <- strsplit(sequence, split="")
        hold$length <- length(spl[[1]])


        # Add the sequences with hits reformatted into lower case.
        spl <- strsplit(sequence, split="")
        chromtot2 <- NULL
        for(i in 1:length(hold$start)){

          if((hold$end[i] == hold$length[i])){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            sequencelow <- paste(lower, collapse="")
            first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")

            chromtot <- paste(first, sequencelow, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)

          }
          if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
            sequencelow <- paste(lower, collapse="")
            last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

            chromtot <- paste(first, sequencelow, last, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)

          }
          if((hold$start[i]-1) == 0){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            sequencelow <- paste(lower, collapse="")
            last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

            chromtot <- paste(sequencelow, last, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)
          }
          i <- i + 1
        }

        hold$Sequence <- chromtot2

        setnames(hold, c("Query_Sequence", "start", "end", "miRNA_Name", "mRNA_Name", "Species", "V1", "length", "Sequence"),
                 c("Query_Sequence", "start", "end", "miRNA_Name", "mRNA_Name", "Species", "Number_Hits", "length", "Sequence"))
      }

      return(hold)
    }

    pb <- txtProgressBar(min = 0, max = nrow(mRNADT), style = 3)
    MiRNAhits <- NULL
    for(i in 1:nrow(mRNADT)){
      fir <- MIRNATargetpredict2(MiRNADT, mRNADT[i,])
      MiRNAhits <- rbind(MiRNAhits, fir)
      setTxtProgressBar(pb, i)
    }
    close(pb)
    return(MiRNAhits)

  }


  if(type == "Multiple"){

    MIRNATargetpredict3 <- function(MiRNADT3, mRNADT3){
      MIRNATargetpredict2 <- function(MiRNADT2, mRNADT2){
        conse2 <- MiRNADT2$Query_Sequence
        Name <- MiRNADT2$miRNA_Name
        sequence <- mRNADT2$Sequence
        #### query the input DNA sequence for consensus sequence elements.
        hold <- data.table(NULL)
        for(i in 1:length(conse2)){
          lis2 <- as.data.frame(str_locate_all(sequence, conse2[i]))
          if(nrow(lis2) > 0){
            dt <- cbind(lis2, Query_Sequence=MiRNADT2$Query_Sequence[i], miRNA_Name = Name[i], mRNA_Name = mRNADT2$external_gene_name)
            hold <- rbind(hold, dt)

          }
        }

        if(nrow(hold) > 0){
          #### Count the number of times each consensus sequence appears.
          numhits <- hold[,length(start), by=c("Query_Sequence", "miRNA_Name")]
          hold <- merge(hold, numhits, by=c("Query_Sequence", "miRNA_Name"), allow.cartesian=TRUE)

          #### Add the length of each mRNA sequence
          spl <- strsplit(sequence, split="")
          hold$length <- length(spl[[1]])


          # Add the sequences with hits reformatted into lower case.
          spl <- strsplit(sequence, split="")
          chromtot2 <- NULL
          for(i in 1:length(hold$start)){

            if((hold$end[i] == hold$length[i])){
              upper <- spl[[1]][hold$start[i]:hold$end[i]]
              lower <- tolower(upper)

              sequencelow <- paste(lower, collapse="")
              first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")

              chromtot <- paste(first, sequencelow, collapse="")
              chromtot2[i] <- gsub(" ", "", chromtot)

            }
            if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
              upper <- spl[[1]][hold$start[i]:hold$end[i]]
              lower <- tolower(upper)

              first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
              sequencelow <- paste(lower, collapse="")
              last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

              chromtot <- paste(first, sequencelow, last, collapse="")
              chromtot2[i] <- gsub(" ", "", chromtot)

            }
            if((hold$start[i]-1) == 0){
              upper <- spl[[1]][hold$start[i]:hold$end[i]]
              lower <- tolower(upper)

              sequencelow <- paste(lower, collapse="")
              last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

              chromtot <- paste(sequencelow, last, collapse="")
              chromtot2[i] <- gsub(" ", "", chromtot)
            }
            i <- i + 1
          }

          hold$Sequence <- chromtot2

          setnames(hold, c("Query_Sequence", "start", "end", "miRNA_Name", "mRNA_Name", "V1", "length", "Sequence"),
                   c("Query_Sequence", "start", "end", "miRNA_Name", "mRNA_Name", "Number_Hits", "length", "Sequence"))
        }

        return(hold)
      }

      MiRNAhits <- NULL
      for(i in 1:nrow(mRNADT3)){
        fir <- MIRNATargetpredict2(MiRNADT3, mRNADT3[i,])
        MiRNAhits <- rbind(MiRNAhits, fir)
      }
      return(MiRNAhits)

    }


    pb <- txtProgressBar(min = 0, max = length(Spe), style = 3)
    Hits_TOT <- NULL
    for(i in 1:length(Spe)){

      MiRNA <- MiRNADT[MiRNADT$Scientific_Name == Spe[i],]
      mRNA <- mRNADT[mRNADT$Scientific_Name == Spe[i]]


      hits <- MIRNATargetpredict3(MiRNA, mRNA)
      hits$Species <- Spe[i]

      Hits_TOT <- rbind(Hits_TOT, hits)
      setTxtProgressBar(pb, i)
    }
    close(pb)
    return(Hits_TOT)

  }
}


#' IUPAC_boolean
#'
#' Converts IUPAC coded DNA sequences to a boolean syntax that can be utilized in query searches.
#'
#' Requires a vector of DNA characters in IUPAC code.
#'
#' There are three different stringency levels: high, medium, and low. The high stringency substitutes all IUPAC symbols
#' for their nucleotide designations.  Medium stringency substitutes IUPAC symbols with one and two nucleotide designations
#' while treating those with three nucleotide designations as any nucleotide.  High stringency retains only IUPAC symbols with
#' one nucleotide designation while treating all others as any nucleotide.
#'
#' @param  DNAseq a vector of DNA characters in IUPAC code.
#' @param stringency a character designating "high", "medium", or "low" stringency levels.
#' @return A vector of DNA characters converted to boolean syntax.
#' @author Brendan Gongol
#' @export
#' @examples
#' sequences <- c("RTSWKMBDHVNATAATCGCTCCATACCTACATCN", "ATAGNNNCTCGACATWKMBACATCGCTACANNTACATAC")
#' IUPAC_Boolean(sequences, stringency = "high")
#' IUPAC_Boolean(sequences, stringency = "medium")
#' IUPAC_Boolean(sequences, stringency = "low")

IUPAC_Boolean <- function(DNAseq, stringency){
  if(stringency == "high"){
    P <- gsub("R", "(A|G)", DNAseq)
    P1 <- gsub("Y", "(C|T)", P)
    P2 <- gsub("S", "(G|C)", P1)
    P3 <- gsub("W", "(A|T)", P2)
    P4 <- gsub("K", "(G|T)", P3)
    P5 <- gsub("M", "(A|C)", P4)
    P6 <- gsub("B", "(C|G|T)", P5)
    P7 <- gsub("D", "(A|G|T)", P6)
    P8 <- gsub("H", "(A|C|T)", P7)
    P9 <- gsub("V", "(A|C|G)", P8)
    P10 <- gsub("N", ".", P9)
  }
  if(stringency == "medium"){
    P <- gsub("R", "(A|G)", DNAseq)
    P1 <- gsub("Y", "(C|T)", P)
    P2 <- gsub("S", "(G|C)", P1)
    P3 <- gsub("W", "(A|T)", P2)
    P4 <- gsub("K", "(G|T)", P3)
    P5 <- gsub("M", "(A|C)", P4)
    P6 <- gsub("B", ".", P5)
    P7 <- gsub("D", ".", P6)
    P8 <- gsub("H", ".", P7)
    P9 <- gsub("V", ".", P8)
    P10 <- gsub("N", ".", P9)
  }
  if(stringency == "low"){
    P <- gsub("R", ".", DNAseq)
    P1 <- gsub("Y", ".", P)
    P2 <- gsub("S", ".", P1)
    P3 <- gsub("W", ".", P2)
    P4 <- gsub("K", ".", P3)
    P5 <- gsub("M", ".", P4)
    P6 <- gsub("B", ".", P5)
    P7 <- gsub("D", ".", P6)
    P8 <- gsub("H", ".", P7)
    P9 <- gsub("V", ".", P8)
    P10 <- gsub("N", ".", P9)
  }
  return(P10)
}



#' TFpredict
#'
#' If type = "single"
#' uses a data.table with a column named Targeting_Factor and a column named Consensus_Sequence to predict what Targeting Factor consensus sequences
#' are located within a Protein, DNA, or RNA sequence
#' Requires a character vector containing the Protein, DNA, or RNA sequence of interest.
#' Requires transcription factor data table with a column labeled "Targeting_Factor" housing the targeting protein of interest and a column labeled
#' "Consensus_Sequence" housing the targeting protein consensus sequence.
#'
#' If type = "multiple"
#' Uses two data.tables.  The first one (Targeting_Factor_DT) houses a column named "Targeting_Factor" and a column named "Consensus_Sequence". The
#' second one (Target) houses a column named "Sequence" and a column named "gene_symbol".  It returns what targeting protein consensus sequences
#' housed in Targeting_Factor_DT are located within a data table of protein, DNA, RNA sequence housed in Target.
#' Requires data.table (Targeting_Factor_DT) with a column named "Targeting_Factor" and a column named "Consensus_Sequence" housing the consensus sequence.
#' Requires data.table (Target) with a columns named "Sequence" and "gene_symbol" housing the protein, DNA, or RNA sequences and designated names to query.
#'
#' If type = "multiple_species"
#' Uses two data.tables.  The first one (Targeting_Factor_DT) houses a column named "Targeting_Factor" and a column named "Consensus_Sequence". The
#' second one (Target) houses a column named "Sequence", a column named "gene_symbol",  column labeled "Common_Name", and a column labeled "Scientific_Name".  It returns what consensus sequences
#' housed in Targeting_Factor_DT are located within a data table of protein, DNA, opr RNA sequence housed in Target across multiple species.
#' Requires data.table (Targeting_Factor_DT) with a column named "Targeting_Factor" and a column named "Consensus_Sequence" housing the consensus sequence.
#' Requires data.table (Target) with a columns named "Sequence", "gene_symbol", "Common_Name", and "Scientific_Name", housing the protein, DNA, or RNA sequences and designated names to query.
#'
#' If type = "multiple_species_unknown"
#' Used if the Target data.table does not contain a column labeled "Common_Name".
#' Uses two data.tables.  The first one (Targeting_Factor_DT) houses a column named "Targeting_Factor" and a column named "Consensus_Sequence". The
#' second one (Target) houses a column named "Sequence", a column named "gene_symbol", and a column labeled "Scientific_Name".  It returns what consensus sequences
#' housed in Targeting_Factor_DT are located within a data.table of protein, DNA, or RNA sequence housed in Target across multiple species.
#' Requires data.table (Targeting_Factor_DT) with a column named "Targeting_Factor" and a column named "Consensus_Sequence" housing the consensus sequence.
#' Requires data.table (Target) with a columns named "Sequence", "gene_symbol", and "Scientific_Name", housing the protein, DNA, or RNA sequences and designated names to query.
#'
#'
#' @param  type a single character indicating "single", "multiple", "multiple_species", or "multiple_species_unknown"
#' @param  Target If type = "single", a vector containing one protein, DNA, or RNA character string. If type = "multiple", a data.table containing two columns, one labeled "Sequence" and one labeled "gene_symbol".
#' If type = "multiple_species", a data.table containing four columns, one labeled "Sequence" one labeled "gene_symbol", one labeled "Common_Name", and one labeled "Scientific_Name". If type = "multiple_species_unknown",
#' a data.table containing three columns, one labeled "Sequence" one labeled "gene_symbol", and one labeled "Scientific_Name".
#' @param  Targeting_Factor_DT a data table with two columns labeled: Targeting_Factor and Consensus_Sequence
#' @return A data table
#' @author Brendan Gongol
#' @importFrom data.table setnames
#' @importFrom data.table data.table
#' @importFrom data.table setkey
#' @importFrom stringr str_locate_all
#' @export
#' @examples
#' Targeting_Factor <- c("AMPK", "PKA", "PKC", "MAPK", "CAMKKB", "CAMKI", "CAMKIV", "CKII", "CDK", "SRC", "AKT")
#' Consensus_Sequence <- c("AGCNVTQ", "PPKLYS", "AAT","AAAAAAAAAAATTGCNVMDEDE", "AA", "ATA", "GTTT", "AAAAAAAAAAAAAAAAAAAAA", "(A|T)..(H|F)", "GCTAAGCTGCGCAATTTTTGTATTTTGT|AGTTCTTTTTGTGTATTAGCTCAGATTTTCCAGCTG","(A|T)T(A|G)(C|T)(C|T)T(C|T)")
#'
#' data_table <- data.table(cbind(Targeting_Factor, Consensus_Sequence))
#' data_table
#'
#' Prot <- "ACQVAPKLHGEAGCNVTQDWTYMMGVCSTASYAATWEQDEPLWYMAATNGHCATWWAAASSCATAFQTSKLPIIIGHATSDF"
#'
#' Sequence <- c("ACDFEQAGCNVTQPCTSTSGANDEPHYYASTGFWYKAGCNVTQETCCKLLHAQSWW",
#'               "ACQVAPKLHGEDWTYMMGVCSTASYWEQDEPLWYMNGHCATWWAAASSCTAQTSKLPIIIGHATSDF",
#'               "TGHATSHCTANMKLPYWQEDTGSCANMHGTYYYDEDEDDASQWWWMNNNCGYTEWSDFGCPKKK",
#'               "AAAATSTSTSGGGGGAAACCCCNNNNMMMMPPPPWWWWQHGGTTYYNNCCAA", "AAAAAATTTTTTCCCCCCGGGGGG")
#' gene_symbol <- c("PABP", "EIF4E", "SREBP", "FOXO", "ABCA1")
#' Common_Name <- c("Rat", "Mouse", "Human", "Cattle", "Dog")
#' Scientific_Name <- c("Rattus_norvegicus", "Mus_musculus", "Homo_sapiens", "Bos_taurus", "Canis_familiaris")
#' proteinTarg <- data.frame(cbind(gene_symbol,Sequence, Common_Name, Scientific_Name))
#' proteinTarg$Sequence <- as.character(proteinTarg$Sequence)
#' proteinTarg$gene_symbol <- as.character(proteinTarg$gene_symbol)
#' proteinTarg$Common_Name <- as.character(proteinTarg$Common_Name)
#' proteinTarg$Scientific_Name <- as.character(proteinTarg$Scientific_Name)
#' proteinTarg
#'
#'
#' TFpredict(Prot, data_table , type = "single")
#' TFpredict(proteinTarg, data_table, type = "multiple")
#' TFpredict(proteinTarg, data_table, type = "multiple_species")
#' TFpredict(proteinTarg, data_table, type = "multiple_species_unknown")
#'
#'
#' Targeting_Factor <- c("KLF2", "PGC1A", "FOXO1", "NCL", "SREBP", "MYC", "HIF", "NF-KB", "TXNIP", "PPAR")
#' Consensus_Sequence <- c("AAGCT", "GCGC", "AAT","AAAAAAAAAAAAAAAAAAAAAAAA", "AA", "ATA", "GTTT", "AAAAAAAAAAAAAAAAAAAAA", "(A|T)..(C|G)", "GCTAAGCTGCGCAATTTTTGTATTTTGT|AGTTCTTTTTGTGTATTAGCTCAGATTTTCCAGCTG")
#' TX_data_table <- data.table(cbind(Targeting_Factor, Consensus_Sequence))
#' TX_data_table
#'
#' chromo_seq <- "AAGCTAAGCTAAGCTGCGCAATTTTTGTATTTTGTTTAAACAGAATCCTCAAGGGAACATCATCCTCAGTTCTTTTTGTGTATTAGCTCAGATTTTCCAGCTGTTTTTAAAGCT"
#'
#'
#' Sequence <- c("AAGCTAAGCTAAGCTGCGCAATTTTTGTATTTTGTTTAAACAGAATCCTCAAGGGAACATCATCCTCAGTTCTTTTTGTGTATTAGCTCAGATTTTCCAGCTGTTTTTAAAGCT",
#'               "CTGTTTCGAGCCTGAATCTCGATCGCTCGCGCTAGACAGCTCGACGCACTTTTCAGCAGGAGCCTG",
#'               "TCAGCAGATAGCGCTCGATACAGCTCGACAGCTCTTGCTGTATTGTGTG",
#'               "TTGCTGTATTGTGTGATCCTCGATACAGGTATTTTCTGAGCCTGATAGCTAGCTTTGCTGTATTGTGTG",
#'               "AAAAAATTTTTTCCCCCCGGGGGG")
#' gene_symbol <- c("AKT", "PI3K", "SREBP", "FOXO", "ABCA1")
#' Common_Name <- c("Rat", "Mouse", "Human", "Cattle", "Dog")
#' Scientific_Name <- c("Rattus_norvegicus", "Mus_musculus", "Homo_sapiens", "Bos_taurus", "Canis_familiaris")
#' chromo <- data.frame(cbind(Sequence, gene_symbol, Common_Name, Scientific_Name))
#' chromo$Sequence <- as.character(chromo$Sequence)
#' chromo$gene_symbol <- as.character(chromo$gene_symbol)
#' chromo$Common_Name <- as.character(chromo$Common_Name)
#' chromo$Scientific_Name <- as.character(chromo$Scientific_Name)
#' chromo
#'
#'
#'
#' TFpredict(chromo_seq, TX_data_table , type = "single")
#' TFpredict(chromo, TX_data_table, type = "multiple")
#' TFpredict(chromo, TX_data_table, type = "multiple_species")
#' TFpredict(chromo, TX_data_table, type = "multiple_species_unknown")


TFpredict <- function(Target, Targeting_Factor_DT, type){

  if(type == "single"){
    conse2 <- Targeting_Factor_DT$Consensus_Sequence
    #### query the input DNA sequence for consensus sequence elements.
    hold <- data.table(NULL)
    for(ce in conse2){
      lis2 <- as.data.frame(str_locate_all(Target, ce))
      if(nrow(lis2) > 0){
        dt <- cbind(lis2, Consensus_Sequence=ce)
        hold <- rbind(hold, dt)
      }
    }

    #### Count the number of times each consensus sequence appears.
    if(nrow(hold) > 0){
      setkey(hold, Consensus_Sequence)
      numhits <- hold[,length(end), by=Consensus_Sequence]
      hold <- merge(hold, numhits, by="Consensus_Sequence", allow.cartesian=TRUE)
    }

    ## Add the Transcription factor names to the data.table
    if(nrow(hold) > 0){
      hold <- merge(hold, Targeting_Factor_DT, by="Consensus_Sequence", allow.cartesian=TRUE)
    }

    #### Add the length of each DNA sequence
    if(nrow(hold) > 0){
      spl <- strsplit(Target, split="")
      hold$length <- length(spl[[1]])
    }

    # Add the sequences with hits reformatted into lower case.
    if(nrow(hold) > 0){
      spl <- strsplit(Target, split="")
      chromtot2 <- NULL
      for(i in 1:length(hold$start)){

        if((hold$end[i] == hold$length[i])){
          upper <- spl[[1]][hold$start[i]:hold$end[i]]
          lower <- tolower(upper)

          sequencelow <- paste(lower, collapse="")
          first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")

          chromtot <- paste(first, sequencelow, collapse="")
          chromtot2[i] <- gsub(" ", "", chromtot)

        }
        if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
          upper <- spl[[1]][hold$start[i]:hold$end[i]]
          lower <- tolower(upper)

          first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
          sequencelow <- paste(lower, collapse="")
          last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

          chromtot <- paste(first, sequencelow, last, collapse="")
          chromtot2[i] <- gsub(" ", "", chromtot)

        }
        if((hold$start[i]-1) == 0){
          upper <- spl[[1]][hold$start[i]:hold$end[i]]
          lower <- tolower(upper)

          sequencelow <- paste(lower, collapse="")
          last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

          chromtot <- paste(sequencelow, last, collapse="")
          chromtot2[i] <- gsub(" ", "", chromtot)
        }
        i <- i + 1
      }

      hold$sequence <- chromtot2
    }

    # Add the identified sequence
    if(nrow(hold) > 0){
      target_seq <- NULL
      for(i in 1:nrow(hold)){
        sp1 <- strsplit(hold$sequence[i], split="")
        sp2 <- sp1[[1]][(hold$start[i]-1):(hold$end[i] +1)]
        target_seq[i] <- paste(sp2, collapse="")
      }
      hold$Identified_sequence <- target_seq
    }


    # Rename the columns
    if(nrow(hold) > 0){
      setnames(hold, c("Consensus_Sequence", "start", "end", "V1", "Targeting_Factor", "length", "sequence", "Identified_sequence"),
               c("Consensus_Sequence", "start", "end", "Number_Hits", "Targeting_Factor", "length", "Sequence", "Identified_sequence"))
    }

    return(hold)
  }

  if(type == "multiple"){

    TXpredictDT <- function(TargetDT, Targeting_Factor_DT2){
      conse2 <- Targeting_Factor_DT2$Consensus_Sequence
      #### query the input DNA sequence for consensus sequence elements.
      hold <- data.table(NULL)
      for(ce in conse2){
        lis2 <- as.data.frame(str_locate_all(TargetDT, ce))
        if(nrow(lis2) > 0){
          dt <- cbind(lis2, Consensus_Sequence=ce)
          hold <- rbind(hold, dt)
        }
      }

      #### Count the number of times each consensus sequence appears.
      if(nrow(hold) > 0){
        setkey(hold, Consensus_Sequence)
        numhits <- hold[,length(end), by=Consensus_Sequence]
        hold <- merge(hold, numhits, by="Consensus_Sequence", allow.cartesian=TRUE)
      }

      ## Add the Transcription factor names to the data.table
      if(nrow(hold) > 0){
        hold <- merge(hold, Targeting_Factor_DT2, by="Consensus_Sequence", allow.cartesian=TRUE)
      }

      #### Add the length of each DNA sequence
      if(nrow(hold) > 0){
        spl <- strsplit(TargetDT, split="")
        hold$length <- length(spl[[1]])
      }

      # Add the sequences with hits reformatted into lower case.
      if(nrow(hold) > 0){
        spl <- strsplit(TargetDT, split="")
        chromtot2 <- NULL
        for(i in 1:length(hold$start)){

          if((hold$end[i] == hold$length[i])){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            sequencelow <- paste(lower, collapse="")
            first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")

            chromtot <- paste(first, sequencelow, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)

          }
          if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
            sequencelow <- paste(lower, collapse="")
            last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

            chromtot <- paste(first, sequencelow, last, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)

          }
          if((hold$start[i]-1) == 0){
            upper <- spl[[1]][hold$start[i]:hold$end[i]]
            lower <- tolower(upper)

            sequencelow <- paste(lower, collapse="")
            last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

            chromtot <- paste(sequencelow, last, collapse="")
            chromtot2[i] <- gsub(" ", "", chromtot)
          }
          i <- i + 1
        }

        hold$sequence <- chromtot2
      }

      # Add the identified sequence
      if(nrow(hold) > 0){
        target_seq <- NULL
        for(i in 1:nrow(hold)){
          sp1 <- strsplit(hold$sequence[i], split="")
          sp2 <- sp1[[1]][(hold$start[i]-1):(hold$end[i] +1)]
          target_seq[i] <- paste(sp2, collapse="")
        }
        hold$Identified_sequence <- target_seq
      }

      # Rename the columns
      if(nrow(hold) > 0){
        setnames(hold, c("Consensus_Sequence", "start", "end", "V1", "Targeting_Factor", "length", "sequence", "Identified_sequence"),
                 c("Consensus_Sequence", "start", "end", "Number_Hits", "Targeting_Factor", "length", "Sequence", "Identified_sequence"))
      }

      return(hold)
    }

    seqs2 <- Target$Sequence
    pb <- txtProgressBar(min = 0, max = nrow(Target), style = 3)
    predictedTSS <- NULL
    for(i in 1:nrow(Target)){
      pred <- TXpredictDT(seqs2[i], Targeting_Factor_DT)
      if(length(pred) > 0){
        pred$gene_symbol <- Target$gene_symbol[i]
        predictedTSS <- rbind(predictedTSS, pred)
      }
      setTxtProgressBar(pb, i)
    }
    close(pb)
    return(predictedTSS)

  }

  if(type == "multiple_species"){


    TXpredictDT <- function(Target2, Targeting_Factor_DT3){
      TXpredictDT2 <- function(TargetDT, Targeting_Factor_DT2){
        conse2 <- Targeting_Factor_DT2$Consensus_Sequence
        #### query the input DNA sequence for consensus sequence elements.
        hold <- data.table(NULL)
        for(ce in conse2){
          lis2 <- as.data.frame(str_locate_all(TargetDT, ce))
          if(nrow(lis2) > 0){
            dt <- cbind(lis2, Consensus_Sequence=ce)
            hold <- rbind(hold, dt)
          }
        }

        #### Count the number of times each consensus sequence appears.
        if(nrow(hold) > 0){
          setkey(hold, Consensus_Sequence)
          numhits <- hold[,length(end), by=Consensus_Sequence]
          hold <- merge(hold, numhits, by="Consensus_Sequence", allow.cartesian=TRUE)
        }

        ## Add the Transcription factor names to the data.table
        if(nrow(hold) > 0){
          hold <- merge(hold, Targeting_Factor_DT2, by="Consensus_Sequence", allow.cartesian=TRUE)
        }

        #### Add the length of each DNA sequence
        if(nrow(hold) > 0){
          spl <- strsplit(TargetDT, split="")
          hold$length <- length(spl[[1]])
        }

        # Add the sequences with hits reformatted into lower case.
        if(nrow(hold) > 0){
          spl <- strsplit(TargetDT, split="")
          chromtot2 <- NULL
          for(i in 1:length(hold$start)){

            if((hold$end[i] == hold$length[i])){
              upper <- spl[[1]][hold$start[i]:hold$end[i]]
              lower <- tolower(upper)

              sequencelow <- paste(lower, collapse="")
              first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")

              chromtot <- paste(first, sequencelow, collapse="")
              chromtot2[i] <- gsub(" ", "", chromtot)

            }
            if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
              upper <- spl[[1]][hold$start[i]:hold$end[i]]
              lower <- tolower(upper)

              first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
              sequencelow <- paste(lower, collapse="")
              last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

              chromtot <- paste(first, sequencelow, last, collapse="")
              chromtot2[i] <- gsub(" ", "", chromtot)

            }
            if((hold$start[i]-1) == 0){
              upper <- spl[[1]][hold$start[i]:hold$end[i]]
              lower <- tolower(upper)

              sequencelow <- paste(lower, collapse="")
              last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

              chromtot <- paste(sequencelow, last, collapse="")
              chromtot2[i] <- gsub(" ", "", chromtot)
            }
            i <- i + 1
          }

          hold$sequence <- chromtot2
        }

        # Add the identified sequence
        if(nrow(hold) > 0){
          target_seq <- NULL
          for(i in 1:nrow(hold)){
            sp1 <- strsplit(hold$sequence[i], split="")
            sp2 <- sp1[[1]][(hold$start[i]-1):(hold$end[i] +1)]
            target_seq[i] <- paste(sp2, collapse="")
          }
          hold$Identified_sequence <- target_seq
        }

        # Rename the columns
        if(nrow(hold) > 0){
          setnames(hold, c("Consensus_Sequence", "start", "end", "V1", "Targeting_Factor", "length", "sequence", "Identified_sequence"),
                   c("Consensus_Sequence", "start", "end", "Number_Hits", "Targeting_Factor", "length", "Sequence", "Identified_sequence"))
        }

        return(hold)
      }

      seqs2 <- Target2$Sequence
      predictedTSS <- NULL
      for(i in 1:nrow(Target2)){
        pred <- TXpredictDT2(seqs2[i], Targeting_Factor_DT3)
        if(nrow(pred) >0){
          pred$gene_symbol <- Target2$gene_symbol[i]
          predictedTSS <- rbind(predictedTSS, pred)
        }
      }
      return(predictedTSS)
    }

    sor <- Target[order(Target$Scientific_Name),]
    SciSP <- sor[!duplicated(sor$Scientific_Name),]$Scientific_Name
    Common <- sor[!duplicated(sor$Scientific_Name),]$Common_Name

    pb <- txtProgressBar(min = 0, max = length(SciSP), style = 3)
    Predicted <- NULL
    for(i in 1:length(SciSP)){
      SurfDT <- Target[Target$Scientific_Name == SciSP[i],]
      TX_SP <- TXpredictDT(SurfDT, Targeting_Factor_DT)
      if(length(TX_SP) > 0){
        TX_SP$Species <- Common[i]
        TX_SP$Scientific_Name  <- SciSP[i]
        Predicted <- rbind(Predicted, TX_SP)
      }
      setTxtProgressBar(pb, i)
    }
    close(pb)
    return(Predicted)
    #        New <- NULL
    #     for(i in 1:length(Predicted)){
    #       New <- rbind(New, Predicted[[i]])
    #     }
    #     return(New)
    #
  }

  if(type == "multiple_species_unknown"){
    TXpredictDT <- function(Target2, Targeting_Factor_DT3){
      TXpredictDT2 <- function(TargetDT, Targeting_Factor_DT2){
        conse2 <- Targeting_Factor_DT2$Consensus_Sequence
        #### query the input DNA sequence for consensus sequence elements.
        hold <- data.table(NULL)
        for(ce in conse2){
          lis2 <- as.data.frame(str_locate_all(TargetDT, ce))
          if(nrow(lis2) > 0){
            dt <- cbind(lis2, Consensus_Sequence=ce)
            hold <- rbind(hold, dt)
          }
        }

        #### Count the number of times each consensus sequence appears.
        if(nrow(hold) > 0){
          setkey(hold, Consensus_Sequence)
          numhits <- hold[,length(end), by=Consensus_Sequence]
          hold <- merge(hold, numhits, by="Consensus_Sequence", allow.cartesian=TRUE)
        }

        ## Add the Transcription factor names to the data.table
        if(nrow(hold) > 0){
          hold <- merge(hold, Targeting_Factor_DT2, by="Consensus_Sequence", allow.cartesian=TRUE)
        }

        #### Add the length of each DNA sequence
        if(nrow(hold) > 0){
          spl <- strsplit(TargetDT, split="")
          hold$length <- length(spl[[1]])
        }

        # Add the sequences with hits reformatted into lower case.
        if(nrow(hold) > 0){
          spl <- strsplit(TargetDT, split="")
          chromtot2 <- NULL
          for(i in 1:length(hold$start)){

            if((hold$end[i] == hold$length[i])){
              upper <- spl[[1]][hold$start[i]:hold$end[i]]
              lower <- tolower(upper)

              sequencelow <- paste(lower, collapse="")
              first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")

              chromtot <- paste(first, sequencelow, collapse="")
              chromtot2[i] <- gsub(" ", "", chromtot)

            }
            if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
              upper <- spl[[1]][hold$start[i]:hold$end[i]]
              lower <- tolower(upper)

              first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
              sequencelow <- paste(lower, collapse="")
              last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

              chromtot <- paste(first, sequencelow, last, collapse="")
              chromtot2[i] <- gsub(" ", "", chromtot)

            }
            if((hold$start[i]-1) == 0){
              upper <- spl[[1]][hold$start[i]:hold$end[i]]
              lower <- tolower(upper)

              sequencelow <- paste(lower, collapse="")
              last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")

              chromtot <- paste(sequencelow, last, collapse="")
              chromtot2[i] <- gsub(" ", "", chromtot)
            }
            i <- i + 1
          }

          hold$sequence <- chromtot2
        }

        # Add the identified sequence
        if(nrow(hold) > 0){
          target_seq <- NULL
          for(i in 1:nrow(hold)){
            sp1 <- strsplit(hold$sequence[i], split="")
            sp2 <- sp1[[1]][(hold$start[i]-1):(hold$end[i] +1)]
            target_seq[i] <- paste(sp2, collapse="")
          }
          hold$Identified_sequence <- target_seq
        }

        # Rename the columns
        if(nrow(hold) > 0){
          setnames(hold, c("Consensus_Sequence", "start", "end", "V1", "Targeting_Factor", "length", "sequence", "Identified_sequence"),
                   c("Consensus_Sequence", "start", "end", "Number_Hits", "Targeting_Factor", "length", "Sequence", "Identified_sequence"))
        }

        return(hold)
      }

      seqs2 <- Target2$Sequence
      predictedTSS <- NULL
      for(i in 1:nrow(Target2)){
        pred <- TXpredictDT2(seqs2[i], Targeting_Factor_DT3)
        if(nrow(pred) >0){
          pred$gene_symbol <- Target2$gene_symbol[i]
          predictedTSS <- rbind(predictedTSS, pred)
        }
      }
      return(predictedTSS)
    }


    SciSP <- sort(Target[!duplicated(Target$Scientific_Name),]$Scientific_Name)
    pb <- txtProgressBar(min = 0, max = length(SciSP), style = 3)
    Predicted <- NULL
    for(i in 1:length(SciSP)){
      SurfDT <- Target[Target$Scientific_Name == SciSP[i],]
      TX_SP <- TXpredictDT(SurfDT, Targeting_Factor_DT)
      if(length(TX_SP) > 0){
        TX_SP$Species <- SciSP[i]
        Predicted <- rbind(Predicted, TX_SP)
      }
      setTxtProgressBar(pb, i)
    }
    close(pb)
    return(Predicted)
    #        New <- NULL
    #     for(i in 1:length(Predicted)){
    #       New <- rbind(New, Predicted[[i]])
    #     }
    #     return(New)
  }
}







#' VariantSort
#'
#' Takes a Data.table containing a compilation of mRNA or protein sequences from a variety of species and either
#' returns a data table containing the longest transcript identified for each protein and species or each variant labeled according to the length of the sequence.
#' The data.table requires a column labeled "Scientific_Name", a column labeled "Sequence", and a column labeled "external_gene_name".
#'
#' @param  DT a data table with three columns labeled Sequence, Scientific_Name, external_gene_name
#' @param variant a character statement either "MAX", "ALL", or "MIN".  If "MAX", returns a data table containing the longest sequence for each sequence and species. If "ALL", returns a data table containing each variant labeled according to the length of the sequence. If "MIN", returns a data table containing the shortest sequence for each sequence and species.
#' @return A data table containing the longest transcripts for each protein and species or each variant labeled according to the length of the sequence.
#' @author Brendan Gongol
#' @importFrom data.table data.table
#' @export
#' @examples
#'  setwd("C:/Users/Brendan/Desktop/oxidative stress surfactant bioinformatics")
#' Surfactant_Transcripts <- fread("1 Surfactant Transcripts.xls")
#' SURF_HUM <- Surfactant_Transcripts[Surfactant_Transcripts$Common_Name == "Human",]
#' sor <- VariantSort(SURF_HUM, variant = "MAX")
#' sor[,c(1, 5:11), with = FALSE]
#' sor <- VariantSort(SURF_HUM, variant = "ALL")
#' sor[,c(1, 5:12), with = FALSE]
#' sor <- VariantSort(SURF_HUM, variant = "MIN")
#' sor[,c(1, 5:11), with = FALSE]

VariantSort <- function(DT, variant){
  if(variant == "MAX"){
    seqs <- DT$Sequence
    LEN <- NULL
    pb <- txtProgressBar(min = 0, max = length(seqs), style = 3)
    for(i in 1:length(seqs)){
      c <- seqs[i]
      LEN[i] <- length(strsplit(c, split = "")[[1]])
      setTxtProgressBar(pb, i)
    }
    DT$Length <- LEN
    DT2 <- DT[DT[, .I[which.max(Length)], by=c("Scientific_Name", "external_gene_name")]$V1]
    close(pb)
    return(DT2)
  }

  if(variant == "ALL"){
    seqs <- DT$Sequence
    LEN <- NULL
    pb <- txtProgressBar(min = 0, max = length(seqs), style = 3)
    for(i in 1:length(seqs)){
      c <- seqs[i]
      LEN[i] <- length(strsplit(c, split = "")[[1]])
      setTxtProgressBar(pb, i)
    }
    DT$Length <- LEN
    DT$Variant <- paste(DT$external_gene_name, DT$Length, sep = "-")
    close(pb)
    return(DT)
  }

  if(variant == "MIN"){
    seqs <- DT$Sequence
    LEN <- NULL
    pb <- txtProgressBar(min = 0, max = length(seqs), style = 3)
    for(i in 1:length(seqs)){
      c <- seqs[i]
      LEN[i] <- length(strsplit(c, split = "")[[1]])
      setTxtProgressBar(pb, i)
    }
    DT$Length <- LEN
    DT2 <- DT[DT[, .I[which.min(Length)], by=c("Scientific_Name", "external_gene_name")]$V1]
    close(pb)
    return(DT2)
  }
}



#' ChromLabel
#'
#' Requires a data table with a column named "chromosome_name" housing the chromosome names in a numerical format only.
#' substitutes the numeric chromosome values for those in the following format" chr1, chr2, chr3, ect... for use in the getSeq() function.
#'
#' @param  DT a data table with a column named "chromosome_name" housing the chromosome names in a numerical format only.
#' @return A data table containing substituted numeric chromosome values for those in the following format" chr1, chr2, chr3, ect... for use in the getSeq() function.
#' @author Brendan Gongol
#' @importFrom data.table data.table
#' @export
#' @examples
#' Sequence <- c("AAGCTAAGCTAAGCTGCGCAATTTTTGTATTTTGTTTAAACAGAATCCTCAAGGGAACATCATCCTCAGTTCTTTTTGTGTATTAGCTCAGATTTTCCAGCTGTTTTTAAAGCT",
#'               "CTGTTTCGAGCCTGAATCTCGATCGCTCGCGCTAGACAGCTCGACGCACTTTTCAGCAGGAGCCTG",
#'               "TCAGCAGATAGCGCTCGATACAGCTCGACAGCTCTTGCTGTATTGTGTG",
#'               "TTGCTGTATTGTGTGATCCTCGATACAGGTATTTTCTGAGCCTGATAGCTAGCTTTGCTGTATTGTGTG",
#'               "AAAAAATTTTTTCCCCCCGGGGGG")
#' gene_symbol <- c("AKT", "PI3K", "SREBP", "FOXO", "ABCA1")
#' chromosome_name <- c("1", "5", "10", "X", "Y")
#' chromo <- data.frame(cbind(Sequence, gene_symbol, chromosome_name))
#' chromo$Sequence <- as.character(chromo$Sequence)
#' chromo$chromosome_name <- as.character(chromo$chromosome_name)
#' ChromLabel(chromo)

ChromLabel <- function(DT){

  DT$chromosome_name <- gsub("^10", "chr10", DT$chromosome_name)
  DT$chromosome_name <- gsub("^11", "chr11", DT$chromosome_name)
  DT$chromosome_name <- gsub("^12", "chr12", DT$chromosome_name)
  DT$chromosome_name <- gsub("^13", "chr13", DT$chromosome_name)
  DT$chromosome_name <- gsub("^14", "chr14", DT$chromosome_name)
  DT$chromosome_name <- gsub("^15", "chr15", DT$chromosome_name)
  DT$chromosome_name <- gsub("^16", "chr16", DT$chromosome_name)
  DT$chromosome_name <- gsub("^17", "chr17", DT$chromosome_name)
  DT$chromosome_name <- gsub("^18", "chr18", DT$chromosome_name)
  DT$chromosome_name <- gsub("^19", "chr19", DT$chromosome_name)
  DT$chromosome_name <- gsub("^1", "chr1", DT$chromosome_name)
  DT$chromosome_name <- gsub("^2", "chr2", DT$chromosome_name)
  DT$chromosome_name <- gsub("^3", "chr3", DT$chromosome_name)
  DT$chromosome_name <- gsub("^4", "chr4", DT$chromosome_name)
  DT$chromosome_name <- gsub("^5", "chr5", DT$chromosome_name)
  DT$chromosome_name <- gsub("^6", "chr6", DT$chromosome_name)
  DT$chromosome_name <- gsub("^7", "chr7", DT$chromosome_name)
  DT$chromosome_name <- gsub("^8", "chr8", DT$chromosome_name)
  DT$chromosome_name <- gsub("^9", "chr9", DT$chromosome_name)
  DT$chromosome_name <- gsub("^X", "chrX", DT$chromosome_name)
  DT$chromosome_name <- gsub("^Y", "chrY", DT$chromosome_name)
  DT$chromosome_name <- gsub("^M", "chrM", DT$chromosome_name)

  return(DT)
}



#' ChromSeqConvert
#'
#' Requires sequences returned from the getSeq function
#' returns the sequences in a string format that can be added to a data.table
#'
#' @param  seqs
#' @return The sequences in a string format that can be added to a data.table
#' @author Brendan Gongol
#' @importFrom Biostrings toString
#' @export
#' @examples

ChromSeqConvert <- function(seqs){

  sequences <- NULL
  for(i in 1:length(seqs)){
    sequences[i] <- toString(seqs[i])
  }
  return(sequences)
}



#' IUPAC_ScoreR
#'
#' Requires a string of IUPAC DNA codes and scores them according to the likelihood of observing the sequence by random chance.
#' returns the sequences in a string format that can be added to a data.table
#'
#' The IUPAC consensus score is calculated as follows: first any positions that can contain any amino acid are removed.  then:
#' if stringency = "high", -log2(.25 for every single nucleotide position X .5 for every double nucleotide position X .75 for every tripple nucleotide position).
#' if stringency = "medium", -log2(.25 for every single nucleotide position X .5 for every double nucleotide position).  tripple nucleodide positions are removed prior to the calculation.
#' if stringency = "low", -log2(.25 for every single nucleotide position).  tripple and double nucleodide positions are removed prior to the calculation.
#'
#' @param  Conse a character vector containing the DNA coded IUPAC sequences
#' @param stringency a character either "high", "medium", or "low"
#' @return the IUPAC consensus score for each IUPAC sequence
#' @author Brendan Gongol
#' @export
#' @examples
#' Conseq <- c("AAAAAAAAAAAAAAAAA", "SYYCNRNSTNGCGTGNSW", "GVTTATTAAKTGGTTATATTGGKTD", "RYSWKMBDHVNATCG")
#' IUPAC_ScoreR(Conseq, stringency = "high")
#' IUPAC_ScoreR(Conseq, stringency = "medium")
#' IUPAC_ScoreR(Conseq, stringency = "low")

IUPAC_ScoreR <- function(Conse, stringency){

  pb <- txtProgressBar(min = 0, max = length(Conse), style = 3)
  SCOLEN <- NULL
  for(a in 1:length(Conse)){

    if(stringency == "high"){
      Conse2 <- gsub("N", "", Conse[a])
      SPL <- strsplit(Conse2, split = "")

      SCO <- 1
      for(i in 1:length(SPL[[1]])){
        if(SPL[[1]][i] == "R"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "Y"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "S"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "W"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "K"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "M"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "B"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "D"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "H"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "V"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "N"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "A"){
          IND <- 0.25
        }
        else if(SPL[[1]][i] == "T"){
          IND <- 0.25
        }
        else if(SPL[[1]][i] == "C"){
          IND <- 0.25
        }
        else if(SPL[[1]][i] == "G"){
          IND <- 0.25
        }
        SCO <- SCO * IND
      }
      SCOLEN[a] <- SCO
    }

    if(stringency == "medium"){
      Conse1 <- gsub("N", "", Conse[a])
      Conse2 <- gsub("B", "", Conse1)
      Conse3 <- gsub("D", "", Conse2)
      Conse4 <- gsub("H", "", Conse3)
      Conse5 <- gsub("V", "", Conse4)
      SPL <- strsplit(Conse5, split = "")

      SCO <- 1
      for(i in 1:length(SPL[[1]])){
        if(SPL[[1]][i] == "R"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "Y"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "S"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "W"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "K"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "M"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "B"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "D"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "H"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "V"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "N"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "A"){
          IND <- 0.25
        }
        else if(SPL[[1]][i] == "T"){
          IND <- 0.25
        }
        else if(SPL[[1]][i] == "C"){
          IND <- 0.25
        }
        else if(SPL[[1]][i] == "G"){
          IND <- 0.25
        }
        SCO <- SCO * IND
      }
      SCOLEN[a] <- SCO
    }

    if(stringency == "low"){
      Conse1 <- gsub("N", "", Conse[a])
      Conse2 <- gsub("B", "", Conse1)
      Conse3 <- gsub("D", "", Conse2)
      Conse4 <- gsub("H", "", Conse3)
      Conse5 <- gsub("V", "", Conse4)
      Conse6 <- gsub("R", "", Conse5)
      Conse7 <- gsub("Y", "", Conse6)
      Conse8 <- gsub("S", "", Conse7)
      Conse9 <- gsub("W", "", Conse8)
      Conse10 <- gsub("K", "", Conse9)
      Conse11 <- gsub("M", "", Conse10)
      SPL <- strsplit(Conse11, split = "")

      SCO <- 1
      for(i in 1:length(SPL[[1]])){
        if(SPL[[1]][i] == "R"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "Y"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "S"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "W"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "K"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "M"){
          IND <- 0.5
        }
        else if(SPL[[1]][i] == "B"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "D"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "H"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "V"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "N"){
          IND <- 0.75
        }
        else if(SPL[[1]][i] == "A"){
          IND <- 0.25
        }
        else if(SPL[[1]][i] == "T"){
          IND <- 0.25
        }
        else if(SPL[[1]][i] == "C"){
          IND <- 0.25
        }
        else if(SPL[[1]][i] == "G"){
          IND <- 0.25
        }
        SCO <- SCO * IND
      }
      SCOLEN[a] <- SCO
    }
    setTxtProgressBar(pb, a)
  }
  close(pb)
  return(-log2(SCOLEN))
}



#' SpeciesTFCons
#'
#' Takes a data.table containing the following columns: "Species", "gene_symbol", "Targeting_Factor"
#' returns the data.table containing only Target names that are conserved between specified species for the original data.table (may contain additional species if
#' they were in the original data.table but will only contain those promoters that are conserved between the specified species.
#' It also may do the same thing for Targeting_Factor Target associations across species specified.
#'
#' @param DT the data table to query
#' @param Spec a character vector containing the species to query.
#' @param provide Target, TF_Target.  If "Target", returns conserved Targets, If "TF_Target", returns only columns with Targeting_Factors that are paired with a Target for specified Species. It also returns an additional column with the paired Targeting_Factor and Target pair.
#' @return The sequences in a string format that can be added to a data.table
#' @author Brendan Gongol
#' @importFrom data.table data.table
#' @export
#' @examples
#' setwd("C:/Users/Brendan/Desktop/oxidative stress surfactant bioinformatics")
#' library(data.table)
#' DT25 <- fread("Raw Transcription factor hits.xls")
#' DT25 <- DT25[,c(1:6, 9:10), with = FALSE]
#' setnames(DT25, c("Consensus_Sequence", "start", "end", "Number_Hits", "TX_Factor", "MotifMap Degenerate consensus sequence", "promoter_name", "Species"),
#'          c("Consensus_Sequence", "start", "end", "Number_Hits", "Targeting_Factor", "MotifMap Degenerate consensus sequence", "gene_symbol", "Species"))
#' SP_MIN <- c("Human", "Mouse", "Rat")
#' SpeciesTFCons(DT = DT25,Spec = SP_MIN, provide = "Target" )
#' SpeciesTFCons(DT = DT25,Spec = SP_MIN, provide = "TF_Target" )

SpeciesTFCons <- function(DT, Spec, provide){

  if(provide == "Target"){
    #### Create a list in which each element contains the data.table of the specified species.
    PromoterDT <- NULL
    for(i in 1:length(Spec)){
      PromoterDT2 <- DT[DT$Species == Spec[i],]
      PromoterDT[[i]] <- PromoterDT2
    }

    #### Identify Targets that are contained in each of the data.tables in the list.
    Conserv <- PromoterDT[[1]][!duplicated(PromoterDT[[1]]$gene_symbol),]$gene_symbol
    for(i in 1:length(PromoterDT)){
      SingSpec <- PromoterDT[[i]][!duplicated(PromoterDT[[i]]$gene_symbol),]$gene_symbol

      Conserv <- intersect(Conserv, SingSpec)
    }

    #### Use the identified conservedTargets to subset the original data.table and return it.
    DT15 <- DT[(DT$gene_symbol %in% Conserv),]

    return(DT15)
  }

  if(provide == "TF_Target"){
    #### Paste the TX_Factor name and theTarget together and add it to a new column.
    DT$mergecol <- paste(DT$Targeting_Factor, DT$gene_symbol, sep = "-")

    #### Identify the TX_Factor promoter associations that are conserved
    CONS2 <- DT$mergecol
    for(i in 1:length(Spec)){
      TX_cons_DT2 <- DT[DT$Species == Spec[i],]
      TXS <- TX_cons_DT2[!duplicated(TX_cons_DT2$mergecol),]$mergecol
      CONS2 <- intersect(CONS2, TXS)
    }
    #### Use the identified conserved promoters to subset the original data.table and return it.
    DT12 <- DT[(DT$mergecol %in% CONS2),]
    # DT12$mergecol <- NULL
    return(DT12)
  }
}




#' TFRankR
#'
#' Requires a data table containing a column labeled "gene_symbol", "Species", "Targeting_Factor", "Score"
#' Returns a ranked list according to the specified options.
#'
#' @param DT the data table to query
#' @param sortBy "species", "Target", "abundance", "score", "species & Target", "species & score & Target", "abundance & Target", "Species & abundance & Target", and "species & abundance".  When sorting by "species $ Target", ranks greatest number of species first, least number of Targets second.  When sorting by "species & score & Target", ranks greatest number of species first, greatest IUPAC consensus score second, and least number of Targets third.
#' When sorting by "abundance & Target", ranks the greatest abundance of consensus sequences for each promoter first and ranks the greatest number of targets for each Targeting_Factor second.
#' When sorting by "species & abundance & Target", ranks the greatest to least number of Species first, the greatest to least number of consensus sequences second, and the greatest to least number of targets third.
#' @param dec: used only if sortBy "species", "promoter", or "abundance" are used.  It is either TRUE or FALSE and indicates whether to sort in decresing or increasing order respectvely.
#' @param SPselect: a single character vector used when sorting by "species & Target", "species & abundance" or by "species & score & Target" to designate the dominant species to use.
#' @param IUPACgreat: A logical statement either TRUE or FALSE indicating whether to use the greatest IUPAC consensus score of the lowest IUPAC score respectively when two or more IUPAC sequences are observed for a particular transcription factor.
#' @return The sequences in a string format that can be added to a data.table
#' @author Brendan Gongol
#' @importFrom data.table data.table
#' @export
#' @examples
#' setwd("C:/Users/Brendan/Desktop/oxidative stress surfactant bioinformatics")
#' library(data.table)
#' DT25 <- fread("Raw Transcription factor hits.xls")
#' DT25 <- DT25[,c(1:6, 9:10), with = FALSE]
#' setnames(DT25, c("Consensus_Sequence", "start", "end", "Number_Hits", "TX_Factor", "MotifMap Degenerate consensus sequence", "promoter_name", "Species"),
#'          c("Consensus_Sequence", "start", "end", "Number_Hits", "Targeting_Factor", "MotifMap Degenerate consensus sequence", "gene_symbol", "Species"))
#' IUPAC <- DT25$`MotifMap Degenerate consensus sequence`
#' DT25$Score <- IUPAC_ScoreR(IUPAC, stringency = "medium")
#'
#' TFRankR(DT = DT25, sortBy = "species", dec = TRUE)
#' TFRankR(DT = DT25, sortBy = "species", dec = TRUE)
#' TFRankR(DT = DT25, sortBy = "species", dec = FALSE)
#' TFRankR(DT = DT25, sortBy = "Target", dec = TRUE)
#' TFRankR(DT = DT25, sortBy = "Target", dec = FALSE)
#' TFRankR(DT = DT25, sortBy = "abundance", dec = TRUE)
#' TFRankR(DT = DT25, sortBy = "abundance", dec = FALSE)
#' TFRankR(DT = DT25, sortBy = "score", dec = TRUE)
#' TFRankR(DT = DT25, sortBy = "score", dec = FALSE)
#' TFRankR(DT = DT25, sortBy = "species & Target", dec = FALSE, SPselect = "Human")  # Ranks greatest number of species first, least number of Targets second.
#' TFRankR(DT = DT25, sortBy = "species & score & Target", dec = FALSE, SPselect = "Human", IUPACgreat = TRUE) # Ranks greatest number of species first, the greatest IUPAC consensus score second, and least number of Targets third.
#' TFRankR(DT = DT25, sortBy = "species & score & Target", dec = FALSE, SPselect = "Human", IUPACgreat = FALSE) # Ranks greatest number of species first, the greatest IUPAC consensus score second, and least number of Targets third.
#' TFRankR(DT = DT25, sortBy = "species & abundance", dec = TRUE, SPselect = "Human")
#' TFRankR(DT = DT25, sortBy = "species & abundance", dec = FALSE, SPselect = "Human")

TFRankR <- function(DT, sortBy, dec, SPselect, IUPACgreat){

  if(sortBy == "species"){
    #### rank each transcription factor by the species preservation at each Target
    Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)

    SP_RANK4 <- NULL
    for(i in 1:length(Prom)){
      PROM <- DT[DT$gene_symbol == Prom[i],]
      PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,

      SP_RANK2 <- PROM2[,.(Number_Species = length(Species)), by= Targeting_Factor]
      # SP_RANK2 <- SP_RANK[order(SP_RANK$Number_Species, decreasing = dec),]
      SP_RANK2$gene_symbol <- Prom[i]
      SP_RANK4 <- rbind(SP_RANK4, SP_RANK2)
      # SP_RANK4 <- SP_RANK4[order(SP_RANK4$Number_Species, decreasing = dec),]
    }
    SP_RANK4 <- SP_RANK4[order(SP_RANK4$Number_Species, decreasing = dec),]
    return(SP_RANK4)
  }


  if(sortBy == "Target"){
    #### Rank transcription factors by the number of Targets they are present for each species.
    SPEC <- sort(DT[!duplicated(DT$Species),]$Species)

    SPEC_RANK3 <- NULL
    for(i in 1:length(SPEC)){
      PROM <- DT[DT$Species == SPEC[i],]
      PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,

      SPEC_RANK2 <- PROM2[,.(Number_targets = length(gene_symbol)), by= Targeting_Factor]
      # SPEC_RANK2 <- SPEC_RANK[order(SPEC_RANK$Number_targets, decreasing = dec),]
      SPEC_RANK2$Species <- SPEC[i]
      SPEC_RANK3 <- rbind(SPEC_RANK3, SPEC_RANK2)
    }
    SPEC_RANK3 <- SPEC_RANK3[order(SPEC_RANK3$Number_targets, decreasing = dec),]
    return(SPEC_RANK3)
  }


  if(sortBy == "abundance"){
    #### Rank transcription factors for each Target by the number of consensus sequences present at each Target for each Species.
    SPEC <- sort(DT[!duplicated(DT$Species),]$Species)
    Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)

    HIT_RANK_TOT <- NULL
    for(a in 1:length(SPEC)){
      TX_TOT_Select <- DT[DT$Species == SPEC[a],]

      HIT_RANK3 <- NULL
      for(i in 1:length(Prom)){
        PROM <- TX_TOT_Select[TX_TOT_Select$gene_symbol == Prom[i],]
        PROM2 <- unique(PROM[,.(Number_Hits,Targeting_Factor, gene_symbol)]) #, Species

        HIT_RANK2 <- PROM2[order(PROM2$Number_Hits, decreasing = dec),]
        HIT_RANK3 <- rbind(HIT_RANK3, HIT_RANK2)

      }
      HIT_RANK3$Species <- SPEC[a]

      HIT_RANK_TOT <- rbind(HIT_RANK_TOT, HIT_RANK3)
    }
    HIT_RANK_TOT <- HIT_RANK_TOT[order(HIT_RANK_TOT$Number_Hits, decreasing = dec),]
    return(HIT_RANK_TOT)
  }



  if(sortBy == "score"){
    #### Rank transcription factors for each Target by the IUPAC score at each Target for each Species.
    SPEC <- sort(DT[!duplicated(DT$Species),]$Species)
    Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)

    SCO_RANK_TOT <- NULL
    for(a in 1:length(SPEC)){
      TX_TOT_Select <- DT[DT$Species == SPEC[a],]

      SCO_RANK3 <- NULL
      for(i in 1:length(Prom)){
        PROM <- TX_TOT_Select[TX_TOT_Select$gene_symbol == Prom[i],]
        PROM2 <- unique(PROM[,.(Score, Targeting_Factor, gene_symbol)]) #, Species

        SCO_RANK2 <- PROM2[order(PROM2$Score, decreasing = dec),]
        SCO_RANK3 <- rbind(SCO_RANK3, SCO_RANK2)

      }
      SCO_RANK3$Species <- SPEC[a]

      SCO_RANK_TOT <- rbind(SCO_RANK_TOT, SCO_RANK3)
    }
    return(SCO_RANK_TOT)
  }


  if(sortBy == "species & Target"){
    #### Rank transcription factors by the number of species they are present in for each Target and the number of Targets they are present in for a selected species (in my case Human)
    #### Rank transcription factors by the number of Targets they are present for each species.
    SPEC <- sort(DT[!duplicated(DT$Species),]$Species)

    SPEC_RANK3 <- NULL
    for(i in 1:length(SPEC)){
      PROM <- DT[DT$Species == SPEC[i],]
      PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,

      SPEC_RANK <- PROM2[,.(Number_targets = length(gene_symbol)), by= Targeting_Factor]
      SPEC_RANK2 <- SPEC_RANK[order(SPEC_RANK$Number_targets, decreasing = TRUE),]
      SPEC_RANK2$Species <- SPEC[i]
      SPEC_RANK3 <- rbind(SPEC_RANK3, SPEC_RANK2)
    }
    SPEC_RANK3

    #### rank each transcription factor by the species preservation at each Target
    Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)

    SP_RANK4 <- NULL
    for(i in 1:length(Prom)){
      PROM <- DT[DT$gene_symbol == Prom[i],]
      PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,

      SP_RANK <- PROM2[,.(Number_Species = length(Species)), by= Targeting_Factor]
      SP_RANK2 <- SP_RANK[order(SP_RANK$Number_Species, decreasing = TRUE),]
      SP_RANK2$gene_symbol <- Prom[i]
      SP_RANK4 <- rbind(SP_RANK4, SP_RANK2)
    }
    SP_RANK4


    # Select species to merge by
    SPEC_HUM <- SPEC_RANK3[SPEC_RANK3$Species == SPselect,]

    # Merge the Species ranking and the number of Targets together.
    SPECIES_PROMNUM <- merge(SP_RANK4, SPEC_HUM, by = "Targeting_Factor")

    # Perform the ranking for each Target
    TX <- SPECIES_PROMNUM[!duplicated(SPECIES_PROMNUM$gene_symbol),]$gene_symbol

    SP_SPEC_RANK <- NULL
    for(i in 1:length(TX)){
      DT1 <-  SPECIES_PROMNUM[SPECIES_PROMNUM$gene_symbol == TX[i],]
      DT2 <- DT1[order(-DT1[,Number_Species], DT1[,Number_targets]),] # Ranking from the greatest number of Species and the least number of Targets
      SP_SPEC_RANK <- rbind(SP_SPEC_RANK, DT2)
    }
    return(SP_SPEC_RANK)

  }


  if(sortBy == "species & score & Target"){
    #### Rank transcription factors by the number of species they are present in for each Target and the number of Targets they are present in for a selected species, and the IUPAC score (in my case Human)
    #### Rank transcription factors by the number of Targets they are present for each species.
    SPEC <- DT[!duplicated(DT$Species),]$Species

    SPEC_RANK3 <- NULL
    for(i in 1:length(SPEC)){
      PROM <- DT[DT$Species == SPEC[i],]
      PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,

      SPEC_RANK <- PROM2[,.(Number_targets = length(gene_symbol)), by= Targeting_Factor]
      SPEC_RANK2 <- SPEC_RANK[order(SPEC_RANK$Number_targets, decreasing = dec),]
      SPEC_RANK2$Species <- SPEC[i]
      SPEC_RANK3 <- rbind(SPEC_RANK3, SPEC_RANK2)
    }
    SPEC_RANK3

    #### rank each transcription factor by the species preservation at each Target
    Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)

    SP_RANK4 <- NULL
    for(i in 1:length(Prom)){
      PROM <- DT[DT$gene_symbol == Prom[i],]
      PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,

      SP_RANK <- PROM2[,.(Number_Species = length(Species)), by= Targeting_Factor]
      SP_RANK2 <- SP_RANK[order(SP_RANK$Number_Species, decreasing = dec),]
      SP_RANK2$gene_symbol <- Prom[i]
      SP_RANK4 <- rbind(SP_RANK4, SP_RANK2)
    }
    SP_RANK4


    #### Rank transcription factors for each Target by the IUPAC score at each Target for each Species.
    SPEC <- DT[!duplicated(DT$Species),]$Species
    Prom <- DT[!duplicated(DT$gene_symbol),]$gene_symbol

    SCO_RANK_TOT <- NULL
    for(a in 1:length(SPEC)){
      TX_TOT_Select <- DT[DT$Species == SPEC[a],]

      SCO_RANK3 <- NULL
      for(i in 1:length(Prom)){
        PROM <- TX_TOT_Select[TX_TOT_Select$gene_symbol == Prom[i],]
        PROM2 <- unique(PROM[,.(Score,Targeting_Factor, gene_symbol)]) #, Species

        SCO_RANK2 <- PROM2[order(PROM2$Score, decreasing = dec),]
        SCO_RANK3 <- rbind(SCO_RANK3, SCO_RANK2)

      }
      SCO_RANK3$Species <- SPEC[a]

      SCO_RANK_TOT <- rbind(SCO_RANK_TOT, SCO_RANK3)
    }
    SCO_RANK_TOT


    # Select species to merge by
    SPEC_HUM <- SPEC_RANK3[SPEC_RANK3$Species == SPselect,]

    SCO_HUM <- SCO_RANK_TOT[SCO_RANK_TOT$Species == SPselect,]

    # Remve duplicated entries for the transcription factors and Select either the consensus sequence with the maximum or minimum score
    if(IUPACgreat == TRUE){
      SCO_RANK_TOT2 <- as.data.table(SCO_HUM)
      SCO_RANK_TOT2 <- SCO_RANK_TOT2[SCO_RANK_TOT2[, .I[which.max(Score)], by= Targeting_Factor]$V1] #### Only returns one transcription factor consensus sequence for each transcription factor.
      SCO_RANK_TOT2$gene_symbol <- NULL
      SCO_RANK_TOT2$Species <- NULL
    }
    if(IUPACgreat == FALSE){
      SCO_RANK_TOT2 <- as.data.table(SCO_HUM)
      SCO_RANK_TOT2 <- SCO_RANK_TOT2[SCO_RANK_TOT2[, .I[which.min(Score)], by= Targeting_Factor]$V1] #### Only returns one transcription factor consensus sequence for each transcription factor.
      SCO_RANK_TOT2$gene_symbol <- NULL
      SCO_RANK_TOT2$Species <- NULL
    }

    # Merge the Species ranking and the number of Targets together.
    SPECIES_PROMNUM <- merge(SP_RANK4, SPEC_HUM, by = "Targeting_Factor")

    # Merge the score ranking Species and Target ranking.
    SCO_SPECIES_PRONUM <- merge(SPECIES_PROMNUM, SCO_RANK_TOT2, by = "Targeting_Factor")


    #### Perform the final ranking
    TX <- sort(SCO_SPECIES_PRONUM[!duplicated(SCO_SPECIES_PRONUM$gene_symbol),]$gene_symbol)

    SP_SPEC_IU_RANK <- NULL
    for(i in 1:length(TX)){
      DT1 <-  SCO_SPECIES_PRONUM[SCO_SPECIES_PRONUM$gene_symbol == TX[i],]
      DT2 <- DT1[order(-DT1[,Number_Species], -DT1[,Score], DT1[,Number_targets]),] # Ranking from the greatest number of Species, the consensus sequence score, and the least number of Targets.
      SP_SPEC_IU_RANK <- rbind(SP_SPEC_IU_RANK, DT2)
    }
    return(SP_SPEC_IU_RANK)
  }



  if(sortBy == "species & abundance"){

    #### rank each transcription factor by the species preservation at each Target
    Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)

    SP_RANK4 <- NULL
    for(i in 1:length(Prom)){
      PROM <- DT[DT$gene_symbol == Prom[i],]
      PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,

      SP_RANK <- PROM2[,.(Number_Species = length(Species)), by= Targeting_Factor]
      SP_RANK2 <- SP_RANK[order(SP_RANK$Number_Species, decreasing = dec),]
      SP_RANK2$gene_symbol <- Prom[i]
      SP_RANK4 <- rbind(SP_RANK4, SP_RANK2)
    }


    #### Rank transcription factors for each Target by the number of consensus sequences present at each Target for each Species.
    SPEC <- sort(DT[!duplicated(DT$Species),]$Species)
    Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)

    HIT_RANK_TOT <- NULL
    for(a in 1:length(SPEC)){
      TX_TOT_Select <- DT[DT$Species == SPEC[a],]

      HIT_RANK3 <- NULL
      for(i in 1:length(Prom)){
        PROM <- TX_TOT_Select[TX_TOT_Select$gene_symbol == Prom[i],]
        PROM2 <- unique(PROM[,.(Number_Hits,Targeting_Factor, gene_symbol)]) #, Species

        HIT_RANK2 <- PROM2[order(PROM2$Number_Hits, decreasing = dec),]
        HIT_RANK3 <- rbind(HIT_RANK3, HIT_RANK2)

      }
      HIT_RANK3$Species <- SPEC[a]

      HIT_RANK_TOT <- rbind(HIT_RANK_TOT, HIT_RANK3)
    }


    # Select species to merge by
    SPEC_HUM <- HIT_RANK_TOT[HIT_RANK_TOT$Species == SPselect,]

    # Merge the Species ranking and the number of Targets together.
    SPECIES_PROMNUM <- merge(SP_RANK4, SPEC_HUM, by = c("Targeting_Factor", "gene_symbol"))
    # SPECIES_PROMNUM <- unique(SPECIES_PROMNUM[,.(Targeting_Factor, gene_symbol, Number_Species, Number_Hits)])


    #### Perform the final ranking
    TX <- sort(SPECIES_PROMNUM[!duplicated(SPECIES_PROMNUM$gene_symbol),]$gene_symbol)

    SP_SPEC_IU_RANK <- NULL
    for(i in 1:length(TX)){
      DT1 <-  SPECIES_PROMNUM[SPECIES_PROMNUM$gene_symbol == TX[i],]
      DT2 <- DT1[order(-DT1[,Number_Species], -DT1[,Number_Hits]),] # Ranking from the greatest number of Species, the consensus sequence score, and the least number of Targets.
      SP_SPEC_IU_RANK <- rbind(SP_SPEC_IU_RANK, DT2)
    }
    SP_SPEC_IU_RANK
    return(SP_SPEC_IU_RANK)

  }

  if(sortBy == "abundance & Target"){
    #### Rank transcription factors for each Target by the number of consensus sequences present at each Target for each Species.
    SPEC <- sort(DT[!duplicated(DT$Species),]$Species)
    Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)

    HIT_RANK_TOT <- NULL
    for(a in 1:length(SPEC)){
      TX_TOT_Select <- DT[DT$Species == SPEC[a],]
      HIT_RANK3 <- NULL
      for(i in 1:length(Prom)){
        PROM <- TX_TOT_Select[TX_TOT_Select$gene_symbol == Prom[i],]
        PROM2 <- unique(PROM[,.(Number_Hits,Targeting_Factor, gene_symbol)]) #, Species
        HIT_RANK2 <- PROM2[order(PROM2$Number_Hits, decreasing = dec),]
        HIT_RANK3 <- rbind(HIT_RANK3, HIT_RANK2)
      }
      HIT_RANK3$Species <- SPEC[a]
      HIT_RANK_TOT <- rbind(HIT_RANK_TOT, HIT_RANK3)
    }
    HIT_RANK_TOT


    #### Rank transcription factors by the number of Targets they are present for each species.
    SPEC <- sort(DT[!duplicated(DT$Species),]$Species)

    SPEC_RANK3 <- NULL
    for(i in 1:length(SPEC)){
      PROM <- DT[DT$Species == SPEC[i],]
      PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,
      SPEC_RANK <- PROM2[,.(Number_targets = length(gene_symbol)), by= Targeting_Factor]
      SPEC_RANK2 <- SPEC_RANK[order(SPEC_RANK$Number_targets, decreasing = dec),]
      SPEC_RANK2$Species <- SPEC[i]
      SPEC_RANK3 <- rbind(SPEC_RANK3, SPEC_RANK2)
    }
    SPEC_RANK3

    # Select species to merge by
    HIT_HUM <- HIT_RANK_TOT[HIT_RANK_TOT$Species == SPselect,]
    SPEC_HUM <- SPEC_RANK3[SPEC_RANK3$Species == SPselect,]
    SPEC_HUM$Species <- NULL

    # Merge the hit ranking and the number of Targets together.
    SPECIES_PROMNUM <- merge(SPEC_HUM, HIT_HUM, by = c("Targeting_Factor"))
    # SPECIES_PROMNUM <- unique(SPECIES_PROMNUM[,.(Targeting_Factor, gene_symbol, Number_Hits, Number_targets, Species)])

    #### Perform the final ranking
    TX <- sort(SPECIES_PROMNUM[!duplicated(SPECIES_PROMNUM$gene_symbol),]$gene_symbol)
    SP_SPEC_IU_RANK <- NULL
    for(i in 1:length(TX)){
      DT1 <-  SPECIES_PROMNUM[SPECIES_PROMNUM$gene_symbol == TX[i],]
      DT2 <- DT1[order(-DT1[,Number_Hits], -DT1[,Number_targets]),] # Ranking from the greatest to least number of consensus sequences and the greatest to least number of targets.
      SP_SPEC_IU_RANK <- rbind(SP_SPEC_IU_RANK, DT2)
    }
    return(SP_SPEC_IU_RANK)
  }

  if(sortBy == "species & abundance & Target"){
    #### Rank targeting factors for each Target by the number of consensus sequences present at each Target for each Species.
    SPEC <- sort(DT[!duplicated(DT$Species),]$Species)
    Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)

    HIT_RANK_TOT <- NULL
    for(a in 1:length(SPEC)){
      TX_TOT_Select <- DT[DT$Species == SPEC[a],]
      HIT_RANK3 <- NULL
      for(i in 1:length(Prom)){
        PROM <- TX_TOT_Select[TX_TOT_Select$gene_symbol == Prom[i],]
        PROM2 <- unique(PROM[,.(Number_Hits,Targeting_Factor, gene_symbol)]) #, Species
        HIT_RANK2 <- PROM2[order(PROM2$Number_Hits, decreasing = TRUE),]
        HIT_RANK3 <- rbind(HIT_RANK3, HIT_RANK2)
      }
      HIT_RANK3$Species <- SPEC[a]
      HIT_RANK_TOT <- rbind(HIT_RANK_TOT, HIT_RANK3)
    }

    #### Rank Targeting factors by the number of Targets they are present for each species.
    SPEC <- sort(DT[!duplicated(DT$Species),]$Species)

    SPEC_RANK3 <- NULL
    for(i in 1:length(SPEC)){
      PROM <- DT[DT$Species == SPEC[i],]
      PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,
      SPEC_RANK <- PROM2[,.(Number_targets = length(gene_symbol)), by= Targeting_Factor]
      SPEC_RANK2 <- SPEC_RANK[order(SPEC_RANK$Number_targets, decreasing = TRUE),]
      SPEC_RANK2$Species <- SPEC[i]
      SPEC_RANK3 <- rbind(SPEC_RANK3, SPEC_RANK2)
    }

    #### rank each Targeting factor by the species preservation at each Target
    Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)

    SP_RANK4 <- NULL
    for(i in 1:length(Prom)){
      PROM <- DT[DT$gene_symbol == Prom[i],]
      PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,

      SP_RANK <- PROM2[,.(Number_Species = length(Species)), by= Targeting_Factor]
      SP_RANK2 <- SP_RANK[order(SP_RANK$Number_Species, decreasing = TRUE),]
      SP_RANK2$gene_symbol <- Prom[i]
      SP_RANK4 <- rbind(SP_RANK4, SP_RANK2)
    }

    # Select species to merge by
    HIT_HUM <- HIT_RANK_TOT[HIT_RANK_TOT$Species == SPselect,]
    SPEC_HUM <- SPEC_RANK3[SPEC_RANK3$Species == SPselect,]
    SPEC_HUM$Species <- NULL

    # Merge the hit ranking and the number of Targets together.
    SPECIES_PROMNUM <- merge(SPEC_HUM, HIT_HUM, by = c("Targeting_Factor"))
    # SPECIES_PROMNUM <- unique(SPECIES_PROMNUM[,.(Targeting_Factor, gene_symbol, Number_Hits, Number_targets, Species)])

    # Merge the species ranking with the merges data.table
    SPECIES_PROMNUM <- merge(SPECIES_PROMNUM, SP_RANK4, by = c("gene_symbol", "Targeting_Factor"))
    SPECIES_PROMNUM <- unique(SPECIES_PROMNUM[,.(Targeting_Factor, gene_symbol, Number_Species, Number_Hits, Number_targets, Species)])


    #### Perform the final ranking
    TX <- sort(SPECIES_PROMNUM[!duplicated(SPECIES_PROMNUM$gene_symbol),]$gene_symbol)
    SP_SPEC_IU_RANK <- NULL
    for(i in 1:length(TX)){
      DT1 <-  SPECIES_PROMNUM[SPECIES_PROMNUM$gene_symbol == TX[i],]
      DT2 <- DT1[order(-DT1[,Number_Species], -DT1[,Number_Hits], -DT1[,Number_targets]),] # Ranking from the greatest to least number of Species, the greatest to least number of consensus sequences, and the greatest to least number of targets.
      SP_SPEC_IU_RANK <- rbind(SP_SPEC_IU_RANK, DT2)
    }
    return(SP_SPEC_IU_RANK)
  }


}



#' GenomeInstaller
#'
#' Requires a character vector of specified genomes to install and installs them.
#' Installs the following genomes when specified
#' "BSgenome.Hsapiens.UCSC.hg38"
#' "BSgenome.Mmusculus.UCSC.mm9"
#' "BSgenome.Mmusculus.UCSC.mm10"
#' "BSgenome.Rnorvegicus.UCSC.rn6"
#' "BSgenome.Alyrata.JGI.v1"
#' "BSgenome.Amellifera.BeeBase.assembly4"
#' "BSgenome.Athaliana.TAIR.TAIR9"
#' "BSgenome.Btaurus.UCSC.bosTau8"
#' "BSgenome.Celegans.UCSC.ce2"
#' "BSgenome.Celegans.UCSC.ce11"
#' "BSgenome.Cfamiliaris.UCSC.canFam3"
#' "BSgenome.Dmelanogaster.UCSC.dm3"
#' "BSgenome.Drerio.UCSC.danRer10"
#' "BSgenome.Gaculeatus.UCSC.gasAcu1"
#' "BSgenome.Ggallus.UCSC.galGal4"
#' "BSgenome.Mfascicularis.NCBI.5.0"
#' "BSgenome.Mfuro.UCSC.musFur1"
#' "BSgenome.Mmulatta.UCSC.rheMac3"
#' "BSgenome.Osativa.MSU.MSU7"
#' "BSgenome.Ptroglodytes.UCSC.panTro3"
#' "BSgenome.Scerevisiae.UCSC.sacCer3"
#' "BSgenome.Sscrofa.UCSC.susScr3"
#' "BSgenome.Tguttata.UCSC.taeGut2"
#' @param genomes A character vector indicating which genomes to install of the following options:
#' "Hsapiens.UCSC.hg38"
#' "Mmusculus.UCSC.mm9"
#' "Mmusculus.UCSC.mm10"
#' "Rnorvegicus.UCSC.rn6"
#' "Alyrata.JGI.v1"
#' "Amellifera.BeeBase.assembly4"
#' "Athaliana.TAIR.TAIR9"
#' "Btaurus.UCSC.bosTau8"
#' "Celegans.UCSC.ce2"
#' "Celegans.UCSC.ce11"
#' "Cfamiliaris.UCSC.canFam3"
#' "Dmelanogaster.UCSC.dm3"
#' "Drerio.UCSC.danRer10"
#' "Gaculeatus.UCSC.gasAcu1"
#' "Ggallus.UCSC.galGal4"
#' "Mfascicularis.NCBI.5.0"
#' "Mfuro.UCSC.musFur1"
#' "Mmulatta.UCSC.rheMac3"
#' "Osativa.MSU.MSU7"
#' "Ptroglodytes.UCSC.panTro3"
#' "Scerevisiae.UCSC.sacCer3"
#' "Sscrofa.UCSC.susScr3"
#' "Tguttata.UCSC.taeGut2"
#' @return The specified downloaded genomes
#' @author Brendan Gongol
#' @export
#' @examples
#' GenomeInstaller(c("Hsapiens.UCSC.hg38","Mmusculus.UCSC.mm9"))
#' GenomeInstaller("Apple")
#'
#' The following code installs all available genomes with this function and attempts to install one genome not available ("APPLE"):
#' ToInstall <- c("Hsapiens.UCSC.hg38","Mmusculus.UCSC.mm9"
#' ,"Mmusculus.UCSC.mm10","Rnorvegicus.UCSC.rn6"
#' ,"Alyrata.JGI.v1","Amellifera.BeeBase.assembly4"
#' ,"Athaliana.TAIR.TAIR9","Btaurus.UCSC.bosTau8"
#' ,"Celegans.UCSC.ce2","Celegans.UCSC.ce11"
#' ,"Cfamiliaris.UCSC.canFam3","Dmelanogaster.UCSC.dm3"
#' ,"Drerio.UCSC.danRer10","Gaculeatus.UCSC.gasAcu1"
#' ,"Ggallus.UCSC.galGal4","Mfascicularis.NCBI.5.0"
#' ,"Mfuro.UCSC.musFur1","Mmulatta.UCSC.rheMac3"
#' ,"Osativa.MSU.MSU7","Ptroglodytes.UCSC.panTro3"
#' ,"Scerevisiae.UCSC.sacCer3","Sscrofa.UCSC.susScr3"
#' ,"Tguttata.UCSC.taeGut2", "APPLE")
#' GenomeInstaller(ToInstall)

GenomeInstaller <- function(genomes){
  source("http://bioconductor.org/biocLite.R")
  for(i in 1:length(genomes)){
    if(genomes[i] == "Hsapiens.UCSC.hg38")
      biocLite("BSgenome.Hsapiens.UCSC.hg38")
    else if(genomes[i] == "Mmusculus.UCSC.mm9")
      biocLite("BSgenome.Mmusculus.UCSC.mm9")
    else if(genomes[i] == "Mmusculus.UCSC.mm10")
      biocLite("BSgenome.Mmusculus.UCSC.mm10")
    else if(genomes[i] == "Rnorvegicus.UCSC.rn6")
      biocLite("BSgenome.Rnorvegicus.UCSC.rn6")
    else if(genomes[i] == "Alyrata.JGI.v1")
      biocLite("BSgenome.Alyrata.JGI.v1")
    else if(genomes[i] == "Amellifera.BeeBase.assembly4")
      biocLite("BSgenome.Amellifera.BeeBase.assembly4")
    else if(genomes[i] == "Athaliana.TAIR.TAIR9")
      biocLite("BSgenome.Athaliana.TAIR.TAIR9")
    else if(genomes[i] == "Btaurus.UCSC.bosTau8")
      biocLite("BSgenome.Btaurus.UCSC.bosTau8")
    else if(genomes[i] == "Celegans.UCSC.ce2")
      biocLite("BSgenome.Celegans.UCSC.ce2")
    else if(genomes[i] == "Celegans.UCSC.ce11")
      biocLite("BSgenome.Celegans.UCSC.ce11")
    else if(genomes[i] == "Cfamiliaris.UCSC.canFam3")
      biocLite("BSgenome.Cfamiliaris.UCSC.canFam3")
    else if(genomes[i] == "Dmelanogaster.UCSC.dm3")
      biocLite("BSgenome.Dmelanogaster.UCSC.dm3")
    else if(genomes[i] == "Drerio.UCSC.danRer10")
      biocLite("BSgenome.Drerio.UCSC.danRer10")
    else if(genomes[i] == "Gaculeatus.UCSC.gasAcu1")
      biocLite("BSgenome.Gaculeatus.UCSC.gasAcu1")
    else if(genomes[i] == "Ggallus.UCSC.galGal4")
      biocLite("BSgenome.Ggallus.UCSC.galGal4")
    else if(genomes[i] == "Mfascicularis.NCBI.5.0")
      biocLite("BSgenome.Mfascicularis.NCBI.5.0")
    else if(genomes[i] == "Mfuro.UCSC.musFur1")
      biocLite("BSgenome.Mfuro.UCSC.musFur1")
    else if(genomes[i] == "Mmulatta.UCSC.rheMac3")
      biocLite("BSgenome.Mmulatta.UCSC.rheMac3")
    else if(genomes[i] == "Osativa.MSU.MSU7")
      biocLite("BSgenome.Osativa.MSU.MSU7")
    else if(genomes[i] == "Ptroglodytes.UCSC.panTro3")
      biocLite("BSgenome.Ptroglodytes.UCSC.panTro3")
    else if(genomes[i] == "Scerevisiae.UCSC.sacCer3")
      biocLite("BSgenome.Scerevisiae.UCSC.sacCer3")
    else if(genomes[i] == "Sscrofa.UCSC.susScr3")
      biocLite("BSgenome.Sscrofa.UCSC.susScr3")
    else if(genomes[i] == "Tguttata.UCSC.taeGut2")
      biocLite("BSgenome.Tguttata.UCSC.taeGut2")
    else{
      print("Genome not available with this function")
    }
  }
}



#' GenomeLoader
#'
#' Requires a character vector of specified genomes to load and loads them.
#'
#' Loads the following genomes when specified
#' BSgenome.Mmusculus.UCSC.mm10
#' BSgenome.Hsapiens.UCSC.hg38
#' BSgenome.Rnorvegicus.UCSC.rn6
#' BSgenome.Alyrata.JGI.v1
#' BSgenome.Amellifera.BeeBase.assembly4
#' BSgenome.Athaliana.TAIR.TAIR9
#' BSgenome.Btaurus.UCSC.bosTau8
#' BSgenome.Celegans.UCSC.ce11
#' BSgenome.Cfamiliaris.UCSC.canFam3
#' BSgenome.Dmelanogaster.UCSC.dm6
#' BSgenome.Drerio.UCSC.danRer10
#' BSgenome.Gaculeatus.UCSC.gasAcu1
#' BSgenome.Ggallus.UCSC.galGal4
#' BSgenome.Mfascicularis.NCBI.5.0
#' BSgenome.Mfuro.UCSC.musFur1
#' BSgenome.Mmulatta.UCSC.rheMac3
#' BSgenome.Osativa.MSU.MSU7
#' BSgenome.Ptroglodytes.UCSC.panTro3
#' BSgenome.Scerevisiae.UCSC.sacCer3
#' BSgenome.Sscrofa.UCSC.susScr3
#' BSgenome.Tguttata.UCSC.taeGut2
#' @param genomes A character vector indicating which genomes to install of the following options:
#' "Hsapiens.UCSC.hg38"
#' "Mmusculus.UCSC.mm9"
#' "Mmusculus.UCSC.mm10"
#' "Rnorvegicus.UCSC.rn6"
#' "Alyrata.JGI.v1"
#' "Amellifera.BeeBase.assembly4"
#' "Athaliana.TAIR.TAIR9"
#' "Btaurus.UCSC.bosTau8"
#' "Celegans.UCSC.ce2"
#' "Celegans.UCSC.ce11"
#' "Cfamiliaris.UCSC.canFam3"
#' "Dmelanogaster.UCSC.dm3"
#' "Drerio.UCSC.danRer10"
#' "Gaculeatus.UCSC.gasAcu1"
#' "Ggallus.UCSC.galGal4"
#' "Mfascicularis.NCBI.5.0"
#' "Mfuro.UCSC.musFur1"
#' "Mmulatta.UCSC.rheMac3"
#' "Osativa.MSU.MSU7"
#' "Ptroglodytes.UCSC.panTro3"
#' "Scerevisiae.UCSC.sacCer3"
#' "Sscrofa.UCSC.susScr3"
#' "Tguttata.UCSC.taeGut2"
#' @return The specified loaded genomes
#' @author Brendan Gongol
#' @export
#' @examples
#' GenomeLoader(c("Hsapiens.UCSC.hg38","Mmusculus.UCSC.mm9"))
#' GenomeLoader("Apple")
#'
#' The following code loads all available genomes with this function and attempts to load one genome not available ("APPLE"):
#' ToLoad <- c("Hsapiens.UCSC.hg38","Mmusculus.UCSC.mm9"
#' ,"Mmusculus.UCSC.mm10","Rnorvegicus.UCSC.rn6"
#' ,"Alyrata.JGI.v1","Amellifera.BeeBase.assembly4"
#' ,"Athaliana.TAIR.TAIR9","Btaurus.UCSC.bosTau8"
#' ,"Celegans.UCSC.ce2","Celegans.UCSC.ce11"
#' ,"Cfamiliaris.UCSC.canFam3","Dmelanogaster.UCSC.dm3"
#' ,"Drerio.UCSC.danRer10","Gaculeatus.UCSC.gasAcu1"
#' ,"Ggallus.UCSC.galGal4","Mfascicularis.NCBI.5.0"
#' ,"Mfuro.UCSC.musFur1","Mmulatta.UCSC.rheMac3"
#' ,"Osativa.MSU.MSU7","Ptroglodytes.UCSC.panTro3"
#' ,"Scerevisiae.UCSC.sacCer3","Sscrofa.UCSC.susScr3"
#' ,"Tguttata.UCSC.taeGut2", "APPLE")
#' GenomeLoader(ToLoad)

GenomeLoader <- function(genomes){
  library(BSgenome)
  for(i in 1:length(genomes)){
    if(genomes[i] == "Hsapiens.UCSC.hg38")
      library(BSgenome.Hsapiens.UCSC.hg38)
    else if(genomes[i] == "Mmusculus.UCSC.mm9")
      library(BSgenome.Mmusculus.UCSC.mm9)
    else if(genomes[i] == "Mmusculus.UCSC.mm10")
      library(BSgenome.Mmusculus.UCSC.mm10)
    else if(genomes[i] == "Rnorvegicus.UCSC.rn6")
      library(BSgenome.Rnorvegicus.UCSC.rn6)
    else if(genomes[i] == "Alyrata.JGI.v1")
      library(BSgenome.Alyrata.JGI.v1)
    else if(genomes[i] == "Amellifera.BeeBase.assembly4")
      library(BSgenome.Amellifera.BeeBase.assembly4)
    else if(genomes[i] == "Athaliana.TAIR.TAIR9")
      library(BSgenome.Athaliana.TAIR.TAIR9)
    else if(genomes[i] == "Btaurus.UCSC.bosTau8")
      library(BSgenome.Btaurus.UCSC.bosTau8)
    else if(genomes[i] == "Celegans.UCSC.ce2")
      library(BSgenome.Celegans.UCSC.ce2)
    else if(genomes[i] == "Celegans.UCSC.ce11")
      library(BSgenome.Celegans.UCSC.ce11)
    else if(genomes[i] == "Cfamiliaris.UCSC.canFam3")
      library(BSgenome.Cfamiliaris.UCSC.canFam3)
    else if(genomes[i] == "Dmelanogaster.UCSC.dm3")
      library(BSgenome.Dmelanogaster.UCSC.dm3)
    else if(genomes[i] == "Dmelanogaster.UCSC.dm6")
      library(BSgenome.Dmelanogaster.UCSC.dm6)
    else if(genomes[i] == "Drerio.UCSC.danRer10")
      library(BSgenome.Drerio.UCSC.danRer10)
    else if(genomes[i] == "Gaculeatus.UCSC.gasAcu1")
      library(BSgenome.Gaculeatus.UCSC.gasAcu1)
    else if(genomes[i] == "Ggallus.UCSC.galGal4")
      library(BSgenome.Ggallus.UCSC.galGal4)
    else if(genomes[i] == "Mfascicularis.NCBI.5.0")
      library(BSgenome.Mfascicularis.NCBI.5.0)
    else if(genomes[i] == "Mfuro.UCSC.musFur1")
      library(BSgenome.Mfuro.UCSC.musFur1)
    else if(genomes[i] == "Mmulatta.UCSC.rheMac3")
      library(BSgenome.Mmulatta.UCSC.rheMac3)
    else if(genomes[i] == "Osativa.MSU.MSU7")
      library(BSgenome.Osativa.MSU.MSU7)
    else if(genomes[i] == "Ptroglodytes.UCSC.panTro3")
      library(BSgenome.Ptroglodytes.UCSC.panTro3)
    else if(genomes[i] == "Scerevisiae.UCSC.sacCer3")
      library(BSgenome.Scerevisiae.UCSC.sacCer3)
    else if(genomes[i] == "Sscrofa.UCSC.susScr3")
      library(BSgenome.Sscrofa.UCSC.susScr3)
    else if(genomes[i] == "Tguttata.UCSC.taeGut2")
      library(BSgenome.Tguttata.UCSC.taeGut2)
    else{
      print("Genome not available with this function")
    }
  }
}



#' ChromosomeSeqCompileR
#'
#' Requires a data table with three columns labeled: "transcription_start_site", "chromosome_name", "Scientific_Name".
#' Requires the specification of which genomes to obtain the sequences from.
#' Requires the specification of how far upstream and downstream of the transcription start site to return sequences for.
#'
#' @param distance specifies the number of nucleotides upstream and downstream to return from the transcript start site.
#' @param DT A data table containing columns labeled: "transcription_start_site", "chromosome_name", "Scientific_Name".
#' Species labels in "Scientific_Name" that are used when querying available genomes.  The Scientific_Name must be labeled accordingly:
#' "Homo_sapiens"
#' "Mus_musculus"
#' "Rattus_norvegicus"
#' "Arabidopsis_lyrata"
#' "Apis_mellifera"
#' "Arabidopsis_thaliana"
#' "Bos_taurus"
#' "Caenorhabditis_elegans"
#' "Canis_familiaris"
#' "Drosophila_melanogaster"
#' "Danio_rerio"
#' "Gasterosteus_aculeatus"
#' "Gallus_gallus"
#' "Macaca_fascicularis"
#' "Mustela_putorius_furo"
#' "Macaca_mulatta"
#' "Oryza_sativa"
#' "Pan_troglodytes"
#' "Saccharomyces_cerevisiae"
#' "Sus_scrofa"
#' "Taeniopygia_guttata"
#' @param Spec a character vector used to designate what genomes to return sequences from to annotate DT.
#' available genomes:
#' "Hsapiens.UCSC.hg38"
#' "Mmusculus.UCSC.mm10"
#' "Rnorvegicus.UCSC.rn6"
#' "Alyrata.JGI.v1"
#' "Amellifera.BeeBase.assembly4"
#' "Athaliana.TAIR.TAIR9"
#' "Btaurus.UCSC.bosTau8"
#' "Celegans.UCSC.ce11"
#' "Cfamiliaris.UCSC.canFam3"
#' "Dmelanogaster.UCSC.dm6"
#' "Drerio.UCSC.danRer10"
#' "Gaculeatus.UCSC.gasAcu1"
#' "Ggallus.UCSC.galGal4"
#' "Mfascicularis.NCBI.5.0"
#' "Mfuro.UCSC.musFur1"
#' "Mmulatta.UCSC.rheMac3"
#' "Osativa.MSU.MSU7"
#' "Ptroglodytes.UCSC.panTro3"
#' "Scerevisiae.UCSC.sacCer3"
#' "Sscrofa.UCSC.susScr3"
#' "Tguttata.UCSC.taeGut2"
#' @importFrom BSgenome getSeq
#' @importFrom data.table data.table
#' @return The specified genomic sequences
#' @author Brendan Gongol
#' @export
#' @examples
#' setwd("C:/Users/Brendan/Desktop/oxidative stress surfactant bioinformatics")
#' library(BSgenome)
#' library(data.table)
#' library(EntroSolve)
#' Sorted_Surfactant <- fread("2 Longest Variant Surfactant Transcripts.xls")
#' #### Relabel chromosome designations
#' Sorted_Surfactant <- ChromLabel(Sorted_Surfactant)
#' #### Remove erroneous chromosome labels
#' Sorted_Surfactant <- Sorted_Surfactant[!(Sorted_Surfactant$chromosome_name == "CHR_MGchr184_PATCH"),]
#' Sorted_Surfactant <- Sorted_Surfactant[!(Sorted_Surfactant$chromosome_name == "Z"),]
#' Sorted_Surfactant <- Sorted_Surfactant[!(Sorted_Surfactant$chromosome_name == "AADN030chr10820.1"),]
#' Sorted_Surfactant <- Sorted_Surfactant[!(Sorted_Surfactant$chromosome_name == "chr1_random"),]
#' #### Load available genomes ####
#' ToLoad <- c("Hsapiens.UCSC.hg38","Mmusculus.UCSC.mm10","Rnorvegicus.UCSC.rn6","Btaurus.UCSC.bosTau8","Cfamiliaris.UCSC.canFam3",
#'             "Dmelanogaster.UCSC.dm6","Drerio.UCSC.danRer10","Ggallus.UCSC.galGal4","Mfuro.UCSC.musFur1","Mmulatta.UCSC.rheMac3"
#'             ,"Ptroglodytes.UCSC.panTro3","Sscrofa.UCSC.susScr3","Tguttata.UCSC.taeGut2")
#' GenomeLoader(ToLoad)
#'
#' genome <- c("Hsapiens.UCSC.hg38", "Mmusculus.UCSC.mm10", "Rnorvegicus.UCSC.rn6", "Btaurus.UCSC.bosTau8",
#'             "Cfamiliaris.UCSC.canFam3", "Dmelanogaster.UCSC.dm6", "Drerio.UCSC.danRer10",
#'             "Ggallus.UCSC.galGal4", "Ptroglodytes.UCSC.panTro3", "Tguttata.UCSC.taeGut2")
#' ChromosomeSeqCompileR(DT = Sorted_Surfactant, Spec = genome, distance = 2000)
#'
#' genome2 <- c("Hsapiens.UCSC.hg38", "Mmusculus.UCSC.mm10", "Rnorvegicus.UCSC.rn6", "Btaurus.UCSC.bosTau8",
#'              "Cfamiliaris.UCSC.canFam3", "Dmelanogaster.UCSC.dm6", "Drerio.UCSC.danRer10",
#'              "Ggallus.UCSC.galGal4", "Ptroglodytes.UCSC.panTro3", "Tguttata.UCSC.taeGut2", "Hsapiens.UCSC.hg38")
#' one <- ChromosomeSeqCompileR(DT = Sorted_Surfactant, Spec = genome2, distance = 2000)
#'
#' genome3 <- c("Hsapiens.UCSC.hg38", "Mmusculus.UCSC.mm10", "Rnorvegicus.UCSC.rn6", "Btaurus.UCSC.bosTau8",
#'              "Cfamiliaris.UCSC.canFam3", "Dmelanogaster.UCSC.dm6", "Drerio.UCSC.danRer10",
#'              "Ggallus.UCSC.galGal4", "Ptroglodytes.UCSC.panTro3", "Tguttata.UCSC.taeGut2", "HUMAN")
#' one <- ChromosomeSeqCompileR(DT = Sorted_Surfactant, Spec = genome3, distance = 2000)

ChromosomeSeqCompileR <- function(DT, Spec, distance){

  if(sum(duplicated(sub("\\..*", "", Spec))) > 0){
    print("You have entered a duplicated genome selection. Please remove duplicated species genome.")
  }
  else{

    ChromSeqConvert <- function(seqs){

      sequences <- NULL
      for(i in 1:length(seqs)){
        sequences[i] <- toString(seqs[i])
      }
      return(sequences)
    }

    ChromSeqConvert2 <- function(seqs){

      sequences <- toString(seqs)

      return(sequences)
    }


    pb <- txtProgressBar(min = 0, max = length(Spec), style = 3)
    CompiledSeqs <- NULL
    for(i in 1:length(Spec)){
      if(Spec[i] == "Hsapiens.UCSC.hg38"){
        genomehum <- BSgenome.Hsapiens.UCSC.hg38
        SurfTrans <- DT[DT$Scientific_Name == "Homo_sapiens"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomehum, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Mmusculus.UCSC.mm10"){
        genomemou <- BSgenome.Mmusculus.UCSC.mm10
        SurfTrans <- DT[DT$Scientific_Name == "Mus_musculus"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomemou, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Rnorvegicus.UCSC.rn6"){
        genomerat <- BSgenome.Rnorvegicus.UCSC.rn6
        SurfTrans <- DT[DT$Scientific_Name == "Rattus_norvegicus"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomerat, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Alyrata.JGI.v1"){
        genomeAly <- BSgenome.Alyrata.JGI.v1
        SurfTrans <- DT[DT$Scientific_Name == "Arabidopsis_lyrata"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeAly, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Amellifera.BeeBase.assembly4"){
        genomeAme <- BSgenome.Amellifera.BeeBase.assembly4
        SurfTrans <- DT[DT$Scientific_Name == "Apis_mellifera"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeAme, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Athaliana.TAIR.TAIR9"){
        genomeAth <- BSgenome.Athaliana.TAIR.TAIR9
        SurfTrans <- DT[DT$Scientific_Name == "Arabidopsis_thaliana"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeAth, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Btaurus.UCSC.bosTau8"){
        genomeBta <- BSgenome.Btaurus.UCSC.bosTau8
        SurfTrans <- DT[DT$Scientific_Name == "Bos_taurus"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeBta, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Celegans.UCSC.ce11"){
        genomeCel <- BSgenome.Celegans.UCSC.ce11
        SurfTrans <- DT[DT$Scientific_Name == "Caenorhabditis_elegans"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeCel, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Cfamiliaris.UCSC.canFam3"){
        genomeCfa <- BSgenome.Cfamiliaris.UCSC.canFam3
        SurfTrans <- DT[DT$Scientific_Name == "Canis_familiaris"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeCfa, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Dmelanogaster.UCSC.dm6"){
        genomeDme <- BSgenome.Dmelanogaster.UCSC.dm6
        SurfTrans <- DT[DT$Scientific_Name == "Drosophila_melanogaster"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeDme, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Drerio.UCSC.danRer10"){
        genomeDre <- BSgenome.Drerio.UCSC.danRer10
        SurfTrans <- DT[DT$Scientific_Name == "Danio_rerio"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeDre, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Gaculeatus.UCSC.gasAcu1"){
        genomeGac <- BSgenome.Gaculeatus.UCSC.gasAcu1
        SurfTrans <- DT[DT$Scientific_Name == "Gasterosteus_aculeatus"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeGac, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Ggallus.UCSC.galGal4"){
        genomeGga <- BSgenome.Ggallus.UCSC.galGal4
        SurfTrans <- DT[DT$Scientific_Name == "Gallus_gallus"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeGga, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Mfascicularis.NCBI.5.0"){
        genomeMfa <- BSgenome.Mfascicularis.NCBI.5.0
        SurfTrans <- DT[DT$Scientific_Name == "Macaca_fascicularis"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeMfa, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Mfuro.UCSC.musFur1"){
        genomeMfu <- BSgenome.Mfuro.UCSC.musFur1
        SurfTrans <- DT[DT$Scientific_Name == "Mustela_putorius_furo"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeMfu, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Mmulatta.UCSC.rheMac3"){
        genomeMmu <- BSgenome.Mmulatta.UCSC.rheMac3
        SurfTrans <- DT[DT$Scientific_Name == "Macaca_mulatta"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeMmu, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Osativa.MSU.MSU7"){
        genomeOsa <- BSgenome.Osativa.MSU.MSU7
        SurfTrans <- DT[DT$Scientific_Name == "Oryza_sativa"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeOsa, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Ptroglodytes.UCSC.panTro3"){
        genomePtr <- BSgenome.Ptroglodytes.UCSC.panTro3
        SurfTrans <- DT[DT$Scientific_Name == "Pan_troglodytes"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomePtr, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Scerevisiae.UCSC.sacCer3"){
        genomeSce <- BSgenome.Scerevisiae.UCSC.sacCer3
        SurfTrans <- DT[DT$Scientific_Name == "Saccharomyces_cerevisiae"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeSce, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Sscrofa.UCSC.susScr3"){
        genomeSsc <- BSgenome.Sscrofa.UCSC.susScr3
        SurfTrans <- DT[DT$Scientific_Name == "Sus_scrofa"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeSsc, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else if(Spec[i] == "Tguttata.UCSC.taeGut2"){
        genomeTgu <- BSgenome.Tguttata.UCSC.taeGut2
        SurfTrans <- DT[DT$Scientific_Name == "Taeniopygia_guttata"]
        STA <- SurfTrans$transcription_start_site - distance
        END <- SurfTrans$transcription_start_site + distance
        CHR <- SurfTrans$chromosome_name
        tryCatch({
          seqs <- getSeq(genomeTgu, CHR, start=STA, end=END)
        }, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
        if(nrow(SurfTrans) > 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
        else if(nrow(SurfTrans) == 1){
          SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
          CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
        }
      }
      else{
        print("At least one genome you selected is not available with this function")
      }
      setTxtProgressBar(pb, i)
    }
    close(pb)
    return(CompiledSeqs)
  }
}







#' SequenceSiftR
#'
#' Requires a data.table with a column labeled "Sequence"
#'
#' @param DT a data table containing a column labeled "Sequence" housing the DNA sequences to analyze.
#' @param Percent a numerical value between 0 and 1 indicating the percentage of "N"'s in the DNA sequence.
#' @param output a logical character either "return", "remove", or"return_remove".  If "return" will add an additional column to the data table indicating the
#' percentage of "N"'s in the DNA string.  If "remove", will return the data table after removing the rows containing sequences that have a
#' greater number of "N"'s than the specified "Percent" cutoff.  If "return_remove"  will add an additional column to the data table indicating the
#' percentage of "N"'s in the DNA string and return the data table after removing the rows containing sequences that have a
#' greater number of "N"'s than the specified "Percent" cutoff.
#' @return The data table as specified by the output argument.
#' @author Brendan Gongol
#' @importFrom stringr str_count
#' @export
#' @examples
#' Sequence <- c("AAGCTAAGCTAAGCTGCGCAATTTTTGTATTTTGTTTAAACAGAATCCTCAAGGGAACATCATCCTCAGTTCTTTTTGTGTATTAGCTCAGATTTTCCAGCTGTTTTTAAAGCT",
#'               "CTGTTTCGAGCCTGAATCTCGATCGCTCGCGCTAGACAGCTCGACGCACTTTTCAGCAGGAGCCTG",
#'               "TCAGCAGATAGCGCTCGATACAGCTCGACAGCTCTTGCTGTATTGTGTG",
#'               "TTGCTGTATTGTGTGATCCTCGATACAGGTATTTTCTGAGCCTGATAGCTAGCTTTGCTGTATTGTGTG",
#'               "AAAAAATTTTTTCCCCCCGGGGGG", "NNTGCTAGCNNNACATCGCTACNNCTAGATCGAT", "NNNNNNAGCTNNNNNNAGCTGNNNNNACNNNNN", "NNNNNNNNNANNNNNNNNNTNNNNNNNNNC")
#' gene_symbol <- c("AKT", "PI3K", "SREBP", "FOXO", "ABCA1", "Caspase-1", "PIGPEN", "SNAIL")
#' chromosome_name <- c("1", "5", "10", "X", "Y", "2", "3", "4")
#' chromo <- data.frame(cbind(Sequence, gene_symbol, chromosome_name))
#' chromo$Sequence <- as.character(chromo$Sequence)
#' chromo$chromosome_name <- as.character(chromo$chromosome_name)
#' chromo
#'
#' SequenceSiftR(chromo, output = "return")
#' SequenceSiftR(chromo, Percent = 0.49, output = "remove")
#' SequenceSiftR(chromo, Percent = 0.49, output = "return_remove")

SequenceSiftR <- function(DT, Percent, output){
  DT_SUB <- DT

  Sequence <- DT_SUB$Sequence
  N <- str_count(Sequence, c("N"))

  spl <- strsplit(DT$Sequence, split="")
  len <- NULL
  for(i in 1:length(spl)){
    len[i] <- length(spl[[i]])
  }


  Perc <- N/len

  if(output == "return"){
    DT_SUB$Percent_N <- Perc
    return(DT_SUB)
  }
  else if(output == "remove"){
    sub <- Perc < Percent
    DT_SUB <- DT_SUB[sub,]
    return(DT_SUB)
  }
  else if(output == "return_remove"){
    DT_SUB$Percent_N <- Perc
    sub <- Perc < Percent
    DT_SUB <- DT_SUB[sub,]
    return(DT_SUB)
  }

}
brengong/ConservationtextmineR documentation built on July 29, 2019, 10:05 a.m.