#' seqCompile: sequence Compile function
#'
#' Sequence compilation to a data table from a fasta file: Loops through Fasta Files in a directory and returns
#' a data table compilation of the sequences and their ID's. The returned data.table contains the following
#' column names: "Sequence", "ensembl_transcript_id", "Species_File", "Scientific_Name", "Common_Name"
#'
#' @param files a vector of numbers indicating what fasta files to compile in a directory. if type = "miRNA", the FASTA file to convert.
#' @param type a single character strig, either "mRNA", "protein", or "miRNA"
#' @param direct a character describing the path to the file containing the FASTA files.
#' @param miRNA_type used only when type = "miRNA". A single character vector indicating the type of miRNA: either "MATURE", or IMMATURE_HAIR_PIN".
#' @return if type = "mRNA", A data table containing the following columns: "Sequence", "ensembl_transcript_id", "Species_File",
#' Scientific_Name, Common_Name. if type = "protein", A data table containing the following columns: "Sequence", "ensembl_peptide_id", "Species_File",
#' Scientific_Name, Common_Name. If type = "miRNA, A data table containing the following columns: "Sequence", "miRNA_Name", "miRNA_type".
#' @author Brendan Gongol
#' @importFrom Biostrings readAAStringSet
#' @importFrom data.table setnames
#' @export
#' @examples
#' setwd("C:/Users/Brendan/Dropbox/Brendan Documents/R programming/bioinformatics/cDNA_transcriptome/2016-3-6 cDNA transcriptomes")
#' seqDT1 <- seqCompile(files = (1:2))
#' seqDT1 <- seqCompile(files = (1:length(dir())))
#' head(seqDT1)
#' tail(seqDT1)
#' dim(seqDT1)
seqCompile <- function(files= 1:5, direct = getwd(), type, miRNA_type = "MATURE"){
olddir <- getwd()
setwd(direct)
if(type == "mRNA"){
#### Read in the sequences and the transcript ID's ####
pb <- txtProgressBar(min = 0, max = length(files), style = 3)
seqDT <- NULL
for(i in files){
mRNA <- readAAStringSet(dir()[i],"fasta")
RefID <- names(mRNA)
RefID <- sub("\\..*", "", RefID) # Remove everything after first decimal
RefID <- sub(" .*", "", RefID) #erase all characters after the first space
seq <- NULL
for (z in 1:length(mRNA)){
seq[z] <- toString(mRNA[z])
}
RefSeqID <- data.frame(seq, RefID)
setnames(RefSeqID, c("seq", "RefID"), c("Sequence", "ensembl_transcript_id"))
RefSeqID$Species_File <- dir()[i]
seqDT <- rbind(seqDT, RefSeqID)
setTxtProgressBar(pb, i)
}
close(pb)
#### Add the scientific name ####
seqDT$Scientific_Name <- sub("\\..*", "", seqDT$Species_File)
#### Add the common name ####
pb <- txtProgressBar(min = 0, max = nrow(seqDT), style = 3)
Common_Name <- NULL
for(i in 1:nrow(seqDT)){
if(seqDT$Scientific_Name[i]== "Ailuropoda_melanoleuca"){
Common_Name[i] <- "Giant panda"
}
else if(seqDT$Scientific_Name[i]== "Anas_platyrhynchos"){
Common_Name[i] <- "Duck"
}
else if(seqDT$Scientific_Name[i]== "Anolis_carolinensis"){
Common_Name[i] <- "Lizard"
}
else if(seqDT$Scientific_Name[i]== "Astyanax_mexicanus"){
Common_Name[i] <- "Cave fish"
}
else if(seqDT$Scientific_Name[i]== "Bos_taurus"){
Common_Name[i] <- "Cattle"
}
else if(seqDT$Scientific_Name[i]== "Caenorhabditis_elegans"){
Common_Name[i] <- "roundworm"
}
else if(seqDT$Scientific_Name[i]== "Callithrix_jacchus"){
Common_Name[i] <- "Common marmoset monkey"
}
else if(seqDT$Scientific_Name[i]== "Canis_familiaris"){
Common_Name[i] <- "Dog"
}
else if(seqDT$Scientific_Name[i]== "Cavia_porcellus"){
Common_Name[i] <- "Guinea pig"
}
else if(seqDT$Scientific_Name[i]== "Chlorocebus_sabaeus"){
Common_Name[i] <- "Green monkey"
}
else if(seqDT$Scientific_Name[i]== "Choloepus_hoffmanni"){
Common_Name[i] <- "Hoffmann's two-toed sloth"
}
else if(seqDT$Scientific_Name[i]== "Ciona_intestinalis"){
Common_Name[i] <- "sea squirt"
}
else if(seqDT$Scientific_Name[i]== "Ciona_savignyi"){
Common_Name[i] <- "Pacific transparent sea squirt"
}
else if(seqDT$Scientific_Name[i]== "Danio_rerio"){
Common_Name[i] <- "Zebrafish"
}
else if(seqDT$Scientific_Name[i]== "Dasypus_novemcinctus"){
Common_Name[i] <- "Nine-banded armadillo"
}
else if(seqDT$Scientific_Name[i]== "Dipodomys_ordii"){
Common_Name[i] <- "Ord's kangaroo rat"
}
else if(seqDT$Scientific_Name[i]== "Drosophila_melanogaster"){
Common_Name[i] <- "Fruit fly"
}
else if(seqDT$Scientific_Name[i]== "Echinops_telfairi"){
Common_Name[i] <- "Lesser hedgehog tenrec"
}
else if(seqDT$Scientific_Name[i]== "Equus_caballus"){
Common_Name[i] <- "Horse"
}
else if(seqDT$Scientific_Name[i]== "Erinaceus_europaeus"){
Common_Name[i] <- "European hedgehog"
}
else if(seqDT$Scientific_Name[i]== "Felis_catus"){
Common_Name[i] <- "Cat"
}
else if(seqDT$Scientific_Name[i]== "Ficedula_albicollis"){
Common_Name[i] <- "Collared flycatcher"
}
else if(seqDT$Scientific_Name[i]== "Gadus_morhua"){
Common_Name[i] <- "Atlantic cod"
}
else if(seqDT$Scientific_Name[i]== "Gallus_gallus"){
Common_Name[i] <- "Chicken"
}
else if(seqDT$Scientific_Name[i]== "Gasterosteus_aculeatus"){
Common_Name[i] <- "Three-spined stickleback"
}
else if(seqDT$Scientific_Name[i]== "Gorilla_gorilla"){
Common_Name[i] <- "Gorilla"
}
else if(seqDT$Scientific_Name[i]== "Homo_sapiens"){
Common_Name[i] <- "Human"
}
else if(seqDT$Scientific_Name[i]== "Ictidomys_tridecemlineatus"){
Common_Name[i] <- "Thirteen-lined ground squirrel"
}
else if(seqDT$Scientific_Name[i]== "Latimeria_chalumnae"){
Common_Name[i] <- "West indian ocean coelacanth"
}
else if(seqDT$Scientific_Name[i]== "Lepisosteus_oculatus"){
Common_Name[i] <- "Spotted gar"
}
else if(seqDT$Scientific_Name[i]== "Loxodonta_africana"){
Common_Name[i] <- "African bush elephant"
}
else if(seqDT$Scientific_Name[i]== "Macaca_mulatta"){
Common_Name[i] <- "Rhesus macaque"
}
else if(seqDT$Scientific_Name[i]== "Macropus_eugenii"){
Common_Name[i] <- "Tammar wallaby"
}
else if(seqDT$Scientific_Name[i]== "Meleagris_gallopavo"){
Common_Name[i] <- "Wild turkey"
}
else if(seqDT$Scientific_Name[i]== "Microcebus_murinus"){
Common_Name[i] <- "Gray mouse lemur"
}
else if(seqDT$Scientific_Name[i]== "Monodelphis_domestica"){
Common_Name[i] <- "Gray short-tailed opossum"
}
else if(seqDT$Scientific_Name[i]== "Mus_musculus"){
Common_Name[i] <- "Mouse"
}
else if(seqDT$Scientific_Name[i]== "Mustela_putorius_furo"){
Common_Name[i] <- "Ferret"
}
else if(seqDT$Scientific_Name[i]== "Myotis_lucifugus"){
Common_Name[i] <- "Little brown bat"
}
else if(seqDT$Scientific_Name[i]== "Nomascus_leucogenys"){
Common_Name[i] <- "Northern white-cheeked gibbon"
}
else if(seqDT$Scientific_Name[i]== "Ochotona_princeps"){
Common_Name[i] <- "American pika"
}
else if(seqDT$Scientific_Name[i]== "Oreochromis_niloticus"){
Common_Name[i] <- "Nile tilapia"
}
else if(seqDT$Scientific_Name[i]== "Ornithorhynchus_anatinus"){
Common_Name[i] <- "Platypus"
}
else if(seqDT$Scientific_Name[i]== "Oryctolagus_cuniculus"){
Common_Name[i] <- "European rabbit"
}
else if(seqDT$Scientific_Name[i]== "Oryzias_latipes"){
Common_Name[i] <- "Japanese rice fish"
}
else if(seqDT$Scientific_Name[i]== "Otolemur_garnettii"){
Common_Name[i] <- "Northern greater galago"
}
else if(seqDT$Scientific_Name[i]== "Ovis_aries"){
Common_Name[i] <- "Sheep"
}
else if(seqDT$Scientific_Name[i]== "Pan_troglodytes"){
Common_Name[i] <- "Chimpanzee"
}
else if(seqDT$Scientific_Name[i]== "Papio_anubis"){
Common_Name[i] <- "Olive baboon"
}
else if(seqDT$Scientific_Name[i]== "Pelodiscus_sinensis"){
Common_Name[i] <- "Chinese softshell turtle"
}
else if(seqDT$Scientific_Name[i]== "Petromyzon_marinus"){
Common_Name[i] <- "Sea lamprey"
}
else if(seqDT$Scientific_Name[i]== "Poecilia_formosa"){
Common_Name[i] <- "Amazon molly"
}
else if(seqDT$Scientific_Name[i]== "Pongo_abelii"){
Common_Name[i] <- "Sumatran orangutan"
}
else if(seqDT$Scientific_Name[i]== "Procavia_capensis"){
Common_Name[i] <- "Rock badger"
}
else if(seqDT$Scientific_Name[i]== "Pteropus_vampyrus"){
Common_Name[i] <- "Large flying fox"
}
else if(seqDT$Scientific_Name[i]== "Rattus_norvegicus"){
Common_Name[i] <- "Rat"
}
else if(seqDT$Scientific_Name[i]== "Saccharomyces_cerevisiae"){
Common_Name[i] <- "Yeast"
}
else if(seqDT$Scientific_Name[i]== "Sarcophilus_harrisii"){
Common_Name[i] <- "Tasmanian devil"
}
else if(seqDT$Scientific_Name[i]== "Sorex_araneus"){
Common_Name[i] <- "Common shrew"
}
else if(seqDT$Scientific_Name[i]== "Sus_scrofa"){
Common_Name[i] <- "Wild boar"
}
else if(seqDT$Scientific_Name[i]== "Taeniopygia_guttata"){
Common_Name[i] <- "Zebra finch"
}
else if(seqDT$Scientific_Name[i]== "Takifugu_rubripes"){
Common_Name[i] <- "Japanese puffer"
}
else if(seqDT$Scientific_Name[i]== "Tarsius_syrichta"){
Common_Name[i] <- "Philippine tarsier"
}
else if(seqDT$Scientific_Name[i]== "Tetraodon_nigroviridis"){
Common_Name[i] <- "Green spotted puffer"
}
else if(seqDT$Scientific_Name[i]== "Tupaia_belangeri"){
Common_Name[i] <- "Northern treeshrew"
}
else if(seqDT$Scientific_Name[i]== "Tursiops_truncatus"){
Common_Name[i] <- "Bottlenose dolphin"
}
else if(seqDT$Scientific_Name[i]== "Vicugna_pacos"){
Common_Name[i] <- "Alpaca"
}
else if(seqDT$Scientific_Name[i]== "Xenopus_tropicalis"){
Common_Name[i] <- "Western clawed frog"
}
else if(seqDT$Scientific_Name[i]== "Xiphophorus_maculatus"){
Common_Name[i] <- "Southern platyfish"
}
else{
Common_Name[i] <- "NA"
}
setTxtProgressBar(pb, i)
}
close(pb)
seqDT$Common_Name <- Common_Name
setwd(olddir)
return(seqDT)
}
if(type == "protein"){
#### Read in the sequences and the transcript ID's ####
pb <- txtProgressBar(min = 0, max = length(files), style = 3)
seqDT <- NULL
for(i in files){
mRNA <- readAAStringSet(dir()[i],"fasta")
RefID <- names(mRNA)
RefID <- sub("\\..*", "", RefID) # Remove everything after first decimal
RefID <- sub(" .*", "", RefID) #erase all characters after the first space
seq <- NULL
for (z in 1:length(mRNA)){
seq[z] <- toString(mRNA[z])
}
RefSeqID <- data.frame(seq, RefID)
setnames(RefSeqID, c("seq", "RefID"), c("Sequence", "ensembl_peptide_id"))
RefSeqID$Species_File <- dir()[i]
seqDT <- rbind(seqDT, RefSeqID)
setTxtProgressBar(pb, i)
}
close(pb)
#### Add the scientific name ####
seqDT$Scientific_Name <- sub("\\..*", "", seqDT$Species_File)
#### Add the common name ####
pb <- txtProgressBar(min = 0, max = nrow(seqDT), style = 3)
Common_Name <- NULL
for(i in 1:nrow(seqDT)){
if(seqDT$Scientific_Name[i]== "Ailuropoda_melanoleuca"){
Common_Name[i] <- "Giant panda"
}
else if(seqDT$Scientific_Name[i]== "Anas_platyrhynchos"){
Common_Name[i] <- "Duck"
}
else if(seqDT$Scientific_Name[i]== "Anolis_carolinensis"){
Common_Name[i] <- "Lizard"
}
else if(seqDT$Scientific_Name[i]== "Astyanax_mexicanus"){
Common_Name[i] <- "Cave fish"
}
else if(seqDT$Scientific_Name[i]== "Bos_taurus"){
Common_Name[i] <- "Cattle"
}
else if(seqDT$Scientific_Name[i]== "Caenorhabditis_elegans"){
Common_Name[i] <- "roundworm"
}
else if(seqDT$Scientific_Name[i]== "Callithrix_jacchus"){
Common_Name[i] <- "Common marmoset monkey"
}
else if(seqDT$Scientific_Name[i]== "Canis_familiaris"){
Common_Name[i] <- "Dog"
}
else if(seqDT$Scientific_Name[i]== "Cavia_porcellus"){
Common_Name[i] <- "Guinea pig"
}
else if(seqDT$Scientific_Name[i]== "Chlorocebus_sabaeus"){
Common_Name[i] <- "Green monkey"
}
else if(seqDT$Scientific_Name[i]== "Choloepus_hoffmanni"){
Common_Name[i] <- "Hoffmann's two-toed sloth"
}
else if(seqDT$Scientific_Name[i]== "Ciona_intestinalis"){
Common_Name[i] <- "sea squirt"
}
else if(seqDT$Scientific_Name[i]== "Ciona_savignyi"){
Common_Name[i] <- "Pacific transparent sea squirt"
}
else if(seqDT$Scientific_Name[i]== "Danio_rerio"){
Common_Name[i] <- "Zebrafish"
}
else if(seqDT$Scientific_Name[i]== "Dasypus_novemcinctus"){
Common_Name[i] <- "Nine-banded armadillo"
}
else if(seqDT$Scientific_Name[i]== "Dipodomys_ordii"){
Common_Name[i] <- "Ord's kangaroo rat"
}
else if(seqDT$Scientific_Name[i]== "Drosophila_melanogaster"){
Common_Name[i] <- "Fruit fly"
}
else if(seqDT$Scientific_Name[i]== "Echinops_telfairi"){
Common_Name[i] <- "Lesser hedgehog tenrec"
}
else if(seqDT$Scientific_Name[i]== "Equus_caballus"){
Common_Name[i] <- "Horse"
}
else if(seqDT$Scientific_Name[i]== "Erinaceus_europaeus"){
Common_Name[i] <- "European hedgehog"
}
else if(seqDT$Scientific_Name[i]== "Felis_catus"){
Common_Name[i] <- "Cat"
}
else if(seqDT$Scientific_Name[i]== "Ficedula_albicollis"){
Common_Name[i] <- "Collared flycatcher"
}
else if(seqDT$Scientific_Name[i]== "Gadus_morhua"){
Common_Name[i] <- "Atlantic cod"
}
else if(seqDT$Scientific_Name[i]== "Gallus_gallus"){
Common_Name[i] <- "Chicken"
}
else if(seqDT$Scientific_Name[i]== "Gasterosteus_aculeatus"){
Common_Name[i] <- "Three-spined stickleback"
}
else if(seqDT$Scientific_Name[i]== "Gorilla_gorilla"){
Common_Name[i] <- "Gorilla"
}
else if(seqDT$Scientific_Name[i]== "Homo_sapiens"){
Common_Name[i] <- "Human"
}
else if(seqDT$Scientific_Name[i]== "Ictidomys_tridecemlineatus"){
Common_Name[i] <- "Thirteen-lined ground squirrel"
}
else if(seqDT$Scientific_Name[i]== "Latimeria_chalumnae"){
Common_Name[i] <- "West indian ocean coelacanth"
}
else if(seqDT$Scientific_Name[i]== "Lepisosteus_oculatus"){
Common_Name[i] <- "Spotted gar"
}
else if(seqDT$Scientific_Name[i]== "Loxodonta_africana"){
Common_Name[i] <- "African bush elephant"
}
else if(seqDT$Scientific_Name[i]== "Macaca_mulatta"){
Common_Name[i] <- "Rhesus macaque"
}
else if(seqDT$Scientific_Name[i]== "Macropus_eugenii"){
Common_Name[i] <- "Tammar wallaby"
}
else if(seqDT$Scientific_Name[i]== "Meleagris_gallopavo"){
Common_Name[i] <- "Wild turkey"
}
else if(seqDT$Scientific_Name[i]== "Microcebus_murinus"){
Common_Name[i] <- "Gray mouse lemur"
}
else if(seqDT$Scientific_Name[i]== "Monodelphis_domestica"){
Common_Name[i] <- "Gray short-tailed opossum"
}
else if(seqDT$Scientific_Name[i]== "Mus_musculus"){
Common_Name[i] <- "Mouse"
}
else if(seqDT$Scientific_Name[i]== "Mustela_putorius_furo"){
Common_Name[i] <- "Ferret"
}
else if(seqDT$Scientific_Name[i]== "Myotis_lucifugus"){
Common_Name[i] <- "Little brown bat"
}
else if(seqDT$Scientific_Name[i]== "Nomascus_leucogenys"){
Common_Name[i] <- "Northern white-cheeked gibbon"
}
else if(seqDT$Scientific_Name[i]== "Ochotona_princeps"){
Common_Name[i] <- "American pika"
}
else if(seqDT$Scientific_Name[i]== "Oreochromis_niloticus"){
Common_Name[i] <- "Nile tilapia"
}
else if(seqDT$Scientific_Name[i]== "Ornithorhynchus_anatinus"){
Common_Name[i] <- "Platypus"
}
else if(seqDT$Scientific_Name[i]== "Oryctolagus_cuniculus"){
Common_Name[i] <- "European rabbit"
}
else if(seqDT$Scientific_Name[i]== "Oryzias_latipes"){
Common_Name[i] <- "Japanese rice fish"
}
else if(seqDT$Scientific_Name[i]== "Otolemur_garnettii"){
Common_Name[i] <- "Northern greater galago"
}
else if(seqDT$Scientific_Name[i]== "Ovis_aries"){
Common_Name[i] <- "Sheep"
}
else if(seqDT$Scientific_Name[i]== "Pan_troglodytes"){
Common_Name[i] <- "Chimpanzee"
}
else if(seqDT$Scientific_Name[i]== "Papio_anubis"){
Common_Name[i] <- "Olive baboon"
}
else if(seqDT$Scientific_Name[i]== "Pelodiscus_sinensis"){
Common_Name[i] <- "Chinese softshell turtle"
}
else if(seqDT$Scientific_Name[i]== "Petromyzon_marinus"){
Common_Name[i] <- "Sea lamprey"
}
else if(seqDT$Scientific_Name[i]== "Poecilia_formosa"){
Common_Name[i] <- "Amazon molly"
}
else if(seqDT$Scientific_Name[i]== "Pongo_abelii"){
Common_Name[i] <- "Sumatran orangutan"
}
else if(seqDT$Scientific_Name[i]== "Procavia_capensis"){
Common_Name[i] <- "Rock badger"
}
else if(seqDT$Scientific_Name[i]== "Pteropus_vampyrus"){
Common_Name[i] <- "Large flying fox"
}
else if(seqDT$Scientific_Name[i]== "Rattus_norvegicus"){
Common_Name[i] <- "Rat"
}
else if(seqDT$Scientific_Name[i]== "Saccharomyces_cerevisiae"){
Common_Name[i] <- "Yeast"
}
else if(seqDT$Scientific_Name[i]== "Sarcophilus_harrisii"){
Common_Name[i] <- "Tasmanian devil"
}
else if(seqDT$Scientific_Name[i]== "Sorex_araneus"){
Common_Name[i] <- "Common shrew"
}
else if(seqDT$Scientific_Name[i]== "Sus_scrofa"){
Common_Name[i] <- "Wild boar"
}
else if(seqDT$Scientific_Name[i]== "Taeniopygia_guttata"){
Common_Name[i] <- "Zebra finch"
}
else if(seqDT$Scientific_Name[i]== "Takifugu_rubripes"){
Common_Name[i] <- "Japanese puffer"
}
else if(seqDT$Scientific_Name[i]== "Tarsius_syrichta"){
Common_Name[i] <- "Philippine tarsier"
}
else if(seqDT$Scientific_Name[i]== "Tetraodon_nigroviridis"){
Common_Name[i] <- "Green spotted puffer"
}
else if(seqDT$Scientific_Name[i]== "Tupaia_belangeri"){
Common_Name[i] <- "Northern treeshrew"
}
else if(seqDT$Scientific_Name[i]== "Tursiops_truncatus"){
Common_Name[i] <- "Bottlenose dolphin"
}
else if(seqDT$Scientific_Name[i]== "Vicugna_pacos"){
Common_Name[i] <- "Alpaca"
}
else if(seqDT$Scientific_Name[i]== "Xenopus_tropicalis"){
Common_Name[i] <- "Western clawed frog"
}
else if(seqDT$Scientific_Name[i]== "Xiphophorus_maculatus"){
Common_Name[i] <- "Southern platyfish"
}
else{
Common_Name[i] <- "NA"
}
setTxtProgressBar(pb, i)
}
close(pb)
seqDT$Common_Name <- Common_Name
setwd(olddir)
return(seqDT)
}
if(type == "miRNA"){
#### Read in the sequences and the transcript ID's ####
MiRNAHP <- readAAStringSet(files)
RefIDHPmiRNA <- names(MiRNAHP)
RefIDHPmiRNA <- sub(" .*", "", RefIDHPmiRNA) #erase all characters after the first space
seqHPmiRNA <- NULL
pb <- txtProgressBar(min = 0, max = length(MiRNAHP), style = 3)
seqHPmiRNA <- NULL
for (i in 1:length(MiRNAHP)){
seqHPmiRNA[i]=toString(MiRNAHP[i])
setTxtProgressBar(pb, i)
}
RefSeqHPmiRNA <- data.frame(seqHPmiRNA, RefIDHPmiRNA)
setnames(RefSeqHPmiRNA, c("seqHPmiRNA", "RefIDHPmiRNA"), c("Sequence", "miRNA_Name"))
RefSeqHPmiRNA$miRNA_type <- miRNA_type
close(pb)
setwd(olddir)
return(RefSeqHPmiRNA)
}
}
#' SeqAnnotate
#'
#' Requires a data table containing a column labeled "ensembl_peptide_id" or "ensembl_transcript_id"
#' Returns a data.table contaiining "ensembl_peptide_id", "external_gene_name", "transcription_start_site", "transcript_start", "transcript_end", "chromosome_name" for all species compiled
#' @param DT a data table containing a column labeled "ensembl_peptide_id" or "ensembl_transcript_id"
#' @param type single character string either "protein" or "mRNA". If "protein", requires the "ensembl_peptide_id" column and returns a data table containing protein ensembl peptide id's.
#' If "mRNA", requires the "ensembl_transcript_id" column and returns a data table containing protein ensembl transcript id's.
#' @return A data.table contaiining "ensembl_peptide_id"/"ensembl_transcript_id", "external_gene_name", "transcription_start_site", "transcript_start", "transcript_end", "chromosome_name" for all species compiled
#' @author Brendan Gongol
#' @importFrom biomaRt useMart
#' @importFrom biomaRt getBM
#' @export
#' @examples
#' library(data.table)
#' setwd("C:/Users/Brendan/Dropbox/Brendan Documents/R programming/bioinformatics/proteomes/2016-4-29 Proteomes")
#' Proteome <- fread("species protein compilation.xls")
#' head(SeqAnnotate(DT = Proteome, type = "protein"))
#'
#' library(data.table)
#' setwd("C:/Users/Brendan/Dropbox/Brendan Documents/R programming/bioinformatics/cDNA_transcriptome/2016-3-6 cDNA transcriptomes")
#' Transcriptome <- fread("species mRNA compilation.xls")
#' head(SeqAnnotate(Transcriptome, type = "mRNA"))
SeqAnnotate <- function(DT, type){
if(type == "protein"){
Values <- DT$ensembl_peptide_id
data.set <- c("oanatinus_gene_ensembl", "cporcellus_gene_ensembl", "gaculeatus_gene_ensembl", "lafricana_gene_ensembl", "itridecemlineatus_gene_ensembl",
"choffmanni_gene_ensembl", "csavignyi_gene_ensembl", "fcatus_gene_ensembl", "rnorvegicus_gene_ensembl", "psinensis_gene_ensembl",
"cjacchus_gene_ensembl", "ttruncatus_gene_ensembl", "scerevisiae_gene_ensembl", "celegans_gene_ensembl", "csabaeus_gene_ensembl",
"oniloticus_gene_ensembl", "trubripes_gene_ensembl", "amexicanus_gene_ensembl", "pmarinus_gene_ensembl", "eeuropaeus_gene_ensembl",
"falbicollis_gene_ensembl", "ptroglodytes_gene_ensembl", "etelfairi_gene_ensembl", "cintestinalis_gene_ensembl", "nleucogenys_gene_ensembl",
"sscrofa_gene_ensembl", "ocuniculus_gene_ensembl", "dnovemcinctus_gene_ensembl", "pcapensis_gene_ensembl", "tguttata_gene_ensembl",
"mlucifugus_gene_ensembl", "hsapiens_gene_ensembl", "pformosa_gene_ensembl", "mfuro_gene_ensembl", "tbelangeri_gene_ensembl",
"ggallus_gene_ensembl", "xtropicalis_gene_ensembl", "ecaballus_gene_ensembl", "pabelii_gene_ensembl", "xmaculatus_gene_ensembl",
"drerio_gene_ensembl", "lchalumnae_gene_ensembl", "tnigroviridis_gene_ensembl", "amelanoleuca_gene_ensembl", "mmulatta_gene_ensembl",
"pvampyrus_gene_ensembl", "panubis_gene_ensembl", "mdomestica_gene_ensembl", "acarolinensis_gene_ensembl", "vpacos_gene_ensembl",
"tsyrichta_gene_ensembl", "ogarnettii_gene_ensembl", "dmelanogaster_gene_ensembl", "mmurinus_gene_ensembl", "loculatus_gene_ensembl",
"olatipes_gene_ensembl", "ggorilla_gene_ensembl", "oprinceps_gene_ensembl", "dordii_gene_ensembl", "oaries_gene_ensembl",
"mmusculus_gene_ensembl", "mgallopavo_gene_ensembl", "gmorhua_gene_ensembl", "aplatyrhynchos_gene_ensembl", "saraneus_gene_ensembl",
"sharrisii_gene_ensembl", "meugenii_gene_ensembl", "btaurus_gene_ensembl", "cfamiliaris_gene_ensembl")
pb <- txtProgressBar(min = 0, max = length(data.set), style = 3)
COMPANNOT <- NULL
for(i in 1:length(data.set)){
mymart <- useMart("ensembl",dataset=data.set[i]) # host="www.ensembl.org", host="jul2015.archive.ensembl.org"
mRNAdata_annot <- getBM(attributes=c("ensembl_peptide_id", "external_gene_name", "transcription_start_site", "transcript_start", "transcript_end", "chromosome_name"), values = Values, mart = mymart)
COMPANNOT <- rbind(COMPANNOT, mRNAdata_annot)
setTxtProgressBar(pb, i)
}
close(pb)
return(COMPANNOT)
}
if(type == "mRNA"){
Values <- DT$ensembl_transcript_id
data.set <- c("oanatinus_gene_ensembl", "cporcellus_gene_ensembl", "gaculeatus_gene_ensembl", "lafricana_gene_ensembl", "itridecemlineatus_gene_ensembl",
"choffmanni_gene_ensembl", "csavignyi_gene_ensembl", "fcatus_gene_ensembl", "rnorvegicus_gene_ensembl", "psinensis_gene_ensembl",
"cjacchus_gene_ensembl", "ttruncatus_gene_ensembl", "scerevisiae_gene_ensembl", "celegans_gene_ensembl", "csabaeus_gene_ensembl",
"oniloticus_gene_ensembl", "trubripes_gene_ensembl", "amexicanus_gene_ensembl", "pmarinus_gene_ensembl", "eeuropaeus_gene_ensembl",
"falbicollis_gene_ensembl", "ptroglodytes_gene_ensembl", "etelfairi_gene_ensembl", "cintestinalis_gene_ensembl", "nleucogenys_gene_ensembl",
"sscrofa_gene_ensembl", "ocuniculus_gene_ensembl", "dnovemcinctus_gene_ensembl", "pcapensis_gene_ensembl", "tguttata_gene_ensembl",
"mlucifugus_gene_ensembl", "hsapiens_gene_ensembl", "pformosa_gene_ensembl", "mfuro_gene_ensembl", "tbelangeri_gene_ensembl",
"ggallus_gene_ensembl", "xtropicalis_gene_ensembl", "ecaballus_gene_ensembl", "pabelii_gene_ensembl", "xmaculatus_gene_ensembl",
"drerio_gene_ensembl", "lchalumnae_gene_ensembl", "tnigroviridis_gene_ensembl", "amelanoleuca_gene_ensembl", "mmulatta_gene_ensembl",
"pvampyrus_gene_ensembl", "panubis_gene_ensembl", "mdomestica_gene_ensembl", "acarolinensis_gene_ensembl", "vpacos_gene_ensembl",
"tsyrichta_gene_ensembl", "ogarnettii_gene_ensembl", "dmelanogaster_gene_ensembl", "mmurinus_gene_ensembl", "loculatus_gene_ensembl",
"olatipes_gene_ensembl", "ggorilla_gene_ensembl", "oprinceps_gene_ensembl", "dordii_gene_ensembl", "oaries_gene_ensembl",
"mmusculus_gene_ensembl", "mgallopavo_gene_ensembl", "gmorhua_gene_ensembl", "aplatyrhynchos_gene_ensembl", "saraneus_gene_ensembl",
"sharrisii_gene_ensembl", "meugenii_gene_ensembl", "btaurus_gene_ensembl", "cfamiliaris_gene_ensembl")
pb <- txtProgressBar(min = 0, max = length(data.set), style = 3)
COMPANNOT <- NULL
for(i in 1:length(data.set)){
mymart <- useMart("ensembl", dataset=data.set[i]) # , host="www.ensembl.org" ,host = "uswest.ensembl.org" ,host="jul2015.archive.ensembl.org"
mRNAdata_annot <- getBM(attributes=c("ensembl_transcript_id", "external_gene_name", "transcription_start_site", "transcript_start", "transcript_end", "chromosome_name"), values = Values, mart = mymart)
COMPANNOT <- rbind(COMPANNOT, mRNAdata_annot)
setTxtProgressBar(pb, i)
}
close(pb)
return(COMPANNOT)
}
}
#' Consensuspredict
#'
#' Takes a data.table or mRNA sequences and returns mRNA's, miRNA's, or proteins containing a specified consensus sequence.
#' Requires a three column data.table with columns labeled according to the type of specified search.
#' If type = "mRNA", one column should be labeled "Sequence" containing the mRNA sequences to query, a column should be
#' labeled "ensembl_transcript_id" containing the mRNA transcript id, and a column should be labeled "Species" designating
#' the species of the mRNA.
#' If type = "protein", one column should be labeled "Sequence" containing the protein sequences to query, a column should be
#' labeled "ensembl_peptide_id" containing the protein peptide id, and a column should be labeled "Species" designating
#' the species of the protein.
#' If type = "miRNA", one column should be labeled "Sequence" containing the miRNA sequences to query, a column should be
#' labeled "miRNA_Name" containing the miRNA name (ex: hsa-miRXXX), and a column should be labeled "miRNA_type" designating
#' if the miRNA is an IMMATURE_HAIR_PIN or a MATURE miRNA.
#'
#'
#' @param DT a data table with three columns. If type = "mRNA", the columns should be labeled "Sequence", "ensembl_transcript_id", "Species".
#' If type = "miRNA", the columns should be labeled "Sequence", "miRNA_Name", "miRNA_type".
#' If type = "protein", the columns should be labeled "Sequence", "ensembl_peptide_id", "Species".
#' @param conse a character string containing a single consensus sequence to query.
#' @param type a single character either "miRNA", "protein", or "mRNA" designating what type of sequences are being queried.
#' @return A data table containing the following columns:
#' @author Brendan Gongol
#' @importFrom data.table setnames
#' @importFrom data.table data.table
#' @importFrom data.table setkey
#' @importFrom stringr str_locate_all
#' @export
#' @examples
#' ensembl_transcript_id <- c("ENSACAT00000000002","ENSACAT00000000003","ENSACAT00000000004","ENSACAT00000000006","ENSACAT00000000007","ENSACAT00000000008")
#' Sequence <- c("CTGTTTCGAGCCTGAATCTCGATCGCTCGCGCTAGACAGCTCGACGCACTTTTCAGCAGGAGCCTG",
#' "AATTAATTTCAGCAGATAGCGCTCGATACAGCTCGACAGCTCTTGCTGTATTGTGTG",
#' "TTGCTGTATTGTGTGATCCTCGATACAGGTATTTTCTGAGCCTGATAGCTAGCTTTGCTGTATTGTGTGAATTAATT",
#' "AAAAAATTTTTTAATTAATTCCCCCCGGGGGG", "AATACAGCTCGCGCGCGGAACCAAT",
#' "AATTAATTATCGCTACAGCTCGACACAATTAATTAGCTCGTGGGTTCCGGCCTTAACAATTAATT")
#' external_gene_name <- c("AKT", "PI3K", "SREBP", "FOXO", "PKA", "NRF")
#' Species <- c("Rat", "Mouse", "Human", "Pig", "Goat", "Fox")
#' mRNADT <- data.frame(cbind(ensembl_transcript_id, Sequence, external_gene_name, Species))
#' mRNADT$Sequence <- as.character(mRNADT$Sequence)
#' Consensuspredict(DT=mRNADT, conse= "AATTAATT", type = "mRNA")
#'
#' ensembl_peptide_id <- c("ENSAMEP00000003151","ENSAMEP00000003176","ENSAMEP00000003150","ENSAMEP00000003213","ENSAMEP00000003164","ENSXMAP00000020464")
#' Sequence <- c("RKQHFIHQAVRNSDLVPKAKGRKSLQRLENTQYLLSLLETDGGTAGLDDGDLAPPAAPGIFAEACSNETYMEVWNDFMNRSGEEQERVLRYLEDEGKSK",
#' "GADKSNRFPLPFPFPSKLYIMCMANLEELQSTDSLDCLERLIDLNNGEGQIFTIDGPLCLKNVQSMFGKLIDLAYTPFH",
#' "IIALALEANNQLTWRDVQHLLVKTSRPAHLKANDWKVNGAGHKVSHLYGFGLVDAEALVMEAKKWTAVPAAEH",
#' "VGSAAVSAPVLALHRLSPGPRTYCSEVFPSRALERAFALYNLLALYLLPLAATCA", "KFVNYMQQVSVQATCATLTAMSVDRWY",
#' "VHEHVILDPLTKELNYPFIILALWGVIMTGSICGLERLRQTDLKALIAYSSVSHMGLVAAAILIQTPWALTGALILMIVHDK")
#' external_gene_name <- c("AKT", "PI3K", "SREBP", "FOXO", "PKA", "NRF")
#' Species <- c("Rat", "Mouse", "Human", "Pig", "Goat", "Fox")
#' PRODT <- data.frame(cbind(ensembl_peptide_id, Sequence, external_gene_name, Species))
#' PRODT$Sequence <- as.character(PRODT$Sequence)
#' conseq= "(G|A|V)(L|A|H)(D|E)(K|R|H)"
#' Consensuspredict(DT = PRODT, conse = conseq, type = "protein")
#'
#' Sequence <- c("UAGCGAUUGGAUCCGGUGAGGUAGUAGGUUGUAUAGUUUGGAAUAUUACCACCGGUGAACUAUGCAAUUUUCUACCUUACCGGAGACAGAACUCUUCGA",
#' "AUGCUUCCGGCCUGUUCCCUGAGACCUCAAGUGUGAGUGUACUAGCGAUGCUUCACACCUGGGCUCUCCGGGUACCAGGACGGUUUGAGCAGAU",
#' "AAAGUGACCGUACCGAGCUGCAUACUUCCUUACAUGCCCAUACUAUAUCAUAAAUGGAUAUGGAAUGUAAAGAAGUAUGUAGAACGGGGUGGUAGU ",
#' "UAAACAGUAUACAGAAAGCCAUCAAAGCGGUGGUUGAUGUGUUGCAAAUUAUGACUUUCAUAUCACAGCCAGCUUUGAUGUGCUGCCUGUUGCACUGU",
#' "CGGACAAUGCUCGAGAGGCAGUGUGGUUAGCUGGUUGCAUAUUUCCUUGACAACGGCUACCUUCACUGCCACCCCGAACAUGUCGUCCAUCUUUGAA",
#' "UAGCGAUUCAGAUCGAGCCAUUGCUGGUUUCUUCCACAGUAGCGAUUUCCAUUAGAACUAUCACCGGGUGGAAACUAGCAGUGGCUCGAUUAGCGAU",
#' "UGAGGUAGUAGGUUGUAUAGUU", "CUAUGCAUAGCGAUACCUUACC", "UCCCUGAGACCUCAAGUGUGA", "ACACCUGGGCUCUCCGGGUACC",
#' "UAGCGAUCCUUACAUGCCCAUA", "UCGUGUCCGUUUCUCGUUUCGA", "ACAAUAAUCGGACACUAGCGAU", "CACACCGGACGAGAUUUCAU",
#' "UACCGGGCGUGGGGAGGGCAGG", "UAGCGAUUCCUUCUUAGCGAU")
#' miRNA_Name <- c("hsa-let-7","cel-lin-4","aae-mir-1","bol-mir-2","rno-mir-34","cel-mir-35",
#' "cel-let-7-5p", "cel-let-7-3p", "cel-lin-4-5p", "cel-lin-4-3p", "cel-miR-1-5p", "ame-miR-9895",
#' "hsa-miR-9896", "rno-miR-3478", "mmu-miR9897-5p", "cre-miR9897-3p")
#' miRNA_type <- c("IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN",
#' "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE")
#' MiRNADT<- data.frame(cbind(Sequence, miRNA_Name, miRNA_type))
#' MiRNADT$Sequence <- as.character(MiRNADT$Sequence)
#'
#' Consensuspredict(DT = MiRNADT, conse = "UAGCGAU", type = "miRNA")
Consensuspredict <- function(DT, conse, type){
if(type == "mRNA"){
RNApredict2 <- function(DT2, conse3){
DNAsequence <- DT2$Sequence
conse2 <- conse3
#### query the input DNA sequence for consensus sequence elements.
hold <- data.table(NULL)
for(ce in conse2){
lis2 <- as.data.frame(str_locate_all(DNAsequence, ce))
if(nrow(lis2) > 0){
dt <- cbind(lis2, nameu=ce)
hold <- rbind(hold, dt)
}
}
#### Count the number of times each consensus sequence appears.
if(nrow(hold) > 0){
setkey(hold, nameu)
numhits <- hold[,length(end), by=nameu]
hold <- merge(hold, numhits, by="nameu", allow.cartesian=TRUE)
}
#### Add the length of each miRNA sequence
if(nrow(hold) > 0){
spl <- strsplit(DNAsequence, split="")
hold$length <- length(spl[[1]])
}
# Add the sequences with hits reformatted into lower case.
if(nrow(hold) > 0){
spl <- strsplit(DNAsequence, split="")
chromtot2 <- NULL
for(i in 1:length(hold$start)){
if((hold$end[i] == hold$length[i])){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
chromtot <- paste(first, sequencelow, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(first, sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) == 0){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
i <- i + 1
}
hold$sequence <- chromtot2
}
# Add the transcript ID and Species
if(nrow(hold) > 0){
hold$ensembl_transcript_id <- DT2$ensembl_transcript_id
hold$Species <- DT2$Species
}
# Rename the columns
if(nrow(hold) > 0){
setnames(hold, c("nameu", "start", "end", "V1", "sequence", "ensembl_transcript_id", "Species"),
c("Consensus_Sequence", "start", "end", "number of hits", "sequence", "ensembl_transcript_id", "Species"))
}
return(hold)
}
pb <- txtProgressBar(min = 0, max = nrow(DT), style = 3)
NCLmRNAhits <- NULL
for(i in 1:nrow(DT)){
fir <- RNApredict2(DT[i,], conse3 = conse)
NCLmRNAhits <- rbind(NCLmRNAhits, fir)
setTxtProgressBar(pb, i)
}
close(pb)
return(NCLmRNAhits)
}
if(type == "protein"){
Proteinpredict2 <- function(DT2, conse3){
PROTEINsequence <- DT2$Sequence
conse2 <- conse3
#### query the input Protein sequence for consensus sequence elements.
hold <- data.table(NULL)
for(ce in conse2){
lis2 <- as.data.frame(str_locate_all(PROTEINsequence, ce))
if(nrow(lis2) > 0){
dt <- cbind(lis2, nameu=ce)
hold <- rbind(hold, dt)
}
}
#### Count the number of times each consensus sequence appears.
if(nrow(hold) > 0){
setkey(hold, nameu)
numhits <- hold[,length(end), by=nameu]
hold <- merge(hold, numhits, by="nameu", allow.cartesian=TRUE)
}
#### Add the length of each Protein sequence
if(nrow(hold) > 0){
spl <- strsplit(PROTEINsequence, split="")
hold$length <- length(spl[[1]])
}
# Add the sequences with hits reformatted into lower case.
if(nrow(hold) > 0){
spl <- strsplit(PROTEINsequence, split="")
chromtot2 <- NULL
for(i in 1:length(hold$start)){
if((hold$end[i] == hold$length[i])){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
chromtot <- paste(first, sequencelow, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(first, sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) == 0){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
i <- i + 1
}
hold$sequence <- chromtot2
}
# Add the transcript ID and Species
if(nrow(hold) > 0){
hold$ensembl_peptide_id <- DT2$ensembl_peptide_id
hold$Species <- DT2$Species
}
# Add the consensus sequence hit
if(nrow(hold) > 0){
AMPK_seq <- NULL
for(i in 1:nrow(hold)){
sp1 <- strsplit(hold$sequence[i], split="")
sp2 <- sp1[[1]][(hold$start[i]):(hold$end[i])]
AMPK_seq[i] <- paste(sp2, collapse="")
}
hold$consensus <- AMPK_seq
}
# Rename the columns
if(nrow(hold) > 0){
setnames(hold, c("nameu", "start", "end", "V1", "sequence", "ensembl_peptide_id", "Species", "consensus"),
c("Consensus_Sequence", "start", "end", "number of hits", "sequence", "ensembl_peptide_id", "Species", "consensus"))
}
return(hold)
}
pb <- txtProgressBar(min = 0, max = nrow(DT), style = 3)
Proteinhits <- NULL
for(i in 1:nrow(DT)){
fir <- Proteinpredict2(DT[i,], conse3 = conse)
Proteinhits <- rbind(Proteinhits, fir)
setTxtProgressBar(pb, i)
}
close(pb)
return(Proteinhits)
}
if(type == "miRNA"){
MIRNApredict2 <- function(DT2, conse3 ){
DNAsequence <- DT2$Sequence
conse2 <- conse3
#### query the input DNA sequence for consensus sequence elements.
hold <- data.table(NULL)
for(ce in conse2){
lis2 <- as.data.frame(str_locate_all(DNAsequence, ce))
if(nrow(lis2) > 0){
dt <- cbind(lis2, nameu=ce)
hold <- rbind(hold, dt)
}
}
#### Count the number of times each consensus sequence appears.
if(nrow(hold) > 0){
setkey(hold, nameu)
numhits <- hold[,length(end), by=nameu]
hold <- merge(hold, numhits, by="nameu", allow.cartesian=TRUE)
}
#### Add the length of each miRNA sequence
if(nrow(hold) > 0){
spl <- strsplit(DNAsequence, split="")
hold$length <- length(spl[[1]])
}
# Add the sequences with hits reformatted into lower case.
if(nrow(hold) > 0){
spl <- strsplit(DNAsequence, split="")
chromtot2 <- NULL
for(i in 1:length(hold$start)){
if((hold$end[i] == hold$length[i])){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
chromtot <- paste(first, sequencelow, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(first, sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) == 0){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
i <- i + 1
}
hold$sequence <- chromtot2
}
# Add the transcript ID and Species
if(nrow(hold) > 0){
hold$miRNA_Name <- DT2$miRNA_Name
hold$miRNA_type <- DT2$miRNA_type
}
# Rename the columns
if(nrow(hold) > 0){
setnames(hold, c("nameu", "start", "end", "V1", "sequence", "miRNA_Name", "miRNA_type"),
c("Consensus_Sequence", "start", "end", "number of hits", "sequence", "miRNA_Name", "miRNA_type"))
}
return(hold)
}
pb <- txtProgressBar(min = 0, max = nrow(DT), style = 3)
MiRNAhits <- NULL
for(i in 1:nrow(DT)){
fir <- MIRNApredict2(DT[i,], conse3 = conse)
MiRNAhits <- rbind(MiRNAhits, fir)
setTxtProgressBar(pb, i)
}
close(pb)
return(MiRNAhits)
}
}
#' MiRNASpeciesAnnot
#'
#' Adds two extra columns to a data.table of the Species a miRNA originates in Scientific_Name and common designations
#' Requires a data.table with the column housing the miRNA labelled "miRNA_Name
#'
#' @param MiRNADT a data table with one labeled miRNA_Name that harbors the miRNA name.
#' @return A data table containing the following columns:
#' @author Brendan Gongol
#' @export
#' @examples
#' miRNA_Name <- c("hsa-let-7","cel-lin-4","aae-mir-1","bol-mir-2","rno-mir-34","cel-mir-35",
#' "cel-let-7-5p", "cel-let-7-3p", "cel-lin-4-5p", "cel-lin-4-3p", "cel-miR-1-5p", "ame-miR-9895",
#' "hsa-miR-9896", "rno-miR-3478", "mmu-miR9897-5p", "cre-miR9897-3p")
#' MiRNADT<- data.frame(cbind(miRNA_Name))
#' MiRNASpeciesAnnot(MiRNADT)
#'
#'
#' Sequence <- c("UAGCGAUUGGAUCCGGUGAGGUAGUAGGUUGUAUAGUUUGGAAUAUUACCACCGGUGAACUAUGCAAUUUUCUACCUUACCGGAGACAGAACUCUUCGA",
#' "AUGCUUCCGGCCUGUUCCCUGAGACCUCAAGUGUGAGUGUACUAGCGAUGCUUCACACCUGGGCUCUCCGGGUACCAGGACGGUUUGAGCAGAU",
#' "AAAGUGACCGUACCGAGCUGCAUACUUCCUUACAUGCCCAUACUAUAUCAUAAAUGGAUAUGGAAUGUAAAGAAGUAUGUAGAACGGGGUGGUAGU ",
#' "UAAACAGUAUACAGAAAGCCAUCAAAGCGGUGGUUGAUGUGUUGCAAAUUAUGACUUUCAUAUCACAGCCAGCUUUGAUGUGCUGCCUGUUGCACUGU",
#' "CGGACAAUGCUCGAGAGGCAGUGUGGUUAGCUGGUUGCAUAUUUCCUUGACAACGGCUACCUUCACUGCCACCCCGAACAUGUCGUCCAUCUUUGAA",
#' "UAGCGAUUCAGAUCGAGCCAUUGCUGGUUUCUUCCACAGUAGCGAUUUCCAUUAGAACUAUCACCGGGUGGAAACUAGCAGUGGCUCGAUUAGCGAU",
#' "UGAGGUAGUAGGUUGUAUAGUU", "CUAUGCAUAGCGAUACCUUACC", "UCCCUGAGACCUCAAGUGUGA", "ACACCUGGGCUCUCCGGGUACC",
#' "UAGCGAUCCUUACAUGCCCAUA", "UCGUGUCCGUUUCUCGUUUCGA", "ACAAUAAUCGGACACUAGCGAU", "CACACCGGACGAGAUUUCAU",
#' "UACCGGGCGUGGGGAGGGCAGG", "UAGCGAUUCCUUCUUAGCGAU")
#' miRNA_Name <- c("hsa-let-7","cel-lin-4","aae-mir-1","bol-mir-2","rno-mir-34","cel-mir-35",
#' "cel-let-7-5p", "cel-let-7-3p", "cel-lin-4-5p", "cel-lin-4-3p", "cel-miR-1-5p", "ame-miR-9895",
#' "hsa-miR-9896", "rno-miR-3478", "mmu-miR9897-5p", "cre-miR9897-3p")
#' miRNA_type <- c("IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN",
#' "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE")
#' MiRNADT<- data.frame(cbind(Sequence, miRNA_Name, miRNA_type))
#' MiRNASpeciesAnnot(MiRNADT)
MiRNASpeciesAnnot<- function(MiRNADT){
MiRNADT$miRNA_Name <- toupper(MiRNADT$miRNA_Name)
pb <- txtProgressBar(min = 0, max = length(MiRNADT$miRNA_Name), style = 3)
SP <- NULL
for(i in 1:length(MiRNADT$miRNA_Name)){
spl <- strsplit(MiRNADT$miRNA_Name[i], split = "-")[[1]]
SP[i] <- spl[1]
}
SP <- toupper(SP)
Common_Name <- NULL
Scientific_Name <- NULL
for(i in 1:nrow(MiRNADT)){
if(SP[i]== "AAU"){
Scientific_Name[i] <- "Acacia_Auriculiformis"
Common_Name[i] <- "Acacia_Auriculiformis"
}
else if(SP[i]== "AAE"){
Scientific_Name[i] <- "Aedes_Aegypti"
Common_Name[i] <- "mosquito"
}
else if(SP[i]== "ABU"){
Scientific_Name[i] <- "Astatotilapia burtoni"
Common_Name[i] <- "Astatotilapia burtoni"
}
else if(SP[i]== "ACA"){
Scientific_Name[i] <- "Anolis_carolinensis"
Common_Name[i] <- "Lizard"
}
else if(SP[i]== "AGA"){
Scientific_Name[i] <- "Anopheles gambiae"
Common_Name[i] <- "mosquito"
}
else if(SP[i]== "AHY"){
Scientific_Name[i] <- "Arachis hypogaea"
Common_Name[i] <- "Peanut"
}
else if(SP[i]== "AJA"){
Scientific_Name[i] <- "Artibeus jamaicensis"
Common_Name[i] <- "Bat"
}
else if(SP[i]== "AMA"){
Scientific_Name[i] <- "Avicennia marina"
Common_Name[i] <- "mangrove"
}
else if(SP[i]== "AMG"){
Scientific_Name[i] <- "Acacia mangium"
Common_Name[i] <- "Forest Mangrove"
}
else if(SP[i]== "AOF"){
Scientific_Name[i] <- "Asparagus officinalis"
Common_Name[i] <- "Asparagus"
}
else if(SP[i]== "API"){
Scientific_Name[i] <- "Acyrthosiphon pisum"
Common_Name[i] <- "pea aphid"
}
else if(SP[i]== "APL"){
Scientific_Name[i] <- "Anas platyrhynchos"
Common_Name[i] <- "mallard"
}
else if(SP[i]== "AQC"){
Scientific_Name[i] <- "Aquilegia caerulea"
Common_Name[i] <- "Colorado Blue Columbine"
}
else if(SP[i]== "AQU"){
Scientific_Name[i] <- "Amphimedon queenslandica"
Common_Name[i] <- "sponge"
}
else if(SP[i]== "ASU"){
Scientific_Name[i] <- "Ascaris suum"
Common_Name[i] <- "roundworm of pigs"
}
else if(SP[i]== "ATA"){
Scientific_Name[i] <- "Aegilops Tauschii"
Common_Name[i] <- "Goatgrass"
}
else if(SP[i]== "ATR"){
Scientific_Name[i] <- "Amborella trichopoda"
Common_Name[i] <- "Amborella"
}
else if(SP[i]== "AME"){
Scientific_Name[i] <- "Apis mellifera"
Common_Name[i] <- "Honey bee"
}
else if(SP[i]== "AMI"){
Scientific_Name[i] <- "Alligator mississippiensis"
Common_Name[i] <- "Alligator"
}
else if(SP[i]== "ALY"){
Scientific_Name[i] <- "Arabidopsis lyrata"
Common_Name[i] <- "Arabidopsis lyrata"
}
else if(SP[i]== "ATH"){
Scientific_Name[i] <- "Arabidopsis thaliana"
Common_Name[i] <- "Arabidopsis thaliana"
}
else if(SP[i]== "AGE"){
Scientific_Name[i] <- "Ateles geoffroyi"
Common_Name[i] <- "Geoffroy spider monkey"
}
else if(SP[i]== "BCY"){
Scientific_Name[i] <- "Bruguiera cylindrica"
Common_Name[i] <- "mangrove"
}
else if(SP[i]== "BFV"){
Scientific_Name[i] <- "Bovine foamy virus"
Common_Name[i] <- "Bovine foamy virus"
}
else if(SP[i]== "BGY"){
Scientific_Name[i] <- "Bruguiera gymnorhiza"
Common_Name[i] <- "black mangrove"
}
else if(SP[i]== "BHV1"){
Scientific_Name[i] <- "Bovine herpesvirus 1"
Common_Name[i] <- "Bovine herpesvirus 1"
}
else if(SP[i]== "BHV5"){
Scientific_Name[i] <- "Bovine herpesvirus 5"
Common_Name[i] <- "Bovine herpesvirus 5"
}
else if(SP[i]== "BIB"){
Scientific_Name[i] <- "Biston betularia"
Common_Name[i] <- "peppered moth"
}
else if(SP[i]== "BKV"){
Scientific_Name[i] <- "BK polyomavirus"
Common_Name[i] <- "BK polyomavirus"
}
else if(SP[i]== "BMA"){
Scientific_Name[i] <- "Brugia malayi"
Common_Name[i] <- "nematode"
}
else if(SP[i]=="BMO"){
Scientific_Name[i] <- "Bombyx mori"
Common_Name[i] <- "silkworm"
}
else if(SP[i]=="BOL"){
Scientific_Name[i] <- "Brassica oleracea"
Common_Name[i] <- "wild cabbage"
}
else if(SP[i]=="BPCV1"){
Scientific_Name[i] <- "Bandicoot papillomatosis carcinomatosis virus type 1"
Common_Name[i] <- "Bandicoot papillomatosis carcinomatosis virus type 1"
}
else if(SP[i]=="BPCV2"){
Scientific_Name[i] <- "Bandicoot papillomatosis carcinomatosis virus type 2"
Common_Name[i] <- "Bandicoot papillomatosis carcinomatosis virus type 2"
}
else if(SP[i]=="BRA"){
Scientific_Name[i] <- "Brassica rapa"
Common_Name[i] <- "field mustard"
}
else if(SP[i]=="BTA"){
Scientific_Name[i] <- "Bos_taurus"
Common_Name[i] <- "Cattle"
}
else if(SP[i]== "BDI"){
Scientific_Name[i] <- "Brachypodium distachyon"
Common_Name[i] <- "Brachypodium distachyon"
}
else if(SP[i]== "BBE"){
Scientific_Name[i] <- "Branchiostoma belcheri"
Common_Name[i] <- "Branchiostoma belcheri"
}
else if(SP[i]== "BDO"){
Scientific_Name[i] <- "Bactrocera dorsalis"
Common_Name[i] <- "oriental fruit fly"
}
else if(SP[i]=="BFL"){
Scientific_Name[i] <- "Branchiostoma floridae"
Common_Name[i] <- "Branchiostoma floridae"
}
else if(SP[i]=="BLV"){
Scientific_Name[i] <- "Bovine leukemia virus"
Common_Name[i] <- "Bovine leukemia virus"
}
else if(SP[i]=="BNA"){
Scientific_Name[i] <- "Brassica napus"
Common_Name[i] <- "Rapeseed"
}
else if(SP[i]=="CAS"){
Scientific_Name[i] <- "Camelina sativa"
Common_Name[i] <- "Camelina sativa"
}
else if(SP[i]=="CBN"){
Scientific_Name[i] <- "Caenorhabditis brenneri"
Common_Name[i] <- "nematode"
}
else if(SP[i]=="CBR"){
Scientific_Name[i] <- "Caenorhabditis briggsae"
Common_Name[i] <- "nematode"
}
else if(SP[i]=="CCA"){
Scientific_Name[i] <- "Cynara cardunculus"
Common_Name[i] <- "artichoke thistle"
}
else if(SP[i]=="CCL"){
Scientific_Name[i] <- "Citrus clementina"
Common_Name[i] <- "Clementine"
}
else if(SP[i]=="CCR"){
Scientific_Name[i] <- "Cyprinus carpio"
Common_Name[i] <- "Carp"
}
else if(SP[i]=="CEL"){
Scientific_Name[i] <- "Caenorhabditis_elegans"
Common_Name[i] <- "roundworm"
}
else if(SP[i]=="CFA"){
Scientific_Name[i] <- "Canis_familiaris"
Common_Name[i] <- "Dog"
}
else if(SP[i]=="CGR"){
Scientific_Name[i] <- "Cricetulus griseus"
Common_Name[i] <- "Chinese hamster"
}
else if(SP[i]=="CHI"){
Scientific_Name[i] <- "Capra hircus"
Common_Name[i] <- "Goat"
}
else if(SP[i]=="CIN"){
Scientific_Name[i] <- "Ciona_intestinalis"
Common_Name[i] <- "Ciona intestinalis"
}
else if(SP[i]=="CJA"){
Scientific_Name[i] <- "Callithrix jacchus"
Common_Name[i] <- "Marmoset"
}
else if(SP[i]=="CLA"){
Scientific_Name[i] <- "Cerebratulus lacteus"
Common_Name[i] <- "Atlantic jackknife clam"
}
else if(SP[i]=="CLI"){
Scientific_Name[i] <- "Columba livia"
Common_Name[i] <- "rock dove"
}
else if(SP[i]=="CLN"){
Scientific_Name[i] <- "Cunninghamia lanceolata"
Common_Name[i] <- "evergreen trees"
}
else if(SP[i]=="CME"){
Scientific_Name[i] <- "Cucumis melo"
Common_Name[i] <- "Muskmelon"
}
else if(SP[i]=="CPA"){
Scientific_Name[i] <- "Carica papaya"
Common_Name[i] <- "Papaya"
}
else if(SP[i]=="CPI"){
Scientific_Name[i] <- "Chrysemys picta"
Common_Name[i] <- "painted turtle"
}
else if(SP[i]=="CPO"){
Scientific_Name[i] <- "Cavia porcellus"
Common_Name[i] <- "guinea pig"
}
else if(SP[i]=="CQU"){
Scientific_Name[i] <- "Culex quinquefasciatus"
Common_Name[i] <- "Southern house mosquito"
}
else if(SP[i]=="CRE"){
Scientific_Name[i] <- "Chlamydomonas reinhardtii"
Common_Name[i] <- "green alga"
}
else if(SP[i]=="CRM"){
Scientific_Name[i] <- "Caenorhabditis remanei"
Common_Name[i] <- "nematode"
}
else if(SP[i]=="CRT"){
Scientific_Name[i] <- "Citrus reticulata"
Common_Name[i] <- "mandarin orange"
}
else if(SP[i]=="CSA"){
Scientific_Name[i] <- "Ciona_savignyi"
Common_Name[i] <- "sea squirt"
}
else if(SP[i]=="CSI"){
Scientific_Name[i] <- "Citrus sinensis"
Common_Name[i] <- "Sweet Orange Group"
}
else if(SP[i]=="CST"){
Scientific_Name[i] <- "Cucumis sativus"
Common_Name[i] <- "Cucumber"
}
else if(SP[i]=="CTE"){
Scientific_Name[i] <- "Capitella teleta"
Common_Name[i] <- "polychaete worm"
}
else if(SP[i]=="CTR"){
Scientific_Name[i] <- "Citrus trifoliata"
Common_Name[i] <- "Trifoliate orange"
}
else if(SP[i]=="DAN"){
Scientific_Name[i] <- "Drosophila ananassae"
Common_Name[i] <- "fruit fly"
}
else if(SP[i]=="DDI"){
Scientific_Name[i] <- "Dictyostelium discoideum"
Common_Name[i] <- "slime mold"
}
else if(SP[i]=="DER"){
Scientific_Name[i] <- "Drosophila erecta"
Common_Name[i] <- "fruit fly"
}
else if(SP[i]=="DEV"){
Scientific_Name[i] <- "Duck enteritis virus"
Common_Name[i] <- "Duck enteritis virus"
}
else if(SP[i]=="DGR"){
Scientific_Name[i] <- "Drosophila grimshawi"
Common_Name[i] <- "fruit fly"
}
else if(SP[i]=="DMA"){
Scientific_Name[i] <- "Daubentonia madagascariensis"
Common_Name[i] <- "aye-aye lemur"
}
else if(SP[i]=="DME"){
Scientific_Name[i] <- "Drosophila_melanogaster"
Common_Name[i] <- "Fruit fly"
}
else if(SP[i]=="DMO"){
Scientific_Name[i] <- "Drosophila mojavensis"
Common_Name[i] <- "Fruit fly"
}
else if(SP[i]=="DNO"){
Scientific_Name[i] <- "Dasypus novemcinctus"
Common_Name[i] <- "armadillo"
}
else if(SP[i]=="DPE"){
Scientific_Name[i] <- "Drosophila persimilis"
Common_Name[i] <- "Fruit fly"
}
else if(SP[i]=="DPR"){
Scientific_Name[i] <- "Digitalis purpurea"
Common_Name[i] <- "foxglove"
}
else if(SP[i]=="DPS"){
Scientific_Name[i] <- "Drosophila pseudoobscura"
Common_Name[i] <- "Fruit fly"
}
else if(SP[i]=="DPU"){
Scientific_Name[i] <- "Daphnia pulex"
Common_Name[i] <- "water flea"
}
else if(SP[i]=="DRE"){
Scientific_Name[i] <- "Danio_rerio"
Common_Name[i] <- "Zebrafish"
}
else if(SP[i]=="DSE"){
Scientific_Name[i] <- "Drosophila sechellia"
Common_Name[i] <- "Fruit fly"
}
else if(SP[i]=="DQU"){
Scientific_Name[i] <- "Dinoponera quadriceps"
Common_Name[i] <- "Dinoponera quadriceps"
}
else if(SP[i]== "DSI"){
Scientific_Name[i] <- "Drosophila simulans"
Common_Name[i] <- "Fruit fly"
}
else if(SP[i]== "DVI"){
Scientific_Name[i] <- "Drosophila virilis"
Common_Name[i] <- "Fruit fly"
}
else if(SP[i]== "DWI"){
Scientific_Name[i] <- "Drosophila willistoni"
Common_Name[i] <- "Fruit fly"
}
else if(SP[i]== "DYA"){
Scientific_Name[i] <- "Drosophila yakuba"
Common_Name[i] <- "Fruit fly"
}
else if(SP[i]== "EBV"){
Scientific_Name[i] <- "Epstein Barr virus"
Common_Name[i] <- "Epstein Barr virus"
}
else if(SP[i]== "EGR"){
Scientific_Name[i] <- "Echinococcus granulosus"
Common_Name[i] <- "Hyper Tape-worm"
}
else if(SP[i]== "EGU"){
Scientific_Name[i] <- "Elaeis guineensis"
Common_Name[i] <- "African oil palm"
}
else if(SP[i]== "EMU"){
Scientific_Name[i] <- "Echinococcus multilocularis"
Common_Name[i] <- "tapeworm"
}
else if(SP[i]=="ESI"){
Scientific_Name[i] <- "Ectocarpus siliculosus"
Common_Name[i] <- "Brown alga"
}
else if(SP[i]=="EUN"){
Scientific_Name[i] <- "Eugenia uniflora"
Common_Name[i] <- "pitanga"
}
else if(SP[i]=="EEL"){
Scientific_Name[i] <- "Electrophorus electricus"
Common_Name[i] <- "electric eel"
}
else if(SP[i]=="EFU"){
Scientific_Name[i] <- "Eptesicus fuscus"
Common_Name[i] <- "Big brown bat"
}
else if(SP[i]=="ECA"){
Scientific_Name[i] <-"Equus_caballus"
Common_Name[i] <- "Horse"
}
else if(SP[i]=="FAR"){
Scientific_Name[i] <-"Festuca arundinacea"
Common_Name[i] <- "Grass"
}
else if(SP[i]=="FHE"){
Scientific_Name[i] <-"Fasciola hepatica"
Common_Name[i] <- "liver fluke"
}
else if(SP[i]=="FRU"){
Scientific_Name[i] <-"Festuca arundinacea"
Common_Name[i] <- "pufferfish"
}
else if(SP[i]=="FVE"){
Scientific_Name[i] <-"Fragaria vesca"
Common_Name[i] <- "strawberry"
}
else if(SP[i]=="GAR"){
Scientific_Name[i] <- "Gossypium_arboreum"
Common_Name[i] <- "tree cotton"
}
else if(SP[i]=="GGA"){
Scientific_Name[i] <- "Gallus_gallus"
Common_Name[i] <- "Chicken"
}
else if(SP[i]=="GGO"){
Scientific_Name[i] <- "Gorilla_gorilla"
Common_Name[i] <- "Gorilla"
}
else if(SP[i]=="GHR"){
Scientific_Name[i] <- "Gossypium hirsutum"
Common_Name[i] <- "cotton"
}
else if(SP[i]=="GMA"){
Scientific_Name[i] <- "Glycine max"
Common_Name[i] <- "Soybean"
}
else if(SP[i]=="GMO"){
Scientific_Name[i] <- "Gadus morhua"
Common_Name[i] <- "Atlantic cod"
}
else if(SP[i]=="GPY"){
Scientific_Name[i] <- "Glottidia pyramidata"
Common_Name[i] <- "Glottidia pyramidata"
}
else if(SP[i]=="GRA"){
Scientific_Name[i] <- "Gossypium raimondii"
Common_Name[i] <- "Cotton plant"
}
else if(SP[i]=="GSA"){
Scientific_Name[i] <- "Gyrodactylus salaris"
Common_Name[i] <- "monogenean ectoparasite"
}
else if(SP[i]=="GSO"){
Scientific_Name[i] <- "Glycine soja"
Common_Name[i] <- "soybean"
}
else if(SP[i]=="HAN"){
Scientific_Name[i] <- "Helianthus annuus"
Common_Name[i] <- "sunflower"
}
else if(SP[i]=="HAR"){
Scientific_Name[i] <- "Helianthus annuus"
Common_Name[i] <- "sunflower"
}
else if(SP[i]=="HBR"){
Scientific_Name[i] <- "Hevea brasiliensis"
Common_Name[i] <- "rubber tree"
}
else if(SP[i]=="HBV"){
Scientific_Name[i] <- "Herpes B virus"
Common_Name[i] <- "Herpes B virus"
}
else if(SP[i]=="HCI"){
Scientific_Name[i] <- "Helianthus ciliaris"
Common_Name[i] <- "sunflower"
}
else if(SP[i]=="HCMV"){
Scientific_Name[i] <- "Human cytomegalovirus"
Common_Name[i] <- "Human cytomegalovirus"
}
else if(SP[i]=="HCO"){
Scientific_Name[i] <- "Haemonchus contortus"
Common_Name[i] <- "Barber's pole worm"
}
else if(SP[i]=="HEX"){
Scientific_Name[i] <- "Helianthus exilis"
Common_Name[i] <- "sunflower"
}
else if(SP[i]=="HHI"){
Scientific_Name[i] <- "Hippoglossus hippoglossus"
Common_Name[i] <- "Atlantic halibut"
}
else if(SP[i]=="HHV6B"){
Scientific_Name[i] <- "Human herpesvirus 6B"
Common_Name[i] <- "Human herpesvirus 6B"
}
else if(SP[i]=="HIV1"){
Scientific_Name[i] <- "Human immunodeficiency virus 1"
Common_Name[i] <- "Human immunodeficiency virus 1"
}
else if(SP[i]=="HMA"){
Scientific_Name[i] <- "Hydra magnipapillata"
Common_Name[i] <- "fresh-water polyp"
}
else if(SP[i]=="HME"){
Scientific_Name[i] <- "Heliconius melpomene"
Common_Name[i] <- "postman butterfly"
}
else if(SP[i]=="HPA"){
Scientific_Name[i] <- "Helianthus paradoxus"
Common_Name[i] <- "sunflower"
}
else if(SP[i]=="HPE"){
Scientific_Name[i] <- "Helianthus petiolaris"
Common_Name[i] <- "sunflower"
}
else if(SP[i]=="HPO"){
Scientific_Name[i] <- "Heligmosomoides polygyrus"
Common_Name[i] <- "Heligmosomoides polygyrus"
}
else if(SP[i]=="HRU"){
Scientific_Name[i] <- "Haliotis rufescens"
Common_Name[i] <- "sea snail"
}
else if(SP[i]=="HSA"){
Scientific_Name[i] <- "Homo_sapiens"
Common_Name[i] <- "Human"
}
else if(SP[i]=="HSV1"){
Scientific_Name[i] <- "Herpes simplex sirus 1"
Common_Name[i] <- "Herpes simplex virus 1"
}
else if(SP[i]=="HSV2"){
Scientific_Name[i] <- "Herpes simplex virus 2"
Common_Name[i] <- "Herpes simplex virus 2"
}
else if(SP[i]=="HTU"){
Scientific_Name[i] <- "Helianthus tuberosus"
Common_Name[i] <- "Jerusalem artichoke"
}
else if(SP[i]=="HVSA"){
Scientific_Name[i] <- "Herpes virus saimiri strain A11"
Common_Name[i] <- "Herpes virus saimiri strain A11"
}
else if(SP[i]=="HVT"){
Scientific_Name[i] <- "Herpesvirus of Turkeys"
Common_Name[i] <- "Herpesvirus of Turkeys"
}
else if(SP[i]=="HVU"){
Scientific_Name[i] <- "Hordeum vulgare"
Common_Name[i] <- "Barley"
}
else if(SP[i]=="ILTV"){
Scientific_Name[i] <- "Infectious laryngotracheitis"
Common_Name[i] <- "Gallid herpesvirus 1"
}
else if(SP[i]=="IPU"){
Scientific_Name[i] <- "Ictalurus punctatus"
Common_Name[i] <- "Channel catfish"
}
else if(SP[i]=="ISC"){
Scientific_Name[i] <- "Ixodes scapularis"
Common_Name[i] <- "deer tick"
}
else if(SP[i]=="JCV"){
Scientific_Name[i] <- "JC polyomavirus"
Common_Name[i] <- "JC virus"
}
else if(SP[i]=="KSHV"){
Scientific_Name[i] <- "Kaposi sarcoma-associated herpesvirus"
Common_Name[i] <- "herpes virus"
}
else if(SP[i]=="LCA"){
Scientific_Name[i] <- "Lemur catta"
Common_Name[i] <- "ring-tailed lemur"
}
else if(SP[i]=="LCO"){
Scientific_Name[i] <- "Leucosolenia complicata"
Common_Name[i] <- "Leucosolenia complicata"
}
else if(SP[i]=="LGI"){
Scientific_Name[i] <- "Lottia gigantea"
Common_Name[i] <- "owl limpet"
}
else if(SP[i]=="LJA"){
Scientific_Name[i] <- "Lotus japonicus"
Common_Name[i] <- "legume"
}
else if(SP[i]=="LLA"){
Scientific_Name[i] <- "Lagothrix lagotricha"
Common_Name[i] <- "Brown woolly monkey"
}
else if(SP[i]=="LMI"){
Scientific_Name[i] <- "Locusta migratoria"
Common_Name[i] <- "Migratory locust"
}
else if(SP[i]=="LUS"){
Scientific_Name[i] <- "Linum usitatissimum"
Common_Name[i] <- "Flax"
}
else if(SP[i]=="LVA"){
Scientific_Name[i] <- "Lytechinus variegatus"
Common_Name[i] <- "green sea urchin"
}
else if(SP[i]=="MCMV"){
Scientific_Name[i] <- "Mouse cytomegalovirus"
Common_Name[i] <- "Mouse cytomegalovirus"
}
else if(SP[i]=="MCO"){
Scientific_Name[i] <- "Mesocestoides corti"
Common_Name[i] <- "Mesocestoides corti"
}
else if(SP[i]=="MCV"){
Scientific_Name[i] <- "Merkel cell polyomavirus"
Common_Name[i] <- "Merkel cell polyomavirus"
}
else if(SP[i]=="MDM"){
Scientific_Name[i] <- "Malus domestica"
Common_Name[i] <- "Apple"
}
else if(SP[i]=="MDO"){
Scientific_Name[i] <- "Monodelphis_domestica"
Common_Name[i] <- "Gray short-tailed opossum"
}
else if(SP[i]=="MDV1"){
Scientific_Name[i] <- "Mareks disease virus"
Common_Name[i] <- "Mareks disease virus"
}
else if(SP[i]=="MDV2"){
Scientific_Name[i] <- "Mareks disease virus type 2"
Common_Name[i] <- "Mareks disease virus type 2"
}
else if(SP[i]=="MES"){
Scientific_Name[i] <- "Manihot esculenta"
Common_Name[i] <- "yuca"
}
else if(SP[i]=="MEu"){
Scientific_Name[i] <- " Macropus eugenii"
Common_Name[i] <- "dama wallaby"
}
else if(SP[i]=="MGHV"){
Scientific_Name[i] <- "Mouse gammaherpesvirus 68"
Common_Name[i] <- "Mouse gammaherpesvirus 68"
}
else if(SP[i]=="MJA"){
Scientific_Name[i] <- "Marsupenaeus japonicu"
Common_Name[i] <- "Marsupenaeus japonicu"
}
else if(SP[i]=="MLE"){
Scientific_Name[i] <- "Melibe leonina"
Common_Name[i] <- "hooded nudibranch"
}
else if(SP[i]=="MML"){
Scientific_Name[i] <- "Macaca_mulatta"
Common_Name[i] <- "Rhesus macaque"
}
else if(SP[i]=="MMR"){
Scientific_Name[i] <- "Microcebus murinus"
Common_Name[i] <- "gray mouse lemur"
}
else if(SP[i]=="MMU"){
Scientific_Name[i] <- "Mus_musculus"
Common_Name[i] <- "Mouse"
}
else if(SP[i]=="MNE"){
Scientific_Name[i] <- "Macaca nemestrina"
Common_Name[i] <- "Southern pig-tailed macaque"
}
else if(SP[i]=="MSE"){
Scientific_Name[i] <- "Manduca sexta"
Common_Name[i] <- "Carolina sphinx moth"
}
else if(SP[i]=="MTR"){
Scientific_Name[i] <- "Medicago truncatula"
Common_Name[i] <- "Barrelclover"
}
else if(SP[i]=="MZE"){
Scientific_Name[i] <- "Metriaclima zebra"
Common_Name[i] <- "Tilapia zebra"
}
else if(SP[i]=="NBR"){
Scientific_Name[i] <- "Neolamprologus brichardi"
Common_Name[i] <- "cichlid"
}
else if(SP[i]=="NGI"){
Scientific_Name[i] <- "Nasonia giraulti"
Common_Name[i] <- "Nasonia"
}
else if(SP[i]=="NLE"){
Scientific_Name[i] <- "Nomascus leucogenys"
Common_Name[i] <- "white-cheeked gibbon"
}
else if(SP[i]=="NLO"){
Scientific_Name[i] <- "Nasonia longicornis"
Common_Name[i] <- "Nasonia"
}
else if(SP[i]=="NTA"){
Scientific_Name[i] <- "Nicotiana tabacum"
Common_Name[i] <- "Tobacco"
}
else if(SP[i]=="NVE"){
Scientific_Name[i] <- "Nematostella vectensis"
Common_Name[i] <- "Starlet sea anemone"
}
else if(SP[i]=="NVI"){
Scientific_Name[i] <- "Nasonia vitripennis"
Common_Name[i] <- "parasitoid wasps"
}
else if(SP[i]=="OAN"){
Scientific_Name[i] <- "Ornithorhynchus_anatinus"
Common_Name[i] <- "Platypus"
}
else if(SP[i]=="OAR"){
Scientific_Name[i] <- "Ovis_aries"
Common_Name[i] <- "Sheep"
}
else if(SP[i]=="OCU"){
Scientific_Name[i] <- "Oryctolagus_cuniculus"
Common_Name[i] <- "rabbit"
}
else if(SP[i]=="ODI"){
Scientific_Name[i] <- "Oikopleura dioica"
Common_Name[i] <- "Oikopleura dioica"
}
else if(SP[i]=="OGA"){
Scientific_Name[i] <- "Otolemur garnettii"
Common_Name[i] <- "northern greater galago"
}
else if(SP[i]=="OHA"){
Scientific_Name[i] <- "Ophiophagus hannah"
Common_Name[i] <- "King cobra"
}
else if(SP[i]=="OLA"){
Scientific_Name[i] <- "Oryzias_latipes"
Common_Name[i] <- "Japanese rice fish"
}
else if(SP[i]=="ONI"){
Scientific_Name[i] <- "Oreochromis niloticus"
Common_Name[i] <- "Nile tilapia"
}
else if(SP[i]=="OSA"){
Scientific_Name[i] <- "Oryza sativa"
Common_Name[i] <- "Asian rice"
}
else if(SP[i]=="PAB"){
Scientific_Name[i] <- "Picea abies"
Common_Name[i] <- "Norway spruce"
}
else if(SP[i]=="PAL"){
Scientific_Name[i] <- "Pteropus alecto"
Common_Name[i] <- "black fruit bat"
}
else if(SP[i]=="PBI"){
Scientific_Name[i] <- "Pygathrix bieti"
Common_Name[i] <- "Pygathrix bieti"
}
else if(SP[i]=="PBV"){
Scientific_Name[i] <- "Python bivittatus"
Common_Name[i] <- "Python"
}
else if(SP[i]=="PCA"){
Scientific_Name[i] <- "Polistes canadensis"
Common_Name[i] <- "red paper wasp"
}
else if(SP[i]=="PDE"){
Scientific_Name[i] <- "Pinus densata"
Common_Name[i] <- "Sikang pine"
}
else if(SP[i]=="PEU"){
Scientific_Name[i] <- "Populus euphratica"
Common_Name[i] <- "Euphrates poplar"
}
else if(SP[i]=="PGI"){
Scientific_Name[i] <- "Panax ginseng"
Common_Name[i] <- "ginseng"
}
else if(SP[i]=="PHA"){
Scientific_Name[i] <- "Papio hamadryas"
Common_Name[i] <- "hamadryas baboon"
}
else if(SP[i]=="PIN"){
Scientific_Name[i] <- "Phytophthora infestans"
Common_Name[i] <- "potato blight"
}
else if(SP[i]=="PLA"){
Scientific_Name[i] <- "Paeonia lactiflora"
Common_Name[i] <- "Paeonia lactiflora"
}
else if(SP[i]=="POL"){
Scientific_Name[i] <- "Paralichthys olivaceus"
Common_Name[i] <- "halibut"
}
else if(SP[i]=="PMA"){
Scientific_Name[i] <- "Petromyzon_marinus"
Common_Name[i] <- "Sea lamprey"
}
else if(SP[i]=="PMI"){
Scientific_Name[i] <- "Patiria miniata"
Common_Name[i] <- "Bat star"
}
else if(SP[i]=="PNY"){
Scientific_Name[i] <- "Pundamilia nyererei"
Common_Name[i] <- "Pundamilia nyererei"
}
else if(SP[i]=="PPA"){
Scientific_Name[i] <- "Pan paniscus"
Common_Name[i] <- "bonobo chimpanzee"
}
else if(SP[i]=="PPC"){
Scientific_Name[i] <- "Pristionchus pacificus"
Common_Name[i] <- "Nematode"
}
else if(SP[i]=="PPE"){
Scientific_Name[i] <- "Prunus persica"
Common_Name[i] <- "Peach"
}
else if(SP[i]=="PPT"){
Scientific_Name[i] <- "Physcomitrella patens"
Common_Name[i] <- "Spreading earthmoss"
}
else if(SP[i]=="PPY"){
Scientific_Name[i] <- "Pongo pygmaeus"
Common_Name[i] <- "Bornean orangutan"
}
else if(SP[i]=="PRA"){
Scientific_Name[i] <- "Phytophthora ramorum"
Common_Name[i] <- "Phytophthora ramorum"
}
else if(SP[i]=="PRD"){
Scientific_Name[i] <- "Panagrellus redivivus"
Common_Name[i] <- "sour paste nematode"
}
else if(SP[i]=="PRV"){
Scientific_Name[i] <- "Pseudorabies virus"
Common_Name[i] <- "Pseudorabies virus"
}
else if(SP[i]=="PSJ"){
Scientific_Name[i] <- "Phytophthora sojae"
Common_Name[i] <- "Phytophthora sojae"
}
else if(SP[i]=="PTA"){
Scientific_Name[i] <- "Pinus taeda"
Common_Name[i] <- "loblolly pine"
}
else if(SP[i]=="PTC"){
Scientific_Name[i] <- "Populus trichocarpa"
Common_Name[i] <- "Black cottonwood"
}
else if(SP[i]=="PTE"){
Scientific_Name[i] <- "Parasteatoda tepidariorum"
Common_Name[i] <- "house spider"
}
else if(SP[i]=="PTI"){
Scientific_Name[i] <- "Phaeodactylum tricornutum"
Common_Name[i] <- "diatom"
}
else if(SP[i]=="PTR"){
Scientific_Name[i] <- "Pan_troglodytes"
Common_Name[i] <- "Chimpanzee"
}
else if(SP[i]=="PVU"){
Scientific_Name[i] <- "Phaseolus vulgaris"
Common_Name[i] <- "Bean"
}
else if(SP[i]=="PXY"){
Scientific_Name[i] <- "Plutella xylostella"
Common_Name[i] <- "Diamondback mot"
}
else if(SP[i]=="RCO"){
Scientific_Name[i] <- "Ricinus communis"
Common_Name[i] <- "castor-oil-plant"
}
else if(SP[i]=="RGL"){
Scientific_Name[i] <- "Rehmannia glutinosa"
Common_Name[i] <- "Rehmannia glutinosa"
}
else if(SP[i]=="RLCV"){
Scientific_Name[i] <- "Rhesus lymphocryptovirus"
Common_Name[i] <- "gamma-1 herpesvirus"
}
else if(SP[i]=="RMI"){
Scientific_Name[i] <- "Rhipicephalus microplus"
Common_Name[i] <- "cattle tick"
}
else if(SP[i]=="RNO"){
Scientific_Name[i] <- "Rattus_norvegicus"
Common_Name[i] <- "Rat"
}
else if(SP[i]=="RRV"){
Scientific_Name[i] <- "Rhesus monkey rhadinovirus"
Common_Name[i] <- "Rhesus monkey rhadinovirus"
}
else if(SP[i]=="SBI"){
Scientific_Name[i] <- "Sorghum bicolor"
Common_Name[i] <- "Sorghum grass"
}
else if(SP[i]=="SBO"){
Scientific_Name[i] <- "Saimiri boliviensis"
Common_Name[i] <- "black-capped squirrel monkey"
}
else if(SP[i]=="SCI"){
Scientific_Name[i] <- "Sycon ciliatum"
Common_Name[i] <- "calcareous sponge"
}
else if(SP[i]=="SEU"){
Scientific_Name[i] <- "Salicornia europaea"
Common_Name[i] <- "glasswort"
}
else if(SP[i]=="SFR"){
Scientific_Name[i] <- "Spodoptera frugiperda"
Common_Name[i] <- "fall armyworm"
}
else if(SP[i]=="SHA"){
Scientific_Name[i] <- "Sarcophilus_harrisii"
Common_Name[i] <- "Tasmanian devil"
}
else if(SP[i]=="SJA"){
Scientific_Name[i] <- "Schistosoma japonicum"
Common_Name[i] <- "Schistosoma japonicum"
}
else if(SP[i]=="SKO"){
Scientific_Name[i] <- "Saccoglossus kowalevskii"
Common_Name[i] <- "Acorn worm"
}
else if(SP[i]=="SLA"){
Scientific_Name[i] <- "Saguinus labiatus"
Common_Name[i] <- "White-lipped tamarin"
}
else if(SP[i]=="SLY"){
Scientific_Name[i] <- "Solanum lycopersicum"
Common_Name[i] <- "Tomato"
}
else if(SP[i]=="SMA"){
Scientific_Name[i] <- "Schistosoma mansoni"
Common_Name[i] <- "Trematode"
}
else if(SP[i]=="SMC"){
Scientific_Name[i] <- "Symbiodinium microadriaticum"
Common_Name[i] <- "zooxanthellae"
}
else if(SP[i]=="SME"){
Scientific_Name[i] <- "Schmidtea mediterranea"
Common_Name[i] <- "Schmidtea mediterranea"
}
else if(SP[i]=="SMO"){
Scientific_Name[i] <- "Selaginella moellendorffii"
Common_Name[i] <- "Selaginella moellendorffii"
}
else if(SP[i]=="SMR"){
Scientific_Name[i] <- "Strigamia maritima"
Common_Name[i] <- "Strigamia maritima"
}
else if(SP[i]=="SOF"){
Scientific_Name[i] <- "Saccharum officinarum"
Common_Name[i] <- "Grass"
}
else if(SP[i]=="SPU"){
Scientific_Name[i] <- "Strongylocentrotus purpuratus"
Common_Name[i] <- "Purple sea urchin"
}
else if(SP[i]=="SSA"){
Scientific_Name[i] <- "Salmo salar"
Common_Name[i] <- "Atlantic salmon"
}
else if(SP[i]=="SSC"){
Scientific_Name[i] <- "Sus_scrofa"
Common_Name[i] <- "Wild boar"
}
else if(SP[i]=="SSL"){
Scientific_Name[i] <- "Salvia sclarea"
Common_Name[i] <- "clary sage"
}
else if(SP[i]=="SSP"){
Scientific_Name[i] <- "Saccharum spontaneum"
Common_Name[i] <- "wild sugarcane"
}
else if(SP[i]=="SSY"){
Scientific_Name[i] <- "Symphalangus syndactylus"
Common_Name[i] <- "black-furred gibbon"
}
else if(SP[i]=="STR"){
Scientific_Name[i] <- "Strongyloides ratti"
Common_Name[i] <- "Strongyloides ratti"
}
else if(SP[i]=="STU"){
Scientific_Name[i] <- "Solanum tuberosum"
Common_Name[i] <- "Potato"
}
else if(SP[i]=="SV40"){
Scientific_Name[i] <- "Simian virus 40"
Common_Name[i] <- "Simian virus 40"
}
else if(SP[i]=="TAE"){
Scientific_Name[i] <- "Triticum aestivum"
Common_Name[i] <- "Common wheat"
}
else if(SP[i]=="TCA"){
Scientific_Name[i] <- "Tribolium castaneum"
Common_Name[i] <- "red flour beetle"
}
else if(SP[i]=="TCC"){
Scientific_Name[i] <- "Theobroma cacao"
Common_Name[i] <- "cocoa tree"
}
else if(SP[i]=="TCF"){
Scientific_Name[i] <- "Triops cancriformis"
Common_Name[i] <- "tadpole shrimp"
}
else if(SP[i]=="TCH"){
Scientific_Name[i] <- "Tupaia chinensis"
Common_Name[i] <- "Tupaia chinensis"
}
else if(SP[i]=="TGU"){
Scientific_Name[i] <- "Taeniopygia_guttata"
Common_Name[i] <- "Zebra finch"
}
else if(SP[i]=="TNI"){
Scientific_Name[i] <- "Tetraodon_nigroviridis"
Common_Name[i] <- "green spotted puffer"
}
else if(SP[i]=="TRE"){
Scientific_Name[i] <- "Terebratulina retusa"
Common_Name[i] <- "Terebratulina retusa"
}
else if(SP[i]=="TTU"){
Scientific_Name[i] <- "Triticum turgidum"
Common_Name[i] <- "pasta wheat"
}
else if(SP[i]=="TUR"){
Scientific_Name[i] <- "Tetranychus urticae"
Common_Name[i] <- "red spider mite"
}
else if(SP[i]=="VCA"){
Scientific_Name[i] <- "Vriesea carinata"
Common_Name[i] <- "Vriesea carinata"
}
else if(SP[i]=="VUN"){
Scientific_Name[i] <- "Vigna unguiculata"
Common_Name[i] <- "Cowpea"
}
else if(SP[i]=="VVI"){
Scientific_Name[i] <- "Vitis vinifera"
Common_Name[i] <- "Grape vine"
}
else if(SP[i]=="XBO"){
Scientific_Name[i] <- "Xenoturbella bocki"
Common_Name[i] <- "Xenoturbella bocki"
}
else if(SP[i]=="XLA"){
Scientific_Name[i] <- "Xenopus laevis"
Common_Name[i] <- "African clawed frog"
}
else if(SP[i]=="XTR"){
Scientific_Name[i] <- "Xenopus_tropicalis"
Common_Name[i] <- "Western clawed frog"
}
else if(SP[i]=="ZMA"){
Scientific_Name[i] <- "Zea Mays"
Common_Name[i] <- "Corn"
}
else{
Scientific_Name[i] <- "NA"
Common_Name[i] <- "NA"
}
setTxtProgressBar(pb, i)
}
close(pb)
MiRNADT$Common_Name <- Common_Name
MiRNADT$Scientific_Name <- Scientific_Name
return(MiRNADT)
}
#' MiRNAname
#'
#' Removes the species designation from the miRNA name and adds it to a new column on a data.frame
#' Requires a data.table with the column housing the miRNA labelled "miRNA_Name"
#'
#' @param miDT a data table with one column labeled miRNA_Name
#' @return A data table containing the following columns:
#' @author Brendan Gongol
#' @export
#' @examples
#'miRNA_Name <- c("hsa-let-7","cel-lin-4","aae-mir-1","bol-mir-2","rno-mir-34","cel-mir-35",
#' "cel-let-7-5p", "cel-let-7-3p", "cel-lin-4-5p", "cel-lin-4-3p", "cel-miR-1-5p", "ame-miR-9895",
#' "hsa-miR-9896", "rno-miR-3478", "mmu-miR9897-5p", "cre-miR9897-3p")
#'MiRNADT<- data.frame(cbind(miRNA_Name))
#'MiRNAname(MiRNADT)
#'
#'Sequence <- c("UAGCGAUUGGAUCCGGUGAGGUAGUAGGUUGUAUAGUUUGGAAUAUUACCACCGGUGAACUAUGCAAUUUUCUACCUUACCGGAGACAGAACUCUUCGA",
#' "AUGCUUCCGGCCUGUUCCCUGAGACCUCAAGUGUGAGUGUACUAGCGAUGCUUCACACCUGGGCUCUCCGGGUACCAGGACGGUUUGAGCAGAU",
#' "AAAGUGACCGUACCGAGCUGCAUACUUCCUUACAUGCCCAUACUAUAUCAUAAAUGGAUAUGGAAUGUAAAGAAGUAUGUAGAACGGGGUGGUAGU ",
#' "UAAACAGUAUACAGAAAGCCAUCAAAGCGGUGGUUGAUGUGUUGCAAAUUAUGACUUUCAUAUCACAGCCAGCUUUGAUGUGCUGCCUGUUGCACUGU",
#' "CGGACAAUGCUCGAGAGGCAGUGUGGUUAGCUGGUUGCAUAUUUCCUUGACAACGGCUACCUUCACUGCCACCCCGAACAUGUCGUCCAUCUUUGAA",
#' "UAGCGAUUCAGAUCGAGCCAUUGCUGGUUUCUUCCACAGUAGCGAUUUCCAUUAGAACUAUCACCGGGUGGAAACUAGCAGUGGCUCGAUUAGCGAU",
#' "UGAGGUAGUAGGUUGUAUAGUU", "CUAUGCAUAGCGAUACCUUACC", "UCCCUGAGACCUCAAGUGUGA", "ACACCUGGGCUCUCCGGGUACC",
#' "UAGCGAUCCUUACAUGCCCAUA", "UCGUGUCCGUUUCUCGUUUCGA", "ACAAUAAUCGGACACUAGCGAU", "CACACCGGACGAGAUUUCAU",
#' "UACCGGGCGUGGGGAGGGCAGG", "UAGCGAUUCCUUCUUAGCGAU")
#'miRNA_Name <- c("hsa-let-7","cel-lin-4","aae-mir-1","bol-mir-2","rno-mir-34","cel-mir-35",
#' "cel-let-7-5p", "cel-let-7-3p", "cel-lin-4-5p", "cel-lin-4-3p", "cel-miR-1-5p", "ame-miR-9895",
#' "hsa-miR-9896", "rno-miR-3478", "mmu-miR9897-5p", "cre-miR9897-3p")
#'miRNA_type <- c("IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN","IMMATURE_HAIR_PIN",
#' "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE")
#'MiRNADT<- data.frame(cbind(Sequence, miRNA_Name, miRNA_type))
#'MiRNAname(MiRNADT)
MiRNAname <- function(miDT){
miDT$miRNA_Name <- toupper(miDT$miRNA_Name)
pb <- txtProgressBar(min = 0, max = nrow(miDT), style = 3)
NAM <- NULL
for(i in 1:nrow(miDT)){
spl <- strsplit(miDT$miRNA_Name[i], split = "-")[[1]]
if(length(spl == 2)){
first <- strsplit(spl[2], split="")
first <- first[[1]][1:3]
first <- paste(first[1], first[2], first[3])
first <- gsub(" ", "", first)
last <- gsub("MIR", "", spl[2])
NAM[i] <- paste(first, last, sep = "-")
}
if((length(spl) == 3) & (spl[2] == "MIR")){
NAM[i] <- paste(spl[2], spl[3], sep = "-")
}
if((length(spl) == 3) & (spl[2] == "LET")){
NAM[i] <- paste(spl[2], spl[3], sep = "-")
}
if((length(spl) == 3) & (spl[3] == "5P")){
first <- strsplit(spl[2], split="")
first <- first[[1]][1:3]
first <- paste(first[1], first[2], first[3])
first <- gsub(" ", "", first)
last <- gsub("MIR", "", spl[2])
NAM[i] <- paste(first, last, spl[3], sep = "-")
}
if(length(spl) == 4){
NAM[i] <- paste(spl[2], spl[3], spl[4], sep = "-")
}
if(length(spl) == 5){
NAM[i] <- paste(spl[2], spl[3], spl[4], spl[5], sep = "-")
}
setTxtProgressBar(pb, i)
}
miDT$miRNA <- NAM
close(pb)
return(miDT)
}
#' MISeed
#'
#' Adds a row to a data.table containing the seed sequence of the miRNA.
#' Requires a column named "Sequence" housing the miRNA sequence and a column named "miRNA_Name"
#' housing the name of the miRNA.
#'
#' @param DT a data table with one column labeled Sequence
#' @return A data table containing the following columns:
#' @author Brendan Gongol
#' @export
#' @examples
#' Sequence <- c("UGAGGUAGUAGGUUGUAUAGUU", "CUAUGCAUAGCGAUACCUUACC", "UCCCUGAGACCUCAAGUGUGA", "ACACCUGGGCUCUCCGGGUACC",
#' "UAGCGAUCCUUACAUGCCCAUA", "UCGUGUCCGUUUCUCGUUUCGA", "ACAAUAAUCGGACACUAGCGAU", "CACACCGGACGAGAUUUCAU",
#' "UACCGGGCGUGGGGAGGGCAGG", "UAGCGAUUCCUUCUUAGCGAU")
#' miRNA_Name <- c("hsa-let-7","cel-lin-4","aae-mir-1","bol-mir-2","rno-mir-34",
#' "cel-lin-4-5p", "ame-miR-9895", "rno-miR-3478", "mmu-miR9897-5p", "cre-miR9897-3p")
#' miRNA_type <- c("MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE", "MATURE")
#' MiRNADT<- data.frame(cbind(Sequence, miRNA_Name, miRNA_type))
#' MiRNADT$Sequence <- as.character(MiRNADT$Sequence)
#' MISeed(MiRNADT)
MISeed <- function(DT){
pb <- txtProgressBar(min = 0, max = nrow(DT), style = 3)
seedSeq <- NULL
for(i in 1: nrow(DT)){
seq <- strsplit(DT$Sequence[i], split = "")
seedseq <- seq[[1]][2:8]
seedSeq[i] <- paste(seedseq, collapse = "")
setTxtProgressBar(pb, i)
}
DT$seed_Sequence <- seedSeq
close(pb)
return(DT)
}
#' MIQuerySeq
#'
#' Adds a row to the original data.table containing the query sequence of the miRNA.
#' Requires a column named "seed_Sequence" housing the miRNA sequence and a column named "miRNA_Name" housing the name of the miRNA.
#'
#' @param DT a data table with one column labeled seed_Sequence
#' @param wobble a logical statement either TRUE or FALSE. If wobble = TRUE, will returen a query sequence allowing for G-U basepairing. If wobble = FALSE, will return a query sequence that considers only Watson-Crick base pairing.
#' @return A data table containing the following columns:
#' @author Brendan Gongol
#' @export
#' @examples
#' Sequence <- c("UGAGGUAGUAGGUUGUAUAGUU", "CUAUGCAUAGCGAUACCUUACC", "UCCCUGAGACCUCAAGUGUGA", "ACACCUGGGCUCUCCGGGUACC",
#' "UAGCGAUCCUUACAUGCCCAUA", "UCGUGUCCGUUUCUCGUUUCGA", "ACAAUAAUCGGACACUAGCGAU", "CACACCGGACGAGAUUUCAU",
#' "UACCGGGCGUGGGGAGGGCAGG", "UAGCGAUUCCUUCUUAGCGAU")
#' miRNA_Name <- c("hsa-let-7","cel-lin-4","aae-mir-1","bol-mir-2","rno-mir-34",
#' "cel-lin-4-5p", "ame-miR-9895", "rno-miR-3478", "mmu-miR9897-5p", "cre-miR9897-3p")
#' miRNA_type <- c("MATURE", "MATURE", "MATURE", "MATURE", "MATURE",
#' "MATURE", "MATURE", "MATURE", "MATURE", "MATURE")
#' seed_Sequence <- c("GAGGUAG", "UAUGCAU", "CCCUGAG", "CACCUGG", "AGCGAUC",
#' "CGUGUCC", "CAAUAAU", "ACACCGG", "ACCGGGC", "AGCGAUU")
#' MiRNADT<- data.frame(cbind(Sequence, miRNA_Name, miRNA_type, seed_Sequence))
#' MiRNADT$Sequence <- as.character(MiRNADT$Sequence)
#' MiRNADT$seed_Sequence <- as.character(MiRNADT$seed_Sequence)
#' MIQuerySeq(MiRNADT)
MIQuerySeq <- function(DT, wobble = FALSE){
x <- DT$seed_Sequence
pb <- txtProgressBar(min = 0, max = nrow(DT), style = 3)
queSeq <- NULL
for(i in 1: nrow(DT)){
if(wobble == TRUE){
y <- substring(x[i], 1:nchar(x[i]), 1:nchar(x[i]))
y <- (rev(y))
y <- chartr("AUGC", "TACG", y)
y <- gsub("C", "(C|T)", y)
y <- gsub("A", "(A|G)", y)
queSeq[i] <- paste(y, collapse="")
}
else if(wobble == FALSE){
y <- substring(x[i], 1:nchar(x[i]), 1:nchar(x[i]))
y <- (rev(y))
y <- chartr("AUGC", "TACG", y)
queSeq[i] <- paste(y, collapse="")
}
setTxtProgressBar(pb, i)
}
DT$Query_Sequence <- queSeq
close(pb)
return(DT)
}
#' MIRNATargetpredict
#'
#' uses a data.table with a column named Query_Sequence and a column named miRNA_Name to predict what miRNAs
#' target one or more mRNAs housed in a data.table
#'
#' Requires miRNA DT with a column named "Query_Sequence" housing the miRNA sequence, a column named "Scientific_Name", and a column named "miRNA_Name"
#' Requires mRNA DT with a columns named "Sequence", "external_gene_name", "Species", and "Scientific_Name".
#'
#' @param MiRNADT a data table with one column labeled Query_Sequence and a column labeled miRNA_Name
#' @param mRNADT a data table with three columns labeled: Sequence, external_gene_name, Species
#' @param type a character indicating "Single" or "Multiple". If type = "Multiple", will match miRNAs to mRNAs by the Scientific_Name in the miRNA DT
#' to the Scientific_Name in the mRNA DT. Both of these species profiles must match across both data tables. Requires the Spe argument indicating what species to use.
#' @param Spe a character string indicating the species to use when type = "Multiple" is used.
#' The following specied are available if used in conjunction with other functions in the EntroSolve package: "Anolis_carolinensis", "Bos_taurus", "Caenorhabditis_elegans",
#' "Canis_familiaris", "Ciona_intestinalis", "Ciona_savignyi", "Danio_rerio", "Drosophila_melanogaster", "Equus_caballus", "Gallus_gallus", "Gorilla_gorilla", "Homo_sapiens",
#' "Macaca_mulatta", "Monodelphis_domestica", "Mus_musculus", "Ornithorhynchus_anatinus", "Oryctolagus_cuniculus", "Oryzias_latipes", "Ovis_aries", "Pan_troglodytes",
#' "Petromyzon_marinus", "Rattus_norvegicus", "Sus_scrofa", "Taeniopygia_guttata", "Tetraodon_nigroviridis", "Xenopus_tropicalis".
#' @return A data table containing the following columns:
#' @author Brendan Gongol
#' @importFrom data.table setnames
#' @importFrom data.table data.table
#' @importFrom data.table setkey
#' @importFrom stringr str_locate_all
#' @export
#' @examples
#' Sequence <- c("CTGTTTCGAGCCTGAATCTCGATCGCTCGCGCTAGACAGCTCGACGCACTTTTCAGCAGGAGCCTG",
#' "TCAGCAGATAGCGCTCGATACAGCTCGACAGCTCTTGCTGTATTGTGTG",
#' "TTGCTGTATTGTGTGATCCTCGATACAGGTATTTTCTGAGCCTGATAGCTAGCTTTGCTGTATTGTGTG",
#' "AAAAAATTTTTTCCCCCCGGGGGG")
#' external_gene_name <- c("AKT", "PI3K", "SREBP", "FOXO")
#' Species <- c("Rat", "Mouse", "Human", "Pig")
#' mRNADTsamp <- data.frame(cbind(Sequence, external_gene_name, Species))
#' mRNADTsamp$Sequence <- as.character(mRNADTsamp$Sequence)
#'
#' Query_Sequence <- c("(C|T)T(A|G)(C|T)(C|T)T(C|T)", "TTG(C|T)G(C|T)(A|G)", "(A|G)(C|T)(A|G)TT(C|T)(C|T)")
#' miRNA_Name <- c("MMU-LET-7G-5P", "MMU-LET-7I-3P", "MMU-MIR-1A-3P")
#' miRNADT1 <- data.frame(cbind(Query_Sequence, miRNA_Name))
#' miRNADT1$Query_Sequence <- as.character(miRNADT1$Query_Sequence)
#'
#' MIRNATargetpredict(miRNADT1, mRNADTsamp)
MIRNATargetpredict <- function(MiRNADT, mRNADT, type, Spe){
if(type == "Single"){
MIRNATargetpredict2 <- function(MiRNADT2, mRNADT2){
conse2 <- MiRNADT2$Query_Sequence
Name <- MiRNADT2$miRNA_Name
sequence <- mRNADT2$Sequence
#### query the input DNA sequence for consensus sequence elements.
hold <- data.table(NULL)
for(i in 1:length(conse2)){
lis2 <- as.data.frame(str_locate_all(sequence, conse2[i]))
if(nrow(lis2) > 0){
dt <- cbind(lis2, Query_Sequence=MiRNADT2$Query_Sequence[i], miRNA_Name = Name[i], mRNA_Name = mRNADT2$external_gene_name, Species = mRNADT2$Species)
hold <- rbind(hold, dt)
}
}
if(nrow(hold) > 0){
#### Count the number of times each consensus sequence appears.
numhits <- hold[,length(start), by=Query_Sequence]
hold <- merge(hold, numhits, by="Query_Sequence", allow.cartesian=TRUE)
#### Add the length of each mRNA sequence
spl <- strsplit(sequence, split="")
hold$length <- length(spl[[1]])
# Add the sequences with hits reformatted into lower case.
spl <- strsplit(sequence, split="")
chromtot2 <- NULL
for(i in 1:length(hold$start)){
if((hold$end[i] == hold$length[i])){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
chromtot <- paste(first, sequencelow, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(first, sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) == 0){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
i <- i + 1
}
hold$Sequence <- chromtot2
setnames(hold, c("Query_Sequence", "start", "end", "miRNA_Name", "mRNA_Name", "Species", "V1", "length", "Sequence"),
c("Query_Sequence", "start", "end", "miRNA_Name", "mRNA_Name", "Species", "Number_Hits", "length", "Sequence"))
}
return(hold)
}
pb <- txtProgressBar(min = 0, max = nrow(mRNADT), style = 3)
MiRNAhits <- NULL
for(i in 1:nrow(mRNADT)){
fir <- MIRNATargetpredict2(MiRNADT, mRNADT[i,])
MiRNAhits <- rbind(MiRNAhits, fir)
setTxtProgressBar(pb, i)
}
close(pb)
return(MiRNAhits)
}
if(type == "Multiple"){
MIRNATargetpredict3 <- function(MiRNADT3, mRNADT3){
MIRNATargetpredict2 <- function(MiRNADT2, mRNADT2){
conse2 <- MiRNADT2$Query_Sequence
Name <- MiRNADT2$miRNA_Name
sequence <- mRNADT2$Sequence
#### query the input DNA sequence for consensus sequence elements.
hold <- data.table(NULL)
for(i in 1:length(conse2)){
lis2 <- as.data.frame(str_locate_all(sequence, conse2[i]))
if(nrow(lis2) > 0){
dt <- cbind(lis2, Query_Sequence=MiRNADT2$Query_Sequence[i], miRNA_Name = Name[i], mRNA_Name = mRNADT2$external_gene_name)
hold <- rbind(hold, dt)
}
}
if(nrow(hold) > 0){
#### Count the number of times each consensus sequence appears.
numhits <- hold[,length(start), by=c("Query_Sequence", "miRNA_Name")]
hold <- merge(hold, numhits, by=c("Query_Sequence", "miRNA_Name"), allow.cartesian=TRUE)
#### Add the length of each mRNA sequence
spl <- strsplit(sequence, split="")
hold$length <- length(spl[[1]])
# Add the sequences with hits reformatted into lower case.
spl <- strsplit(sequence, split="")
chromtot2 <- NULL
for(i in 1:length(hold$start)){
if((hold$end[i] == hold$length[i])){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
chromtot <- paste(first, sequencelow, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(first, sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) == 0){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
i <- i + 1
}
hold$Sequence <- chromtot2
setnames(hold, c("Query_Sequence", "start", "end", "miRNA_Name", "mRNA_Name", "V1", "length", "Sequence"),
c("Query_Sequence", "start", "end", "miRNA_Name", "mRNA_Name", "Number_Hits", "length", "Sequence"))
}
return(hold)
}
MiRNAhits <- NULL
for(i in 1:nrow(mRNADT3)){
fir <- MIRNATargetpredict2(MiRNADT3, mRNADT3[i,])
MiRNAhits <- rbind(MiRNAhits, fir)
}
return(MiRNAhits)
}
pb <- txtProgressBar(min = 0, max = length(Spe), style = 3)
Hits_TOT <- NULL
for(i in 1:length(Spe)){
MiRNA <- MiRNADT[MiRNADT$Scientific_Name == Spe[i],]
mRNA <- mRNADT[mRNADT$Scientific_Name == Spe[i]]
hits <- MIRNATargetpredict3(MiRNA, mRNA)
hits$Species <- Spe[i]
Hits_TOT <- rbind(Hits_TOT, hits)
setTxtProgressBar(pb, i)
}
close(pb)
return(Hits_TOT)
}
}
#' IUPAC_boolean
#'
#' Converts IUPAC coded DNA sequences to a boolean syntax that can be utilized in query searches.
#'
#' Requires a vector of DNA characters in IUPAC code.
#'
#' There are three different stringency levels: high, medium, and low. The high stringency substitutes all IUPAC symbols
#' for their nucleotide designations. Medium stringency substitutes IUPAC symbols with one and two nucleotide designations
#' while treating those with three nucleotide designations as any nucleotide. High stringency retains only IUPAC symbols with
#' one nucleotide designation while treating all others as any nucleotide.
#'
#' @param DNAseq a vector of DNA characters in IUPAC code.
#' @param stringency a character designating "high", "medium", or "low" stringency levels.
#' @return A vector of DNA characters converted to boolean syntax.
#' @author Brendan Gongol
#' @export
#' @examples
#' sequences <- c("RTSWKMBDHVNATAATCGCTCCATACCTACATCN", "ATAGNNNCTCGACATWKMBACATCGCTACANNTACATAC")
#' IUPAC_Boolean(sequences, stringency = "high")
#' IUPAC_Boolean(sequences, stringency = "medium")
#' IUPAC_Boolean(sequences, stringency = "low")
IUPAC_Boolean <- function(DNAseq, stringency){
if(stringency == "high"){
P <- gsub("R", "(A|G)", DNAseq)
P1 <- gsub("Y", "(C|T)", P)
P2 <- gsub("S", "(G|C)", P1)
P3 <- gsub("W", "(A|T)", P2)
P4 <- gsub("K", "(G|T)", P3)
P5 <- gsub("M", "(A|C)", P4)
P6 <- gsub("B", "(C|G|T)", P5)
P7 <- gsub("D", "(A|G|T)", P6)
P8 <- gsub("H", "(A|C|T)", P7)
P9 <- gsub("V", "(A|C|G)", P8)
P10 <- gsub("N", ".", P9)
}
if(stringency == "medium"){
P <- gsub("R", "(A|G)", DNAseq)
P1 <- gsub("Y", "(C|T)", P)
P2 <- gsub("S", "(G|C)", P1)
P3 <- gsub("W", "(A|T)", P2)
P4 <- gsub("K", "(G|T)", P3)
P5 <- gsub("M", "(A|C)", P4)
P6 <- gsub("B", ".", P5)
P7 <- gsub("D", ".", P6)
P8 <- gsub("H", ".", P7)
P9 <- gsub("V", ".", P8)
P10 <- gsub("N", ".", P9)
}
if(stringency == "low"){
P <- gsub("R", ".", DNAseq)
P1 <- gsub("Y", ".", P)
P2 <- gsub("S", ".", P1)
P3 <- gsub("W", ".", P2)
P4 <- gsub("K", ".", P3)
P5 <- gsub("M", ".", P4)
P6 <- gsub("B", ".", P5)
P7 <- gsub("D", ".", P6)
P8 <- gsub("H", ".", P7)
P9 <- gsub("V", ".", P8)
P10 <- gsub("N", ".", P9)
}
return(P10)
}
#' TFpredict
#'
#' If type = "single"
#' uses a data.table with a column named Targeting_Factor and a column named Consensus_Sequence to predict what Targeting Factor consensus sequences
#' are located within a Protein, DNA, or RNA sequence
#' Requires a character vector containing the Protein, DNA, or RNA sequence of interest.
#' Requires transcription factor data table with a column labeled "Targeting_Factor" housing the targeting protein of interest and a column labeled
#' "Consensus_Sequence" housing the targeting protein consensus sequence.
#'
#' If type = "multiple"
#' Uses two data.tables. The first one (Targeting_Factor_DT) houses a column named "Targeting_Factor" and a column named "Consensus_Sequence". The
#' second one (Target) houses a column named "Sequence" and a column named "gene_symbol". It returns what targeting protein consensus sequences
#' housed in Targeting_Factor_DT are located within a data table of protein, DNA, RNA sequence housed in Target.
#' Requires data.table (Targeting_Factor_DT) with a column named "Targeting_Factor" and a column named "Consensus_Sequence" housing the consensus sequence.
#' Requires data.table (Target) with a columns named "Sequence" and "gene_symbol" housing the protein, DNA, or RNA sequences and designated names to query.
#'
#' If type = "multiple_species"
#' Uses two data.tables. The first one (Targeting_Factor_DT) houses a column named "Targeting_Factor" and a column named "Consensus_Sequence". The
#' second one (Target) houses a column named "Sequence", a column named "gene_symbol", column labeled "Common_Name", and a column labeled "Scientific_Name". It returns what consensus sequences
#' housed in Targeting_Factor_DT are located within a data table of protein, DNA, opr RNA sequence housed in Target across multiple species.
#' Requires data.table (Targeting_Factor_DT) with a column named "Targeting_Factor" and a column named "Consensus_Sequence" housing the consensus sequence.
#' Requires data.table (Target) with a columns named "Sequence", "gene_symbol", "Common_Name", and "Scientific_Name", housing the protein, DNA, or RNA sequences and designated names to query.
#'
#' If type = "multiple_species_unknown"
#' Used if the Target data.table does not contain a column labeled "Common_Name".
#' Uses two data.tables. The first one (Targeting_Factor_DT) houses a column named "Targeting_Factor" and a column named "Consensus_Sequence". The
#' second one (Target) houses a column named "Sequence", a column named "gene_symbol", and a column labeled "Scientific_Name". It returns what consensus sequences
#' housed in Targeting_Factor_DT are located within a data.table of protein, DNA, or RNA sequence housed in Target across multiple species.
#' Requires data.table (Targeting_Factor_DT) with a column named "Targeting_Factor" and a column named "Consensus_Sequence" housing the consensus sequence.
#' Requires data.table (Target) with a columns named "Sequence", "gene_symbol", and "Scientific_Name", housing the protein, DNA, or RNA sequences and designated names to query.
#'
#'
#' @param type a single character indicating "single", "multiple", "multiple_species", or "multiple_species_unknown"
#' @param Target If type = "single", a vector containing one protein, DNA, or RNA character string. If type = "multiple", a data.table containing two columns, one labeled "Sequence" and one labeled "gene_symbol".
#' If type = "multiple_species", a data.table containing four columns, one labeled "Sequence" one labeled "gene_symbol", one labeled "Common_Name", and one labeled "Scientific_Name". If type = "multiple_species_unknown",
#' a data.table containing three columns, one labeled "Sequence" one labeled "gene_symbol", and one labeled "Scientific_Name".
#' @param Targeting_Factor_DT a data table with two columns labeled: Targeting_Factor and Consensus_Sequence
#' @return A data table
#' @author Brendan Gongol
#' @importFrom data.table setnames
#' @importFrom data.table data.table
#' @importFrom data.table setkey
#' @importFrom stringr str_locate_all
#' @export
#' @examples
#' Targeting_Factor <- c("AMPK", "PKA", "PKC", "MAPK", "CAMKKB", "CAMKI", "CAMKIV", "CKII", "CDK", "SRC", "AKT")
#' Consensus_Sequence <- c("AGCNVTQ", "PPKLYS", "AAT","AAAAAAAAAAATTGCNVMDEDE", "AA", "ATA", "GTTT", "AAAAAAAAAAAAAAAAAAAAA", "(A|T)..(H|F)", "GCTAAGCTGCGCAATTTTTGTATTTTGT|AGTTCTTTTTGTGTATTAGCTCAGATTTTCCAGCTG","(A|T)T(A|G)(C|T)(C|T)T(C|T)")
#'
#' data_table <- data.table(cbind(Targeting_Factor, Consensus_Sequence))
#' data_table
#'
#' Prot <- "ACQVAPKLHGEAGCNVTQDWTYMMGVCSTASYAATWEQDEPLWYMAATNGHCATWWAAASSCATAFQTSKLPIIIGHATSDF"
#'
#' Sequence <- c("ACDFEQAGCNVTQPCTSTSGANDEPHYYASTGFWYKAGCNVTQETCCKLLHAQSWW",
#' "ACQVAPKLHGEDWTYMMGVCSTASYWEQDEPLWYMNGHCATWWAAASSCTAQTSKLPIIIGHATSDF",
#' "TGHATSHCTANMKLPYWQEDTGSCANMHGTYYYDEDEDDASQWWWMNNNCGYTEWSDFGCPKKK",
#' "AAAATSTSTSGGGGGAAACCCCNNNNMMMMPPPPWWWWQHGGTTYYNNCCAA", "AAAAAATTTTTTCCCCCCGGGGGG")
#' gene_symbol <- c("PABP", "EIF4E", "SREBP", "FOXO", "ABCA1")
#' Common_Name <- c("Rat", "Mouse", "Human", "Cattle", "Dog")
#' Scientific_Name <- c("Rattus_norvegicus", "Mus_musculus", "Homo_sapiens", "Bos_taurus", "Canis_familiaris")
#' proteinTarg <- data.frame(cbind(gene_symbol,Sequence, Common_Name, Scientific_Name))
#' proteinTarg$Sequence <- as.character(proteinTarg$Sequence)
#' proteinTarg$gene_symbol <- as.character(proteinTarg$gene_symbol)
#' proteinTarg$Common_Name <- as.character(proteinTarg$Common_Name)
#' proteinTarg$Scientific_Name <- as.character(proteinTarg$Scientific_Name)
#' proteinTarg
#'
#'
#' TFpredict(Prot, data_table , type = "single")
#' TFpredict(proteinTarg, data_table, type = "multiple")
#' TFpredict(proteinTarg, data_table, type = "multiple_species")
#' TFpredict(proteinTarg, data_table, type = "multiple_species_unknown")
#'
#'
#' Targeting_Factor <- c("KLF2", "PGC1A", "FOXO1", "NCL", "SREBP", "MYC", "HIF", "NF-KB", "TXNIP", "PPAR")
#' Consensus_Sequence <- c("AAGCT", "GCGC", "AAT","AAAAAAAAAAAAAAAAAAAAAAAA", "AA", "ATA", "GTTT", "AAAAAAAAAAAAAAAAAAAAA", "(A|T)..(C|G)", "GCTAAGCTGCGCAATTTTTGTATTTTGT|AGTTCTTTTTGTGTATTAGCTCAGATTTTCCAGCTG")
#' TX_data_table <- data.table(cbind(Targeting_Factor, Consensus_Sequence))
#' TX_data_table
#'
#' chromo_seq <- "AAGCTAAGCTAAGCTGCGCAATTTTTGTATTTTGTTTAAACAGAATCCTCAAGGGAACATCATCCTCAGTTCTTTTTGTGTATTAGCTCAGATTTTCCAGCTGTTTTTAAAGCT"
#'
#'
#' Sequence <- c("AAGCTAAGCTAAGCTGCGCAATTTTTGTATTTTGTTTAAACAGAATCCTCAAGGGAACATCATCCTCAGTTCTTTTTGTGTATTAGCTCAGATTTTCCAGCTGTTTTTAAAGCT",
#' "CTGTTTCGAGCCTGAATCTCGATCGCTCGCGCTAGACAGCTCGACGCACTTTTCAGCAGGAGCCTG",
#' "TCAGCAGATAGCGCTCGATACAGCTCGACAGCTCTTGCTGTATTGTGTG",
#' "TTGCTGTATTGTGTGATCCTCGATACAGGTATTTTCTGAGCCTGATAGCTAGCTTTGCTGTATTGTGTG",
#' "AAAAAATTTTTTCCCCCCGGGGGG")
#' gene_symbol <- c("AKT", "PI3K", "SREBP", "FOXO", "ABCA1")
#' Common_Name <- c("Rat", "Mouse", "Human", "Cattle", "Dog")
#' Scientific_Name <- c("Rattus_norvegicus", "Mus_musculus", "Homo_sapiens", "Bos_taurus", "Canis_familiaris")
#' chromo <- data.frame(cbind(Sequence, gene_symbol, Common_Name, Scientific_Name))
#' chromo$Sequence <- as.character(chromo$Sequence)
#' chromo$gene_symbol <- as.character(chromo$gene_symbol)
#' chromo$Common_Name <- as.character(chromo$Common_Name)
#' chromo$Scientific_Name <- as.character(chromo$Scientific_Name)
#' chromo
#'
#'
#'
#' TFpredict(chromo_seq, TX_data_table , type = "single")
#' TFpredict(chromo, TX_data_table, type = "multiple")
#' TFpredict(chromo, TX_data_table, type = "multiple_species")
#' TFpredict(chromo, TX_data_table, type = "multiple_species_unknown")
TFpredict <- function(Target, Targeting_Factor_DT, type){
if(type == "single"){
conse2 <- Targeting_Factor_DT$Consensus_Sequence
#### query the input DNA sequence for consensus sequence elements.
hold <- data.table(NULL)
for(ce in conse2){
lis2 <- as.data.frame(str_locate_all(Target, ce))
if(nrow(lis2) > 0){
dt <- cbind(lis2, Consensus_Sequence=ce)
hold <- rbind(hold, dt)
}
}
#### Count the number of times each consensus sequence appears.
if(nrow(hold) > 0){
setkey(hold, Consensus_Sequence)
numhits <- hold[,length(end), by=Consensus_Sequence]
hold <- merge(hold, numhits, by="Consensus_Sequence", allow.cartesian=TRUE)
}
## Add the Transcription factor names to the data.table
if(nrow(hold) > 0){
hold <- merge(hold, Targeting_Factor_DT, by="Consensus_Sequence", allow.cartesian=TRUE)
}
#### Add the length of each DNA sequence
if(nrow(hold) > 0){
spl <- strsplit(Target, split="")
hold$length <- length(spl[[1]])
}
# Add the sequences with hits reformatted into lower case.
if(nrow(hold) > 0){
spl <- strsplit(Target, split="")
chromtot2 <- NULL
for(i in 1:length(hold$start)){
if((hold$end[i] == hold$length[i])){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
chromtot <- paste(first, sequencelow, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(first, sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) == 0){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
i <- i + 1
}
hold$sequence <- chromtot2
}
# Add the identified sequence
if(nrow(hold) > 0){
target_seq <- NULL
for(i in 1:nrow(hold)){
sp1 <- strsplit(hold$sequence[i], split="")
sp2 <- sp1[[1]][(hold$start[i]-1):(hold$end[i] +1)]
target_seq[i] <- paste(sp2, collapse="")
}
hold$Identified_sequence <- target_seq
}
# Rename the columns
if(nrow(hold) > 0){
setnames(hold, c("Consensus_Sequence", "start", "end", "V1", "Targeting_Factor", "length", "sequence", "Identified_sequence"),
c("Consensus_Sequence", "start", "end", "Number_Hits", "Targeting_Factor", "length", "Sequence", "Identified_sequence"))
}
return(hold)
}
if(type == "multiple"){
TXpredictDT <- function(TargetDT, Targeting_Factor_DT2){
conse2 <- Targeting_Factor_DT2$Consensus_Sequence
#### query the input DNA sequence for consensus sequence elements.
hold <- data.table(NULL)
for(ce in conse2){
lis2 <- as.data.frame(str_locate_all(TargetDT, ce))
if(nrow(lis2) > 0){
dt <- cbind(lis2, Consensus_Sequence=ce)
hold <- rbind(hold, dt)
}
}
#### Count the number of times each consensus sequence appears.
if(nrow(hold) > 0){
setkey(hold, Consensus_Sequence)
numhits <- hold[,length(end), by=Consensus_Sequence]
hold <- merge(hold, numhits, by="Consensus_Sequence", allow.cartesian=TRUE)
}
## Add the Transcription factor names to the data.table
if(nrow(hold) > 0){
hold <- merge(hold, Targeting_Factor_DT2, by="Consensus_Sequence", allow.cartesian=TRUE)
}
#### Add the length of each DNA sequence
if(nrow(hold) > 0){
spl <- strsplit(TargetDT, split="")
hold$length <- length(spl[[1]])
}
# Add the sequences with hits reformatted into lower case.
if(nrow(hold) > 0){
spl <- strsplit(TargetDT, split="")
chromtot2 <- NULL
for(i in 1:length(hold$start)){
if((hold$end[i] == hold$length[i])){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
chromtot <- paste(first, sequencelow, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(first, sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) == 0){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
i <- i + 1
}
hold$sequence <- chromtot2
}
# Add the identified sequence
if(nrow(hold) > 0){
target_seq <- NULL
for(i in 1:nrow(hold)){
sp1 <- strsplit(hold$sequence[i], split="")
sp2 <- sp1[[1]][(hold$start[i]-1):(hold$end[i] +1)]
target_seq[i] <- paste(sp2, collapse="")
}
hold$Identified_sequence <- target_seq
}
# Rename the columns
if(nrow(hold) > 0){
setnames(hold, c("Consensus_Sequence", "start", "end", "V1", "Targeting_Factor", "length", "sequence", "Identified_sequence"),
c("Consensus_Sequence", "start", "end", "Number_Hits", "Targeting_Factor", "length", "Sequence", "Identified_sequence"))
}
return(hold)
}
seqs2 <- Target$Sequence
pb <- txtProgressBar(min = 0, max = nrow(Target), style = 3)
predictedTSS <- NULL
for(i in 1:nrow(Target)){
pred <- TXpredictDT(seqs2[i], Targeting_Factor_DT)
if(length(pred) > 0){
pred$gene_symbol <- Target$gene_symbol[i]
predictedTSS <- rbind(predictedTSS, pred)
}
setTxtProgressBar(pb, i)
}
close(pb)
return(predictedTSS)
}
if(type == "multiple_species"){
TXpredictDT <- function(Target2, Targeting_Factor_DT3){
TXpredictDT2 <- function(TargetDT, Targeting_Factor_DT2){
conse2 <- Targeting_Factor_DT2$Consensus_Sequence
#### query the input DNA sequence for consensus sequence elements.
hold <- data.table(NULL)
for(ce in conse2){
lis2 <- as.data.frame(str_locate_all(TargetDT, ce))
if(nrow(lis2) > 0){
dt <- cbind(lis2, Consensus_Sequence=ce)
hold <- rbind(hold, dt)
}
}
#### Count the number of times each consensus sequence appears.
if(nrow(hold) > 0){
setkey(hold, Consensus_Sequence)
numhits <- hold[,length(end), by=Consensus_Sequence]
hold <- merge(hold, numhits, by="Consensus_Sequence", allow.cartesian=TRUE)
}
## Add the Transcription factor names to the data.table
if(nrow(hold) > 0){
hold <- merge(hold, Targeting_Factor_DT2, by="Consensus_Sequence", allow.cartesian=TRUE)
}
#### Add the length of each DNA sequence
if(nrow(hold) > 0){
spl <- strsplit(TargetDT, split="")
hold$length <- length(spl[[1]])
}
# Add the sequences with hits reformatted into lower case.
if(nrow(hold) > 0){
spl <- strsplit(TargetDT, split="")
chromtot2 <- NULL
for(i in 1:length(hold$start)){
if((hold$end[i] == hold$length[i])){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
chromtot <- paste(first, sequencelow, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(first, sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) == 0){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
i <- i + 1
}
hold$sequence <- chromtot2
}
# Add the identified sequence
if(nrow(hold) > 0){
target_seq <- NULL
for(i in 1:nrow(hold)){
sp1 <- strsplit(hold$sequence[i], split="")
sp2 <- sp1[[1]][(hold$start[i]-1):(hold$end[i] +1)]
target_seq[i] <- paste(sp2, collapse="")
}
hold$Identified_sequence <- target_seq
}
# Rename the columns
if(nrow(hold) > 0){
setnames(hold, c("Consensus_Sequence", "start", "end", "V1", "Targeting_Factor", "length", "sequence", "Identified_sequence"),
c("Consensus_Sequence", "start", "end", "Number_Hits", "Targeting_Factor", "length", "Sequence", "Identified_sequence"))
}
return(hold)
}
seqs2 <- Target2$Sequence
predictedTSS <- NULL
for(i in 1:nrow(Target2)){
pred <- TXpredictDT2(seqs2[i], Targeting_Factor_DT3)
if(nrow(pred) >0){
pred$gene_symbol <- Target2$gene_symbol[i]
predictedTSS <- rbind(predictedTSS, pred)
}
}
return(predictedTSS)
}
sor <- Target[order(Target$Scientific_Name),]
SciSP <- sor[!duplicated(sor$Scientific_Name),]$Scientific_Name
Common <- sor[!duplicated(sor$Scientific_Name),]$Common_Name
pb <- txtProgressBar(min = 0, max = length(SciSP), style = 3)
Predicted <- NULL
for(i in 1:length(SciSP)){
SurfDT <- Target[Target$Scientific_Name == SciSP[i],]
TX_SP <- TXpredictDT(SurfDT, Targeting_Factor_DT)
if(length(TX_SP) > 0){
TX_SP$Species <- Common[i]
TX_SP$Scientific_Name <- SciSP[i]
Predicted <- rbind(Predicted, TX_SP)
}
setTxtProgressBar(pb, i)
}
close(pb)
return(Predicted)
# New <- NULL
# for(i in 1:length(Predicted)){
# New <- rbind(New, Predicted[[i]])
# }
# return(New)
#
}
if(type == "multiple_species_unknown"){
TXpredictDT <- function(Target2, Targeting_Factor_DT3){
TXpredictDT2 <- function(TargetDT, Targeting_Factor_DT2){
conse2 <- Targeting_Factor_DT2$Consensus_Sequence
#### query the input DNA sequence for consensus sequence elements.
hold <- data.table(NULL)
for(ce in conse2){
lis2 <- as.data.frame(str_locate_all(TargetDT, ce))
if(nrow(lis2) > 0){
dt <- cbind(lis2, Consensus_Sequence=ce)
hold <- rbind(hold, dt)
}
}
#### Count the number of times each consensus sequence appears.
if(nrow(hold) > 0){
setkey(hold, Consensus_Sequence)
numhits <- hold[,length(end), by=Consensus_Sequence]
hold <- merge(hold, numhits, by="Consensus_Sequence", allow.cartesian=TRUE)
}
## Add the Transcription factor names to the data.table
if(nrow(hold) > 0){
hold <- merge(hold, Targeting_Factor_DT2, by="Consensus_Sequence", allow.cartesian=TRUE)
}
#### Add the length of each DNA sequence
if(nrow(hold) > 0){
spl <- strsplit(TargetDT, split="")
hold$length <- length(spl[[1]])
}
# Add the sequences with hits reformatted into lower case.
if(nrow(hold) > 0){
spl <- strsplit(TargetDT, split="")
chromtot2 <- NULL
for(i in 1:length(hold$start)){
if((hold$end[i] == hold$length[i])){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
chromtot <- paste(first, sequencelow, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) > 0 & (!(hold$end[i] == hold$length[i]))){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
first <- paste(spl[[1]][1:(hold$start[i]-1)], collapse="")
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(first, sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
if((hold$start[i]-1) == 0){
upper <- spl[[1]][hold$start[i]:hold$end[i]]
lower <- tolower(upper)
sequencelow <- paste(lower, collapse="")
last <- paste(spl[[1]][(hold$end[i]+1):length(spl[[1]])], collapse="")
chromtot <- paste(sequencelow, last, collapse="")
chromtot2[i] <- gsub(" ", "", chromtot)
}
i <- i + 1
}
hold$sequence <- chromtot2
}
# Add the identified sequence
if(nrow(hold) > 0){
target_seq <- NULL
for(i in 1:nrow(hold)){
sp1 <- strsplit(hold$sequence[i], split="")
sp2 <- sp1[[1]][(hold$start[i]-1):(hold$end[i] +1)]
target_seq[i] <- paste(sp2, collapse="")
}
hold$Identified_sequence <- target_seq
}
# Rename the columns
if(nrow(hold) > 0){
setnames(hold, c("Consensus_Sequence", "start", "end", "V1", "Targeting_Factor", "length", "sequence", "Identified_sequence"),
c("Consensus_Sequence", "start", "end", "Number_Hits", "Targeting_Factor", "length", "Sequence", "Identified_sequence"))
}
return(hold)
}
seqs2 <- Target2$Sequence
predictedTSS <- NULL
for(i in 1:nrow(Target2)){
pred <- TXpredictDT2(seqs2[i], Targeting_Factor_DT3)
if(nrow(pred) >0){
pred$gene_symbol <- Target2$gene_symbol[i]
predictedTSS <- rbind(predictedTSS, pred)
}
}
return(predictedTSS)
}
SciSP <- sort(Target[!duplicated(Target$Scientific_Name),]$Scientific_Name)
pb <- txtProgressBar(min = 0, max = length(SciSP), style = 3)
Predicted <- NULL
for(i in 1:length(SciSP)){
SurfDT <- Target[Target$Scientific_Name == SciSP[i],]
TX_SP <- TXpredictDT(SurfDT, Targeting_Factor_DT)
if(length(TX_SP) > 0){
TX_SP$Species <- SciSP[i]
Predicted <- rbind(Predicted, TX_SP)
}
setTxtProgressBar(pb, i)
}
close(pb)
return(Predicted)
# New <- NULL
# for(i in 1:length(Predicted)){
# New <- rbind(New, Predicted[[i]])
# }
# return(New)
}
}
#' VariantSort
#'
#' Takes a Data.table containing a compilation of mRNA or protein sequences from a variety of species and either
#' returns a data table containing the longest transcript identified for each protein and species or each variant labeled according to the length of the sequence.
#' The data.table requires a column labeled "Scientific_Name", a column labeled "Sequence", and a column labeled "external_gene_name".
#'
#' @param DT a data table with three columns labeled Sequence, Scientific_Name, external_gene_name
#' @param variant a character statement either "MAX", "ALL", or "MIN". If "MAX", returns a data table containing the longest sequence for each sequence and species. If "ALL", returns a data table containing each variant labeled according to the length of the sequence. If "MIN", returns a data table containing the shortest sequence for each sequence and species.
#' @return A data table containing the longest transcripts for each protein and species or each variant labeled according to the length of the sequence.
#' @author Brendan Gongol
#' @importFrom data.table data.table
#' @export
#' @examples
#' setwd("C:/Users/Brendan/Desktop/oxidative stress surfactant bioinformatics")
#' Surfactant_Transcripts <- fread("1 Surfactant Transcripts.xls")
#' SURF_HUM <- Surfactant_Transcripts[Surfactant_Transcripts$Common_Name == "Human",]
#' sor <- VariantSort(SURF_HUM, variant = "MAX")
#' sor[,c(1, 5:11), with = FALSE]
#' sor <- VariantSort(SURF_HUM, variant = "ALL")
#' sor[,c(1, 5:12), with = FALSE]
#' sor <- VariantSort(SURF_HUM, variant = "MIN")
#' sor[,c(1, 5:11), with = FALSE]
VariantSort <- function(DT, variant){
if(variant == "MAX"){
seqs <- DT$Sequence
LEN <- NULL
pb <- txtProgressBar(min = 0, max = length(seqs), style = 3)
for(i in 1:length(seqs)){
c <- seqs[i]
LEN[i] <- length(strsplit(c, split = "")[[1]])
setTxtProgressBar(pb, i)
}
DT$Length <- LEN
DT2 <- DT[DT[, .I[which.max(Length)], by=c("Scientific_Name", "external_gene_name")]$V1]
close(pb)
return(DT2)
}
if(variant == "ALL"){
seqs <- DT$Sequence
LEN <- NULL
pb <- txtProgressBar(min = 0, max = length(seqs), style = 3)
for(i in 1:length(seqs)){
c <- seqs[i]
LEN[i] <- length(strsplit(c, split = "")[[1]])
setTxtProgressBar(pb, i)
}
DT$Length <- LEN
DT$Variant <- paste(DT$external_gene_name, DT$Length, sep = "-")
close(pb)
return(DT)
}
if(variant == "MIN"){
seqs <- DT$Sequence
LEN <- NULL
pb <- txtProgressBar(min = 0, max = length(seqs), style = 3)
for(i in 1:length(seqs)){
c <- seqs[i]
LEN[i] <- length(strsplit(c, split = "")[[1]])
setTxtProgressBar(pb, i)
}
DT$Length <- LEN
DT2 <- DT[DT[, .I[which.min(Length)], by=c("Scientific_Name", "external_gene_name")]$V1]
close(pb)
return(DT2)
}
}
#' ChromLabel
#'
#' Requires a data table with a column named "chromosome_name" housing the chromosome names in a numerical format only.
#' substitutes the numeric chromosome values for those in the following format" chr1, chr2, chr3, ect... for use in the getSeq() function.
#'
#' @param DT a data table with a column named "chromosome_name" housing the chromosome names in a numerical format only.
#' @return A data table containing substituted numeric chromosome values for those in the following format" chr1, chr2, chr3, ect... for use in the getSeq() function.
#' @author Brendan Gongol
#' @importFrom data.table data.table
#' @export
#' @examples
#' Sequence <- c("AAGCTAAGCTAAGCTGCGCAATTTTTGTATTTTGTTTAAACAGAATCCTCAAGGGAACATCATCCTCAGTTCTTTTTGTGTATTAGCTCAGATTTTCCAGCTGTTTTTAAAGCT",
#' "CTGTTTCGAGCCTGAATCTCGATCGCTCGCGCTAGACAGCTCGACGCACTTTTCAGCAGGAGCCTG",
#' "TCAGCAGATAGCGCTCGATACAGCTCGACAGCTCTTGCTGTATTGTGTG",
#' "TTGCTGTATTGTGTGATCCTCGATACAGGTATTTTCTGAGCCTGATAGCTAGCTTTGCTGTATTGTGTG",
#' "AAAAAATTTTTTCCCCCCGGGGGG")
#' gene_symbol <- c("AKT", "PI3K", "SREBP", "FOXO", "ABCA1")
#' chromosome_name <- c("1", "5", "10", "X", "Y")
#' chromo <- data.frame(cbind(Sequence, gene_symbol, chromosome_name))
#' chromo$Sequence <- as.character(chromo$Sequence)
#' chromo$chromosome_name <- as.character(chromo$chromosome_name)
#' ChromLabel(chromo)
ChromLabel <- function(DT){
DT$chromosome_name <- gsub("^10", "chr10", DT$chromosome_name)
DT$chromosome_name <- gsub("^11", "chr11", DT$chromosome_name)
DT$chromosome_name <- gsub("^12", "chr12", DT$chromosome_name)
DT$chromosome_name <- gsub("^13", "chr13", DT$chromosome_name)
DT$chromosome_name <- gsub("^14", "chr14", DT$chromosome_name)
DT$chromosome_name <- gsub("^15", "chr15", DT$chromosome_name)
DT$chromosome_name <- gsub("^16", "chr16", DT$chromosome_name)
DT$chromosome_name <- gsub("^17", "chr17", DT$chromosome_name)
DT$chromosome_name <- gsub("^18", "chr18", DT$chromosome_name)
DT$chromosome_name <- gsub("^19", "chr19", DT$chromosome_name)
DT$chromosome_name <- gsub("^1", "chr1", DT$chromosome_name)
DT$chromosome_name <- gsub("^2", "chr2", DT$chromosome_name)
DT$chromosome_name <- gsub("^3", "chr3", DT$chromosome_name)
DT$chromosome_name <- gsub("^4", "chr4", DT$chromosome_name)
DT$chromosome_name <- gsub("^5", "chr5", DT$chromosome_name)
DT$chromosome_name <- gsub("^6", "chr6", DT$chromosome_name)
DT$chromosome_name <- gsub("^7", "chr7", DT$chromosome_name)
DT$chromosome_name <- gsub("^8", "chr8", DT$chromosome_name)
DT$chromosome_name <- gsub("^9", "chr9", DT$chromosome_name)
DT$chromosome_name <- gsub("^X", "chrX", DT$chromosome_name)
DT$chromosome_name <- gsub("^Y", "chrY", DT$chromosome_name)
DT$chromosome_name <- gsub("^M", "chrM", DT$chromosome_name)
return(DT)
}
#' ChromSeqConvert
#'
#' Requires sequences returned from the getSeq function
#' returns the sequences in a string format that can be added to a data.table
#'
#' @param seqs
#' @return The sequences in a string format that can be added to a data.table
#' @author Brendan Gongol
#' @importFrom Biostrings toString
#' @export
#' @examples
ChromSeqConvert <- function(seqs){
sequences <- NULL
for(i in 1:length(seqs)){
sequences[i] <- toString(seqs[i])
}
return(sequences)
}
#' IUPAC_ScoreR
#'
#' Requires a string of IUPAC DNA codes and scores them according to the likelihood of observing the sequence by random chance.
#' returns the sequences in a string format that can be added to a data.table
#'
#' The IUPAC consensus score is calculated as follows: first any positions that can contain any amino acid are removed. then:
#' if stringency = "high", -log2(.25 for every single nucleotide position X .5 for every double nucleotide position X .75 for every tripple nucleotide position).
#' if stringency = "medium", -log2(.25 for every single nucleotide position X .5 for every double nucleotide position). tripple nucleodide positions are removed prior to the calculation.
#' if stringency = "low", -log2(.25 for every single nucleotide position). tripple and double nucleodide positions are removed prior to the calculation.
#'
#' @param Conse a character vector containing the DNA coded IUPAC sequences
#' @param stringency a character either "high", "medium", or "low"
#' @return the IUPAC consensus score for each IUPAC sequence
#' @author Brendan Gongol
#' @export
#' @examples
#' Conseq <- c("AAAAAAAAAAAAAAAAA", "SYYCNRNSTNGCGTGNSW", "GVTTATTAAKTGGTTATATTGGKTD", "RYSWKMBDHVNATCG")
#' IUPAC_ScoreR(Conseq, stringency = "high")
#' IUPAC_ScoreR(Conseq, stringency = "medium")
#' IUPAC_ScoreR(Conseq, stringency = "low")
IUPAC_ScoreR <- function(Conse, stringency){
pb <- txtProgressBar(min = 0, max = length(Conse), style = 3)
SCOLEN <- NULL
for(a in 1:length(Conse)){
if(stringency == "high"){
Conse2 <- gsub("N", "", Conse[a])
SPL <- strsplit(Conse2, split = "")
SCO <- 1
for(i in 1:length(SPL[[1]])){
if(SPL[[1]][i] == "R"){
IND <- 0.5
}
else if(SPL[[1]][i] == "Y"){
IND <- 0.5
}
else if(SPL[[1]][i] == "S"){
IND <- 0.5
}
else if(SPL[[1]][i] == "W"){
IND <- 0.5
}
else if(SPL[[1]][i] == "K"){
IND <- 0.5
}
else if(SPL[[1]][i] == "M"){
IND <- 0.5
}
else if(SPL[[1]][i] == "B"){
IND <- 0.75
}
else if(SPL[[1]][i] == "D"){
IND <- 0.75
}
else if(SPL[[1]][i] == "H"){
IND <- 0.75
}
else if(SPL[[1]][i] == "V"){
IND <- 0.75
}
else if(SPL[[1]][i] == "N"){
IND <- 0.75
}
else if(SPL[[1]][i] == "A"){
IND <- 0.25
}
else if(SPL[[1]][i] == "T"){
IND <- 0.25
}
else if(SPL[[1]][i] == "C"){
IND <- 0.25
}
else if(SPL[[1]][i] == "G"){
IND <- 0.25
}
SCO <- SCO * IND
}
SCOLEN[a] <- SCO
}
if(stringency == "medium"){
Conse1 <- gsub("N", "", Conse[a])
Conse2 <- gsub("B", "", Conse1)
Conse3 <- gsub("D", "", Conse2)
Conse4 <- gsub("H", "", Conse3)
Conse5 <- gsub("V", "", Conse4)
SPL <- strsplit(Conse5, split = "")
SCO <- 1
for(i in 1:length(SPL[[1]])){
if(SPL[[1]][i] == "R"){
IND <- 0.5
}
else if(SPL[[1]][i] == "Y"){
IND <- 0.5
}
else if(SPL[[1]][i] == "S"){
IND <- 0.5
}
else if(SPL[[1]][i] == "W"){
IND <- 0.5
}
else if(SPL[[1]][i] == "K"){
IND <- 0.5
}
else if(SPL[[1]][i] == "M"){
IND <- 0.5
}
else if(SPL[[1]][i] == "B"){
IND <- 0.75
}
else if(SPL[[1]][i] == "D"){
IND <- 0.75
}
else if(SPL[[1]][i] == "H"){
IND <- 0.75
}
else if(SPL[[1]][i] == "V"){
IND <- 0.75
}
else if(SPL[[1]][i] == "N"){
IND <- 0.75
}
else if(SPL[[1]][i] == "A"){
IND <- 0.25
}
else if(SPL[[1]][i] == "T"){
IND <- 0.25
}
else if(SPL[[1]][i] == "C"){
IND <- 0.25
}
else if(SPL[[1]][i] == "G"){
IND <- 0.25
}
SCO <- SCO * IND
}
SCOLEN[a] <- SCO
}
if(stringency == "low"){
Conse1 <- gsub("N", "", Conse[a])
Conse2 <- gsub("B", "", Conse1)
Conse3 <- gsub("D", "", Conse2)
Conse4 <- gsub("H", "", Conse3)
Conse5 <- gsub("V", "", Conse4)
Conse6 <- gsub("R", "", Conse5)
Conse7 <- gsub("Y", "", Conse6)
Conse8 <- gsub("S", "", Conse7)
Conse9 <- gsub("W", "", Conse8)
Conse10 <- gsub("K", "", Conse9)
Conse11 <- gsub("M", "", Conse10)
SPL <- strsplit(Conse11, split = "")
SCO <- 1
for(i in 1:length(SPL[[1]])){
if(SPL[[1]][i] == "R"){
IND <- 0.5
}
else if(SPL[[1]][i] == "Y"){
IND <- 0.5
}
else if(SPL[[1]][i] == "S"){
IND <- 0.5
}
else if(SPL[[1]][i] == "W"){
IND <- 0.5
}
else if(SPL[[1]][i] == "K"){
IND <- 0.5
}
else if(SPL[[1]][i] == "M"){
IND <- 0.5
}
else if(SPL[[1]][i] == "B"){
IND <- 0.75
}
else if(SPL[[1]][i] == "D"){
IND <- 0.75
}
else if(SPL[[1]][i] == "H"){
IND <- 0.75
}
else if(SPL[[1]][i] == "V"){
IND <- 0.75
}
else if(SPL[[1]][i] == "N"){
IND <- 0.75
}
else if(SPL[[1]][i] == "A"){
IND <- 0.25
}
else if(SPL[[1]][i] == "T"){
IND <- 0.25
}
else if(SPL[[1]][i] == "C"){
IND <- 0.25
}
else if(SPL[[1]][i] == "G"){
IND <- 0.25
}
SCO <- SCO * IND
}
SCOLEN[a] <- SCO
}
setTxtProgressBar(pb, a)
}
close(pb)
return(-log2(SCOLEN))
}
#' SpeciesTFCons
#'
#' Takes a data.table containing the following columns: "Species", "gene_symbol", "Targeting_Factor"
#' returns the data.table containing only Target names that are conserved between specified species for the original data.table (may contain additional species if
#' they were in the original data.table but will only contain those promoters that are conserved between the specified species.
#' It also may do the same thing for Targeting_Factor Target associations across species specified.
#'
#' @param DT the data table to query
#' @param Spec a character vector containing the species to query.
#' @param provide Target, TF_Target. If "Target", returns conserved Targets, If "TF_Target", returns only columns with Targeting_Factors that are paired with a Target for specified Species. It also returns an additional column with the paired Targeting_Factor and Target pair.
#' @return The sequences in a string format that can be added to a data.table
#' @author Brendan Gongol
#' @importFrom data.table data.table
#' @export
#' @examples
#' setwd("C:/Users/Brendan/Desktop/oxidative stress surfactant bioinformatics")
#' library(data.table)
#' DT25 <- fread("Raw Transcription factor hits.xls")
#' DT25 <- DT25[,c(1:6, 9:10), with = FALSE]
#' setnames(DT25, c("Consensus_Sequence", "start", "end", "Number_Hits", "TX_Factor", "MotifMap Degenerate consensus sequence", "promoter_name", "Species"),
#' c("Consensus_Sequence", "start", "end", "Number_Hits", "Targeting_Factor", "MotifMap Degenerate consensus sequence", "gene_symbol", "Species"))
#' SP_MIN <- c("Human", "Mouse", "Rat")
#' SpeciesTFCons(DT = DT25,Spec = SP_MIN, provide = "Target" )
#' SpeciesTFCons(DT = DT25,Spec = SP_MIN, provide = "TF_Target" )
SpeciesTFCons <- function(DT, Spec, provide){
if(provide == "Target"){
#### Create a list in which each element contains the data.table of the specified species.
PromoterDT <- NULL
for(i in 1:length(Spec)){
PromoterDT2 <- DT[DT$Species == Spec[i],]
PromoterDT[[i]] <- PromoterDT2
}
#### Identify Targets that are contained in each of the data.tables in the list.
Conserv <- PromoterDT[[1]][!duplicated(PromoterDT[[1]]$gene_symbol),]$gene_symbol
for(i in 1:length(PromoterDT)){
SingSpec <- PromoterDT[[i]][!duplicated(PromoterDT[[i]]$gene_symbol),]$gene_symbol
Conserv <- intersect(Conserv, SingSpec)
}
#### Use the identified conservedTargets to subset the original data.table and return it.
DT15 <- DT[(DT$gene_symbol %in% Conserv),]
return(DT15)
}
if(provide == "TF_Target"){
#### Paste the TX_Factor name and theTarget together and add it to a new column.
DT$mergecol <- paste(DT$Targeting_Factor, DT$gene_symbol, sep = "-")
#### Identify the TX_Factor promoter associations that are conserved
CONS2 <- DT$mergecol
for(i in 1:length(Spec)){
TX_cons_DT2 <- DT[DT$Species == Spec[i],]
TXS <- TX_cons_DT2[!duplicated(TX_cons_DT2$mergecol),]$mergecol
CONS2 <- intersect(CONS2, TXS)
}
#### Use the identified conserved promoters to subset the original data.table and return it.
DT12 <- DT[(DT$mergecol %in% CONS2),]
# DT12$mergecol <- NULL
return(DT12)
}
}
#' TFRankR
#'
#' Requires a data table containing a column labeled "gene_symbol", "Species", "Targeting_Factor", "Score"
#' Returns a ranked list according to the specified options.
#'
#' @param DT the data table to query
#' @param sortBy "species", "Target", "abundance", "score", "species & Target", "species & score & Target", "abundance & Target", "Species & abundance & Target", and "species & abundance". When sorting by "species $ Target", ranks greatest number of species first, least number of Targets second. When sorting by "species & score & Target", ranks greatest number of species first, greatest IUPAC consensus score second, and least number of Targets third.
#' When sorting by "abundance & Target", ranks the greatest abundance of consensus sequences for each promoter first and ranks the greatest number of targets for each Targeting_Factor second.
#' When sorting by "species & abundance & Target", ranks the greatest to least number of Species first, the greatest to least number of consensus sequences second, and the greatest to least number of targets third.
#' @param dec: used only if sortBy "species", "promoter", or "abundance" are used. It is either TRUE or FALSE and indicates whether to sort in decresing or increasing order respectvely.
#' @param SPselect: a single character vector used when sorting by "species & Target", "species & abundance" or by "species & score & Target" to designate the dominant species to use.
#' @param IUPACgreat: A logical statement either TRUE or FALSE indicating whether to use the greatest IUPAC consensus score of the lowest IUPAC score respectively when two or more IUPAC sequences are observed for a particular transcription factor.
#' @return The sequences in a string format that can be added to a data.table
#' @author Brendan Gongol
#' @importFrom data.table data.table
#' @export
#' @examples
#' setwd("C:/Users/Brendan/Desktop/oxidative stress surfactant bioinformatics")
#' library(data.table)
#' DT25 <- fread("Raw Transcription factor hits.xls")
#' DT25 <- DT25[,c(1:6, 9:10), with = FALSE]
#' setnames(DT25, c("Consensus_Sequence", "start", "end", "Number_Hits", "TX_Factor", "MotifMap Degenerate consensus sequence", "promoter_name", "Species"),
#' c("Consensus_Sequence", "start", "end", "Number_Hits", "Targeting_Factor", "MotifMap Degenerate consensus sequence", "gene_symbol", "Species"))
#' IUPAC <- DT25$`MotifMap Degenerate consensus sequence`
#' DT25$Score <- IUPAC_ScoreR(IUPAC, stringency = "medium")
#'
#' TFRankR(DT = DT25, sortBy = "species", dec = TRUE)
#' TFRankR(DT = DT25, sortBy = "species", dec = TRUE)
#' TFRankR(DT = DT25, sortBy = "species", dec = FALSE)
#' TFRankR(DT = DT25, sortBy = "Target", dec = TRUE)
#' TFRankR(DT = DT25, sortBy = "Target", dec = FALSE)
#' TFRankR(DT = DT25, sortBy = "abundance", dec = TRUE)
#' TFRankR(DT = DT25, sortBy = "abundance", dec = FALSE)
#' TFRankR(DT = DT25, sortBy = "score", dec = TRUE)
#' TFRankR(DT = DT25, sortBy = "score", dec = FALSE)
#' TFRankR(DT = DT25, sortBy = "species & Target", dec = FALSE, SPselect = "Human") # Ranks greatest number of species first, least number of Targets second.
#' TFRankR(DT = DT25, sortBy = "species & score & Target", dec = FALSE, SPselect = "Human", IUPACgreat = TRUE) # Ranks greatest number of species first, the greatest IUPAC consensus score second, and least number of Targets third.
#' TFRankR(DT = DT25, sortBy = "species & score & Target", dec = FALSE, SPselect = "Human", IUPACgreat = FALSE) # Ranks greatest number of species first, the greatest IUPAC consensus score second, and least number of Targets third.
#' TFRankR(DT = DT25, sortBy = "species & abundance", dec = TRUE, SPselect = "Human")
#' TFRankR(DT = DT25, sortBy = "species & abundance", dec = FALSE, SPselect = "Human")
TFRankR <- function(DT, sortBy, dec, SPselect, IUPACgreat){
if(sortBy == "species"){
#### rank each transcription factor by the species preservation at each Target
Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)
SP_RANK4 <- NULL
for(i in 1:length(Prom)){
PROM <- DT[DT$gene_symbol == Prom[i],]
PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,
SP_RANK2 <- PROM2[,.(Number_Species = length(Species)), by= Targeting_Factor]
# SP_RANK2 <- SP_RANK[order(SP_RANK$Number_Species, decreasing = dec),]
SP_RANK2$gene_symbol <- Prom[i]
SP_RANK4 <- rbind(SP_RANK4, SP_RANK2)
# SP_RANK4 <- SP_RANK4[order(SP_RANK4$Number_Species, decreasing = dec),]
}
SP_RANK4 <- SP_RANK4[order(SP_RANK4$Number_Species, decreasing = dec),]
return(SP_RANK4)
}
if(sortBy == "Target"){
#### Rank transcription factors by the number of Targets they are present for each species.
SPEC <- sort(DT[!duplicated(DT$Species),]$Species)
SPEC_RANK3 <- NULL
for(i in 1:length(SPEC)){
PROM <- DT[DT$Species == SPEC[i],]
PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,
SPEC_RANK2 <- PROM2[,.(Number_targets = length(gene_symbol)), by= Targeting_Factor]
# SPEC_RANK2 <- SPEC_RANK[order(SPEC_RANK$Number_targets, decreasing = dec),]
SPEC_RANK2$Species <- SPEC[i]
SPEC_RANK3 <- rbind(SPEC_RANK3, SPEC_RANK2)
}
SPEC_RANK3 <- SPEC_RANK3[order(SPEC_RANK3$Number_targets, decreasing = dec),]
return(SPEC_RANK3)
}
if(sortBy == "abundance"){
#### Rank transcription factors for each Target by the number of consensus sequences present at each Target for each Species.
SPEC <- sort(DT[!duplicated(DT$Species),]$Species)
Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)
HIT_RANK_TOT <- NULL
for(a in 1:length(SPEC)){
TX_TOT_Select <- DT[DT$Species == SPEC[a],]
HIT_RANK3 <- NULL
for(i in 1:length(Prom)){
PROM <- TX_TOT_Select[TX_TOT_Select$gene_symbol == Prom[i],]
PROM2 <- unique(PROM[,.(Number_Hits,Targeting_Factor, gene_symbol)]) #, Species
HIT_RANK2 <- PROM2[order(PROM2$Number_Hits, decreasing = dec),]
HIT_RANK3 <- rbind(HIT_RANK3, HIT_RANK2)
}
HIT_RANK3$Species <- SPEC[a]
HIT_RANK_TOT <- rbind(HIT_RANK_TOT, HIT_RANK3)
}
HIT_RANK_TOT <- HIT_RANK_TOT[order(HIT_RANK_TOT$Number_Hits, decreasing = dec),]
return(HIT_RANK_TOT)
}
if(sortBy == "score"){
#### Rank transcription factors for each Target by the IUPAC score at each Target for each Species.
SPEC <- sort(DT[!duplicated(DT$Species),]$Species)
Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)
SCO_RANK_TOT <- NULL
for(a in 1:length(SPEC)){
TX_TOT_Select <- DT[DT$Species == SPEC[a],]
SCO_RANK3 <- NULL
for(i in 1:length(Prom)){
PROM <- TX_TOT_Select[TX_TOT_Select$gene_symbol == Prom[i],]
PROM2 <- unique(PROM[,.(Score, Targeting_Factor, gene_symbol)]) #, Species
SCO_RANK2 <- PROM2[order(PROM2$Score, decreasing = dec),]
SCO_RANK3 <- rbind(SCO_RANK3, SCO_RANK2)
}
SCO_RANK3$Species <- SPEC[a]
SCO_RANK_TOT <- rbind(SCO_RANK_TOT, SCO_RANK3)
}
return(SCO_RANK_TOT)
}
if(sortBy == "species & Target"){
#### Rank transcription factors by the number of species they are present in for each Target and the number of Targets they are present in for a selected species (in my case Human)
#### Rank transcription factors by the number of Targets they are present for each species.
SPEC <- sort(DT[!duplicated(DT$Species),]$Species)
SPEC_RANK3 <- NULL
for(i in 1:length(SPEC)){
PROM <- DT[DT$Species == SPEC[i],]
PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,
SPEC_RANK <- PROM2[,.(Number_targets = length(gene_symbol)), by= Targeting_Factor]
SPEC_RANK2 <- SPEC_RANK[order(SPEC_RANK$Number_targets, decreasing = TRUE),]
SPEC_RANK2$Species <- SPEC[i]
SPEC_RANK3 <- rbind(SPEC_RANK3, SPEC_RANK2)
}
SPEC_RANK3
#### rank each transcription factor by the species preservation at each Target
Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)
SP_RANK4 <- NULL
for(i in 1:length(Prom)){
PROM <- DT[DT$gene_symbol == Prom[i],]
PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,
SP_RANK <- PROM2[,.(Number_Species = length(Species)), by= Targeting_Factor]
SP_RANK2 <- SP_RANK[order(SP_RANK$Number_Species, decreasing = TRUE),]
SP_RANK2$gene_symbol <- Prom[i]
SP_RANK4 <- rbind(SP_RANK4, SP_RANK2)
}
SP_RANK4
# Select species to merge by
SPEC_HUM <- SPEC_RANK3[SPEC_RANK3$Species == SPselect,]
# Merge the Species ranking and the number of Targets together.
SPECIES_PROMNUM <- merge(SP_RANK4, SPEC_HUM, by = "Targeting_Factor")
# Perform the ranking for each Target
TX <- SPECIES_PROMNUM[!duplicated(SPECIES_PROMNUM$gene_symbol),]$gene_symbol
SP_SPEC_RANK <- NULL
for(i in 1:length(TX)){
DT1 <- SPECIES_PROMNUM[SPECIES_PROMNUM$gene_symbol == TX[i],]
DT2 <- DT1[order(-DT1[,Number_Species], DT1[,Number_targets]),] # Ranking from the greatest number of Species and the least number of Targets
SP_SPEC_RANK <- rbind(SP_SPEC_RANK, DT2)
}
return(SP_SPEC_RANK)
}
if(sortBy == "species & score & Target"){
#### Rank transcription factors by the number of species they are present in for each Target and the number of Targets they are present in for a selected species, and the IUPAC score (in my case Human)
#### Rank transcription factors by the number of Targets they are present for each species.
SPEC <- DT[!duplicated(DT$Species),]$Species
SPEC_RANK3 <- NULL
for(i in 1:length(SPEC)){
PROM <- DT[DT$Species == SPEC[i],]
PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,
SPEC_RANK <- PROM2[,.(Number_targets = length(gene_symbol)), by= Targeting_Factor]
SPEC_RANK2 <- SPEC_RANK[order(SPEC_RANK$Number_targets, decreasing = dec),]
SPEC_RANK2$Species <- SPEC[i]
SPEC_RANK3 <- rbind(SPEC_RANK3, SPEC_RANK2)
}
SPEC_RANK3
#### rank each transcription factor by the species preservation at each Target
Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)
SP_RANK4 <- NULL
for(i in 1:length(Prom)){
PROM <- DT[DT$gene_symbol == Prom[i],]
PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,
SP_RANK <- PROM2[,.(Number_Species = length(Species)), by= Targeting_Factor]
SP_RANK2 <- SP_RANK[order(SP_RANK$Number_Species, decreasing = dec),]
SP_RANK2$gene_symbol <- Prom[i]
SP_RANK4 <- rbind(SP_RANK4, SP_RANK2)
}
SP_RANK4
#### Rank transcription factors for each Target by the IUPAC score at each Target for each Species.
SPEC <- DT[!duplicated(DT$Species),]$Species
Prom <- DT[!duplicated(DT$gene_symbol),]$gene_symbol
SCO_RANK_TOT <- NULL
for(a in 1:length(SPEC)){
TX_TOT_Select <- DT[DT$Species == SPEC[a],]
SCO_RANK3 <- NULL
for(i in 1:length(Prom)){
PROM <- TX_TOT_Select[TX_TOT_Select$gene_symbol == Prom[i],]
PROM2 <- unique(PROM[,.(Score,Targeting_Factor, gene_symbol)]) #, Species
SCO_RANK2 <- PROM2[order(PROM2$Score, decreasing = dec),]
SCO_RANK3 <- rbind(SCO_RANK3, SCO_RANK2)
}
SCO_RANK3$Species <- SPEC[a]
SCO_RANK_TOT <- rbind(SCO_RANK_TOT, SCO_RANK3)
}
SCO_RANK_TOT
# Select species to merge by
SPEC_HUM <- SPEC_RANK3[SPEC_RANK3$Species == SPselect,]
SCO_HUM <- SCO_RANK_TOT[SCO_RANK_TOT$Species == SPselect,]
# Remve duplicated entries for the transcription factors and Select either the consensus sequence with the maximum or minimum score
if(IUPACgreat == TRUE){
SCO_RANK_TOT2 <- as.data.table(SCO_HUM)
SCO_RANK_TOT2 <- SCO_RANK_TOT2[SCO_RANK_TOT2[, .I[which.max(Score)], by= Targeting_Factor]$V1] #### Only returns one transcription factor consensus sequence for each transcription factor.
SCO_RANK_TOT2$gene_symbol <- NULL
SCO_RANK_TOT2$Species <- NULL
}
if(IUPACgreat == FALSE){
SCO_RANK_TOT2 <- as.data.table(SCO_HUM)
SCO_RANK_TOT2 <- SCO_RANK_TOT2[SCO_RANK_TOT2[, .I[which.min(Score)], by= Targeting_Factor]$V1] #### Only returns one transcription factor consensus sequence for each transcription factor.
SCO_RANK_TOT2$gene_symbol <- NULL
SCO_RANK_TOT2$Species <- NULL
}
# Merge the Species ranking and the number of Targets together.
SPECIES_PROMNUM <- merge(SP_RANK4, SPEC_HUM, by = "Targeting_Factor")
# Merge the score ranking Species and Target ranking.
SCO_SPECIES_PRONUM <- merge(SPECIES_PROMNUM, SCO_RANK_TOT2, by = "Targeting_Factor")
#### Perform the final ranking
TX <- sort(SCO_SPECIES_PRONUM[!duplicated(SCO_SPECIES_PRONUM$gene_symbol),]$gene_symbol)
SP_SPEC_IU_RANK <- NULL
for(i in 1:length(TX)){
DT1 <- SCO_SPECIES_PRONUM[SCO_SPECIES_PRONUM$gene_symbol == TX[i],]
DT2 <- DT1[order(-DT1[,Number_Species], -DT1[,Score], DT1[,Number_targets]),] # Ranking from the greatest number of Species, the consensus sequence score, and the least number of Targets.
SP_SPEC_IU_RANK <- rbind(SP_SPEC_IU_RANK, DT2)
}
return(SP_SPEC_IU_RANK)
}
if(sortBy == "species & abundance"){
#### rank each transcription factor by the species preservation at each Target
Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)
SP_RANK4 <- NULL
for(i in 1:length(Prom)){
PROM <- DT[DT$gene_symbol == Prom[i],]
PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,
SP_RANK <- PROM2[,.(Number_Species = length(Species)), by= Targeting_Factor]
SP_RANK2 <- SP_RANK[order(SP_RANK$Number_Species, decreasing = dec),]
SP_RANK2$gene_symbol <- Prom[i]
SP_RANK4 <- rbind(SP_RANK4, SP_RANK2)
}
#### Rank transcription factors for each Target by the number of consensus sequences present at each Target for each Species.
SPEC <- sort(DT[!duplicated(DT$Species),]$Species)
Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)
HIT_RANK_TOT <- NULL
for(a in 1:length(SPEC)){
TX_TOT_Select <- DT[DT$Species == SPEC[a],]
HIT_RANK3 <- NULL
for(i in 1:length(Prom)){
PROM <- TX_TOT_Select[TX_TOT_Select$gene_symbol == Prom[i],]
PROM2 <- unique(PROM[,.(Number_Hits,Targeting_Factor, gene_symbol)]) #, Species
HIT_RANK2 <- PROM2[order(PROM2$Number_Hits, decreasing = dec),]
HIT_RANK3 <- rbind(HIT_RANK3, HIT_RANK2)
}
HIT_RANK3$Species <- SPEC[a]
HIT_RANK_TOT <- rbind(HIT_RANK_TOT, HIT_RANK3)
}
# Select species to merge by
SPEC_HUM <- HIT_RANK_TOT[HIT_RANK_TOT$Species == SPselect,]
# Merge the Species ranking and the number of Targets together.
SPECIES_PROMNUM <- merge(SP_RANK4, SPEC_HUM, by = c("Targeting_Factor", "gene_symbol"))
# SPECIES_PROMNUM <- unique(SPECIES_PROMNUM[,.(Targeting_Factor, gene_symbol, Number_Species, Number_Hits)])
#### Perform the final ranking
TX <- sort(SPECIES_PROMNUM[!duplicated(SPECIES_PROMNUM$gene_symbol),]$gene_symbol)
SP_SPEC_IU_RANK <- NULL
for(i in 1:length(TX)){
DT1 <- SPECIES_PROMNUM[SPECIES_PROMNUM$gene_symbol == TX[i],]
DT2 <- DT1[order(-DT1[,Number_Species], -DT1[,Number_Hits]),] # Ranking from the greatest number of Species, the consensus sequence score, and the least number of Targets.
SP_SPEC_IU_RANK <- rbind(SP_SPEC_IU_RANK, DT2)
}
SP_SPEC_IU_RANK
return(SP_SPEC_IU_RANK)
}
if(sortBy == "abundance & Target"){
#### Rank transcription factors for each Target by the number of consensus sequences present at each Target for each Species.
SPEC <- sort(DT[!duplicated(DT$Species),]$Species)
Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)
HIT_RANK_TOT <- NULL
for(a in 1:length(SPEC)){
TX_TOT_Select <- DT[DT$Species == SPEC[a],]
HIT_RANK3 <- NULL
for(i in 1:length(Prom)){
PROM <- TX_TOT_Select[TX_TOT_Select$gene_symbol == Prom[i],]
PROM2 <- unique(PROM[,.(Number_Hits,Targeting_Factor, gene_symbol)]) #, Species
HIT_RANK2 <- PROM2[order(PROM2$Number_Hits, decreasing = dec),]
HIT_RANK3 <- rbind(HIT_RANK3, HIT_RANK2)
}
HIT_RANK3$Species <- SPEC[a]
HIT_RANK_TOT <- rbind(HIT_RANK_TOT, HIT_RANK3)
}
HIT_RANK_TOT
#### Rank transcription factors by the number of Targets they are present for each species.
SPEC <- sort(DT[!duplicated(DT$Species),]$Species)
SPEC_RANK3 <- NULL
for(i in 1:length(SPEC)){
PROM <- DT[DT$Species == SPEC[i],]
PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,
SPEC_RANK <- PROM2[,.(Number_targets = length(gene_symbol)), by= Targeting_Factor]
SPEC_RANK2 <- SPEC_RANK[order(SPEC_RANK$Number_targets, decreasing = dec),]
SPEC_RANK2$Species <- SPEC[i]
SPEC_RANK3 <- rbind(SPEC_RANK3, SPEC_RANK2)
}
SPEC_RANK3
# Select species to merge by
HIT_HUM <- HIT_RANK_TOT[HIT_RANK_TOT$Species == SPselect,]
SPEC_HUM <- SPEC_RANK3[SPEC_RANK3$Species == SPselect,]
SPEC_HUM$Species <- NULL
# Merge the hit ranking and the number of Targets together.
SPECIES_PROMNUM <- merge(SPEC_HUM, HIT_HUM, by = c("Targeting_Factor"))
# SPECIES_PROMNUM <- unique(SPECIES_PROMNUM[,.(Targeting_Factor, gene_symbol, Number_Hits, Number_targets, Species)])
#### Perform the final ranking
TX <- sort(SPECIES_PROMNUM[!duplicated(SPECIES_PROMNUM$gene_symbol),]$gene_symbol)
SP_SPEC_IU_RANK <- NULL
for(i in 1:length(TX)){
DT1 <- SPECIES_PROMNUM[SPECIES_PROMNUM$gene_symbol == TX[i],]
DT2 <- DT1[order(-DT1[,Number_Hits], -DT1[,Number_targets]),] # Ranking from the greatest to least number of consensus sequences and the greatest to least number of targets.
SP_SPEC_IU_RANK <- rbind(SP_SPEC_IU_RANK, DT2)
}
return(SP_SPEC_IU_RANK)
}
if(sortBy == "species & abundance & Target"){
#### Rank targeting factors for each Target by the number of consensus sequences present at each Target for each Species.
SPEC <- sort(DT[!duplicated(DT$Species),]$Species)
Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)
HIT_RANK_TOT <- NULL
for(a in 1:length(SPEC)){
TX_TOT_Select <- DT[DT$Species == SPEC[a],]
HIT_RANK3 <- NULL
for(i in 1:length(Prom)){
PROM <- TX_TOT_Select[TX_TOT_Select$gene_symbol == Prom[i],]
PROM2 <- unique(PROM[,.(Number_Hits,Targeting_Factor, gene_symbol)]) #, Species
HIT_RANK2 <- PROM2[order(PROM2$Number_Hits, decreasing = TRUE),]
HIT_RANK3 <- rbind(HIT_RANK3, HIT_RANK2)
}
HIT_RANK3$Species <- SPEC[a]
HIT_RANK_TOT <- rbind(HIT_RANK_TOT, HIT_RANK3)
}
#### Rank Targeting factors by the number of Targets they are present for each species.
SPEC <- sort(DT[!duplicated(DT$Species),]$Species)
SPEC_RANK3 <- NULL
for(i in 1:length(SPEC)){
PROM <- DT[DT$Species == SPEC[i],]
PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,
SPEC_RANK <- PROM2[,.(Number_targets = length(gene_symbol)), by= Targeting_Factor]
SPEC_RANK2 <- SPEC_RANK[order(SPEC_RANK$Number_targets, decreasing = TRUE),]
SPEC_RANK2$Species <- SPEC[i]
SPEC_RANK3 <- rbind(SPEC_RANK3, SPEC_RANK2)
}
#### rank each Targeting factor by the species preservation at each Target
Prom <- sort(DT[!duplicated(DT$gene_symbol),]$gene_symbol)
SP_RANK4 <- NULL
for(i in 1:length(Prom)){
PROM <- DT[DT$gene_symbol == Prom[i],]
PROM2 <- unique(PROM[,.(Targeting_Factor, gene_symbol, Species)]) # Number_Hits,
SP_RANK <- PROM2[,.(Number_Species = length(Species)), by= Targeting_Factor]
SP_RANK2 <- SP_RANK[order(SP_RANK$Number_Species, decreasing = TRUE),]
SP_RANK2$gene_symbol <- Prom[i]
SP_RANK4 <- rbind(SP_RANK4, SP_RANK2)
}
# Select species to merge by
HIT_HUM <- HIT_RANK_TOT[HIT_RANK_TOT$Species == SPselect,]
SPEC_HUM <- SPEC_RANK3[SPEC_RANK3$Species == SPselect,]
SPEC_HUM$Species <- NULL
# Merge the hit ranking and the number of Targets together.
SPECIES_PROMNUM <- merge(SPEC_HUM, HIT_HUM, by = c("Targeting_Factor"))
# SPECIES_PROMNUM <- unique(SPECIES_PROMNUM[,.(Targeting_Factor, gene_symbol, Number_Hits, Number_targets, Species)])
# Merge the species ranking with the merges data.table
SPECIES_PROMNUM <- merge(SPECIES_PROMNUM, SP_RANK4, by = c("gene_symbol", "Targeting_Factor"))
SPECIES_PROMNUM <- unique(SPECIES_PROMNUM[,.(Targeting_Factor, gene_symbol, Number_Species, Number_Hits, Number_targets, Species)])
#### Perform the final ranking
TX <- sort(SPECIES_PROMNUM[!duplicated(SPECIES_PROMNUM$gene_symbol),]$gene_symbol)
SP_SPEC_IU_RANK <- NULL
for(i in 1:length(TX)){
DT1 <- SPECIES_PROMNUM[SPECIES_PROMNUM$gene_symbol == TX[i],]
DT2 <- DT1[order(-DT1[,Number_Species], -DT1[,Number_Hits], -DT1[,Number_targets]),] # Ranking from the greatest to least number of Species, the greatest to least number of consensus sequences, and the greatest to least number of targets.
SP_SPEC_IU_RANK <- rbind(SP_SPEC_IU_RANK, DT2)
}
return(SP_SPEC_IU_RANK)
}
}
#' GenomeInstaller
#'
#' Requires a character vector of specified genomes to install and installs them.
#' Installs the following genomes when specified
#' "BSgenome.Hsapiens.UCSC.hg38"
#' "BSgenome.Mmusculus.UCSC.mm9"
#' "BSgenome.Mmusculus.UCSC.mm10"
#' "BSgenome.Rnorvegicus.UCSC.rn6"
#' "BSgenome.Alyrata.JGI.v1"
#' "BSgenome.Amellifera.BeeBase.assembly4"
#' "BSgenome.Athaliana.TAIR.TAIR9"
#' "BSgenome.Btaurus.UCSC.bosTau8"
#' "BSgenome.Celegans.UCSC.ce2"
#' "BSgenome.Celegans.UCSC.ce11"
#' "BSgenome.Cfamiliaris.UCSC.canFam3"
#' "BSgenome.Dmelanogaster.UCSC.dm3"
#' "BSgenome.Drerio.UCSC.danRer10"
#' "BSgenome.Gaculeatus.UCSC.gasAcu1"
#' "BSgenome.Ggallus.UCSC.galGal4"
#' "BSgenome.Mfascicularis.NCBI.5.0"
#' "BSgenome.Mfuro.UCSC.musFur1"
#' "BSgenome.Mmulatta.UCSC.rheMac3"
#' "BSgenome.Osativa.MSU.MSU7"
#' "BSgenome.Ptroglodytes.UCSC.panTro3"
#' "BSgenome.Scerevisiae.UCSC.sacCer3"
#' "BSgenome.Sscrofa.UCSC.susScr3"
#' "BSgenome.Tguttata.UCSC.taeGut2"
#' @param genomes A character vector indicating which genomes to install of the following options:
#' "Hsapiens.UCSC.hg38"
#' "Mmusculus.UCSC.mm9"
#' "Mmusculus.UCSC.mm10"
#' "Rnorvegicus.UCSC.rn6"
#' "Alyrata.JGI.v1"
#' "Amellifera.BeeBase.assembly4"
#' "Athaliana.TAIR.TAIR9"
#' "Btaurus.UCSC.bosTau8"
#' "Celegans.UCSC.ce2"
#' "Celegans.UCSC.ce11"
#' "Cfamiliaris.UCSC.canFam3"
#' "Dmelanogaster.UCSC.dm3"
#' "Drerio.UCSC.danRer10"
#' "Gaculeatus.UCSC.gasAcu1"
#' "Ggallus.UCSC.galGal4"
#' "Mfascicularis.NCBI.5.0"
#' "Mfuro.UCSC.musFur1"
#' "Mmulatta.UCSC.rheMac3"
#' "Osativa.MSU.MSU7"
#' "Ptroglodytes.UCSC.panTro3"
#' "Scerevisiae.UCSC.sacCer3"
#' "Sscrofa.UCSC.susScr3"
#' "Tguttata.UCSC.taeGut2"
#' @return The specified downloaded genomes
#' @author Brendan Gongol
#' @export
#' @examples
#' GenomeInstaller(c("Hsapiens.UCSC.hg38","Mmusculus.UCSC.mm9"))
#' GenomeInstaller("Apple")
#'
#' The following code installs all available genomes with this function and attempts to install one genome not available ("APPLE"):
#' ToInstall <- c("Hsapiens.UCSC.hg38","Mmusculus.UCSC.mm9"
#' ,"Mmusculus.UCSC.mm10","Rnorvegicus.UCSC.rn6"
#' ,"Alyrata.JGI.v1","Amellifera.BeeBase.assembly4"
#' ,"Athaliana.TAIR.TAIR9","Btaurus.UCSC.bosTau8"
#' ,"Celegans.UCSC.ce2","Celegans.UCSC.ce11"
#' ,"Cfamiliaris.UCSC.canFam3","Dmelanogaster.UCSC.dm3"
#' ,"Drerio.UCSC.danRer10","Gaculeatus.UCSC.gasAcu1"
#' ,"Ggallus.UCSC.galGal4","Mfascicularis.NCBI.5.0"
#' ,"Mfuro.UCSC.musFur1","Mmulatta.UCSC.rheMac3"
#' ,"Osativa.MSU.MSU7","Ptroglodytes.UCSC.panTro3"
#' ,"Scerevisiae.UCSC.sacCer3","Sscrofa.UCSC.susScr3"
#' ,"Tguttata.UCSC.taeGut2", "APPLE")
#' GenomeInstaller(ToInstall)
GenomeInstaller <- function(genomes){
source("http://bioconductor.org/biocLite.R")
for(i in 1:length(genomes)){
if(genomes[i] == "Hsapiens.UCSC.hg38")
biocLite("BSgenome.Hsapiens.UCSC.hg38")
else if(genomes[i] == "Mmusculus.UCSC.mm9")
biocLite("BSgenome.Mmusculus.UCSC.mm9")
else if(genomes[i] == "Mmusculus.UCSC.mm10")
biocLite("BSgenome.Mmusculus.UCSC.mm10")
else if(genomes[i] == "Rnorvegicus.UCSC.rn6")
biocLite("BSgenome.Rnorvegicus.UCSC.rn6")
else if(genomes[i] == "Alyrata.JGI.v1")
biocLite("BSgenome.Alyrata.JGI.v1")
else if(genomes[i] == "Amellifera.BeeBase.assembly4")
biocLite("BSgenome.Amellifera.BeeBase.assembly4")
else if(genomes[i] == "Athaliana.TAIR.TAIR9")
biocLite("BSgenome.Athaliana.TAIR.TAIR9")
else if(genomes[i] == "Btaurus.UCSC.bosTau8")
biocLite("BSgenome.Btaurus.UCSC.bosTau8")
else if(genomes[i] == "Celegans.UCSC.ce2")
biocLite("BSgenome.Celegans.UCSC.ce2")
else if(genomes[i] == "Celegans.UCSC.ce11")
biocLite("BSgenome.Celegans.UCSC.ce11")
else if(genomes[i] == "Cfamiliaris.UCSC.canFam3")
biocLite("BSgenome.Cfamiliaris.UCSC.canFam3")
else if(genomes[i] == "Dmelanogaster.UCSC.dm3")
biocLite("BSgenome.Dmelanogaster.UCSC.dm3")
else if(genomes[i] == "Drerio.UCSC.danRer10")
biocLite("BSgenome.Drerio.UCSC.danRer10")
else if(genomes[i] == "Gaculeatus.UCSC.gasAcu1")
biocLite("BSgenome.Gaculeatus.UCSC.gasAcu1")
else if(genomes[i] == "Ggallus.UCSC.galGal4")
biocLite("BSgenome.Ggallus.UCSC.galGal4")
else if(genomes[i] == "Mfascicularis.NCBI.5.0")
biocLite("BSgenome.Mfascicularis.NCBI.5.0")
else if(genomes[i] == "Mfuro.UCSC.musFur1")
biocLite("BSgenome.Mfuro.UCSC.musFur1")
else if(genomes[i] == "Mmulatta.UCSC.rheMac3")
biocLite("BSgenome.Mmulatta.UCSC.rheMac3")
else if(genomes[i] == "Osativa.MSU.MSU7")
biocLite("BSgenome.Osativa.MSU.MSU7")
else if(genomes[i] == "Ptroglodytes.UCSC.panTro3")
biocLite("BSgenome.Ptroglodytes.UCSC.panTro3")
else if(genomes[i] == "Scerevisiae.UCSC.sacCer3")
biocLite("BSgenome.Scerevisiae.UCSC.sacCer3")
else if(genomes[i] == "Sscrofa.UCSC.susScr3")
biocLite("BSgenome.Sscrofa.UCSC.susScr3")
else if(genomes[i] == "Tguttata.UCSC.taeGut2")
biocLite("BSgenome.Tguttata.UCSC.taeGut2")
else{
print("Genome not available with this function")
}
}
}
#' GenomeLoader
#'
#' Requires a character vector of specified genomes to load and loads them.
#'
#' Loads the following genomes when specified
#' BSgenome.Mmusculus.UCSC.mm10
#' BSgenome.Hsapiens.UCSC.hg38
#' BSgenome.Rnorvegicus.UCSC.rn6
#' BSgenome.Alyrata.JGI.v1
#' BSgenome.Amellifera.BeeBase.assembly4
#' BSgenome.Athaliana.TAIR.TAIR9
#' BSgenome.Btaurus.UCSC.bosTau8
#' BSgenome.Celegans.UCSC.ce11
#' BSgenome.Cfamiliaris.UCSC.canFam3
#' BSgenome.Dmelanogaster.UCSC.dm6
#' BSgenome.Drerio.UCSC.danRer10
#' BSgenome.Gaculeatus.UCSC.gasAcu1
#' BSgenome.Ggallus.UCSC.galGal4
#' BSgenome.Mfascicularis.NCBI.5.0
#' BSgenome.Mfuro.UCSC.musFur1
#' BSgenome.Mmulatta.UCSC.rheMac3
#' BSgenome.Osativa.MSU.MSU7
#' BSgenome.Ptroglodytes.UCSC.panTro3
#' BSgenome.Scerevisiae.UCSC.sacCer3
#' BSgenome.Sscrofa.UCSC.susScr3
#' BSgenome.Tguttata.UCSC.taeGut2
#' @param genomes A character vector indicating which genomes to install of the following options:
#' "Hsapiens.UCSC.hg38"
#' "Mmusculus.UCSC.mm9"
#' "Mmusculus.UCSC.mm10"
#' "Rnorvegicus.UCSC.rn6"
#' "Alyrata.JGI.v1"
#' "Amellifera.BeeBase.assembly4"
#' "Athaliana.TAIR.TAIR9"
#' "Btaurus.UCSC.bosTau8"
#' "Celegans.UCSC.ce2"
#' "Celegans.UCSC.ce11"
#' "Cfamiliaris.UCSC.canFam3"
#' "Dmelanogaster.UCSC.dm3"
#' "Drerio.UCSC.danRer10"
#' "Gaculeatus.UCSC.gasAcu1"
#' "Ggallus.UCSC.galGal4"
#' "Mfascicularis.NCBI.5.0"
#' "Mfuro.UCSC.musFur1"
#' "Mmulatta.UCSC.rheMac3"
#' "Osativa.MSU.MSU7"
#' "Ptroglodytes.UCSC.panTro3"
#' "Scerevisiae.UCSC.sacCer3"
#' "Sscrofa.UCSC.susScr3"
#' "Tguttata.UCSC.taeGut2"
#' @return The specified loaded genomes
#' @author Brendan Gongol
#' @export
#' @examples
#' GenomeLoader(c("Hsapiens.UCSC.hg38","Mmusculus.UCSC.mm9"))
#' GenomeLoader("Apple")
#'
#' The following code loads all available genomes with this function and attempts to load one genome not available ("APPLE"):
#' ToLoad <- c("Hsapiens.UCSC.hg38","Mmusculus.UCSC.mm9"
#' ,"Mmusculus.UCSC.mm10","Rnorvegicus.UCSC.rn6"
#' ,"Alyrata.JGI.v1","Amellifera.BeeBase.assembly4"
#' ,"Athaliana.TAIR.TAIR9","Btaurus.UCSC.bosTau8"
#' ,"Celegans.UCSC.ce2","Celegans.UCSC.ce11"
#' ,"Cfamiliaris.UCSC.canFam3","Dmelanogaster.UCSC.dm3"
#' ,"Drerio.UCSC.danRer10","Gaculeatus.UCSC.gasAcu1"
#' ,"Ggallus.UCSC.galGal4","Mfascicularis.NCBI.5.0"
#' ,"Mfuro.UCSC.musFur1","Mmulatta.UCSC.rheMac3"
#' ,"Osativa.MSU.MSU7","Ptroglodytes.UCSC.panTro3"
#' ,"Scerevisiae.UCSC.sacCer3","Sscrofa.UCSC.susScr3"
#' ,"Tguttata.UCSC.taeGut2", "APPLE")
#' GenomeLoader(ToLoad)
GenomeLoader <- function(genomes){
library(BSgenome)
for(i in 1:length(genomes)){
if(genomes[i] == "Hsapiens.UCSC.hg38")
library(BSgenome.Hsapiens.UCSC.hg38)
else if(genomes[i] == "Mmusculus.UCSC.mm9")
library(BSgenome.Mmusculus.UCSC.mm9)
else if(genomes[i] == "Mmusculus.UCSC.mm10")
library(BSgenome.Mmusculus.UCSC.mm10)
else if(genomes[i] == "Rnorvegicus.UCSC.rn6")
library(BSgenome.Rnorvegicus.UCSC.rn6)
else if(genomes[i] == "Alyrata.JGI.v1")
library(BSgenome.Alyrata.JGI.v1)
else if(genomes[i] == "Amellifera.BeeBase.assembly4")
library(BSgenome.Amellifera.BeeBase.assembly4)
else if(genomes[i] == "Athaliana.TAIR.TAIR9")
library(BSgenome.Athaliana.TAIR.TAIR9)
else if(genomes[i] == "Btaurus.UCSC.bosTau8")
library(BSgenome.Btaurus.UCSC.bosTau8)
else if(genomes[i] == "Celegans.UCSC.ce2")
library(BSgenome.Celegans.UCSC.ce2)
else if(genomes[i] == "Celegans.UCSC.ce11")
library(BSgenome.Celegans.UCSC.ce11)
else if(genomes[i] == "Cfamiliaris.UCSC.canFam3")
library(BSgenome.Cfamiliaris.UCSC.canFam3)
else if(genomes[i] == "Dmelanogaster.UCSC.dm3")
library(BSgenome.Dmelanogaster.UCSC.dm3)
else if(genomes[i] == "Dmelanogaster.UCSC.dm6")
library(BSgenome.Dmelanogaster.UCSC.dm6)
else if(genomes[i] == "Drerio.UCSC.danRer10")
library(BSgenome.Drerio.UCSC.danRer10)
else if(genomes[i] == "Gaculeatus.UCSC.gasAcu1")
library(BSgenome.Gaculeatus.UCSC.gasAcu1)
else if(genomes[i] == "Ggallus.UCSC.galGal4")
library(BSgenome.Ggallus.UCSC.galGal4)
else if(genomes[i] == "Mfascicularis.NCBI.5.0")
library(BSgenome.Mfascicularis.NCBI.5.0)
else if(genomes[i] == "Mfuro.UCSC.musFur1")
library(BSgenome.Mfuro.UCSC.musFur1)
else if(genomes[i] == "Mmulatta.UCSC.rheMac3")
library(BSgenome.Mmulatta.UCSC.rheMac3)
else if(genomes[i] == "Osativa.MSU.MSU7")
library(BSgenome.Osativa.MSU.MSU7)
else if(genomes[i] == "Ptroglodytes.UCSC.panTro3")
library(BSgenome.Ptroglodytes.UCSC.panTro3)
else if(genomes[i] == "Scerevisiae.UCSC.sacCer3")
library(BSgenome.Scerevisiae.UCSC.sacCer3)
else if(genomes[i] == "Sscrofa.UCSC.susScr3")
library(BSgenome.Sscrofa.UCSC.susScr3)
else if(genomes[i] == "Tguttata.UCSC.taeGut2")
library(BSgenome.Tguttata.UCSC.taeGut2)
else{
print("Genome not available with this function")
}
}
}
#' ChromosomeSeqCompileR
#'
#' Requires a data table with three columns labeled: "transcription_start_site", "chromosome_name", "Scientific_Name".
#' Requires the specification of which genomes to obtain the sequences from.
#' Requires the specification of how far upstream and downstream of the transcription start site to return sequences for.
#'
#' @param distance specifies the number of nucleotides upstream and downstream to return from the transcript start site.
#' @param DT A data table containing columns labeled: "transcription_start_site", "chromosome_name", "Scientific_Name".
#' Species labels in "Scientific_Name" that are used when querying available genomes. The Scientific_Name must be labeled accordingly:
#' "Homo_sapiens"
#' "Mus_musculus"
#' "Rattus_norvegicus"
#' "Arabidopsis_lyrata"
#' "Apis_mellifera"
#' "Arabidopsis_thaliana"
#' "Bos_taurus"
#' "Caenorhabditis_elegans"
#' "Canis_familiaris"
#' "Drosophila_melanogaster"
#' "Danio_rerio"
#' "Gasterosteus_aculeatus"
#' "Gallus_gallus"
#' "Macaca_fascicularis"
#' "Mustela_putorius_furo"
#' "Macaca_mulatta"
#' "Oryza_sativa"
#' "Pan_troglodytes"
#' "Saccharomyces_cerevisiae"
#' "Sus_scrofa"
#' "Taeniopygia_guttata"
#' @param Spec a character vector used to designate what genomes to return sequences from to annotate DT.
#' available genomes:
#' "Hsapiens.UCSC.hg38"
#' "Mmusculus.UCSC.mm10"
#' "Rnorvegicus.UCSC.rn6"
#' "Alyrata.JGI.v1"
#' "Amellifera.BeeBase.assembly4"
#' "Athaliana.TAIR.TAIR9"
#' "Btaurus.UCSC.bosTau8"
#' "Celegans.UCSC.ce11"
#' "Cfamiliaris.UCSC.canFam3"
#' "Dmelanogaster.UCSC.dm6"
#' "Drerio.UCSC.danRer10"
#' "Gaculeatus.UCSC.gasAcu1"
#' "Ggallus.UCSC.galGal4"
#' "Mfascicularis.NCBI.5.0"
#' "Mfuro.UCSC.musFur1"
#' "Mmulatta.UCSC.rheMac3"
#' "Osativa.MSU.MSU7"
#' "Ptroglodytes.UCSC.panTro3"
#' "Scerevisiae.UCSC.sacCer3"
#' "Sscrofa.UCSC.susScr3"
#' "Tguttata.UCSC.taeGut2"
#' @importFrom BSgenome getSeq
#' @importFrom data.table data.table
#' @return The specified genomic sequences
#' @author Brendan Gongol
#' @export
#' @examples
#' setwd("C:/Users/Brendan/Desktop/oxidative stress surfactant bioinformatics")
#' library(BSgenome)
#' library(data.table)
#' library(EntroSolve)
#' Sorted_Surfactant <- fread("2 Longest Variant Surfactant Transcripts.xls")
#' #### Relabel chromosome designations
#' Sorted_Surfactant <- ChromLabel(Sorted_Surfactant)
#' #### Remove erroneous chromosome labels
#' Sorted_Surfactant <- Sorted_Surfactant[!(Sorted_Surfactant$chromosome_name == "CHR_MGchr184_PATCH"),]
#' Sorted_Surfactant <- Sorted_Surfactant[!(Sorted_Surfactant$chromosome_name == "Z"),]
#' Sorted_Surfactant <- Sorted_Surfactant[!(Sorted_Surfactant$chromosome_name == "AADN030chr10820.1"),]
#' Sorted_Surfactant <- Sorted_Surfactant[!(Sorted_Surfactant$chromosome_name == "chr1_random"),]
#' #### Load available genomes ####
#' ToLoad <- c("Hsapiens.UCSC.hg38","Mmusculus.UCSC.mm10","Rnorvegicus.UCSC.rn6","Btaurus.UCSC.bosTau8","Cfamiliaris.UCSC.canFam3",
#' "Dmelanogaster.UCSC.dm6","Drerio.UCSC.danRer10","Ggallus.UCSC.galGal4","Mfuro.UCSC.musFur1","Mmulatta.UCSC.rheMac3"
#' ,"Ptroglodytes.UCSC.panTro3","Sscrofa.UCSC.susScr3","Tguttata.UCSC.taeGut2")
#' GenomeLoader(ToLoad)
#'
#' genome <- c("Hsapiens.UCSC.hg38", "Mmusculus.UCSC.mm10", "Rnorvegicus.UCSC.rn6", "Btaurus.UCSC.bosTau8",
#' "Cfamiliaris.UCSC.canFam3", "Dmelanogaster.UCSC.dm6", "Drerio.UCSC.danRer10",
#' "Ggallus.UCSC.galGal4", "Ptroglodytes.UCSC.panTro3", "Tguttata.UCSC.taeGut2")
#' ChromosomeSeqCompileR(DT = Sorted_Surfactant, Spec = genome, distance = 2000)
#'
#' genome2 <- c("Hsapiens.UCSC.hg38", "Mmusculus.UCSC.mm10", "Rnorvegicus.UCSC.rn6", "Btaurus.UCSC.bosTau8",
#' "Cfamiliaris.UCSC.canFam3", "Dmelanogaster.UCSC.dm6", "Drerio.UCSC.danRer10",
#' "Ggallus.UCSC.galGal4", "Ptroglodytes.UCSC.panTro3", "Tguttata.UCSC.taeGut2", "Hsapiens.UCSC.hg38")
#' one <- ChromosomeSeqCompileR(DT = Sorted_Surfactant, Spec = genome2, distance = 2000)
#'
#' genome3 <- c("Hsapiens.UCSC.hg38", "Mmusculus.UCSC.mm10", "Rnorvegicus.UCSC.rn6", "Btaurus.UCSC.bosTau8",
#' "Cfamiliaris.UCSC.canFam3", "Dmelanogaster.UCSC.dm6", "Drerio.UCSC.danRer10",
#' "Ggallus.UCSC.galGal4", "Ptroglodytes.UCSC.panTro3", "Tguttata.UCSC.taeGut2", "HUMAN")
#' one <- ChromosomeSeqCompileR(DT = Sorted_Surfactant, Spec = genome3, distance = 2000)
ChromosomeSeqCompileR <- function(DT, Spec, distance){
if(sum(duplicated(sub("\\..*", "", Spec))) > 0){
print("You have entered a duplicated genome selection. Please remove duplicated species genome.")
}
else{
ChromSeqConvert <- function(seqs){
sequences <- NULL
for(i in 1:length(seqs)){
sequences[i] <- toString(seqs[i])
}
return(sequences)
}
ChromSeqConvert2 <- function(seqs){
sequences <- toString(seqs)
return(sequences)
}
pb <- txtProgressBar(min = 0, max = length(Spec), style = 3)
CompiledSeqs <- NULL
for(i in 1:length(Spec)){
if(Spec[i] == "Hsapiens.UCSC.hg38"){
genomehum <- BSgenome.Hsapiens.UCSC.hg38
SurfTrans <- DT[DT$Scientific_Name == "Homo_sapiens"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomehum, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Mmusculus.UCSC.mm10"){
genomemou <- BSgenome.Mmusculus.UCSC.mm10
SurfTrans <- DT[DT$Scientific_Name == "Mus_musculus"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomemou, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Rnorvegicus.UCSC.rn6"){
genomerat <- BSgenome.Rnorvegicus.UCSC.rn6
SurfTrans <- DT[DT$Scientific_Name == "Rattus_norvegicus"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomerat, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Alyrata.JGI.v1"){
genomeAly <- BSgenome.Alyrata.JGI.v1
SurfTrans <- DT[DT$Scientific_Name == "Arabidopsis_lyrata"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeAly, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Amellifera.BeeBase.assembly4"){
genomeAme <- BSgenome.Amellifera.BeeBase.assembly4
SurfTrans <- DT[DT$Scientific_Name == "Apis_mellifera"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeAme, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Athaliana.TAIR.TAIR9"){
genomeAth <- BSgenome.Athaliana.TAIR.TAIR9
SurfTrans <- DT[DT$Scientific_Name == "Arabidopsis_thaliana"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeAth, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Btaurus.UCSC.bosTau8"){
genomeBta <- BSgenome.Btaurus.UCSC.bosTau8
SurfTrans <- DT[DT$Scientific_Name == "Bos_taurus"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeBta, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Celegans.UCSC.ce11"){
genomeCel <- BSgenome.Celegans.UCSC.ce11
SurfTrans <- DT[DT$Scientific_Name == "Caenorhabditis_elegans"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeCel, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Cfamiliaris.UCSC.canFam3"){
genomeCfa <- BSgenome.Cfamiliaris.UCSC.canFam3
SurfTrans <- DT[DT$Scientific_Name == "Canis_familiaris"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeCfa, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Dmelanogaster.UCSC.dm6"){
genomeDme <- BSgenome.Dmelanogaster.UCSC.dm6
SurfTrans <- DT[DT$Scientific_Name == "Drosophila_melanogaster"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeDme, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Drerio.UCSC.danRer10"){
genomeDre <- BSgenome.Drerio.UCSC.danRer10
SurfTrans <- DT[DT$Scientific_Name == "Danio_rerio"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeDre, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Gaculeatus.UCSC.gasAcu1"){
genomeGac <- BSgenome.Gaculeatus.UCSC.gasAcu1
SurfTrans <- DT[DT$Scientific_Name == "Gasterosteus_aculeatus"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeGac, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Ggallus.UCSC.galGal4"){
genomeGga <- BSgenome.Ggallus.UCSC.galGal4
SurfTrans <- DT[DT$Scientific_Name == "Gallus_gallus"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeGga, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Mfascicularis.NCBI.5.0"){
genomeMfa <- BSgenome.Mfascicularis.NCBI.5.0
SurfTrans <- DT[DT$Scientific_Name == "Macaca_fascicularis"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeMfa, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Mfuro.UCSC.musFur1"){
genomeMfu <- BSgenome.Mfuro.UCSC.musFur1
SurfTrans <- DT[DT$Scientific_Name == "Mustela_putorius_furo"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeMfu, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Mmulatta.UCSC.rheMac3"){
genomeMmu <- BSgenome.Mmulatta.UCSC.rheMac3
SurfTrans <- DT[DT$Scientific_Name == "Macaca_mulatta"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeMmu, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Osativa.MSU.MSU7"){
genomeOsa <- BSgenome.Osativa.MSU.MSU7
SurfTrans <- DT[DT$Scientific_Name == "Oryza_sativa"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeOsa, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Ptroglodytes.UCSC.panTro3"){
genomePtr <- BSgenome.Ptroglodytes.UCSC.panTro3
SurfTrans <- DT[DT$Scientific_Name == "Pan_troglodytes"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomePtr, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Scerevisiae.UCSC.sacCer3"){
genomeSce <- BSgenome.Scerevisiae.UCSC.sacCer3
SurfTrans <- DT[DT$Scientific_Name == "Saccharomyces_cerevisiae"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeSce, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Sscrofa.UCSC.susScr3"){
genomeSsc <- BSgenome.Sscrofa.UCSC.susScr3
SurfTrans <- DT[DT$Scientific_Name == "Sus_scrofa"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeSsc, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else if(Spec[i] == "Tguttata.UCSC.taeGut2"){
genomeTgu <- BSgenome.Tguttata.UCSC.taeGut2
SurfTrans <- DT[DT$Scientific_Name == "Taeniopygia_guttata"]
STA <- SurfTrans$transcription_start_site - distance
END <- SurfTrans$transcription_start_site + distance
CHR <- SurfTrans$chromosome_name
tryCatch({
seqs <- getSeq(genomeTgu, CHR, start=STA, end=END)
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
if(nrow(SurfTrans) > 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
else if(nrow(SurfTrans) == 1){
SurfTrans$Genome_Sequence <- ChromSeqConvert2(seqs)
CompiledSeqs <- rbind(CompiledSeqs, SurfTrans)
}
}
else{
print("At least one genome you selected is not available with this function")
}
setTxtProgressBar(pb, i)
}
close(pb)
return(CompiledSeqs)
}
}
#' SequenceSiftR
#'
#' Requires a data.table with a column labeled "Sequence"
#'
#' @param DT a data table containing a column labeled "Sequence" housing the DNA sequences to analyze.
#' @param Percent a numerical value between 0 and 1 indicating the percentage of "N"'s in the DNA sequence.
#' @param output a logical character either "return", "remove", or"return_remove". If "return" will add an additional column to the data table indicating the
#' percentage of "N"'s in the DNA string. If "remove", will return the data table after removing the rows containing sequences that have a
#' greater number of "N"'s than the specified "Percent" cutoff. If "return_remove" will add an additional column to the data table indicating the
#' percentage of "N"'s in the DNA string and return the data table after removing the rows containing sequences that have a
#' greater number of "N"'s than the specified "Percent" cutoff.
#' @return The data table as specified by the output argument.
#' @author Brendan Gongol
#' @importFrom stringr str_count
#' @export
#' @examples
#' Sequence <- c("AAGCTAAGCTAAGCTGCGCAATTTTTGTATTTTGTTTAAACAGAATCCTCAAGGGAACATCATCCTCAGTTCTTTTTGTGTATTAGCTCAGATTTTCCAGCTGTTTTTAAAGCT",
#' "CTGTTTCGAGCCTGAATCTCGATCGCTCGCGCTAGACAGCTCGACGCACTTTTCAGCAGGAGCCTG",
#' "TCAGCAGATAGCGCTCGATACAGCTCGACAGCTCTTGCTGTATTGTGTG",
#' "TTGCTGTATTGTGTGATCCTCGATACAGGTATTTTCTGAGCCTGATAGCTAGCTTTGCTGTATTGTGTG",
#' "AAAAAATTTTTTCCCCCCGGGGGG", "NNTGCTAGCNNNACATCGCTACNNCTAGATCGAT", "NNNNNNAGCTNNNNNNAGCTGNNNNNACNNNNN", "NNNNNNNNNANNNNNNNNNTNNNNNNNNNC")
#' gene_symbol <- c("AKT", "PI3K", "SREBP", "FOXO", "ABCA1", "Caspase-1", "PIGPEN", "SNAIL")
#' chromosome_name <- c("1", "5", "10", "X", "Y", "2", "3", "4")
#' chromo <- data.frame(cbind(Sequence, gene_symbol, chromosome_name))
#' chromo$Sequence <- as.character(chromo$Sequence)
#' chromo$chromosome_name <- as.character(chromo$chromosome_name)
#' chromo
#'
#' SequenceSiftR(chromo, output = "return")
#' SequenceSiftR(chromo, Percent = 0.49, output = "remove")
#' SequenceSiftR(chromo, Percent = 0.49, output = "return_remove")
SequenceSiftR <- function(DT, Percent, output){
DT_SUB <- DT
Sequence <- DT_SUB$Sequence
N <- str_count(Sequence, c("N"))
spl <- strsplit(DT$Sequence, split="")
len <- NULL
for(i in 1:length(spl)){
len[i] <- length(spl[[i]])
}
Perc <- N/len
if(output == "return"){
DT_SUB$Percent_N <- Perc
return(DT_SUB)
}
else if(output == "remove"){
sub <- Perc < Percent
DT_SUB <- DT_SUB[sub,]
return(DT_SUB)
}
else if(output == "return_remove"){
DT_SUB$Percent_N <- Perc
sub <- Perc < Percent
DT_SUB <- DT_SUB[sub,]
return(DT_SUB)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.