toast: Transcriptome Ortholog Alignment Sequence Tools

Documented in ToastGoSlim

#' Assign Go_Slim IDs to BUSCO IDs
#'
#' this function takes the Gene Ontology information contained in the orthoDB and generalizes it
#'         into as as few GO terms as possible given three perspectives: Molecular function,
#'         Biological Process and Cellular Component, each returned as a dataframe
#' @author Dustin J Wcisel, \email{djwcisel@@ncsu.edu}
#' @author James Thomas Howard, \email{jthowar3@@ncsu.edu}
#' @author Jeffrey A Yoder, \email{jayoder@@ncsu.edu}
#' @author Alex Dornburg, \email{dornburgalex@@gmail.com}
#' @keywords toast missing transcript sequence DNA phylogeny fasta busco ortholog gene ontology go slim
#' @param orthogroup_info assing the uncompressed ontology file found in orthoDB/info/*orthogroup_info.txt.gz
#' @param obo download desired ontology information (.obo file) from http://geneontology.org/docs/download-ontology/#go_obo_and_owl
#'            these are consistently updated so make sure to grab the newest
#'            we recommend the GO slim AGR subset (goslim_agr.obo) which can be obtained using the command
#'            wget http://current.geneontology.org/ontology/subsets/goslim_agr.obo
#' @param perspective options are BP(biological process), MF(Molecular function), or CC(cellular component)
#' @import GOstats GSEABase BiocManager
#' @export
#' @examples
#' BP <- ToastGoSlim(orthogroup_info = "path/to/orthoDB/info/*orthogroup_info.txt.gz", obo = "path/to/goslim_agr.obo", perspective = "BP")

ToastGoSlim <- function(orthogroup_info, obo, perspective = "BP"){
    orthogroup_info <- read.csv("~/Downloads/gProfiler_hsapiens_3-21-2022_5-19-47 PM__intersections.csv", header = TRUE)
    pbcounter <- 0 #will help keep track of progression in progress bar through loops
    pb7 <- txtProgressBar(min = 0, max = nrow(orthogroup_info), style = 3)
    for (i in 1:nrow(orthogroup_info)){
        go_ids <-  orthogroup_info$term_id          
              if (exists("empty") == FALSE){ #start the variable if it doesn't exist
                empty <- unlist(as.character(orthogroup_info[i, "term_id"]))
            } else { #make a list of empty orthogroups to append to the end
                empty <- append(empty, unlist(as.character(orthogroup_info[i, "term_id"])))
            }

            myCollection <- GOCollection(go_ids)
            slim <- getOBOCollection("~/Downloads/Goslim.obo")
            slimmed <- goSlim(myCollection, slim, perspective)
            row.names(slimmed) <- slimmed$Term
            if (exists("headers") == FALSE) { #start the variable if it doesn't exist
                headers <- unlist(as.character(orthogroup_info[i, "term_id"]))
            } else {
            headers <- append(headers, unlist(as.character(orthogroup_info[i, "term_id"])))
            }
            if (exists("appended") ==  FALSE) { #start the variable if it doesn't exist
                appended <- slimmed[,1]
            } else {
               appended <- cbind(appended, slimmed[,1])
            }
            pbcounter <- pbcounter + 1 #keep track of progress through the three perspectives loops

        setTxtProgressBar(pb7, pbcounter)
    }
        colnames(appended) <- headers
    #need to merge back in the orthogroup_info[i,"BiologicalProcesses"] that were empty
    zeros <- nrow(appended) * length(empty) #will help populate the emptry matrix
    empty_matrix <- matrix(rep(0, zeros), nrow = nrow(appended)) #create an matrix full of zeros with
    colnames(empty_matrix) <- empty                              #colnames to merge with appended
    merger <- cbind(appended, empty_matrix) #this is the desired matrix
    row.names(merger) <- slimmed$Term
    return(merger)
    close(pb7)
}