orthologr: Comparative Genomics with R

Documented in orthofinder2

#' @title Interface function to Orthofinder2
#' @description Run Orthofinder2 from R.
#' @param proteome_folder file path to a folder storing the proteome sequences of the species for which orthology inference shall be performed.
#' @param use_existing_output a logical value indicating whether or not an existing \code{Orthofinder2}
#' output folder shall be used fo further import and processing. If \code{use_existing_output = TRUE}
#' is selected then please specify the file path to the in \code{proteome_folder} where the \code{Orthofinder2} output folder can be found.
#' @param import_type type of \code{Orthofinder2} output that shall be imported after running \code{Orthofinder2}.
#' Options are:
#' \itemize{
#' \item \code{import_type = "orthogroups_core"}
#' \item \code{import_type = "orthogroups_pairwise"}
#' \item \code{import_type = ""}
#' \item \code{import_type = ""}
#' }
#' @param comp_cores number of cores that shall be used for parallel processing. Default is \code{cores = 1}.
#' @param of_path a character string specifying the path to the locally installed \code{orthofinder} executable.
#' A possible specification could be \code{of_path = "/opt/miniconda3/bin/"} which internally will translate to
#' \code{/opt/miniconda3/bin/orthofinder}. The default is \code{of_path = NULL} which means that orthofinder assumes
#' users have their \code{orthofinder} executable stored at \code{/opt/miniconda3/bin/orthofinder}.
#' @author Hajk-Georg Drost
#' @note This function assumes that users have \code{OrthoFinder} installed via \code{miniconda} and stored at \code{~/opt/miniconda3/bin/}.
#' In addition, DIAMOND needs to be installed as executable tool (/usr/local/bin).
#' @examples \dontrun{
#' # specify species names
#' orgs <- c("Arabidopsis lyrata",
#'           "Capsella rubella", "Solanum lycopersicum")
#' # download proteome files for all species
#' biomartr::getProteomeSet(db = "refseq", organisms = orgs, path = "of_proteomes")
#' # download annotation files for all species
#' biomartr::getGFFSet(db = "refseq", organisms = orgs, path = "of_gff")
#' # select longest splice variant per gene locus
#' retrieve_longest_isoforms_all(proteome_folder = "of_proteomes",
#'                               annotation_folder = "of_gff",
#'                               annotation_format = "gff",
#'                               output_folder = "of_proteomes_longest_sv")
#' # run orthofinder2 to infer ortho groups for the specified species
#' orthofinder2(proteome_folder = "of_proteomes_longest_sv", comp_cores = 4)
#' }
#' @export

orthofinder2 <-
        function(proteome_folder,
                 use_existing_output = FALSE,
                 import_type = NULL,
                 comp_cores = 1,
                 of_path = NULL) {
                is_installed_orthofinder(path = of_path)
                
                if (!is.null(import_type)) {
                        if (!is.element(import_type,
                                        c("orthogroups_core", "orthogroups_pairwise")))
                                stop(
                                        "The specified 'import_type' is not supported by this function. Please consult the documentation to select a valid Orthofinder2 'import_type'.",
                                        call. = FALSE
                                )
                }
                
                ### add here a test that input sequences are amino acid sequences
                
                if (!file.exists(proteome_folder))
                        stop(
                                "Please provide a valid path to the proteome folder. Your specified path '",
                                proteome_folder,
                                "' seems not to exist.",
                                call. = FALSE
                        )
                
                files <- list.files(proteome_folder)
                
                if (any(stringr::str_detect(files, "documentation"))) {
                        message(
                                "Your species folder '",
                                proteome_folder,
                                " still contains the 'documentation' folder which will be moved from ",
                                proteome_folder,
                                " to ",
                                getwd(),
                                " to enable the Orthofinder2 search."
                        )
                        file.rename(
                                file.path(proteome_folder, "documentation"),
                                file.path(getwd(), "documentation")
                        )
                        unlink(
                                file.path(proteome_folder, "documentation"),
                                recursive = TRUE,
                                force = TRUE
                        )
                }
                
                if (!use_existing_output) {
                        # determine the number of cores on a multicore machineYes
                        cores <- parallel::detectCores()
                        
                        # in case one tries to use more cores than are available
                        if (comp_cores > cores)
                                stop("You chose more cores than are available on your machine.",
                                     call. = FALSE)
                        
                        message("Running OrthoFinder2 with ",
                                comp_cores,
                                " cores ...")
                        message(
                                "Do your input fasta files store unique sequences per gene locus, e.g. the longest splice variant? If not, then have a look at ?orthologr::retrieve_longest_isoforms()."
                        )
                        
                        # output is stored in OrthoFinder/Results_DATE , where DATE is in format: Nov08 for 08 th November
                        # or in this case OrthoFinder/Results_DATE_basename(proteome_folder)
                        
                        if (dirname(proteome_folder) == ".") {
                                if (is.null(of_path)) {
                                        system(
                                                paste0(
                                                        "/opt/miniconda3/bin/orthofinder -f ",
                                                        ws_wrap(
                                                                file.path(getwd(), proteome_folder)
                                                        ),
                                                        " -t ",
                                                        cores,
                                                        " -a ",
                                                        cores,
                                                        " -S diamond -n ",
                                                        basename(proteome_folder)
                                                )
                                        )
                                } else {
                                        message("\n")
                                        message(
                                                "A new path to the orthofinder executable was specified: ",
                                                ws_wrap(
                                                        file.path(of_path, "orthofinder")
                                                ),
                                                " ..."
                                        )
                                        system(
                                                paste0(
                                                        ws_wrap(
                                                                file.path(of_path, "orthofinder")
                                                        ),
                                                        " -f ",
                                                        ws_wrap(
                                                                file.path(getwd(), proteome_folder)
                                                        ),
                                                        " -t ",
                                                        cores,
                                                        " -a ",
                                                        cores,
                                                        " -S diamond -n ",
                                                        basename(proteome_folder)
                                                )
                                        )
                                }
                                
                        } else {
                                if (is.null(of_path)) {
                                        system(
                                                paste0(
                                                        "/opt/miniconda3/bin/orthofinder -f ",
                                                        ws_wrap(proteome_folder),
                                                        " -t ",
                                                        cores,
                                                        " -a ",
                                                        cores,
                                                        " -S diamond -n ",
                                                        basename(proteome_folder)
                                                )
                                        )
                                } else {
                                        message("\n")
                                        message(
                                                "A new path to the orthofinder executable was specified: ",
                                                ws_wrap(
                                                        file.path(of_path, "orthofinder")
                                                ),
                                                " ..."
                                        )
                                        system(
                                                paste0(
                                                        ws_wrap(
                                                                file.path(of_path, "orthofinder")
                                                        ),
                                                        " -f ",
                                                        ws_wrap(proteome_folder),
                                                        " -t ",
                                                        cores,
                                                        " -a ",
                                                        cores,
                                                        " -S diamond -n ",
                                                        basename(proteome_folder)
                                                )
                                        )
                                }
                                
                        }
                        message("\n")
                        message(
                                "Orthofinder2 finished successfully and stored all results in ",
                                ifelse(
                                        dirname(proteome_folder) == ".",
                                        ws_wrap(file.path(
                                                getwd(), proteome_folder
                                        )),
                                        ws_wrap(proteome_folder)
                                )
                        )
                        
                }
                orthofinder2_output_folder <-
                        file.path(proteome_folder,
                                  "OrthoFinder",
                                  paste0("Results_", basename(proteome_folder)))
                
                if (!is.null(import_type) &&
                    (import_type == "orthogroups_core")) {
                        res <-
                                orthofinder2_retrieve_core_orthologs(
                                        orthogroups_file = file.path(orthofinder2_output_folder, "") ,
                                        single_copy_file = file.path(orthofinder2_output_folder, "")
                                )
                        return(res)
                }
                
        }