R/utils.R

Defines functions valid_blast valid_annot valid_seq species_colors custom_palette iqtree_is_installed iqtree_version last_is_installed diamond_is_installed is_valid create_species_id_table check_gene_names check_ngenes check_list_names

Documented in create_species_id_table diamond_is_installed iqtree_is_installed iqtree_version last_is_installed

#' Check if the names of list of sequences and annotations match.
#' 
#' @param seq A list of AAStringSet objects.
#' @param annotation A GRangesList, CompressedGRangesList, or list of
#' GRanges with the annotation for the sequences in \strong{seq}.
#' 
#' @return TRUE if the objects pass the check.
#' @noRd
#' @examples 
#' data(annotation)
#' data(proteomes)
#' seq <- proteomes
#' check_list_names(seq, annotation)
check_list_names <- function(seq = NULL, annotation = NULL) {
    
    annot_names <- names(annotation)
    seq_names <- names(seq)
    
    # Check for differences in both sets
    diff_seq <- setdiff(seq_names, annot_names)
    diff_annot <- setdiff(annot_names, seq_names)
    
    check <- TRUE
    if(is.null(annot_names) | is.null(seq_names)) {
        stop("List-like arguments 'seq' and 'annotation' must have names.")
    } else if(length(diff_seq) != 0 & length(diff_annot) == 0) {
        stop(
            "The following elements in `seq` were not found in `annotation`:\n",
            paste0(diff_seq, collapse = "\n")
        )
    } else if(length(diff_seq) == 0 & length(diff_annot) != 0) {
        stop(
            "The following elements in `annotation` were not found in `seq:`\n",
            paste0(diff_annot, collapse = "\n")
        )
    } else if(length(diff_seq) != 0 & length(diff_annot) != 0) {
        stop(
            "Element in `seq` but not in `annotation`: \n",
            paste0(diff_seq, collapse = "\n"),
            "\n\nElements in `annotation` but not in `seq`: \n",
            paste0(diff_annot, collapse = "\n")
        )
    }
    
    return(check)
}


#' Check if the number of sequences is less than the number of genes
#'
#' @param seq A list of AAStringSet objects.
#' @param annotation A GRangesList or CompressedGRangesList object 
#' with the annotation for the sequences in \strong{seq}.
#'
#' @return TRUE if the objects pass the check.
#' @noRd 
#' @importFrom utils capture.output
#' @examples 
#' data(proteomes)
#' data(annotation)
#' seq <- proteomes
#' check_ngenes(seq, annotation)
check_ngenes <- function(seq = NULL, annotation = NULL) {
    
    # Data frame of species and gene count based on annotation
    gene_count <- Reduce(rbind, lapply(seq_along(annotation), function(x) {
        count <- length(annotation[[x]][annotation[[x]]$type == "gene"])
        count_df <- data.frame(species = names(annotation)[x], ngenes = count)
        return(count_df)
    }))
    
    # Data frame of species and gene count based on sequences
    seq_count <- Reduce(rbind, lapply(seq_along(seq), function(x) {
        count <- length(seq[[x]])
        count_df <- data.frame(species = names(annotation)[x], nseqs = count)
        return(count_df)
    }))
    
    # Check if number of sequences is <= gene count (accounting for ncRNAs)
    counts <- merge(gene_count, seq_count, by = "species")
    check_count <- counts[counts$nseqs > counts$ngenes, ]
    if(nrow(check_count) > 0) {
        msg <- paste0(
            "One or more species have more sequences in `seq` than ", 
            "there are genes in `annotation`.\n",
            "Did you remember to keep only one protein isoform per gene?\n",
            "Problematic species:\n"
        )
        out <- capture.output(print(check_count, row.names = FALSE))
        stop(paste(c(msg, out), collapse = "\n"))
    }
    
    return(TRUE)
}


#' Check if sequence names match gene names from annotation
#'
#' @param seq A list of AAStringSet objects.
#' @param annotation A GRangesList or CompressedGRangesList object 
#' with the annotation for the sequences in \strong{seq}.
#' @param gene_field Character, name of the column in the GRanges objects
#' that contains gene IDs. Default: "gene_id".
#'
#' @return TRUE if the objects pass the check.
#' @noRd 
#' @importFrom GenomicRanges mcols
#' @importFrom utils capture.output head
#' @examples
#' data(annotation)
#' data(proteomes)
#' seq <- proteomes
#' check_gene_names(seq, annotation)
check_gene_names <- function(
        seq = NULL, annotation = NULL, gene_field = "gene_id"
) {
    
    seq_names <- lapply(seq, names)
    gene_names <- lapply(annotation, function(x) {
        ranges_cols <- GenomicRanges::mcols(x[x$type == "gene"])
        if(!gene_field %in% names(ranges_cols)) {
            stop("Could not find column '", gene_field, "' in GRanges.")
        }
        nam <- unique(ranges_cols[[gene_field]])
        
        return(nam)
    })
    
    # Check if names in `seq` match gene names in `annotation`
    check_names <- lapply(seq_along(seq_names), function(x) {
        sp <- names(seq_names)[x]
        diff <- seq_names[[x]][!seq_names[[x]] %in% gene_names[[sp]]]
        return(diff)
    })
    names(check_names) <- names(seq_names)
    
    # If there at least one species with a mismatch, show species name + info
    n_mismatch <- lengths(check_names)
    if(any(n_mismatch > 0)) {
        m <- names(n_mismatch[n_mismatch > 0])
        
        firstn <- function(x, n = 2) {
            return(lapply(x, function(y) paste(head(y, n), collapse = ",")))
        }
        m_df <- data.frame(
            species = m,
            sample_seqs = unlist(firstn(check_names[m])),
            sample_genes = unlist(firstn(gene_names[m]))
        )
        out <- capture.output(print(m_df, row.names = FALSE))
        msg <- paste0(
            "Sequence names in `seq` do not match gene names ", 
            "in `annotation` for the following ", length(m), " species:\n"
        )
        stop(paste(c(msg, out), collapse = "\n"))
    }
    
    return(TRUE)
}


#' Create a data frame of species IDs (3-5-character abbreviations)
#' 
#' @param species_names A character vector of names extracted from 
#' the \strong{seq} or \strong{annotation} lists, which can be extracted with
#' \code{names(seq)} or \code{names(annotation)}.
#' 
#' @return A 2-column data frame with the following variables:
#' \describe{
#'   \item{species_id}{Character, species ID consisting of 3-5 characters.}
#'   \item{species_name}{Character, original names passed as input.}
#' }
#' @export
#' @importFrom stats ave
#' @rdname create_species_id_table
#' @examples
#' # Load 'seq' list (list of AAStringSet objects)
#' data(proteomes)
#' 
#' # Create ID table
#' create_species_id_table(names(proteomes))
create_species_id_table <- function(species_names) {
    
    # Replace space with 'X'
    list_names <- gsub(" ", "X", species_names)
    
    # Get the minimum number of characters possible
    for(n in seq(3, 6)) { 
        abbrev <- substr(list_names, 1, n)
        count <- table(abbrev)
        if (!any(count > 1)) break
    }
    
    # Create a data frame of species IDs and names
    abbrev_df <- data.frame(
        species_id = abbrev,
        species_name = species_names
    )
    
    # If there are duplicated names with 5 characters, append LETTER to the end
    if(n > 5) {
        abbrev <- substr(list_names, 1, 5)
        abbrev_df <- data.frame(
            species_name = species_names,
            species_id = abbrev,
            count = as.numeric(ave(
                as.character(abbrev), abbrev, FUN = seq_along
            ))
        )
        abbrev_df$species_id <- ifelse(
            abbrev_df$count > 1, 
            paste0(substr(abbrev_df$species_id, 1, 4), LETTERS[abbrev_df$count]),
            abbrev_df$species_id
        )
        abbrev_df <- abbrev_df[, c("species_id", "species_name")]
    }
    
    return(abbrev_df)
}

#' Wrapper to check if command is found in PATH
#'
#' @param cmd Command to test.
#' @param args Arguments for command.
#'
#' @author Fabricio Almeida-Silva
#' @return Logical indicating whether the command is in PATH or not.
#' @noRd
is_valid <- function(cmd = NULL, args = NULL) {
    found <- tryCatch(
        system2(cmd, args = args, stdout = FALSE, stderr = FALSE),
        error = function(e) return(FALSE),
        warning = function(w) return(FALSE)
    )
    if(!isFALSE(found)) {
        found <- TRUE
    }
    return(found)
}


#' Check if DIAMOND is installed
#'
#' @return Logical indicating whether DIAMOND is installed or not.
#' @export
#' @rdname diamond_is_installed
#' @examples
#' diamond_is_installed()
diamond_is_installed <- function() {
    valid <- is_valid(cmd = "diamond", args = "help")
    return(valid)
}


#' Check if last is installed
#'
#' @return Logical indicating whether last is installed or not.
#' @export
#' @rdname last_is_installed
#' @examples
#' last_is_installed()
last_is_installed <- function() {
    valid <- is_valid(cmd = "lastal", args = "-h")
    return(valid)
}


#' Get IQ-TREE version
#'
#' @return Numeric indicating IQ-TREE version, with either 1 or 2.
#' @export
#' @rdname iqtree_version
#' @examples
#' iqtree_version()
iqtree_version <- function() {
    
    is_v1 <- is_valid(cmd = "iqtree", args = "-h")
    is_v2 <- is_valid(cmd = "iqtree2", args = "-h")
    
    v <- NA
    if(is_v1) { 
        v <- 1 
    } else if(is_v2) {
        v <- 2
    }
    return(v)
}


#' Check if IQTREE is installed
#'
#' @return Logical indicating whether IQTREE is installed or not.
#' @export
#' @rdname iqtree_is_installed
#' @examples
#' iqtree_is_installed()
iqtree_is_installed <- function() {
    
    v <- iqtree_version()
    
    valid <- !is.na(v)
    return(valid)
}


#' Generate custom color palette
#'
#' @param pal Numeric specifying palette number, from 1 to 3.
#'
#' @return Character vector of custom color palette with 20 colors
#' @noRd
custom_palette <- function(pal = 1) {
    pal1 <- c("#1F77B4FF", "#FF7F0EFF", "#2CA02CFF", "#D62728FF",
              "#9467BDFF", "#8C564BFF", "#E377C2FF", "#7F7F7FFF",
              "#BCBD22FF", "#17BECFFF", "#AEC7E8FF", "#FFBB78FF",
              "#98DF8AFF", "#FF9896FF", "#C5B0D5FF", "#C49C94FF",
              "#F7B6D2FF", "#C7C7C7FF", "#DBDB8DFF", "#9EDAE5FF")
    
    pal2 <- c("#3182BDFF", "#E6550DFF", "#31A354FF", "#756BB1FF",
              "#636363FF", "#6BAED6FF", "#FD8D3CFF", "#74C476FF",
              "#9E9AC8FF", "#969696FF", "#9ECAE1FF", "#FDAE6BFF",
              "#A1D99BFF", "#BCBDDCFF", "#BDBDBDFF", "#C6DBEFFF",
              "#FDD0A2FF", "#C7E9C0FF", "#DADAEBFF", "#D9D9D9FF")
    
    pal3 <- c("#393B79FF", "#637939FF", "#8C6D31FF", "#843C39FF",
              "#7B4173FF", "#5254A3FF", "#8CA252FF", "#BD9E39FF",
              "#AD494AFF", "#A55194FF", "#6B6ECFFF", "#B5CF6BFF",
              "#E7BA52FF", "#D6616BFF", "#CE6DBDFF", "#9C9EDEFF",
              "#CEDB9CFF", "#E7CB94FF", "#E7969CFF", "#DE9ED6FF")
    
    l <- list(pal1, pal2, pal3)
    l_final <- l[[pal]]
    return(l_final)
}


#' Wrapper to handle gene color annotation in heatmap
#'
#' @param species_annotation A 2-column data frame with species IDs in 
#' the first column (same as column names of profile matrix), and species
#' annotation (e.g., higher-level taxonomic information) in the second column.
#'
#' @return A list containing a data frame of species annotation and a list with
#' a named character vector of colors for each annotation category.
#' @noRd
#' @examples 
#' species_order <- c(
#'     "vra", "van", "pvu", "gma", "cca", "tpr", "mtr", "adu", "lja",
#'     "Lang", "car", "pmu", "ppe", "pbr", "mdo", "roc", "fve",
#'     "Mnot", "Zjuj", "jcu", "mes", "rco", "lus", "ptr"
#' ) 
#' species_annotation <- data.frame(
#'    Species = species_order,
#'    Family = c(rep("Fabaceae", 11), rep("Rosaceae", 6),
#'               "Moraceae", "Ramnaceae", rep("Euphorbiaceae", 3), 
#'               "Linaceae", "Salicaceae")
#')
#' species_colors(species_annotation)
species_colors <- function(species_annotation) {
    
    # First column to rownames
    rownames(species_annotation) <- species_annotation[,1]
    species_annotation[,1] <- NULL
    
    a_col <- colnames(species_annotation)[1] # Name of annotation column
    n_categories <- length(unique(species_annotation[[a_col]]))
    
    # List of named character vector with annotation colors for each category
    annotation_colors <- list(
        custom_palette(2)[seq_len(n_categories)]
    )
    names(annotation_colors) <- a_col
    names(annotation_colors[[a_col]]) <- unique(species_annotation[[a_col]])
    
    # Save results in a list
    results <- list(
        species_annotation = species_annotation,
        annotation_colors = annotation_colors
    )
    return(results)
}


#' Wrapper to check if input sequences have the expected class
#'
#' @param seq A list of AAStringSet objects, each list element containing
#' protein sequences for a given species. This list must have names 
#' (not NULL), and names of each list element must match the names of
#' list elements in \strong{annotation}.
#' 
#' @return TRUE if the input is valid.
#' @noRd
#' @importFrom methods is
valid_seq <- function(seq) {
    is_valid <- FALSE
    
    if(is(seq, "list") & is(seq[[1]], "AAStringSet")) {
        is_valid <- TRUE
    }
    
    if(!is_valid) { 
        stop("Input sequences must be in a list of AAStringSet objects.") 
    }
    
    return(is_valid)
}


#' Wrapper to check if input annotation has the expected class
#'
#' @param annotation A GRangesList, CompressedGRangesList, or list of
#' GRanges with the annotation for the sequences in \strong{seq}. This list must
#' have names (not NULL), and names of each list element must match the names
#' of list elements in \strong{seq}.
#' 
#' @return TRUE if the input is valid.
#' @noRd
#' @importFrom methods is
valid_annot <- function(annot) {

    if(is(annot, "list") | is(annot, "GRangesList") | 
       is(annot, "CompressedGRangesList")) {
        is_valid <- TRUE
    } else {
        stop("Input annotation must be in a GRangesList, CompressedGRangesList,
             or list of GRanges.") 
    }
    
    return(is_valid)
}


#' Wrapper to check if input BLAST list has the expected class
#'
#' @param blast_list A list of data frames, each data frame having 
#' the tabular output of BLASTp or similar programs, such as DIAMOND.
#' 
#' @return TRUE if the input is valid.
#' @noRd
#' @importFrom methods is
valid_blast <- function(blast_list) {

    if(is(blast_list, "list") & is(blast_list[[1]], "data.frame")) {
        is_valid <- TRUE
    } else {
        stop("BLAST list must be a list of data frames.")
    }
    
    return(is_valid)
}
almeidasilvaf/syntenet documentation built on March 23, 2024, 8:27 a.m.