#' Test graph for sparseness
#'
#' @param G igraph object representing the graph to test.
#' @param threshold numeric, between 0 and 1 indicating the proportion of which
#' to consider much greater than. For example, if 10 is much greater than 1,
#' then the threshold is considered to be 0.1. Default threshold is 0.1.
#'
#' @author Christopher Nobles, Ph.D.
#' @export
sparseness <- function(G, threshold = 0.1){
stopifnot(require("igraph"))
V <- V(G)$name
el <- as.data.frame(get.edgelist(G))
names(el) <- c("vi", "vj")
el$ki <- igraph::degree(G, v = el$vi)
el$kj <- igraph::degree(G, v = el$vj)
el$kikj <- el$ki * el$kj
# Quantify sparseness
M <- ecount(G)
maxM <- ifelse(
igraph::is_directed(G),
length(V) * (length(V) - 1 ),
length(V) * (length(V) - 1 ) / 2)
denG <- igraph::graph.density(G)
kimax <- max(c(el$ki, el$kj))
kikjmax <- max(el$kikj)
pct_above_ki <- 100 * (
sum(as.integer(igraph::degree(G, V) >= threshold * length(V))) ) / length(V)
pct_above_kikj <- 100 * (
sum(as.integer(el$kikj >= threshold * M)) ) / M
# Ouput Results
return(list(
"N" = length(V),
"M" = M,
"maxM" = maxM,
"Density" = denG,
"kiMax" = kimax,
"pct_above_ki_threshold" = pct_above_ki,
"kikjMax" = kikjmax,
"pct_above_kikj_threshold" = pct_above_kikj))
}
#' Match gene names within data to vertex identifiers with a reference data set.
#'
#' @param data data.frame with column specified by the `names` parameter.
#' @param names character specifying the column name in data containing the
#' `names` of to be mapped from.
#' @param refdata data.frame with reference data. Needs to contain the columns
#' referred to by `refcol` and `refout`.
#' @param refcol character the name of the column with matching information for
#' the `names` parameter.
#' @param refout character the name of the column for in the reference with the
#' information to be mapped to data.
#' @param outname character the name given to the new information mapped to the
#' `data` data.frame.
#' @param removeUnmapped logical indicating if rows should be removed if
#' unmapped.
#' @param quiet logical indicating if messages should be printed or not.
#'
#' @author Christopher Nobles, Ph.D.
#' @export
map_names <- function(data, names = "gene_name",
refdata, refcol = "preferred_name",
refout = "protein_external_id",
outname = "stringID", removeUnmapped = TRUE,
quiet = FALSE){
# Check input data sets for data.frame structure
stopifnot(all(sapply(list(data, refdata), class) == "data.frame"))
# Identify which columns to use
dataColNum <- match(names, names(data))
refColNum <- match(refcol, names(refdata))
refOutNum <- match(refout, names(refdata))
# Match names from data to reference
matched_order <- match(data[,dataColNum], refdata[,refColNum])
# Annotate data with output identifier
data[,outname] <- refdata[,refout][matched_order]
# Message unmapped identifiers
if(removeUnmapped & !quiet){
unmapped <- sum(as.integer(is.na(matched_order)))
unmappedPct <- round(100 * unmapped / nrow(data), digits = 2)
message(paste0(
"Unmapped: ", unmappedPct, "% (", unmapped, " out of ", nrow(data), ")"))
}
# Remove unmapped rows if requested
if(removeUnmapped){
data <- data[!is.na(data[,outname]),]
}
return(data)
}
#' Assign consistant ID to variable given various alias IDs from a reference.
#'
#' @usage
#' alias_arbiter(IDs, RefIDs, aliasIDs)
#'
#' alias_arbiter(
#' IDs, RefIDs, aliasIDs, outputIDs, sep = NULL,
#' remove_absent_IDs = FALSE, quiet = FALSE)
#'
#' @description Assign a reference ID to a vector of IDs that may have alias
#' values. An example, if ID values of "A", "B", "C" were given to three
#' bacterial species, but "B" was later reclassified as "C", then "B" becomes
#' a depreciated alias of "C". This function takes this information and
#' changes the original vector of "A", "B", and "C", to "A", "C", and "C".
#' Aliases can be supplied as a list of character or numeric vectors or as a
#' character vector to split on the `sep` value. IDs need to match RefIDs, but
#' outputIDs can be another ID type, as long as they are indexed in the same
#' order as RefIDs. Any ambiguous aliases will be removed, but will be messaged
#' to the output. Any IDs not matching RefIDs or aliasIDs will be returned as
#' "NA", unless "NULL" is provided to the option `remove_absent_IDs`. For this
#' option, the default value is `FALSE` which will return `NA` for unmatched IDs
#' and `TRUE` will remove all `NA` values. Caution, this produces an output that
#' is not the same length as the input if there are any unmatched or ambiguous
#' IDs. The other option is `NULL` to be supplied, which fills the `NA` values
#' with their original input, even if the alias was ambiguous.
#'
#' This function creates a directed graph where all reference IDs are source
#' nodes and all aliases are children from that source node. Alias IDs are
#' screened for reference IDs and ambiguous IDs first, therefore ensuring the
#' graph is a collection of small subgraphs with single sources and multiple
#' sinks.
#'
#' @param IDs character vector of IDs which need to be consistantly assigned.
#' Must be same type as RefIDs.
#' @param RefIDs character vector of IDs for reference, must be the same length
#' and order as aliasIDs and, if included, outputIDs.
#' @param aliasIDs list of character vectors giving aliases for given RefIDs, or
#' character vector to be split for aliases (must specify the `sep` argument
#' with delimiter).
#' @param outputIDs character or numeric vector for output. Indexed in the same
#' order as the RefIDs and aliasIDs. If not given, RefIDs will be used as
#' the output IDs
#' @param sep character delimiter for aliasIDs if supplied in a character
#' vector. Default is `NULL`.
#' @param remove_absent_IDs logical. Default behavior will change IDs not found
#' in RefIDs or aliases to `NA`. If set to TRUE, IDs not matching RefIDs
#' or aliasIDs will be removed. Alternatively, if set to NULL, IDs not matching
#' RefIDs or aliasIDs will remain as input ID values.
#' @param quiet logical. If TRUE, message output will be silenced. Messages
#' include notification of ambiguous aliases or aliases which belong to two or
#' more reference IDs.
#'
#' @example
#' # Using letters as IDs
#' IDs <- LETTERS[1:10]
#' RefIDs <- c("A", "C", "E", "J")
#' aliasIDs <- c("A,B", "C,D", "E,F,G,H,I", "J")
#' alias_arbiter(IDs, RefIDs, aliasIDs, sep = ",")
#'
#' @author Christopher Nobles, Ph.D.
#' @export
alias_arbiter <- function(IDs, RefIDs, aliasIDs, outputIDs = NULL,
sep = NULL, remove_absent_IDs = FALSE, quiet = FALSE){
# Require packages
packs <- c("igraph")
stopifnot(suppressMessages(sapply(packs, require, character.only = TRUE)))
# Check inputs and assign defaults
stopifnot(any(IDs %in% RefIDs))
stopifnot(length(RefIDs) == length(aliasIDs))
if(!is.null(outputIDs)){
stopifnot(length(outputIDs) == length(RefIDs))
}else{
outputIDs <- RefIDs
}
if(class(aliasIDs) != "list"){
stopifnot(any(sapply(aliasIDs, function(x) grepl(sep, x))))
aliasIDs <- strsplit(aliasIDs, sep, fixed = TRUE)
}
# Curate aliases to remove source
aliasID_df <- data.frame(
ref = rep(RefIDs, sapply(aliasIDs, length)),
alias = unlist(aliasIDs),
stringsAsFactors = FALSE)
aliasID_df <- aliasID_df[aliasID_df$ref != aliasID_df$alias,]
aliasID_df <- aliasID_df[!aliasID_df$alias %in% RefIDs,]
aliasID_df <- aliasID_df[aliasID_df$alias != "",]
aliasID_df <- aggregate(alias ~ ref, data = aliasID_df, function(x){
paste(x, collapse = sep)})
aliasIDs <- strsplit(aliasID_df$alias, sep, fixed = TRUE)
names(aliasIDs) <- aliasID_df$ref
# Check for ambiguous aliases
ambi_alias <- table(unlist(aliasIDs))
ambi_alias <- names(ambi_alias[ambi_alias > 1])
if(length(ambi_alias) > 0 & !quiet){
message(
"Ambiguous aliases removed : ", paste(ambi_alias, collapse = ", "), ".")
}
# Remove ambiguous aliases from aliasIDs
aliasID_df2 <- data.frame(
ref = rep(names(aliasIDs), sapply(aliasIDs, length)),
alias = unlist(aliasIDs, use.names = FALSE),
stringsAsFactors = FALSE)
aliasID_df2 <- aliasID_df2[!aliasID_df2$alias %in% ambi_alias,]
aliasIDs <- split(aliasID_df2$alias, aliasID_df2$ref)
# Create graph of aliases and reference IDs
E <- data.frame(
vi = rep(names(aliasIDs), sapply(aliasIDs, length)),
vj = unlist(aliasIDs, use.names = FALSE),
stringsAsFactors = FALSE)
V <- data.frame(v = unique(c(RefIDs, unlist(aliasIDs))))
G <- construct_graph(E, V, mode = "directed")
# Assign IDs to clusters (maybe this should be sources instead of clusters)
mem <- igraph::clusters(G)$membership
IDmem <- mem[as.character(IDs)]
Refmem <- mem[as.character(RefIDs)]
# Assign output IDs to input IDs and return
outIDs <- outputIDs[match(IDmem, Refmem)]
if(is.null(remove_absent_IDs)){
outIDs <- ifelse(is.na(outIDs), IDs, outIDs)
}else if(remove_absent_IDs == TRUE){
outIDs <- outIDs[!is.na(outIDs)]
}
return(outIDs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.