#' Prior to importing a KINC network, this function checks its Size.
#'
#' For large networks it may be useful to preserve system memory
#' by reading in the network in subsets. Prior to calling the
#' loadKINCNetwork function this function can be used to
#' determine the size of the network in terms of number of
#' edges. Once the size is know the nrows and skip arguments
#' of the loadKINCNetwork function can be used to load
#' subsets of the network at a time.
#'
#' @param network_file
#' The path to the network file on the file system.
#'
#' @return
#' An integer indicating the number of edges in the
#' network.
#'
#' @export
getKINCNetworkSize = function (network_file) {
num_lines = countLines(network_file)
# Subtract 1 from the total to account for the header.
return (num_lines[1] - 1)
}
#' Imports a network file produced by KINC into a data frame.
#'
#' @param network_file
#' The path to the network file on the file system.
#' @param nrows
#' For large network files it may be useful to preserve system
#' memory be reading in subsets of large network files. This
#' argument specifies the number of rows of the network that
#' should be read. Use this in combination with the skip
#' argument to cycle through the entire network file
#' @param skip
#' For large network files it may be useful to preserve system
#' memory be reading in subsets of large network files. This
#' argument specifies the number of rows to skip before reading.
#' Use this in combination with the nrows argument to cycle
#' through the entire network file
#'
#' @return
#' A dataframe containing the network file contents
#'
#' @export
loadKINCNetwork = function(network_file, nrows=-1, skip=0) {
# First get the headers so we can tell if this is full or tidy network file.
type = 'text'
headers = read.table(file=network_file, header = TRUE, sep="\t", nrows=1, skip=0, stringsAsFactors = FALSE)
if ('Test_Name' %in% colnames(headers)) {
type = 'tidy'
}
# If we are skipping rows then we need to tell the loader not to try to
# retrieve a header.
header = TRUE
if (skip > 0) {
header = FALSE
}
if (type == 'text') {
# Get the number of columns in the file and set the column classes.
# The first 7 columns are set by KINC.
ncols = max(count.fields(network_file, sep = "\t"))
colClasses = c(c(
"character", "character", "numeric", "character", "numeric",
"numeric", "character"
), rep("numeric", ncols-7))
net = read.table(file=network_file, header = header, sep="\t", colClasses=colClasses, nrows=nrows, skip=skip)
}
if (type == 'tidy') {
colClasses = recode(colnames(headers), "Source" = "character", "Target" = "character",
"Similarity_Score" = "numeric", "Interaction" = "character",
"Cluster_Index" = "numeric", "Cluster_Size" = "numeric",
"Samples" = "character", "Test_Name" = "character" ,
"p_value" = "numeric", "r_squared" = "numeric", "rank" = "numeric",
"WAnova_Max" = "numeric", "WAnova_Min" = "numeric", "MissingTtest" = "numeric")
net = read.table(file=network_file, header = header, sep="\t", colClasses=colClasses, na.strings="nan", nrows=nrows, skip=skip)
}
colnames(net) = colnames(headers)
return (net)
}
#' Imports a network file produced by KINC into a data frame.
#'
#' @param GEM_file
#' The path to the GEM file on the file system. This file should
#' contain gene expression (or abundance data) where the rows
#' are genes (or compounds) and the columns are samples. The first
#' column should countain the gene name. The first row contain a
#' a header with only the sample names. Thus the header row has
#' one less values then every other row.
#'
#' @return
#' A dataframe containing the GEM.
#'
#' @export
loadGEM = function(GEM_file) {
emx = read.table(GEM_file, header=TRUE, sep="\t")
return(emx)
}
#' Exports a network data frame as a KINC compatible file.
#'
#' @param net
#' A network data frame int tidy format containing the KINC-produced network.
#' The loadKINCNetwork function can be used to import the network.
#' @param network_file
#' The path to which the network file will be saved.
#' @param append
#' For large network files it may be useful to preserve system
#' memory be reading in subsets of large network files, processing
#' and then saving the processed subset before moving on to the
#' next. Set this to TRUE to append to an existing network file.
#' For the header to appear correctly, be sure to set append
#' to FALSE when saving the first subset.
#'
#' @export
saveKINCNetwork = function(net, network_file, append=FALSE) {
col.names = TRUE
if (append == TRUE) {
col.names = FALSE
}
num_cols = dim(net)[2]
if (num_cols > 12) {
for (i in 13:num_cols) {
if (is.numeric(net[,i])) {
net[,i] = format(net[,i], digits=4, nsmall=1)
}
}
}
write.table(format(net, digits=4), file = network_file,
sep = "\t", na="", quote=FALSE,
row.names=FALSE, col.names=col.names, append=append)
}
#' Exports a condition-specific network as a Tidy KINC compatible file.
#'
#' This function will save as many condition-specific network files as directed.
#'
#' @param net
#' A network data frame int tidy format containing the KINC-produced network.
#' The loadKINCNetwork function can be used to import the network.
#' @param network_file
#' The path to which the network file will be saved. The token %condition%
#' should be present in the filename. The condition name will be substituted
#' in place of this token resulting in unique files for each condition. The
#' token %filter% can also be provided. If present the value provided to
#' the filter argument will be substituted.
#' @param conditions
#' An array of condition names. These must be present in the Test_Name
#' column of the network data frame. If no conditions are provided then
#' all of the conditions in the Test_Name column will be used.
#' @param filter
#' The type of condition-specific network to create: 'label', 'class' or 'full'.
#' If 'label' is provided only edges that have no other signficant tests other
#' than the one specifid are included. If 'class' is provided only edges in one
#' type of class will be included. For example, if the Test_Name value is
#' 'Subspecies__Japonica' the class is 'Subspecies'. If no value is provided
#' then the filter defaults to 'full'.
#'
#' @export
saveConditionKINCNetwork = function(net, network_file, conditions=c(), filter='label') {
if (!'Test_Name' %in% colnames(net)) {
message('ERROR: Please provide a network dataframe in Tidy format to save condition-specific networks.')
return(NA)
}
if (length(conditions) == 0) {
conditions = unique(net$Test_Name)
}
if (filter == 'class') {
message(paste0('Filtering for edges with unique conditional classes...'))
net$Class = str_replace(net$Test_Name, '__.*$','')
edge_grp = net %>% group_by(Source,Target)
net = edge_grp %>% filter(length(unique(Class)) == 1) %>% as.data.frame()
for (class in unique(net$Class)) {
message(paste0('Writing the condition-specific network for the class: ', class, '...'))
csGCN = net[which(net$Class == class),]
if (dim(csGCN)[1] == 0) {
message(paste0('The class ', class, ' is not present in the network dataframe. Skipping.'))
next
}
csGCN_file = str_replace(network_file, '%condition%', class)
csGCN_file = str_replace(csGCN_file, '%filter%', 'unique_class')
saveKINCNetwork(csGCN, csGCN_file)
}
} else if (filter == 'label') {
message(paste0('Filtering for edges with unique conditional labels...'))
edge_grp = net %>% group_by(Source,Target)
net = edge_grp %>% filter(n() == 1) %>% as.data.frame()
for (condition in conditions) {
message(paste0('Writing the condition-specific network for the condition: ', condition, '...'))
csGCN = net[which(net$Test_Name == condition),]
if (dim(csGCN)[1] == 0) {
message(paste0('The condition ', condition, ' is not present in the network dataframe. Skipping.'))
next
}
csGCN_file = str_replace(network_file, '%condition%', condition)
csGCN_file = str_replace(csGCN_file, '%filter%', 'unique_label')
saveKINCNetwork(csGCN, csGCN_file)
}
} else {
for (condition in conditions) {
message(paste0('Writing the condition-specific network for the condition: ', condition, '...'))
csGCN = net[which(net$Test_Name == condition),]
if (dim(csGCN)[1] == 0) {
message(paste0('The condition ', condition, ' is not present in the network dataframe. Skipping.'))
next
}
csGCN_file = str_replace(network_file, '%condition%', condition)
csGCN_file = str_replace(csGCN_file, '%filter%', 'all')
saveKINCNetwork(csGCN, csGCN_file)
}
}
}
#' Converts the KINC network dataframe to an iGraph object.
#'
#' @param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#' @param add.attrs
#' Set to TRUE to include all the attributes
#' @return
#' An iGraph object.
#'
#' @export
KINCtoIgraph = function(net, add.attrs = FALSE) {
g = graph.edgelist(as.matrix(net[, c('Source', 'Target')]), directed = FALSE)
if (add.attrs == TRUE) {
attrs = colnames(net)
for (attr in attrs) {
if (attr == 'Source' | attr == 'Target') {
next
}
edge_attr(g, attr) = net[, attr]
}
}
return(g)
}
#' Creates all PNG figures describing p-value and r-squared distributions of the network.
#'
#' The network must be in tidy format and must include condition-specfic p-values.
#'
#' @param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#' @param out_prefix
#' The prefix of the output images files.
#'
#' @return dataframe
#' A network dataframe with non-significant edges removed.
#'
#' @export
saveKINCplots = function(net, out_prefix = "KINC_network") {
if (!'Test_Name' %in% colnames(net)) {
message('ERROR: Please provide a network dataframe in Tidy format to generate plots.')
return(NA)
}
has_r2 = TRUE
if(sum(is.na(net$r_squared)) == dim(net)[1]) {
has_r2 = FALSE
}
plotPvalPerCondition(net, out_prefix)
plotPvalPerScore(net, out_prefix)
plotPvalPerScoreAndCondition(net, out_prefix)
plotPvalHistPerCondition(net, out_prefix)
if (has_r2) {
plotRsqrPerScoreAndCondition(net, out_prefix)
plotRsqrPerScore(net, out_prefix)
plotRsqrHistPerCondition(net, out_prefix)
plotRsqrPerCondition(net, out_prefix)
}
plotEdgeCountPerCondition(net, out_prefix)
}
#'Plots the distribution of p-values per condition.
#'
#'@param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#'@param out_prefix
#' The prefix of the output images files.
#'
#'@export
plotPvalPerCondition = function(net, out_prefix=NA) {
if (!'Test_Name' %in% colnames(net)) {
message('ERROR: Please provide a network dataframe in Tidy format to generate plots.')
return(NA)
}
# Explore the distribution of p-values & rSqr for each trait
if (!is.na(out_prefix)) {
print(paste0("Saving: ", out_prefix, '.PvalPerCondition.png'))
png(paste0(out_prefix, '.PvalPerCondition.png'), width=6 ,height=6, units="in", res=300)
}
plot = ggplot(net, aes(x=Test_Name, y=p_value, fill=Test_Name)) +
geom_boxplot() +
scale_y_log10(breaks=c(1e-3, 1e-5, 1e-10, 1e-15, 1e-20)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "none") +
xlab("") + ylab(bquote(-log^10~(p)))
print(plot)
if (!is.na(out_prefix)) {
dev.off()
}
}
#'Plots the distribution of p-values per condition.
#'
#'@param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#'@param out_prefix
#' The prefix of the output images files.
#'
#'@export
plotRsqrPerCondition = function(net, out_prefix=NA) {
if (!'Test_Name' %in% colnames(net)) {
message('ERROR: Please provide a network dataframe in Tidy format to generate plots.')
return(NA)
}
net$r_squared = as.numeric(net$r_squared)
if (!is.na(out_prefix)) {
print(paste0("Saving: ", out_prefix, '.RsqrPerCondition.png'))
png(paste0(out_prefix, '.RsqrPerCondition.png'), width=6 ,height=6, units="in", res=300)
}
plot = ggplot(net, aes(x=Test_Name, y=r_squared, fill=Test_Name)) +
geom_boxplot() +
scale_y_continuous(breaks=seq(-100,100, by=10)/100) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "none") +
xlab("") + ylab(bquote(R^2))
print(plot)
if (!is.na(out_prefix)) {
dev.off()
}
}
#'Plots the distribution of p-values per similarity score
#'
#'@param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#'@param out_prefix
#' The prefix of the output images files.
#'
#'@export
plotPvalPerScore = function(net, out_prefix=NA) {
if (!'Test_Name' %in% colnames(net)) {
message('ERROR: Please provide a network dataframe in Tidy format to generate plots.')
return(NA)
}
# Round off the similarity sccores
mround = function(x, base) {
base * floor(x/base)
}
net$Similarity_Score = as.factor(mround(net$Similarity_Score, 0.05))
# Explore the distribution of p-values per correlation range.
if (!is.na(out_prefix)) {
print(paste0("Saving: ", out_prefix, '.PvalPerScore.png'))
png(paste0(out_prefix, '.PvalPerScore.png'), width=6 ,height=6, units="in", res=300)
}
plot = ggplot(net, aes(x=Similarity_Score, y=p_value, fill=Similarity_Score)) +
geom_boxplot() +
scale_y_log10(breaks=c(1e-3, 1e-5, 1e-10, 1e-15, 1e-20)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "none") +
xlab("Similarity Score") + ylab(bquote(-log^10~(p)))
print(plot)
if (!is.na(out_prefix)) {
dev.off()
}
}
#'Plots the distribution of R-squared values per similarity score
#'
#'@param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#'@param out_prefix
#' The prefix of the output images files.
#'
#'@export
plotRsqrPerScore = function(net, out_prefix=NA) {
if (!'Test_Name' %in% colnames(net)) {
message('ERROR: Please provide a network dataframe in Tidy format to generate plots.')
return(NA)
}
# Round off the similarity sccores
mround = function(x, base) {
base * floor(x/base)
}
net$Similarity_Score = as.factor(mround(net$Similarity_Score, 0.05))
net$r_squared = as.numeric(net$r_squared)
if (!is.na(out_prefix)) {
print(paste0("Saving: ", out_prefix, '.RsqrPerScore.png'))
png(paste0(out_prefix, '.RsqrPerScore.png'), width=6 ,height=6, units="in", res=300)
}
plot = ggplot(net, aes(x=Similarity_Score, y=r_squared, fill=Similarity_Score)) +
geom_boxplot() +
scale_y_continuous(breaks=seq(-100,100, by=10)/100) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "none") +
xlab("Similarity Score") + ylab(bquote(R^2))
print(plot)
if (!is.na(out_prefix)) {
dev.off()
}
}
#'Plots boxplots of the p-values per similarity score and condition.
#'
#'@param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#'@param out_prefix
#' The prefix of the output images files.
#'
#'@export
plotPvalPerScoreAndCondition = function(net, out_prefix=NA) {
if (!'Test_Name' %in% colnames(net)) {
message('ERROR: Please provide a network dataframe in Tidy format to generate plots.')
return(NA)
}
# Round off the similarity sccores
mround = function(x, base) {
base * floor(x/base)
}
net$Similarity_Score = as.factor(mround(net$Similarity_Score, 0.05))
# Explore the distribution of p-values per correlation range and trait.
if (!is.na(out_prefix)) {
print(paste0("Saving: ", out_prefix, '.PvalPerScoreAndCondition.png'))
png(paste0(out_prefix, '.PvalPerScoreAndCondition.png'), width=6 ,height=6, units="in", res=300)
}
plot = ggplot(net, aes(x=Similarity_Score, y=p_value, fill=Test_Name)) +
geom_boxplot() +
facet_wrap(~Test_Name) +
scale_y_log10(breaks=c(1e-3, 1e-5, 1e-10, 1e-15, 1e-20)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(legend.position = "none") +
xlab("Similarity Score") + ylab(bquote(-log^10~(p)))
print(plot)
if (!is.na(out_prefix)) {
dev.off()
}
}
#'Plots boxplots of the R-squared values per similarity score and condition.
#'
#'@param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#'@param out_prefix
#' The prefix of the output images files.
#'
#'@export
plotRsqrPerScoreAndCondition = function(net, out_prefix=NA) {
if (!'Test_Name' %in% colnames(net)) {
message('ERROR: Please provide a network dataframe in Tidy format to generate plots.')
return(NA)
}
# Round off the similarity sccores
mround = function(x, base) {
base * floor(x/base)
}
net$Similarity_Score = as.factor(mround(net$Similarity_Score, 0.05))
net$r_squared = as.numeric(net$r_squared)
if (!is.na(out_prefix)) {
print(paste0("Saving: ", out_prefix, '.RsqrPerScoreAndCondition.png'))
png(paste0(out_prefix, '.RsqrPerScoreAndCondition.png'), width=6 ,height=6, units="in", res=300)
}
plot = ggplot(net, aes(x=Similarity_Score, y=r_squared, fill=Test_Name)) +
geom_boxplot() +
facet_wrap(~Test_Name) +
scale_y_continuous(breaks=seq(-100,100, by=10)/100) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(legend.position = "none") +
xlab("Similarity Score") + ylab(bquote(R^2))
print(plot)
if (!is.na(out_prefix)) {
dev.off()
}
}
#'Plots a histogram of the p-values per condition
#'
#'@param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#'@param out_prefix
#' The prefix of the output images files.
#'
#'@export
plotPvalHistPerCondition = function(net, out_prefix=NA) {
if (!'Test_Name' %in% colnames(net)) {
message('ERROR: Please provide a network dataframe in Tidy format to generate plots.')
return(NA)
}
# How many rows does each trait contribute
if (!is.na(out_prefix)) {
print(paste0("Saving: ", out_prefix, '.PvalHistPerCondition.png'))
png(paste0(out_prefix, '.PvalHistPerCondition.png'), width=6 ,height=6, units="in", res=300)
}
plot = ggplot(net, aes(x=p_value, color=Test_Name)) +
geom_histogram(binwidth=0.5) +
facet_wrap(~Test_Name) +
scale_x_log10(breaks=c(1e-3, 1e-5, 1e-10, 1e-15, 1e-20)) +
scale_y_log10() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position = "none") +
ylab("Count") + xlab(bquote(-log^10~(p)))
print(plot)
if (!is.na(out_prefix)) {
dev.off()
}
}
#'Plots a histogram of the R-squared values per condition
#'
#'@param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#'@param out_prefix
#' The prefix of the output images files.
#'
#'@export
plotRsqrHistPerCondition = function(net, out_prefix=NA) {
if (!'Test_Name' %in% colnames(net)) {
message('ERROR: Please provide a network dataframe in Tidy format to generate plots.')
return(NA)
}
# Round off the similarity sccores
mround = function(x, base) {
base * floor(x/base)
}
net$r_squared = as.numeric(net$r_squared)
if (!is.na(out_prefix)) {
print(paste0("Saving: ", out_prefix, '.RsqrHistPerCondition.png'))
png(paste0(out_prefix, '.RsqrHistPerCondition.png'), width=6 ,height=6, units="in", res=300)
}
plot = ggplot(net, aes(x=r_squared, color=Test_Name)) +
geom_histogram(binwidth=0.01) +
facet_wrap(~Test_Name) +
scale_y_continuous(breaks=seq(-100,100, by=10)/100) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position = "none") +
ylab("Count") + xlab(bquote(R^2))
print(plot)
if (!is.na(out_prefix)) {
dev.off()
}
}
#'Plots a bar graph of the number of edges per condition.
#'
#'@param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#'@param out_prefix
#' The prefix of the output images files.
#'
#'@export
plotEdgeCountPerCondition = function(net, out_prefix=NA) {
if (!'Test_Name' %in% colnames(net)) {
message('ERROR: Please provide a network dataframe in Tidy format to generate plots.')
return(NA)
}
# Plot the edge count per category
if (!is.na(out_prefix)) {
print(paste0("Saving: ", out_prefix, '.EdgeCountPerCondition.png'))
png(paste0(out_prefix, '.EdgeCountPerCondition.png'), width=6 ,height=6, units="in", res=300)
}
plot = ggplot(net, aes(x=Test_Name, fill=Test_Name)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(legend.position = "none") +
ylab("Number of Edges") + xlab("")
print(plot)
if (!is.na(out_prefix)) {
dev.off()
}
}
#' Ranks the edges of a network and returns those ranks.
#'
#' Final ranks are determined by ordering p-values, R-squared and
#' similarity scores for each edge and assigning a rank to each edge
#' corresponding to the order of each value. For example, edges with the
#' lowest p-value will receive a rank of 1. The final rank is a weighted
#' sum of each of the three categories.
#'
#' @param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#' @param by_condition
#' If TRUE then ranks are calculated independently for each condition
#' (i.e. value in the Test_Name column). If FALSE then ranks are
#' calculated for the entire network.
#' @param sscore_weight
#' The weight to apply to the Similarity Score in rankings.
#' @param pval_weight
#' The weight to apply to the p-value in rankings.
#' @param rsqr_weight
#' The weight to apply to the r-squared value in rankings.
#' @return
#' An array of ranks. The order of the items in the ranks array corresponds
#' to the order of edges in the provided network dataframe.
#'
#' @export
getEdgeRanks = function(net, by_condition = TRUE, sscore_weight = 1, pval_weight = 1,
rsqr_weight = 1, anova_weight=1) {
if (!'Test_Name' %in% colnames(net)) {
message('ERROR: Please provide a network dataframe in Tidy format to generate plots.')
return(NA)
}
if (by_condition == FALSE) {
net = getRanks(net)
valuations = (net$score_rank * sscore_weight) +
(net$pval_rank * pval_weight) +
(net$rsqr_rank * rsqr_weight) +
(net$anova_rank * anova_weight)
net$valuation = valuations
# Now order the valuations
unique_val = unique(valuations)
ordered_val = unique_val[order(unique_val)]
final_ranks = data.frame(valuation = ordered_val, rank = seq(1:length(ordered_val)))
net = left_join(net, final_ranks, by=c('valuation' = 'valuation'))
} else {
for (condition in unique(net$Test_Name)) {
csGCN = net[which(net$Test_Name == condition),]
if ('rank' %in% colnames(csGCN)) {
csGCN = csGCN[, -which(colnames(csGCN) %in% c("rank"))]
}
csGCN = getRanks(csGCN)
# calculate the valuation for each edge.
valuations = (csGCN$score_rank * sscore_weight) +
(csGCN$pval_rank * pval_weight) +
(csGCN$rsqr_rank * rsqr_weight) +
(csGCN$anova_rank * anova_weight)
csGCN$valuation = valuations
# Now order the valuations
unique_val = unique(valuations)
ordered_val = unique_val[order(unique_val)]
final_ranks = data.frame(valuation = ordered_val, rank = seq(1:length(ordered_val)))
csGCN = left_join(csGCN, final_ranks, by=c('valuation' = 'valuation'))
net[which(net$Test_Name == condition), 'rank'] = csGCN$rank
}
}
return(as.numeric(net$rank))
}
#' A helper function for the getEdgeRanks
#'
#' @param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#' @return
#' A new network containing the p-value, r-squared and similarity score
#' rankings.
getRanks = function(net) {
net$ABS_Similarity_Score = abs(as.numeric(net$Similarity_Score))
unique_scores = unique(net$ABS_Similarity_Score)
ordered_scores = unique_scores[order(unique_scores, decreasing=TRUE)]
score_ranks = data.frame(score = ordered_scores, score_rank = seq(1:length(ordered_scores)))
unique_pvals = unique(net$p_value)
ordered_pvals = unique_pvals[order(unique_pvals)]
pval_ranks = data.frame(p_value = ordered_pvals, pval_rank = seq(1:length(ordered_pvals)))
unique_rsqr = unique(net$r_squared)
ordered_rsqr = unique_rsqr[order(unique_rsqr, decreasing=TRUE)]
rsqr_ranks = data.frame(r_squared = ordered_rsqr, rsqr_rank = seq(1:length(ordered_rsqr)))
unique_anova = unique(net$WAnova_Max)
ordered_anova = unique_anova[order(unique_anova)]
anova_ranks = data.frame(anova = ordered_anova, anova_rank = seq(1:length(ordered_anova)))
net = left_join(net, score_ranks, by=c('ABS_Similarity_Score' = 'score'))
net = left_join(net, pval_ranks, by=c('p_value' = 'p_value'))
net = left_join(net, rsqr_ranks, by=c('r_squared' = 'r_squared'))
net = left_join(net, anova_ranks, by=c('WAnova_Max' = 'anova'))
return (net)
}
#' Imports sample annotations.
#'
#' The format of the sample annotation file is tab delimited where the
#' first column is the sample name and every other column contains
#' the annotation. The file must contain a header with the annotation
#' types and the first column of the header should be the word 'Sample'
#'
#' @param annotation_file
#' The path to the file containing the annotations.
#' @param sample_header
#' The name of the column containing the sample names.
#'
#' @return
#' A data frame containing the annotations in the order of the samples.
#'
#' @export
loadSampleAnnotations = function (annotation_file, sample_header="Sample") {
# Read in the annotation file
osa = read.table(annotation_file, sep="\t", header=TRUE, row.names=NULL, quote="", fill=TRUE)
#sample_order = read.table(sample_order_file, colClasses=c('character'),
# col.names=c('Sample'))
#osa = merge(sample_order, sample_annots, by = "Sample", sort=FALSE)
row.names(osa) = osa[,sample_header]
return(osa)
}
#' Draws the graph of the entire network
#'
#' @param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#' @param osa
#' The sample annotation matrix. One column must contain the header 'Sample'
#' and the remaining colums correspond to an annotation type. The rows
#' of the anntation columns should contain the annotations.
#' @export
graphNet = function(net, osa = data.frame()) {
edge_indexes = c(1:length(net$Source));
g = graphEdgeList(edge_indexes, net, osa)
return(g)
}
#' Draws the graph of a set of edges in the network
#'
#' @param edge_indexes
#' The index of the edges in the network that comprise the module.
#' @param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#' @param osa
#' The sample annotation matrix. One column must contain the header 'Sample'
#' and the remaining colums correspond to an annotation type. The rows
#' of the anntation columns should contain the annotations.
#'
#' @export
graphEdgeList = function(edge_indexes, net, osa = data.frame()) {
g = graph.edgelist(as.matrix(net[edge_indexes, c('Source', 'Target')]), directed = F)
E(g)$weight = abs(net[edge_indexes, 'Similarity_Score'])
# Don't show node labels if we have too many nodes.
vlc = 0.5
vs = 10
if (length(edge_indexes) > 100) {
vlc = 0.25
vs = 5
}
if (length(edge_indexes) > 1000) {
vlc = 0.001
vs = 2
}
# Plot the graph.
l = layout_nicely(g)
plot(g, vertex.label.color="black", vertex.label.cex = vlc, vertex.size = vs,
edge.color='black', vertex.color='cyan',
layout = l)
return(list(
graph = g,
layout = l
))
}
#' Colors the edges of a given edge list on the currently plotted graph.
#'
#'
#' @param edge_indexes
#' The index of the edges in the network that comprise the module.
#' @param graph
#' The igraph object as returned from graphNet or graphEdgeList.
#' @param layout
#' The layout object as returned from graphNet or graphEdgeList
#' @param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#' @param color
#' The color that should be used to highlith the edges
#'
#' @export
highlightEdgeList = function(edge_indexes, graph, layout, net, color) {
edges = net[edge_indexes, c('Source', 'Target')]
verts = names(V(graph))
layout = norm_coords(layout)
for (i in 1:nrow(edges)) {
source = edges[i, c('Source')]
target = edges[i, c('Target')]
sindex = which(verts == source)
tindex = which(verts == target)
x0 = layout[sindex, 1]
y0 = layout[sindex, 2]
x1 = layout[tindex, 1]
y1 = layout[tindex, 2]
segments(x0, y0, x1, y1, col = color)
}
}
#' Plots a 3D scatterplot for a given pair of nodes.
#'
#' @param gene1
#' The name of the first gene in the pair.
#' @param gene2
#' The name of the second gene in the pair.
#' @param osa
#' The sample annotation matrix. One column must contain the header 'Sample'
#' and the remaining colums correspond to an annotation type. The rows
#' of the anntation columns should contain the annotations.
#' @param ematrix
#' The expression matrix.
#' @param field
#' The field in the sample annotation matrix by which to split the points
#' into separate layers.
#' @param colors
#' An array of colors per sample.
#' @export
plot3DPair = function(gene1, gene2, osa, ematrix, field,
colors = NA, samples = NA, highlight = c()) {
rgl.open()
rgl.bg(color = "white")
if (!is.na(samples)) {
x = as.factor(osa[samples, field])
y = t(ematrix[gene1, ])
z = t(ematrix[gene2, ])
}
else {
x = as.factor(osa[, field])
y = t(ematrix[gene1, ])
z = t(ematrix[gene2, ])
}
size = rep(0.25, length(x))
if (length(highlight) > 0) {
size[highlight] = 0.5
}
if (is.na(colors) == TRUE) {
plot3d(x, y, z, type = 's', size = size)
} else {
plot3d(x, y, z, type = 's', size = size, col = colors)
}
}
#' Plots a 3D scatterplot for a given list of edges.
#'
#' @param edge_indexes
#' The index of the edge in the network data frame.
#' @param osa
#' The sample annotation matrix. One column must contain the header 'Sample'
#' and the remaining colums correspond to an annotation type. The rows
#' of the anntation columns should contain the annotations.
#' @param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#' @param ematrix
#' The expression matrix.
#' @param field
#' The field in the sample annotation matrix by which to split the points
#' into separate layers.
#' @param samples
#' Limit the plot to only the samples indexes provided.
#' @param highlight
#' If set to TRUE then the samples in the edge cluster are drawn larger than the
#' other samples in the plot.
#' @export
plot3DEdgeList = function(edge_indexes, osa, net, ematrix, field, label_field = NA,
samples = NA, highlight = TRUE) {
rgl.open()
rgl.bg(color = "white")
# Set the row names to be the sample names.
row.names(osa) = osa$Sample
# Loop until the user decides to leave.
j = 1
done = FALSE;
while (!done) {
i = edge_indexes[j]
source = net[i, 'Source']
target = net[i, 'Target']
sample_indexes = c()
if ('Samples' %in% names(net)) {
sample_indexes = getEdgeSamples(i, net)
}
if (!is.na(samples)) {
x = t(ematrix[source, samples])
y = t(ematrix[target, samples])
z = osa[[field]][samples]
}
else {
x = t(ematrix[source, ])
y = t(ematrix[target, ])
z = osa[, field]
}
size = rep(0.3, length(x))
if (highlight & length(sample_indexes) > 0) {
size[sample_indexes] = 0.75
}
# Build a color vector for the data.
zcolors = rep('blue', length(z))
if (class(z) != 'factor' & class(z) != 'numeric') {
z = as.factor(z)
}
if (class(z) == 'factor') {
if (nlevels(z) > 2 & nlevels(z) <= 8) {
colors = data.frame(colors = brewer.pal(nlevels(z), "Dark2"), zval = unique(z), stringsAsFactors=FALSE)
for (i in 1:length(z)) {
zcolors[i] = colors$colors[which(colors$zval == z[i])]
}
}
if (nlevels(z) > 8) {
colfunc <- colorRampPalette(c("darkblue", "cyan"))
colors = data.frame(colors = colfunc(length(sort(unique(z)))),
zval = sort(unique(z)),
stringsAsFactors=FALSE)
for (i in 1:length(z)) {
if (is.na(z[i])) {
zcolors[i] = '#000000'
} else {
zcolors[i] = colors$colors[which(colors$zval == z[i])]
}
}
}
if (nlevels(z) == 2) {
colors = data.frame(colors = brewer.pal(3, "Dark2")[1:2],
zval = unique(z),
stringsAsFactors=FALSE)
for (i in 1:length(z)) {
if (is.na(z[i])) {
zcolors[i] = '#000000'
} else {
zcolors[i] = colors$colors[which(colors$zval == z[i])]
}
}
}
}
if (class(z) == 'numeric') {
# Use a blue gradient
colfunc <- colorRampPalette(c("darkblue", "cyan"))
colors = data.frame(colors = colfunc(length(sort(unique(z)))),
zval = sort(unique(z)),
stringsAsFactors=FALSE)
for (i in 1:length(z)) {
if (is.na(z[i])) {
zcolors[i] = '#000000'
}
else {
zcolors[i] = colors$colors[which(colors$zval == z[i])]
}
}
}
main = paste(source, 'vs', target, 'Edge:', i)
plot3d(x, y, z, type = 's', main = main, size = size, col = zcolors, xlab=source, ylab=target, zlab=field, axes=FALSE)
z_labels = unique(sort(as.factor(osa[[field]])))
box3d()
axis3d('x')
axis3d('y')
axis3d('z', nticks = length(unique(osa[, field])) - 1, labels = z_labels)
if (!is.na(samples)) {
text3d(x, y, z, osa[names(ematrix[samples]), ][[label_field]], adj=c(1, 1))
}
else {
text3d(x, y, z, osa[names(ematrix), ][[label_field]], adj=c(1, 1))
}
# add legend
#legend3d("topright", legend = paste('Type', c('A', 'B', 'C')), pch = 16, col = rainbow(3), cex=1, inset=c(0.02))
# Use the key press to navigate through the images.
val = readline(prompt="Type control keys the [enter]. Keys: n > forward, b > backwards, q > quit.")
if (val == 'n') {
if (j < length(edge_indexes)) {
j = j + 1;
}
}
if (val == 'b') {
if (j > 1) {
j = j - 1
}
}
if(val == 'q') {
done = TRUE
}
}
}
#' Plots a scatterplot of gene expression across a set of samples.
#'
#' Samples will be ordered by the values of field.
#'
#' @param gene
#' The name of the gene to plot
#' @param osa
#' The sample annotation matrix. One column must contain the header 'Sample'
#' and the remaining colums correspond to an annotation type. The rows
#' of the anntation columns should contain the annotations.
#' @param ematrix
#' The expression matrix.
#' @param ingroup
#' A list of sample names that are considered the "ingroup". This could be,
#' for exapmle, a GMM cluster. These samples will be drawn in a larager
#' size..
#' @param field
#' The name of the field by which to order the points along the
#' x-axis.
#' @param colfield
#' The field in the OSA that should be used to color the points. Each
#' category in the field recieves a unique color.
#' @param show.legend
#' Set to TRUE to have a legend appear on the figure. Defaults to TRUE.
#' @param fig_title
#' The text to use for the title of the figure.
#' @param highlight
#' A vector of sample indexes to highlight in the plot.
#' @export
plotGene = function(gene, osa, ematrix, field, ingroup=c(), colfield = field,
show.legend=TRUE, fig_title = NA) {
condition = data.frame(c = osa[[field]], c2 = osa[[colfield]])
row.names(condition) = row.names(osa)
expdata = merge(t(ematrix[gene,]), condition, by="row.names")
colnames(expdata) = c('Sample', gene, field, colfield)
row.names(expdata) = expdata$Sample
expdata['size'] = 0.25
if (length(ingroup) > 0) {
expdata[ingroup, 'size'] = 1
}
# Remove missing values
expdata = expdata[which(!is.na(expdata[gene])),]
expplot = ggplot(expdata, aes_string(gene, field, color=colfield)) +
geom_point(size=expdata$size, show.legend = show.legend)
theme(legend.title = element_blank())
if (!is.numeric(expdata[field])) {
expplot = expplot + scale_color_brewer(palette="Dark2")
}
if (!is.null(fig_title)) {
expplot = expplot + ggtitle(fig_title)
}
return(expplot)
}
#' Plots a 2D scatterplot for a given list of edges.
#'
#' @param edge_indexes
#' The index of the edge in the network data frame.
#' @param osa
#' The sample annotation matrix. One column must contain the header 'Sample'
#' and the remaining colums correspond to an annotation type. The rows
#' of the anntation columns should contain the annotations.
#' @param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#' @param ematrix
#' The expression matrix.
#' @param field
#' The field in the sample annotation matrix by which to split the points
#' into separate layers.
#' @param samples
#' An array of sample names for which the scatterplot should be included in the plot.
#' If no value is provided then all samples with values are included.
#' @param highlight
#' If set to TRUE, makes the samples belonging to the cluster that
#' underlies the edge larger in the plot. Default = TRUE.
#' @param fig_title
#' A title to give the plot. Default = NULL
#' @export
plot2DEdgeList = function(edge_indexes, osa, net, ematrix,
field = NA, samples = NA, highlight = TRUE,
fig_title = NULL, show.legend=TRUE, legend.title = field) {
j = 1
done = FALSE;
while (!done) {
i = edge_indexes[j]
source = net$Source[i]
target = net$Target[i]
sample_indexes = c()
if ('Samples' %in% names(net)) {
sample_indexes = getEdgeSamples(i, net)
}
if (!is.na(samples)) {
x = t(ematrix[source, samples])
y = t(ematrix[target, samples])
colors = colors[samples]
condition = osa[[field]][samples]
}
else {
x = t(ematrix[source, ])
y = t(ematrix[target, ])
condition = osa[[field]]
}
size = rep(0.25, length(x))
if (!highlight) {
size = rep(1, length(x))
}
if (highlight & length(sample_indexes) > 0) {
size[sample_indexes] = 1.5
}
coexpdata = data.frame(source = x, target = y, category = condition, size = size)
colnames(coexpdata) = c('x', 'y', 'category', 'size')
coexpdata = coexpdata[complete.cases(coexpdata),]
coexpplot = ggplot(coexpdata, aes(x, y, color=category)) +
geom_point(size=coexpdata$size, show.legend = show.legend) +
xlab(source) + ylab(target) + labs(colour = legend.title)
if (!is.null(fig_title)) {
coexpplot = coexpplot + ggtitle(fig_title)
}
if (length(edge_indexes) == 1) {
return(coexpplot)
}
# Use the key press to navigate through the images.
val = readline(prompt=paste("Edge: ", i, '. ', j, " of ", length(edge_indexes), ". Type control keys the [enter]. Keys: n > forward, b > backwards, q > quit.", sep=""))
if (val == 'n') {
if (j < length(edge_indexes)) {
j = j + 1;
}
}
if (val == 'b') {
if (j > 1) {
j = j - 1
}
}
if(val == 'q') {
done = TRUE
}
}
}
#' Plots a 2D scatterplot for a given pair of genes
#'
#' @param gene1
#' The name of the first gene in the pair.
#' @param gene2
#' The name of the second gene in the pair.
#' @param osa
#' The sample annotation matrix. One column must contain the header 'Sample'
#' and the remaining colums correspond to an annotation type. The rows
#' of the anntation columns should contain the annotations.
#' @param ematrix
#' The expression matrix.
#' @param field
#' The field in the sample annotation matrix by which to split the points
#' into separate layers.
#' @param ingroup
#' A list of sample names that are considered the "ingroup". This could be,
#' for exapmle, a GMM cluster. These samples will be drawn in a larager
#' size..
#' @param fig_title
#' A title to give the plot. Default = NULL
#' @export
plot2DPair = function(gene1, gene2, osa, ematrix, field, ingroup = c(),
fig_title = NA, show.legend=TRUE) {
# Get the gene expression and order it by the osa sample names.
x = t(ematrix[gene1, ])
y = t(ematrix[gene2, ])
coexpdata = data.frame(source = x, target = y)
# Get the conditions of the field specified by the user.
coexpdata['Category'] = osa[colnames(ematrix), field]
coexpdata['Group'] = 'Out'
coexpdata[ingroup, 'Group'] = 'In'
coexpdata['Size'] = 0.25
coexpdata[ingroup, 'Size'] = 1
row.names(coexpdata) = make.names(row.names(coexpdata))
coexpplot = ggplot(coexpdata, aes_string(gene1, gene2, color='Category')) +
geom_point(size=coexpdata$Size, show.legend = show.legend) +
xlab(gene1) + ylab(gene2) + labs(colour=field)
if (!is.numeric(coexpdata$Category)) {
coexpplot = coexpplot + scale_color_brewer(palette="Dark2")
}
if (!is.null(fig_title)) {
coexpplot = coexpplot + ggtitle(fig_title)
}
return(coexpplot)
}
#' Plots four images for each edge to explore their relationship.
#'
#' @param gene1
#' The name of the first gene in the pair.
#' @param gene2
#' The name of the second gene in the pair.
#' @param osa
#' The sample annotation matrix. One column must contain the header 'Sample'
#' and the remaining colums correspond to an annotation type. The rows
#' of the anntation columns should contain the annotations.
#' @param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#' @param ematrix
#' The expression matrix.
#' @param field
#' The field in the sample annotation matrix by which to split the points
#' into separate layers in the pairwise scatterplot
#' @param field2
#' The field in the sample annotation matrix by which to order points on
#' the y-axis of the bottom two gene expression plots. The x-axis is
#' ordered by expression level.
#' @export
plot2DEdgeReport <- function(edge_indexes, osa, net, ematrix,
field = NA, field2 = field, show.legend=TRUE,
control=c()) {
j = 1
done = FALSE;
while (!done) {
i = edge_indexes[j]
source = net[i, 'Source']
target = net[i, 'Target']
plot2DPairReport(source, target, osa, net, ematrix, field, field2, show.legend, control=control)
if (length(edge_indexes) == 1) {
return
}
# Use the key press to navigate through the images.
val = readline(prompt=paste("Edge: ", i, '. ', j, " of ", length(edge_indexes), ". Type control keys the [enter]. Keys: n > forward, b > backwards, q > quit.", sep=""))
if (val == 'n') {
if (j < length(edge_indexes)) {
j = j + 1;
}
}
if (val == 'b') {
if (j > 1) {
j = j - 1
}
}
if(val == 'q') {
done = TRUE
}
}
}
#' Plots four images for a given gene pair to explore their relationship.
#'
#' @param gene1
#' The name of the first gene in the pair.
#' @param gene2
#' The name of the second gene in the pair.
#' @param osa
#' The sample annotation matrix. One column must contain the header 'Sample'
#' and the remaining colums correspond to an annotation type. The rows
#' of the anntation columns should contain the annotations.
#' @param ematrix
#' The expression matrix.
#' @param field
#' The field in the sample annotation matrix by which to split the points
#' into separate layers in the pairwise scatterplot
#' @param ingroup
#' A list of sample names that are considered the "ingroup". This could be,
#' for exapmle, a GMM cluster. These samples will be drawn in a larager
#' size..
#' @param field2
#' The field in the sample annotation matrix by which to order points on
#' the y-axis of the bottom two gene expression plots. The x-axis is
#' ordered by expression level.
#' @param show.lengend
#' Set to TRUE to display the plot legened
#' @export
plot2DPairReport <-function(gene1, gene2, osa, ematrix, field, ingroup = c(),
field2 = field, show.legend=TRUE) {
groups = osa[c(field, field2)]
groups['Group'] = 'Out'
groups[ingroup, 'Group'] = 'In'
FigYa = plot2DPair(gene1, gene2, osa, ematrix, ingroup=ingroup, field = field, fig_title='(a)', show.legend=TRUE)
FigYc = plotGene(gene1, osa, ematrix, field=field2, ingroup=ingroup, colfield=field, show.legend=TRUE, fig_title='(c)')
FigYd = plotGene(gene2, osa, ematrix, field=field2, ingroup=ingroup, colfield=field, show.legend=TRUE, fig_title='(d)')
expdata = merge(t(ematrix[c(gene1,gene2),]), groups, by="row.names")
cols = c('Sample', gene1, gene2, field, field2)
colnames(expdata) = cols
expdata = expdata[, c('Sample', gene1, gene2, field)]
mexpdata = melt(expdata, id.vars=c('Sample', field), variable.name='Gene', value.name='Expression' )
colnames(expdata) = c('Sample', 'Edge', 'Gene', 'Expression')
FigYb <- ggplot(mexpdata, aes_string(x=field, y="Expression", fill=field)) +
geom_violin(trim=FALSE, show.legend=FALSE) + ggtitle('(b)') +
geom_boxplot(width=0.1, show.legend=FALSE) +
scale_fill_brewer(palette="Dark2") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_grid(. ~ Gene)
layout <- matrix(seq(1,100), ncol = 10, nrow = 10)
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
print(FigYa, vp = viewport(layout.pos.row = c(1:6), layout.pos.col = c(1:5)))
print(FigYb, vp = viewport(layout.pos.row = c(1:6), layout.pos.col = c(6:10)))
print(FigYc, vp = viewport(layout.pos.row = c(7:10), layout.pos.col = c(1:5)))
print(FigYd, vp = viewport(layout.pos.row = c(7:10), layout.pos.col = c(6:10)))
}
#' Finds Linked communities in the network.
#'
#' This function uses the linked communities (linkcomm) method for module discover. This
#' approaches finds modules of edges rather than modules of nodes. This allows
#' nodes to be in more than one module better supporting the concept of multifunctional
#' genes. This function generates three output files that it writes to the current
#' working directory.
#'
#' @param net
#' A network data frame containing the KINC-produced network. The loadNetwork
#' function imports a dataframe in the correct format for this function.
#' @param file_prefix
#' A prefix to add to the beginning of each file name.
#' @param module_prefix
#' A prefix to add to the beginning of the module names. By deafult this is simply
#' the letter 'M'.
#' @param hcmethod
#' A character string naming the hierarchical clustering method to use. Can be one of
#' "ward.D", "single", "complete", "average", "mcquitty", "median", or "centroid".
#' Defaults to "single".
#' @param meta
#' Indicates if modules should be collapsed into meta-modules. If set to TRUE then
#' the linked communities returned are meta modules. Defaults to FALSE.
#' @param th
#' Specifies the Jaccard similarity score between two modules gene content in order
#' for those modules to be merged. Only applies if meta=TRUE. Lower threshold results in merging
#' being more common.
#' @param min.verticies
#' If a network is disconnected then communities will be found in each subgraph
#' independnet of the others. This argument specifies the number of elements that
#' must exist in a subgraph for communities to be identified.
#' @param ignore_inverse
#' If TRUE inverese edges are removed from the analysis. Defaults to TRUE
#' @param sim_col
#' The name of the column in the network data frame that contains the similarity
#' score.
#' @return
#' The linked communities object.
#' @export
#'
findLinkedCommunities = function(net, file_prefix="net", module_prefix = 'M',
hcmethod = 'ward.D', meta = TRUE,
ignore_inverse = TRUE, th=0.5, min.vertices=10,
sim_col = 'Similarity_Score') {
new_net = net
new_net$Module = NA
# If the user requested to ignore inverse correlation edges we'll take those out.
if (ignore_inverse) {
g = graph.edgelist(as.matrix(net[which(net[[sim_col]] > 0), c('Source', 'Target')]), directed = F)
E(g)$weight = abs(net[which(net[[sim_col]] > 0), sim_col])
}
else {
g = graph.edgelist(as.matrix(net[, c('Source', 'Target')]), directed = F)
E(g)$weight = abs(net[, sim_col])
}
# Remove duplicated edges or it drammatically slows down cluster merging.
#g = unique(g)
# Linked community method doesn't do well with disconnected graphs. So, we want
# to decompose the graph into its disjointed subgraphs. If the callee doesn't want
# modules to span inverse edges then remove those.
subg = decompose(g, min.vertices = min.vertices);
lcs = list()
# Iterate through the subgraphs and find the communities.
for (gi in 1:length(subg)) {
print(paste('Working on subgraph', gi, 'of', length(subg), '...', sep=" "))
# We need a large enough subnetwork to find clusters.
subnet = as_edgelist(subg[[gi]])
# Create the Link Community clusters.
lc = getLinkCommunities(subnet, hcmethod = hcmethod, removetrivial = FALSE, plot = FALSE)
# If the meta analysis was selected. Rather than use the cluster merging
# function that comes with LCM we perform our own. This is because the
# heirarchical clustering method of LCM improperly merges some clusters that
# are not connected.
if (meta) {
r = mergeCommunities(lc, th)
}
else {
r = list(
cedges = lc$clusters,
cnodes = lc$nodeclusters
)
}
lcs[[gi]] = r
# Iterate through all of the edges in the graph and add the module to the
# network
for (i in 1:length(r$cedges)) {
for (j in 1:length(r$cedges[[i]])) {
edge = subnet[r$cedges[[i]][j],]
node1 = edge[1]
node2 = edge[2]
cluster = i
ei = which(new_net$Source == node1 & new_net$Target == node2)
if (length(ei) > 0) {
module = sprintf("SG%02dM%04d", gi, cluster)
new_net[ei,]$Module = module
}
ei = which(new_net$Target == node1 & new_net$Source == node2)
if (length(ei) > 0) {
module = sprintf("SG%02dM%04d", gi, cluster)
new_net[ei,]$Module = module
}
}
}
}
# Convert all of the arrays above into a data frame for printing the modules
# list.
write.table(new_net, file=paste(file_prefix, "gcn.lcm.txt", sep=".") ,sep="\t", row.names=FALSE, append=FALSE, quote=FALSE)
# Write out the node list
node_list = data.frame(Node=c(as.character(new_net$Source), as.character(new_net$Target)), Cluster=c(new_net$Module, new_net$Module))
write.table(unique(node_list), file=paste(file_prefix, "coexpnet.edges.lcmByNodes.txt", sep=".") ,sep="\t", row.names=FALSE, append=FALSE, quote=FALSE, col.names=FALSE)
# convert the edge and cluster into a report of modules by edge which can
# be used for Cytoscape.
new_net_edges = cbind(paste(new_net$Source, "(co)", new_net$Target, sep=" "), new_net$Module)
colnames(new_net_edges) = c('Edge', 'Module')
new_net_edges = new_net_edges[which(!is.na(new_net_edges[,2])),]
write.table(new_net_edges, file=paste(file_prefix, "coexpnet.edges.lcm.cys.txt", sep=".") ,sep="\t", row.names=FALSE, append=FALSE, quote=FALSE)
return(lcs)
}
#' Merges linked community clusters that have the most similar sets of nodes.
#'
#' This merging function recursively iterates through all of the clusters
#' and performs a pair-wse Jaccard comparision between all clusters.
#' Those with the highest Jaccard score that are above the given
#' threshold (i.e. deafult of 0.5) are candidates for merging.
#'
#' This is a helper function for the findLinkedCommunities() function and is
#' not meant to be called on its own.
#'
#' @param lc
#' A linkcomm object.
#' @param th
#' Threshold to be used by `mergeClusters`. Lower threshold results in merging
#' being more common.
#'
#' @return
#' A list were each element of the list is the
#' set of nodes and edges of the merged clusters.
mergeCommunities = function(lc, th = 0.5){
cedges = lc$clusters
cnodes = lc$nodeclusters
cnodes$cluster = as.integer(cnodes$cluster)
r = mergeClusters(cedges, cnodes, th)
return(r)
}
#' The recursive merging function called by mergeCommunities().
#' This is a helper function for the findLinkedCommunities() and mergeCommunities
#' functions and is not meant to be called on its own.
mergeClusters = function(cedges, cnodes, th) {
nclusters = length(cedges)
# Create a dataframe for storing the best pairwise Jaccard similarity scores.
best = data.frame(i = 0, j = 0, ji = 0)
for (i in 1:nclusters) {
if (best$ji[1] == 1) next
for (j in 1:nclusters) {
if (best$ji[1] == 1) next
if (j >= i) next
A = as.character(cnodes$node[which(cnodes$cluster == i)])
B = as.character(cnodes$node[which(cnodes$cluster == j)])
overlap_i = length(intersect(A, B))/length(A)
overlap_j = length(intersect(A, B))/length(B)
if (overlap_i > best$ji[1]) {
best$i[1] = i
best$j[1] = j
best$ji[1] = overlap_i
}
if (overlap_j > best$ji[1]) {
best$i[1] = i
best$j[1] = j
best$ji[1] = overlap_j
}
}
}
# Now merge the best two clusters as long as the jaccard score is above
# our given threshold.
if (best$ji[1] >= th) {
i = best$i[1]
j = best$j[1]
cat(paste("Merging", i, 'and', j, 'from', nclusters, 'similarity:', sprintf("%.02f", best$ji[1]), "\r", sep=" "))
cedges[[i]] = union(cedges[[i]], cedges[[j]])
cedges[[j]] = NULL
cnodes$cluster[which(cnodes$cluster == j)] = i
cnodes$cluster[which(cnodes$cluster > j)] = cnodes$cluster[which(cnodes$cluster > j)] - 1
cnodes = cnodes[which(!duplicated(cnodes)),]
r = mergeClusters(cedges, cnodes, th)
return(r)
}
return(list(
cedges = cedges,
cnodes = cnodes
))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.