#' @title Converting a given network into a co-occurrence network
#'
#' @param alleles A vector of alleles for which pairwise co-occurrence counts
#' will be calculated.
#' @param net A Graph object defining a network whose edge weights will be replaced
#' with co-occurrence counts. The argument "alleles" overrides this argument. When
#' alleles = NULL and net != NULL, the net will be converted into a co-occurrence
#' network while no edges will be added to it. To define the object "net", the
#' first two columns of the edge attribute must be node names. Edge attributes
#' must include "d_min" and "d_max" for the ingroup distances when the parameter
#' ds is specified and use.ingroup.d = TRUE.
#' @param pam An allelic presence-absence matrix produced by the function
#' importAllelicPAM.
#' @param sam A data frame of sample information. It must have two columns named
#' "Year_low" and "Year_up" for the lower bound and upper bound of years that an
#' isolate was obtained. For isolates whose isolation years are known explicitly,
#' Year_low = Year_up. In addition, the first column must be isolate names.
#' @param name.col The index for the column of sam for strain names. By default,
#' name.col = 1. Namely, the first column in sam stores the strain names.
#' @param years An integer vector for every year the co-occurrence network is
#' generated. Leave this argument as 0 (default) to disable year-based sampling.
#' @param ds A data frame of allelic physical distances, which is generated by the
#' physDist pipeline. This is an optional argument.
#' @param use.ingroup.d A logical argument specifying the function to filter the
#' physical allelic distances (in ds) using the d_min and d_max in edge attributes.
#' By default, use.ingroup.d = FALSE, so All distances are used to calculate the
#' distance measurability.
#'
#' @return A list of two elements. The first element "G" is a list of Graph objects,
#' each named by a year; the second element w is a numeric vector for weighted
#' occurrence count per isolate. However, when years <= 0, the G element is a
#' single Graph object.
#'
#' @author Yu Wan (\email{wanyuac@@126.com})
#' @export
#
# Copyright 2018 Yu Wan <wanyuac@126.com>
# Licensed under the Apache License, Version 2.0
# First edition: 11 Aug 2018, the latest edition: 25 Dec 2018
mkCoocurNetwork <- function(alleles = NULL, net = NULL, pam = NULL, sam = NULL,
name.col = 1, years = 0, ds = NULL, use.ingroup.d = FALSE) {
# Validity check
all_to_all <- !is.null(alleles) # to compute all-to-all co-occurrence or simply convert a network into a co-occurrence network?
if ((!all_to_all) && is.null(net)) {
stop("Argument error: alleles and net cannot be NULL at the same time.")
}
# Unify column name
if (names(sam)[name.col] != "Strain") {
names(sam)[name.col] <- "Strain"
}
with_ds <- !is.null(ds)
# Calculate weighted occurrence counts. For instance, if an isolate was obtained
# between 2017 and 2018, then its weighted occurence count of an allele is
# 0.5 for each year.
year_range <- sam$Year_up - sam$Year_low + 1
ws <- round(1 / year_range, digits = 4) # weight for allelic presence per year: the wider the range, the smaller the weight
names(ws) <- sam$Strain # the weights
ws <- ws[rownames(pam)] # match the weights with strain names in the PAM
#W <- diag(as.numeric(w_counts)) # construct a square diagonal matrix
#rownames(W) <- rownames(pam) # Otherwise, PAM loses row names in the next step.
#pam <- W %*% pam # Convert the allelic PAM into a weighted PAM by multiplying the weights to rows of PAM, which is essentially weight the allelic presence-absence for each strain
if (all_to_all) {
# Create a new network from all the alleles
E <- data.frame(a1 = character(0), a2 = character(0), stringsAsFactors = FALSE)
n <- length(alleles)
for (i in 1 : (n - 1)) {
for (j in (i + 1) : n) {
E <- rbind.data.frame(E, data.frame(a1 = alleles[i], a2 = alleles[j], stringsAsFactors = FALSE))
}
}
} else {
# Convert the network into an undirected network
no_pair <- !any(names(net@E) == "pair")
if (no_pair) {
net@E <- assignPairID(lmms = net@E, paired.rows = FALSE) # a "pair" column will be appended to this data frame
}
E <- net@E[!duplicated(net@E$pair), ] # only use the data frame of edge attributes for the rest of process
if (no_pair) {
E <- E[, -ncol(E)] # drop the last column - "pair", which was appended just now
}
names(E)[c(1, 2)] <- c("a1", "a2") # allele 1 and 2. Actually, only the first two columns will be used for the rest of process.
}
# Get co-occurrence count for each pair of alleles in each year
if (years[1] > 0) { # temporal mode
G <- vector(mode = "list", length = 0)
for (yr in years) {
strains_yr <- sam$Strain[(sam$Year_low <= yr) & (sam$Year_up >= yr)] # strains of that year
n <- length(strains_yr) # sample size of that year
print(paste0(n, " strains was isolated in ", yr, "."))
if (n > 1) { # multiple strains
pam_yr <- pam[strains_yr, ]
ws_yr <- ws[strains_yr]
if (with_ds) {
ds_yr <- subset(ds, sample %in% strains_yr)
G[[as.character(yr)]] <- .countCoocurrencePerYear(E, yr, pam_yr, ds_yr, ws_yr, use.ingroup.d)
} else {
G[[as.character(yr)]] <- .countCoocurrencePerYear(E, yr, pam_yr, NULL, ws_yr, use.ingroup.d)
}
} else if (n == 1) { # Only a single strain was obtained in that year.
print(paste0("One strain was isolated in ", yr, "."))
pam_yr <- pam[strains_yr, ] # a named vector
ws_yr <- ws[[strains_yr]] # a numeric value
if (with_ds) {
ds_yr <- subset(ds, sample == strains_yr)
G[[as.character(yr)]] <- .countCoocurrencePerYear(E, yr, pam_yr, ds_yr, ws_yr, use.ingroup.d)
} else {
G[[as.character(yr)]] <- .countCoocurrencePerYear(E, yr, pam_yr, NULL, ws_yr, use.ingroup.d)
}
} else {
print(paste0("No strain was isolated in the year ", yr, "."))
G[[as.character(yr)]] <- NULL
}
}
} else if (with_ds) { # overview mode
G <- .countCoocurrencePerYear(E = E, yr = "all", pam_yr = pam, ds_yr = ds, ws_yr = ws, filter_d = use.ingroup.d)
} else {
G <- .countCoocurrencePerYear(E = E, yr = "all", pam_yr = pam, ds_yr = NULL, ws_yr = ws, filter_d = use.ingroup.d)
}
out <- list(G = G, w = ws)
return(out)
}
.countCoocurrencePerYear <- function(E, yr, pam_yr, ds_yr = NULL, ws_yr, filter_d = FALSE) {
# a subordinate function of mkCoocurNetwork
v_yr <- data.frame(allele = character(0), count = integer(0),
count_w = numeric(0), stringsAsFactors = FALSE)
e_yr <- data.frame(a1 = character(0), a2 = character(0), n_co = integer(0),
n_co_w = numeric(0), n_d = integer(0),
m = numeric(0), stringsAsFactors = FALSE)
pam_isMat <- is.matrix(pam_yr)
if (!is.null(ds_yr)) {
with_ds <- nrow(ds_yr) > 0
} else {
with_ds <- FALSE
}
# Edge attributes ===============
for (i in 1 : nrow(E)) { # go through every edge
e <- E[i, ] # take an edge from the undirected network
a1 <- e$a1
a2 <- e$a2
if (pam_isMat) {
v_co <- pam_yr[, a1] * pam_yr[, a2] # a binary vector of co-occurrence status
} else {
v_co <- pam_yr[[a1]] * pam_yr[[a2]] # a binary variable
}
n_co <- sum(v_co)
n_co_w <- sum(v_co * ws_yr) # weighted co-occurrence count
if (n_co > 0) {
if (with_ds) {
ds_yr_a12 <- getRowsXY(df = ds_yr, k1 = a1, k2 = a2, c1 = "query1", c2 = "query2")
if (nrow(ds_yr_a12) > 0) {
d <- ds_yr_a12$distance
if (filter_d) {
n_d_a12 <- sum(d >= e$d_min & d <= e$d_max) # in-group distances
} else {
n_d_a12 <- length(d)
}
e_yr <- rbind.data.frame(e_yr,
data.frame(a1 = a1, a2 = a2, n_co = n_co,
n_co_w = n_co_w, n_d = n_d_a12,
m = round(n_d_a12 / n_co, digits = 4),
stringsAsFactors = FALSE),
stringsAsFactors = FALSE)
} else { # No distance available for the current pair of alleles this year
e_yr <- rbind.data.frame(e_yr,
data.frame(a1 = a1, a2 = a2, n_co = n_co,
n_co_w = n_co_w, n_d = 0, m = 0,
stringsAsFactors = FALSE),
stringsAsFactors = FALSE)
}
} else { # No distance is available for that year.
e_yr <- rbind.data.frame(e_yr,
data.frame(a1 = a1, a2 = a2, n_co = n_co,
n_co_w = n_co_w, n_d = 0, m = 0,
stringsAsFactors = FALSE),
stringsAsFactors = FALSE)
}
} else {
e_yr <- rbind.data.frame(e_yr,
data.frame(a1 = a1, a2 = a2, n_co = 0,
n_co_w = 0, n_d = 0, m = 0,
stringsAsFactors = FALSE),
stringsAsFactors = FALSE)
}
}
# Attach other columns
e_yr <- merge(x = e_yr, y = E, by = c("a1", "a2"), all.x = TRUE, all.y = FALSE, sort = FALSE)
# Node attributes ===============
vs <- sort(union(e_yr$a1, e_yr$a2), decreasing = FALSE)
if (pam_isMat) {
for (a in vs) {
pa <- pam_yr[, a]
n <- sum(pa)
if (n > 0) {
n_w <- sum(pa * ws_yr) # weighted counts
} else {
n_w <- 0
}
v_yr <- rbind.data.frame(v_yr, data.frame(allele = a,
count = n,
count_w = n_w,
stringsAsFactors = FALSE),
stringsAsFactors = FALSE)
}
} else {
for (a in vs) {
pa <- pam_yr[[a]] # a single, binary value
v_yr <- rbind.data.frame(v_yr, data.frame(allele = a,
count = pa,
count_w = pa * ws_yr,
stringsAsFactors = FALSE),
stringsAsFactors = FALSE)
}
}
return(new("Graph", id = as.character(yr), V = v_yr, E = e_yr))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.