R/func__geoTempAnalyser__mkCoocurNetwork.R

Defines functions .countCoocurrencePerYear mkCoocurNetwork

Documented in mkCoocurNetwork

#' @title Converting a given network into a co-occurrence network
#'
#' @param alleles A vector of alleles for which pairwise co-occurrence counts
#' will be calculated.
#' @param net A Graph object defining a network whose edge weights will be replaced
#' with co-occurrence counts. The argument "alleles" overrides this argument. When
#' alleles = NULL and net != NULL, the net will be converted into a co-occurrence
#' network while no edges will be added to it. To define the object "net", the
#' first two columns of the edge attribute must be node names. Edge attributes
#' must include "d_min" and "d_max" for the ingroup distances when the parameter
#' ds is specified and use.ingroup.d = TRUE.
#' @param pam An allelic presence-absence matrix produced by the function
#' importAllelicPAM.
#' @param sam A data frame of sample information. It must have two columns named
#' "Year_low" and "Year_up" for the lower bound and upper bound of years that an
#' isolate was obtained. For isolates whose isolation years are known explicitly,
#' Year_low = Year_up. In addition, the first column must be isolate names.
#' @param name.col The index for the column of sam for strain names. By default,
#' name.col = 1. Namely, the first column in sam stores the strain names.
#' @param years An integer vector for every year the co-occurrence network is
#' generated. Leave this argument as 0 (default) to disable year-based sampling.
#' @param ds A data frame of allelic physical distances, which is generated by the
#' physDist pipeline. This is an optional argument.
#' @param use.ingroup.d A logical argument specifying the function to filter the
#' physical allelic distances (in ds) using the d_min and d_max in edge attributes.
#' By default, use.ingroup.d = FALSE, so All distances are used to calculate the
#' distance measurability.
#'
#' @return A list of two elements. The first element "G" is a list of Graph objects,
#' each named by a year; the second element w is a numeric vector for weighted
#' occurrence count per isolate. However, when years <= 0, the G element is a
#' single Graph object.
#'
#' @author Yu Wan (\email{wanyuac@@126.com})
#' @export
#
#  Copyright 2018 Yu Wan <wanyuac@126.com>
#  Licensed under the Apache License, Version 2.0
#  First edition: 11 Aug 2018, the latest edition: 25 Dec 2018

mkCoocurNetwork <- function(alleles = NULL, net = NULL, pam = NULL, sam = NULL,
                            name.col = 1, years = 0, ds = NULL, use.ingroup.d = FALSE) {
    # Validity check
    all_to_all <- !is.null(alleles)  # to compute all-to-all co-occurrence or simply convert a network into a co-occurrence network?
    if ((!all_to_all) && is.null(net)) {
        stop("Argument error: alleles and net cannot be NULL at the same time.")
    }

    # Unify column name
    if (names(sam)[name.col] != "Strain") {
        names(sam)[name.col] <- "Strain"
    }
    with_ds <- !is.null(ds)

    # Calculate weighted occurrence counts. For instance, if an isolate was obtained
    # between 2017 and 2018, then its weighted occurence count of an allele is
    # 0.5 for each year.
    year_range <- sam$Year_up - sam$Year_low + 1
    ws <- round(1 / year_range, digits = 4)  # weight for allelic presence per year: the wider the range, the smaller the weight
    names(ws) <- sam$Strain  # the weights
    ws <- ws[rownames(pam)]  # match the weights with strain names in the PAM
    #W <- diag(as.numeric(w_counts))  # construct a square diagonal matrix
    #rownames(W) <- rownames(pam)  # Otherwise, PAM loses row names in the next step.
    #pam <- W %*% pam  # Convert the allelic PAM into a weighted PAM by multiplying the weights to rows of PAM, which is essentially weight the allelic presence-absence for each strain

    if (all_to_all) {
        # Create a new network from all the alleles
        E <- data.frame(a1 = character(0), a2 = character(0), stringsAsFactors = FALSE)
        n <- length(alleles)
        for (i in 1 : (n - 1)) {
            for (j in (i + 1) : n) {
                E <- rbind.data.frame(E, data.frame(a1 = alleles[i], a2 = alleles[j], stringsAsFactors = FALSE))
            }
        }
    } else {
        # Convert the network into an undirected network
        no_pair <- !any(names(net@E) == "pair")
        if (no_pair) {
            net@E <- assignPairID(lmms = net@E, paired.rows = FALSE)  # a "pair" column will be appended to this data frame
        }
        E <- net@E[!duplicated(net@E$pair), ]  # only use the data frame of edge attributes for the rest of process
        if (no_pair) {
            E <- E[, -ncol(E)]  # drop the last column - "pair", which was appended just now
        }
        names(E)[c(1, 2)] <- c("a1", "a2")  # allele 1 and 2. Actually, only the first two columns will be used for the rest of process.
    }

    # Get co-occurrence count for each pair of alleles in each year
    if (years[1] > 0) {  # temporal mode
        G <- vector(mode = "list", length = 0)
        for (yr in years) {
            strains_yr <- sam$Strain[(sam$Year_low <= yr) & (sam$Year_up >= yr)]  # strains of that year
            n <- length(strains_yr)  # sample size of that year
            print(paste0(n, " strains was isolated in ", yr, "."))
            if (n > 1) {  # multiple strains
                pam_yr <- pam[strains_yr, ]
                ws_yr <- ws[strains_yr]
                if (with_ds) {
                    ds_yr <- subset(ds, sample %in% strains_yr)
                    G[[as.character(yr)]] <- .countCoocurrencePerYear(E, yr, pam_yr, ds_yr, ws_yr, use.ingroup.d)
                } else {
                    G[[as.character(yr)]] <- .countCoocurrencePerYear(E, yr, pam_yr, NULL, ws_yr, use.ingroup.d)
                }
            } else if (n == 1) {  # Only a single strain was obtained in that year.
                print(paste0("One strain was isolated in ", yr, "."))
                pam_yr <- pam[strains_yr, ]  # a named vector
                ws_yr <- ws[[strains_yr]]  # a numeric value
                if (with_ds) {
                    ds_yr <- subset(ds, sample == strains_yr)
                    G[[as.character(yr)]] <- .countCoocurrencePerYear(E, yr, pam_yr, ds_yr, ws_yr, use.ingroup.d)
                } else {
                    G[[as.character(yr)]] <- .countCoocurrencePerYear(E, yr, pam_yr, NULL, ws_yr, use.ingroup.d)
                }
            } else {
                print(paste0("No strain was isolated in the year ", yr, "."))
                G[[as.character(yr)]] <- NULL
            }
        }
    } else if (with_ds) {  # overview mode
        G <- .countCoocurrencePerYear(E = E, yr = "all", pam_yr = pam, ds_yr = ds, ws_yr = ws, filter_d = use.ingroup.d)
    } else {
        G <- .countCoocurrencePerYear(E = E, yr = "all", pam_yr = pam, ds_yr = NULL, ws_yr = ws, filter_d = use.ingroup.d)
    }

    out <- list(G = G, w = ws)

    return(out)
}

.countCoocurrencePerYear <- function(E, yr, pam_yr, ds_yr = NULL, ws_yr, filter_d = FALSE) {
    # a subordinate function of mkCoocurNetwork
    v_yr <- data.frame(allele = character(0), count = integer(0),
                       count_w = numeric(0), stringsAsFactors = FALSE)
    e_yr <- data.frame(a1 = character(0), a2 = character(0), n_co = integer(0),
                       n_co_w = numeric(0), n_d = integer(0),
                       m = numeric(0), stringsAsFactors = FALSE)
    pam_isMat <- is.matrix(pam_yr)
    if (!is.null(ds_yr)) {
        with_ds <- nrow(ds_yr) > 0
    } else {
        with_ds <- FALSE
    }

    # Edge attributes ===============
    for (i in 1 : nrow(E)) {  # go through every edge
        e <- E[i, ]  # take an edge from the undirected network
        a1 <- e$a1
        a2 <- e$a2

        if (pam_isMat) {
            v_co <- pam_yr[, a1] * pam_yr[, a2]  # a binary vector of co-occurrence status
        } else {
            v_co <- pam_yr[[a1]] * pam_yr[[a2]]  # a binary variable
        }

        n_co <- sum(v_co)
        n_co_w <- sum(v_co * ws_yr)  # weighted co-occurrence count
        if (n_co > 0) {
            if (with_ds) {
                ds_yr_a12 <- getRowsXY(df = ds_yr, k1 = a1, k2 = a2, c1 = "query1", c2 = "query2")
                if (nrow(ds_yr_a12) > 0) {
                    d <- ds_yr_a12$distance
                    if (filter_d) {
                        n_d_a12 <- sum(d >= e$d_min & d <= e$d_max)  # in-group distances
                    } else {
                        n_d_a12 <- length(d)
                    }
                    e_yr <- rbind.data.frame(e_yr,
                                             data.frame(a1 = a1, a2 = a2, n_co = n_co,
                                                        n_co_w = n_co_w, n_d = n_d_a12,
                                                        m = round(n_d_a12 / n_co, digits = 4),
                                                        stringsAsFactors = FALSE),
                                             stringsAsFactors = FALSE)
                } else {  # No distance available for the current pair of alleles this year
                    e_yr <- rbind.data.frame(e_yr,
                                             data.frame(a1 = a1, a2 = a2, n_co = n_co,
                                                        n_co_w = n_co_w, n_d = 0, m = 0,
                                                        stringsAsFactors = FALSE),
                                             stringsAsFactors = FALSE)
                }
            } else {  # No distance is available for that year.
                e_yr <- rbind.data.frame(e_yr,
                                         data.frame(a1 = a1, a2 = a2, n_co = n_co,
                                                    n_co_w = n_co_w, n_d = 0, m = 0,
                                                    stringsAsFactors = FALSE),
                                         stringsAsFactors = FALSE)
            }
        } else {
            e_yr <- rbind.data.frame(e_yr,
                                     data.frame(a1 = a1, a2 = a2, n_co = 0,
                                                n_co_w = 0, n_d = 0, m = 0,
                                                stringsAsFactors = FALSE),
                                     stringsAsFactors = FALSE)
        }
    }

    # Attach other columns
    e_yr <- merge(x = e_yr, y = E, by = c("a1", "a2"), all.x = TRUE, all.y = FALSE, sort = FALSE)

    # Node attributes ===============
    vs <- sort(union(e_yr$a1, e_yr$a2), decreasing = FALSE)
    if (pam_isMat) {
        for (a in vs) {
            pa <- pam_yr[, a]
            n <- sum(pa)
            if (n > 0) {
                n_w <- sum(pa * ws_yr)  # weighted counts
            } else {
                n_w <- 0
            }
            v_yr <- rbind.data.frame(v_yr, data.frame(allele = a,
                                                      count = n,
                                                      count_w = n_w,
                                                      stringsAsFactors = FALSE),
                                     stringsAsFactors = FALSE)
        }
    } else {
        for (a in vs) {
            pa <- pam_yr[[a]]  # a single, binary value
            v_yr <- rbind.data.frame(v_yr, data.frame(allele = a,
                                                      count = pa,
                                                      count_w = pa * ws_yr,
                                                      stringsAsFactors = FALSE),
                                     stringsAsFactors = FALSE)
        }
    }


    return(new("Graph", id = as.character(yr), V = v_yr, E = e_yr))
}
wanyuac/GeneMates documentation built on Aug. 12, 2022, 7:37 a.m.