R/account_stats.R

Defines functions account_stats

Documented in account_stats

#' account_stats
#'
#' @description
#' Calculate account statistics: total posts shared, average time delta,
#' average edge symmetry score.
#'
#' @details
#' With this helper function, you can obtain summary statistics for the accounts in
#' the network. When applied to a network for which a narrower `time_window` has
#' been calculated using the \link{flag_speed_share} function, the summary statistics
#' are computed separately for the full and faster networks depending on the
#' 'weight_threshold' option. When this option is set to "full", metrics are
#' computed on the set of nodes and edges surpassing the user-defined
#' edge_weight threshold in the \link{generate_coordinated_network} function. Also,
#' metrics for nodes and edges in the fastest network are returned, but they are
#' calculated on the specified subgraph. The same applies when the 'weight_threshold'
#' option is set to "fast". In this case, metrics are calculated on the fast subgraph.
#' When the option is set to "null", the entire inputted graph without further
#' subsetting is considered.
#'
#' The node share count is performed thanks to the table resulting from the
#' \link{detect_groups} function. If the user has used the optional
#' \link{flag_speed_share} function and decides to calculate statistics on
#' the fastest graph (by setting weight_threshold = "fast"), the share count
#' is calculated considering only shares made in the fastest time window.
#' Alternatively, shares in the largest time window are considered (option
#' weight_threshold = "full" or weight_threshold = "none"). When calculating the
#' share count, all shares made by accounts are considered, regardless of
#' whether they are shares of posts shared in a coordinated fashion or not,
#' according to the edge weight threshold. In other words, this is a measure of
#' an account's activity in the time window under consideration.
#'
#' @param coord_graph an `igraph` object generated by \link{generate_coordinated_network}
#' @param result a table generated by \link{detect_groups}
#' @param weight_threshold The threshold to be used for filtering the graph
#' (options: "full", "fast", or "none").

#' @return a data.table with summary statistics for each account
#'
#' @import data.table
#' @import igraph
#' @export
#'
#'

account_stats <- function(coord_graph, result, weight_threshold = c("full", "fast", "none")) {
    weight_threshold_full <- weight_threshold_fast <- from <- n_content_id_full <-
        n_content_id_fast <- avg_time_delta_full <- avg_time_delta_fast <- edge_symmetry_score_full <-
        edge_symmetry_score_fast <- to <- n_content_id_y_full <- n_content_id_y_fast <- n_content_full <-
        n_content_fast <- account_id <- n_content_id <- avg_time_delta <- edge_symmetry_score <- n_content_id_y <-
        n_content <- shares <- NULL


    # Check if "_fast" and "_full" are in column names
    if (any(grepl("_fast", igraph::edge_attr_names(coord_graph))) &&
        any(grepl("_full", igraph::edge_attr_names(coord_graph)))) {

        # Check if "object_ids" are included and remove the relative attributes
        if (any(grepl("object_ids", igraph::edge_attr_names(coord_graph)))) {
            # List of edge attributes to be removed
            edge_attrs_to_remove <-
                c("object_ids_full", "object_ids_fast")

            # Removing edge attributes using a loop
            for (attr in edge_attrs_to_remove) {
                coord_graph <- igraph::delete_edge_attr(coord_graph, attr)
            }
        }

        x <-
            data.table::as.data.table(igraph::as_data_frame(coord_graph))

        # Filter by edge weight threshold
        if (weight_threshold == "full") {
            x <- x[weight_threshold_full == 1]
        }

        if (weight_threshold == "fast") {
            x <- x[weight_threshold_fast == 1]
        }

        # Reshape and summarize
        reshaped <- rbind(x[, .(
            account_id = from,
            avg_time_delta_full,
            avg_time_delta_fast,
            edge_symmetry_score_full,
            edge_symmetry_score_fast
        )],
        x[, .(
            account_id = to,
            avg_time_delta_full,
            avg_time_delta_fast,
            edge_symmetry_score_full,
            edge_symmetry_score_fast
        )])

        reshaped <- unique(reshaped)

        account_summary <- reshaped[, .(
            avg_time_delta_full = mean(avg_time_delta_full, na.rm = TRUE),
            avg_time_delta_fast = mean(avg_time_delta_fast, na.rm = TRUE),
            avg_edge_symmetry_score_full = mean(edge_symmetry_score_full, na.rm = TRUE),
            avg_edge_symmetry_score_fast = mean(edge_symmetry_score_fast, na.rm = TRUE)
        ), by = .(account_id)]

        account_summary <-
            account_summary[, lapply(.SD, function(x) {
                x[is.nan(x)] <- NA
                return(x)
            })]

        # get unique shares count from the result table ------------------------

        # Filter by edge weight threshold
        if (weight_threshold == "fast") {
            # Find the first column whose name contains "time_window_"
            fast_flag_column_index <- grep("time_window_", names(result))[1]

            # Get the actual column name
            fast_flag_column <- names(result)[fast_flag_column_index]

            # Filter rows where this column is equal to 1
            result <- result[result[[fast_flag_column]] == 1, ]
        }

        shares_count <- as.data.table(data.frame(
            account_id = c(result$account_id,
                      result$account_id_y),
            shares = c(result$content_id,
                       result$content_id_y)
        ))

        shares_count <-
            shares_count[, .(unique_shares_count = length(unique(shares))),
                         by = .(account_id)]

        # merge shares' count to account summary
        account_summary <-
            account_summary[shares_count, on = "account_id", nomatch = 0]

        data.table::setorder(account_summary, "account_id")

        return(account_summary)
    }

    # --------------------------------------------------------------------------
    # if the data has not been updated with the `flag_speed_share` function ----

    if (!any(grepl("_fast", igraph::edge_attr_names(coord_graph))) &&
        !any(grepl("_full", igraph::edge_attr_names(coord_graph)))) {
        {

            # If data includes object_ids, remove unused edge attributes
            if (any(grepl("object_ids", igraph::edge_attr_names(coord_graph)))) {
                coord_graph <- igraph::delete_edge_attr(coord_graph, "object_ids")
            }

            x <-
                data.table::as.data.table(igraph::as_data_frame(coord_graph))

            # Filter by edge weight threshold
            if (weight_threshold == "full") {
                x <- x[weight_threshold == 1]
            }

            if (weight_threshold == "fast") {
                warning(
                    "The data does not include the fastest shares. Metrics are calculated based on all the data.\nIf you wish to calculate metrics for a narrower time window, be sure to use the `flag_speed_share` function before creating the network."
                )
            }

            reshaped <- rbind(x[, .(
                account_id = from,
                avg_time_delta,
                edge_symmetry_score
            )],
            x[, .(
                account_id = to,
                avg_time_delta,
                edge_symmetry_score
            )])

            # Remove duplicates
            reshaped <- unique(reshaped)

            # Aggregate to find the average values for each account and content pair
            account_summary <- reshaped[, .(
                avg_time_delta = mean(avg_time_delta, na.rm = TRUE),
                avg_edge_symmetry_score = mean(edge_symmetry_score, na.rm = TRUE)
            ), by = .(account_id)]

            account_summary <-
                account_summary[, lapply(.SD, function(x) {
                    # Replace NaN with NA
                    x[is.nan(x)] <- NA
                    return(x)
                })]

            # get unique shares count from the result table --------------------
            shares_count <- data.table::as.data.table(data.frame(
                account_id = c(result$account_id,
                               result$account_id_y),
                shares = c(result$content_id,
                           result$content_id_y)
            ))

            shares_count <-
                shares_count[, .(unique_shares_count = length(unique(shares))),
                             by = .(account_id)]

            # merge shares' count to account summary
            account_summary <-
                account_summary[shares_count, on = "account_id", nomatch = 0]

            data.table::setorder(account_summary, "account_id")

            return(account_summary)
        }
    }
}

Try the CooRTweet package in your browser

Any scripts or data that you put into this service are public.

CooRTweet documentation built on April 4, 2025, 2:25 a.m.