R/estimate_coord_interval.R

Defines functions estimate_coord_interval

Documented in estimate_coord_interval

#' estimate_coord_interval
#'
#' This function estimates a threshold in seconds that defines a coordinated link share. While it is common that multiple (pages/groups/account) entities share the same link, some tend to perform these actions in an unusually short period of time. Unusual is thus defined here as a function of the median co-share time difference. More specifically, the function ranks all co-shares by time-difference from first share and focuses on the behaviour of the quickest second share performing q\% (default 0.5) URLs. The value returned is the median time in seconds spent by these URLs to cumulate the p\% (default 0.1) of their total shares
#'
#' @param ct_shares.df the data.frame of link posts resulting from the function get_ctshares
#' @param q parameter that controls the quantile of quickest URLs to be filtered. Default to 0.1 [0-1]
#' @param p parameter that controls the percentage of total shares to be reached. Default to 0.5 [0-1]
#' @param clean_urls clean up unnecessary url paramters and malformed urls, and keep just the URLs included in the original data set (default FALSE)
#' @param keep_ourl_only restrict the analysis to ct shares links matching the original URLs (default=FALSE)
#'
#' @return a list containing two objects: summary statistics of q\% quickest second share performing URLs, and a time in seconds corresponding to the median time spent by these URLs to cumulate the p\% of their total shares
#' @examples
#' cord_int <- estimate_coord_interval(ct_shares.df = ct_shares.df, q=0.1, p=0.5, clean_urls=TRUE, keep_ourl_only=FALSE)
#' cord_int[[1]]
#' cord_int[[2]]
#'
#' @importFrom dplyr group_by mutate select arrange filter
#'
#' @export

estimate_coord_interval <- function(ct_shares.df, q=0.1, p=0.5, clean_urls=FALSE, keep_ourl_only=FALSE) {

  if(p < 0 | p > 1){
    stop("The p value must be between 0 and 1")
  }

  if(q < 0 | q > 1){
    stop("The q value must be between 0 and 1")
  }

  # initialize logfile
  if (!file.exists("log.txt")) {
    write(paste("#################### CooRnet #####################",
                "\nestimate_coord_interval script executed on:", format(Sys.time(), format = "%F %R %Z")),
          file="log.txt")
  }
  else {
    write(paste("\nestimate_coord_interval script executed on:", format(Sys.time(), format = "%F %R %Z")),
          file="log.txt", append = TRUE)
  }

  # keep original URLs only?
  if(keep_ourl_only==TRUE){
    ct_shares.df <- subset(ct_shares.df, ct_shares.df$is_orig==TRUE)
    if (nrow(ct_shares.df) < 2) stop("Can't execute with keep_ourl_only=TRUE. Not enough posts matching original URLs")
    write("Coordination interval estimated on shares matching original URLs", file = "log.txt", append = TRUE)
  }

  # clean urls?
  if(clean_urls==TRUE){
    ct_shares.df <- clean_urls(ct_shares.df, "expanded")
    write("Coordination interval estimated on cleaned URLs", file = "log.txt", append = TRUE)
  }

  ct_shares.df <- ct_shares.df[, c("id", "date", "expanded"),]

  # get a list of all shared URLs
  URLs <- as.data.frame(table(ct_shares.df$expanded))
  names(URLs) <- c("URL", "ct_shares")
  URLs <- subset(URLs, URLs$ct_shares>1) # remove URLs shared only 1 time (can't be coordinated)
  URLs$URL <- as.character(URLs$URL)

  ct_shares.df <- subset(ct_shares.df, ct_shares.df$expanded %in% URLs$URL)

  ranked_shares <- ct_shares.df %>%
    dplyr::group_by(expanded) %>%
    dplyr::mutate(ct_shares_count=n(),
           first_share_date = min(date),
           rank = rank(date, ties.method = "first"),
           date = date,
           sec_from_first_share = difftime(date, first_share_date, units = "secs"),
           perc_of_shares = rank/ct_shares_count) %>%
    dplyr::select(expanded, ct_shares_count, first_share_date, rank, date, sec_from_first_share, perc_of_shares) %>%
    dplyr::arrange(expanded)

  rm(ct_shares.df)

  # find URLs with an unusual fast second share and keep the quickest
  rank_2 <- ranked_shares %>%
    dplyr::group_by(expanded) %>%
    dplyr::filter(rank==2) %>%
    dplyr::mutate(sec_from_first_share = min(sec_from_first_share)) %>%
    dplyr::select(expanded, sec_from_first_share) %>%
    unique()

  rank_2 <- subset(rank_2, rank_2$sec_from_first_share <= as.numeric(quantile(rank_2$sec_from_first_share, q)))

  # keep only the quickest URLs's shares
  ranked_shares <- subset(ranked_shares, ranked_shares$expanded %in% rank_2$expanded)

  ranked_shares_sub <- ranked_shares %>%
    dplyr::filter(perc_of_shares > p) %>%
    dplyr::mutate(sec_from_first_share = min(sec_from_first_share)) %>%
    dplyr::select(expanded, sec_from_first_share) %>%
    unique()

  summary_secs <- summary(as.numeric(ranked_shares_sub$sec_from_first_share))
  coordination_interval <- paste0(quantile(ranked_shares_sub$sec_from_first_share, 0.5), " secs")

  # get results in case of median equal 0 secs
  if(coordination_interval == "0 secs") {
    coordination_interval <- "1 secs"

    coord_interval <- list(summary_secs, coordination_interval)

    write(paste0("\n",
                  "\nq (quantile of quickest URLs to be filtered): ", q,
                  "\np (percentage of total shares to be reached): ", p,
                  "\ncoordination interval from estimate_coord_interval: ", coordination_interval,
                  "\nWarning: with the specified parameters p and q the median was 0 secs. The coordination interval has been automatically set to 1 secs"), file="log.txt", append=TRUE)

    return(coord_interval)
  }

  else {
    # get results in case of median > 0 secs
    coord_interval <- list(summary_secs, coordination_interval)

    write(paste0("\nq (quantile of quickest URLs to be filtered): ", q,
                 "\np (percentage of total shares to be reached): ", p,
                 "\ncoordination interval from estimate_coord_interval: ", coordination_interval),
        file="log.txt",
        append=TRUE)

  return(coord_interval)
  }
}
fabiogiglietto/CooRnet documentation built on Aug. 15, 2024, 7:16 p.m.