R/AddText.twitter.R

Defines functions null_to_na retweet_full_text retweet_header_regex add_tweet_text extract_nested_tweet_text AddText.twomode.twitter AddText.semantic.twitter AddText.actor.twitter AddText.activity.twitter

Documented in AddText.activity.twitter AddText.actor.twitter AddText.semantic.twitter AddText.twomode.twitter

#' @title Add columns containing text data to twitter activity network dataframes
#'
#' @param net A named list of dataframes \code{nodes} and \code{edges} generated by \code{Create}.
#' @param data A dataframe generated by \code{Collect}.
#' @param hashtags Logical. Add tweet hashtags to dataframes. Default is \code{FALSE}.
#' @param ... Additional parameters passed to function. Not used in this method.
#'
#' @return Network as a named list of two dataframes containing \code{$nodes} and \code{$edges} including columns
#'   containing text data.
#'
#' @aliases AddText.activity.twitter
#' @name AddText.activity.twitter
#' @export
AddText.activity.twitter <- function(net, data, hashtags = FALSE, ...) {
  net$nodes <- add_tweet_text(net$nodes, data$tweets, hashtags)
  class(net) <- union(class(net), c("voson.text"))
  msg("Done.\n")

  net
}

#' @title Add columns containing text data to twitter actor network dataframes
#'
#' @param net A named list of dataframes \code{nodes} and \code{edges} generated by \code{Create}.
#' @param data A dataframe generated by \code{Collect}.
#' @param hashtags Logical. Add tweet hashtags to dataframes. Default is \code{FALSE}.
#' @param ... Additional parameters passed to function. Not used in this method.
#'
#' @return Network as a named list of two dataframes containing \code{$nodes} and \code{$edges} including columns
#'   containing text data.
#'
#' @aliases AddText.actor.twitter
#' @name AddText.actor.twitter
#' @export
AddText.actor.twitter <- function(net, data, hashtags = FALSE, ...) {
  net$edges <- add_tweet_text(net$edges, data$tweets, hashtags)
  class(net) <- union(class(net), c("voson.text"))
  msg("Done.\n")

  net
}

#' @title Add columns containing text data to twitter semantic network dataframes
#'
#' @param net A named list of dataframes \code{nodes} and \code{edges} generated by \code{Create}.
#' @param data A dataframe generated by \code{Collect}.
#' @param hashtags Logical. Add tweet hashtags to dataframes. Default is \code{FALSE}.
#' @param ... Additional parameters passed to function. Not used in this method.
#'
#' @return Network as a named list of two dataframes containing \code{$nodes} and \code{$edges} including columns
#'   containing text data.
#'
#' @aliases AddText.semantic.twitter
#' @name AddText.semantic.twitter
#' @export
AddText.semantic.twitter <- function(net, data, hashtags = FALSE, ...) {
  net$edges <- add_tweet_text(net$edges, data$tweets, hashtags)
  class(net) <- union(class(net), c("voson.text"))
  msg("Done.\n")

  net
}

#' @title Add columns containing text data to twitter 2mode network dataframes
#'
#' @param net A named list of dataframes \code{nodes} and \code{edges} generated by \code{Create}.
#' @param data A dataframe generated by \code{Collect}.
#' @param hashtags Logical. Add tweet hashtags to dataframes. Default is \code{FALSE}.
#' @param ... Additional parameters passed to function. Not used in this method.
#'
#' @return Network as a named list of two dataframes containing \code{$nodes} and \code{$edges} including columns
#'   containing text data.
#'
#' @aliases AddText.twomode.twitter
#' @name AddText.twomode.twitter
#' @export
AddText.twomode.twitter <- function(net, data, hashtags = FALSE, ...) {
  net$edges <- add_tweet_text(net$edges, data$tweets, hashtags)
  class(net) <- union(class(net), c("voson.text"))
  msg("Done.\n")

  net
}

# extract text from nested tweet fields
extract_nested_tweet_text <- function(x, var, hashtags = FALSE) {
  df <- x |>
    dplyr::select(parent_status_id = .data$status_id, {{ var }}) |>
    tidyr::unnest(cols = c({{ var }}))

  if (ncol(df) == 1) return(NULL)
  
  if (!"id_str" %in% colnames(df)) return(NULL) 
  
  df <- df |>
    dplyr::filter(!is.na(.data$id_str)) |>
    dplyr::select(.data$parent_status_id, status_id = .data$id_str, .data$full_text, .data$entities) |>
    dplyr::distinct(.data$parent_status_id, .keep_all = TRUE)

  if (hashtags) {
    df <- df |>
      dplyr::mutate(hashtags = .data$entities$hashtags) |>
      tidyr::hoist(
        .col = .data$hashtags,
        hashtags = list("text"),
        .remove = FALSE
      )
  }

  df <- df |> dplyr::select(-.data$entities)

  df
}

# extract and add text to twitter network nodes or edges
add_tweet_text <- function(objs, data, hashtags = FALSE) {
  objs <-
    dplyr::left_join(
      objs,
      dplyr::select(
        data,
        .data$status_id,
        t.is_reply = .data$is_reply,
        t.is_quote = .data$is_quote,
        t.is_retweet = .data$is_retweet,
        t.full_text = .data$full_text,
        entities = .data$entities
      ) |> dplyr::distinct(.data$status_id, .keep_all = TRUE),
      by = "status_id")

  if (hashtags) {
    objs <- objs |>
      tidyr::hoist(
        .col = .data$entities,
        t.hashtags = list("hashtags", "text"),
        .remove = FALSE
      )
  }

  objs <- objs |>
    dplyr::select(-.data$entities) |>
    dplyr::mutate(
      t.full_text = data.table::fifelse(!is.na(.data$t.full_text), .data$t.full_text, NA_character_)
    )

  # separate to simplify joins
  # tweets are present in data and refs are referenced tweets derived from data
  
  # tweets with refs have either TRUE or FALSE value, not NA
  objs_tweets <- objs |>
    dplyr::filter(!is.na(.data$t.is_retweet) & !is.na(.data$t.is_quote))

  # refs have NA for both values  
  objs_refs <- objs |>
    dplyr::filter(is.na(.data$t.is_retweet) & is.na(.data$t.is_quote))

  # nested quote tweets
  qs <- extract_nested_tweet_text(data, "qs", hashtags)
  if (!is.null(qs)) {
    qs <- qs |> dplyr::rename_with(function(x) paste0("t.quoted.", x))

    objs_tweets <- objs_tweets |>
      dplyr::left_join(qs, by = c("status_id" = "t.quoted.parent_status_id"))

    objs_refs <- objs_refs |>
      dplyr::left_join(
        qs |>
          dplyr::select(-.data$t.quoted.parent_status_id) |>
          dplyr::distinct(.data$t.quoted.status_id, .keep_all = TRUE),
        by = c("status_id" = "t.quoted.status_id")
      )
  }

  # nested retweets
  rts <- extract_nested_tweet_text(data, "rts", hashtags)
  if (!is.null(rts)) {
    rts <- rts |> dplyr::rename_with(function(x) paste0("t.retweeted.", x))

    objs_tweets <- objs_tweets |>
      dplyr::left_join(rts, by = c("status_id" = "t.retweeted.parent_status_id"))

    objs_refs <- objs_refs |>
      dplyr::left_join(
        rts |>
          dplyr::select(-.data$t.retweeted.parent_status_id) |>
          dplyr::distinct(.data$t.retweeted.status_id, .keep_all = TRUE),
        by = c("status_id" = "t.retweeted.status_id")
      )
  }

  objs <- dplyr::bind_rows(objs_tweets, objs_refs)

  # set a default text
  # not really interested in quoted text as it is contained within parent tweet
  if (!"t.retweeted.full_text" %in% colnames(objs)) objs$t.retweeted.full_text <- NA_character_
  if (!"t.quoted.full_text" %in% colnames(objs)) objs$t.quoted.full_text <- NA_character_
    
  objs <- objs |> dplyr::mutate(
    vosonTxt_tweet = data.table::fcase(
      .data$t.is_retweet == TRUE, .data$t.retweeted.full_text,
      is.na(.data$t.full_text) & !is.na(.data$t.retweeted.full_text), .data$t.retweeted.full_text,
      is.na(.data$t.full_text) & !is.na(.data$t.quoted.full_text), .data$t.quoted.full_text,
      !is.na(.data$t.full_text), .data$t.full_text,
      default = NA_character_
    )
  )

  if (hashtags) {
    objs$t.hashtags <- lapply(objs$t.hashtags, null_to_na)
    
    if (!"t.retweeted.hashtags" %in% colnames(objs)) objs$t.retweeted.hashtags <- NA_character_
    if (!"t.quoted.hashtags" %in% colnames(objs)) objs$t.quoted.hashtags <- NA_character_
    objs$t.quoted.hashtags <- lapply(objs$t.quoted.hashtags, null_to_na)
    objs$t.retweeted.hashtags <- lapply(objs$t.retweeted.hashtags, null_to_na)

    objs <- objs |>
      dplyr::rowwise() |>
      dplyr::mutate(
        vosonTxt_hashtags = data.table::fcase(
          .data$t.is_retweet == TRUE, paste0(.data$t.retweeted.hashtags, collapse = ","),
          is.na(.data$t.full_text) & !is.na(.data$t.retweeted.full_text),
            paste0(.data$t.retweeted.hashtags, collapse = ","),
          is.na(.data$t.full_text) & !is.na(.data$t.quoted.full_text),
            paste0(.data$t.quoted.hashtags, collapse = ","),
          !is.na(.data$t.full_text), paste0(.data$t.hashtags, collapse = ","),
          default = NA_character_
        )
      )
  }

  objs$vosonTxt_tweet <- textutils::HTMLdecode(objs$vosonTxt_tweet)

  objs <- objs |> dplyr::relocate(.data$vosonTxt_tweet, .after = dplyr::last_col())

  if (hashtags) objs <- objs |> dplyr::relocate(.data$vosonTxt_hashtags, .after = dplyr::last_col())

  objs
}

# retweet text header
retweet_header_regex <- function() {
  "^(RT\\s@[A-Za-z0-9_]+:)?\\s"
}

# replaces truncated retweet full_text with embedded entities full_text
# extracts and prepends existing retweet header to tweet text
retweet_full_text <- function(x) {
  rts <- extract_nested_tweet_text(x, "rts", FALSE)

  if (!is.null(rts)) {
    rts <- rts |> dplyr::rename_with(function(x) paste0("t.retweeted.", x)) |>
      dplyr::select(.data$t.retweeted.parent_status_id, .data$t.retweeted.full_text)

    x <- x |>
      dplyr::left_join(rts, by = c("status_id" = "t.retweeted.parent_status_id")) |>
      dplyr::mutate(full_text = ifelse(.data$is_retweet == TRUE & !is.na(.data$t.retweeted.full_text),
                                       paste(
                                         stringr::str_match(.data$full_text, retweet_header_regex()),
                                         .data$t.retweeted.full_text
                                       ),
                                       .data$full_text)) |>
      dplyr::select(-.data$rts, -.data$t.retweeted.full_text)
  }

  x
}

# joins add nulls into dataframes
null_to_na <- function(x) {
  if (length(x) < 1) return(c(NA_character_))

  x
}
vosonlab/vosonSML documentation built on April 28, 2024, 6:26 a.m.