R/CreateActorNetwork.twitter.R
In SocialMediaLab: Tools for Collecting Social Media Data and Generating Networks for Analysis

#' @export
CreateActorNetwork.twitter <-
function(x,writeToFile)
{

  from=retweet_from=to=edgeType=timeStamp=tweet_id=users_mentioned=reply_to=NULL # to please the gods of R CMD CHECK

  if (missing(writeToFile)) {
    writeToFile <- FALSE # default = not write to file
  }

  df <- x # match the variable names (this must be used to avoid warnings in package compilation?)

  # if `df` is a list of dataframes, then need to convert these into one dataframe
  # CURRENTLY NOT IMPLEMENTED - there is no method for lists yet.
  # suppressWarnings(
  #   if (class(df)=="list") {
  #   df <- do.call("rbind", df)
  #   }
  # )

  # The `hashtags_used` column in `df` causes problems for creating actor network, so delete it:
  df <- df[,-21]

  # clear any odd characters
  # df <- removeOddChars(df)

  # convert df to data.table
  df <- data.table(df)

    # Now create the dfActorNetwork1, a dataframe of relations between users
    cat("Generating the network...\n")  ### DEBUG
    flush.console()

      # for speed we will pre-allocate `dataCombined` to a very large size (more rows than needed)
      # and after everything is finished we will delete the unused rows

      dataCombined <- data.table(
        from = as.character(c(rep("NA_f00",20000000))),
        to = as.character(c(rep("NA_f00",20000000))),
        edgeType = as.character(c(rep("NA_f00",20000000))),
        timeStamp = as.character(c(rep("NA_f00",20000000))),
        tweet_id = as.character(c(rep("NA_f00",20000000)))
        )

      setkey(dataCombined,from) # set the key value of the data table

      nextEmptyRow <- 1 # so we can update rows in `dataCombined` in a relatively efficient way

      # We firstly do the retweet data
      for (i in 1:nrow(df)) {

        if (is.na(df[i,retweet_from][[1]])) {next} # we check if there are retweets, if not skip to next row

          # nextEmptyRow <- dataCombined[  , .I[from_userID=="NA_f00"] ][1] # we get index of the next 'empty' row to put data into # NOT NEEDED NOW, BUT USEFUL FOR LATER

          dataCombined[nextEmptyRow, from:= as.character(df$from_user[i][[1]])]
          dataCombined[nextEmptyRow, to := as.character(df$retweet_from[i][[1]])]
          dataCombined[nextEmptyRow, edgeType := as.character("Retweet")]
          dataCombined[nextEmptyRow, timeStamp := as.character(df$created_at[i][[1]])]
          dataCombined[nextEmptyRow, tweet_id := as.character(df$id[i][[1]])]

          nextEmptyRow <- nextEmptyRow + 1 # increment the row to update in `dataCombined`

      }

      # Next we do the mentions
      for (i in 1:nrow(df)) {

        if (length(df[i,users_mentioned][[1]]) < 1) {next} # we check if there are likes, if not skip to next row

        for (j in 1:length(df$users_mentioned[i][[1]])){ # for each row of the likes data for post i

          # nextEmptyRow <- dataCombined[  , .I[from_userID=="NA_f00"] ][1] # we get index of the next 'empty' row to put data into # NOT NEEDED NOW, BUT USEFUL FOR LATER

          dataCombined[nextEmptyRow, from := as.character(df$from_user[i][[1]])]
          dataCombined[nextEmptyRow, to := as.character(df$users_mentioned[i][[1]][j])]
          dataCombined[nextEmptyRow, edgeType := as.character("Mention")]
          dataCombined[nextEmptyRow, timeStamp := as.character(df$created_at[i][[1]])]
          dataCombined[nextEmptyRow, tweet_id := as.character(df$id[i][[1]])]

          nextEmptyRow <- nextEmptyRow + 1 # increment the row to update in `dataCombined`

        }

      }

      # Finally, we do the replies data
      for (i in 1:nrow(df)) {

        if (is.na(df[i,reply_to][[1]])) {next} # we check if there are retweets, if not skip to next row

          # nextEmptyRow <- dataCombined[  , .I[from_userID=="NA_f00"] ][1] # we get index of the next 'empty' row to put data into # NOT NEEDED NOW, BUT USEFUL FOR LATER

          dataCombined[nextEmptyRow, from:= as.character(df$from_user[i][[1]])]
          dataCombined[nextEmptyRow, to := as.character(df$reply_to[i][[1]])]
          dataCombined[nextEmptyRow, edgeType := as.character("Reply")]
          dataCombined[nextEmptyRow, timeStamp := as.character(df$created_at[i][[1]])]
          dataCombined[nextEmptyRow, tweet_id := as.character(df$id[i][[1]])]

          nextEmptyRow <- nextEmptyRow + 1 # increment the row to update in `dataCombined`

      }

      # we now delete all the rows at the end of `dataCombined` that are unused
      dataCombined <- dataCombined[edgeType != "NA_f00"] # we just keep the rows that are unchanged from the original dummy data values

    ## --------------------------------

    # make a vector of all the unique actors in the network1
    # actorsNames <- unique(c(as.character(dataCombined$from),as.character(dataCombined$to)))
    actorsNames <- unique(factor(c(as.character(unique(dataCombined$from)),as.character(unique(dataCombined$to)))))

#
# # cat(actorsNames) # DEBUG
#
#     # Retrieve all the user details (e.g. follower count, number of tweets, etc) and include as node attributes.
#       # NOTE: Given API rate limits, the below implementation supports up to 7500 users overall in dataset (150 requests * 50 users per request).
#       # NOTE: Future work needs to address the Twitter API rate limit for looking up user information (150 requests per 15 minutes).
#       # NOTE: Requesting 50 users at a time seems to avoid rate limit errors (it's a safe bet...).
#
#         # This function is supposed to perform the lookups in batches
#         # and mind the rate limit:
#         getUserObjects <- function(users) {
#           groups <- split(users, ceiling(seq_along(users)/50))
#           userObjects <- ldply(groups, function(group) { # ldply is a very cool function, found in plyr package.
#             objects <- lookupUsers(group)
#             out <- twListToDF(objects) # twListToDF is also a handy function, found in twitteR package. Converts weird class object to data frame.
#               # print("Waiting for 15 minutes (to 'refresh' the rate limit)...") # Don't need to use this yet. Implement later for number of users > 7500 (have to do chunked batches... chunks of chunks... urrghh)
#               # Sys.sleep(900)
#             return(out)
#           })
#           return(userObjects)
#         }
#
#     # Putting it into action:
#     usersInformationAttributes <- getUserObjects(actorsNames)
#     actorsInfoDF <- usersInformationAttributes
#
#     # Need to clean the user text collected here (get rid of odd characters):
#     # actorsInfoDF <- RemoveOddCharsUserInfo(actorsInfoDF) # uses the new function in v2_munge_tweets.R
#
#     # We sometimes have a PROBLEM of missing actors (no info could be retrieved for them - might be misspellings/errors/pun or joke, etc)
#     # So, identify which users are missing from original set to retrieved set,
#     # then ensure these users/connections are removed before proceeding onwards:
#
#     missingActors <- setdiff(actorsNames,usersInformationAttributes$screenName)
#       # NOTE: This is a horrible approach, need to optimise.
#     missingTemp <- NULL # store the indexes of "offending" edge connections (i.e. bad/missing actors)
#       # NOTE: Obviously the 'offending' users can only be found in the 2nd column
#       # NOTE: Ipso facto, if they are not real/actual users, then they can't be the source of a directed edge
#
#     for (i in 1:length(missingActors)) {
#       missingTemp <- c(missingTemp, which(missingActors[i] == dataCombined$to))
#     }
#
#     # REMOVE the offendors:
#       if(length(missingTemp) > 0) {
#       dataCombined <- dataCombined[-missingTemp,]
#       }
#
#     # REMOVE any duplicated usernames in the retrieved user information (NOT SURE HOW/WHY THIS WOULD OCCUR **NEED TO CHECK**):
#     # duplicatedUsers <- which(duplicated(actorsInfoDF$screenName))
#
#     # if(length(duplicatedUsers) > 0) {
#     #   actorsInfoDF <- actorsInfoDF[-duplicatedUsers,]
#     # }
#
#     actors <- data.frame(
#       name=actorsInfoDF$screenName,
#       userDescription=actorsInfoDF$description,
#       statusesCount=actorsInfoDF$statusesCount,
#       followersCount=actorsInfoDF$followersCount,
#       favoritesCount=actorsInfoDF$favoritesCount,
#       friendsCount=actorsInfoDF$friendsCount,
#       url=actorsInfoDF$url,
#       realName=actorsInfoDF$name,
#       dateAccountCreated=actorsInfoDF$created,
#       userLocation=actorsInfoDF$location,
#       userLanguage=actorsInfoDF$lang,
#       numberOfListsUserIsFeaturedOn=actorsInfoDF$listedCount,
#       profileImageUrl=actorsInfoDF$profileImageUrl
#       )
#
#     # actors <- actors[-which(duplicated(actors$name)),]
#     # actors <- unique(actors)
#
#     # make a dataframe of the relations between actors
#       # NOTE - FUTURE WORK: include edge attributes to specify the specific type of "mentions" (see previous comments on temporal network problem (see: approx. LINES 113-116)).
#       # NOTE - For example, "RETWEET" versus "TWEET TO" (@username specified beginning of tweet) versus "MENTION" (@username specified somewhere else in tweet text)
#
#     # return(df) # DEBUG

    relations <- data.frame(
      from=dataCombined$from,
      to=dataCombined$to,
      edgeType=dataCombined$edgeType,
      timeStamp=dataCombined$timeStamp,
      tweet_id=dataCombined$tweet_id)

    ##### STEP FOUR #####
# cat("\n I got to the final step before network generation")

    # convert into a graph
    # note: suppressing warnings is used to avoid this error:
    #     In if (class(newval) == "factor") { :
    #     the condition has length > 1 and only the first element will be used

  suppressWarnings(
    g <- graph.data.frame(relations, directed=TRUE, vertices=actorsNames) # used to be vertices=actors (when it collected user data)
  )

    # Make the node labels play nice with Gephi
    V(g)$label <- V(g)$name

    if (writeToFile=="TRUE" | writeToFile=="true" | writeToFile=="T" | writeToFile==TRUE) {
      # Output the final network to a graphml file, to import directly into Gephi
      currTime <- format(Sys.time(), "%b_%d_%X_%Y_%Z")
      currTime <- gsub(":","_",currTime)
      write.graph(g,paste0(currTime,"_TwitterActorNetwork.graphml"),format="graphml")
      cat("Twitter actor network was written to current working directory, with filename:\n")
      cat(paste0(currTime,"_TwitterActorNetwork.graphml"))
    }

    cat("\nDone.\n") ### DEBUG
    flush.console()

    return(g)

}

Any scripts or data that you put into this service are public.

SocialMediaLab documentation built on May 29, 2017, 9:41 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

SocialMediaLab
Tools for Collecting Social Media Data and Generating Networks for Analysis

R/CreateActorNetwork.twitter.R
In SocialMediaLab: Tools for Collecting Social Media Data and Generating Networks for Analysis

Defines functions CreateActorNetwork.twitter

Try the SocialMediaLab package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

SocialMediaLab Tools for Collecting Social Media Data and Generating Networks for Analysis

R/CreateActorNetwork.twitter.R In SocialMediaLab: Tools for Collecting Social Media Data and Generating Networks for Analysis

Defines functions CreateActorNetwork.twitter

Try the SocialMediaLab package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

SocialMediaLab
Tools for Collecting Social Media Data and Generating Networks for Analysis

R/CreateActorNetwork.twitter.R
In SocialMediaLab: Tools for Collecting Social Media Data and Generating Networks for Analysis