emoji-analysis.R
In tuber: Client for the YouTube API

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  echo = TRUE,
  fig.path = "figure/",
  fig.width = 8,
  fig.height = 4,
  out.width = "100%"
)

## ----setup, message=FALSE, warning=FALSE, eval=FALSE--------------------------
# library(tuber)
# library(dplyr)
# library(ggplot2)

## ----get-comments, eval=FALSE-------------------------------------------------
# yt_oauth("your_app_id", "your_app_secret")
# 
# comments <- get_all_comments(video_id = "your_video_id", max_results = 500)

## ----basic-analysis, eval=FALSE-----------------------------------------------
# comments <- comments |>
#   mutate(
#     has_emoji = has_emoji(textDisplay),
#     emoji_count = count_emojis(textDisplay)
#   )
# 
# summary(comments$emoji_count)
# 
# emoji_rate <- mean(comments$has_emoji, na.rm = TRUE) * 100
# cat("Comments with emojis:", round(emoji_rate, 1), "%\n")

## ----distribution, eval=FALSE-------------------------------------------------
# comments |>
#   filter(emoji_count > 0) |>
#   ggplot(aes(x = emoji_count)) +
#   geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
#   labs(
#     title = "Distribution of Emojis per Comment",
#     x = "Number of Emojis",
#     y = "Number of Comments"
#   ) +
#   theme_minimal()

## ----frequency, eval=FALSE----------------------------------------------------
# all_emojis <- unlist(extract_emojis(comments$textDisplay))
# 
# emoji_freq <- as.data.frame(table(all_emojis), stringsAsFactors = FALSE)
# names(emoji_freq) <- c("emoji", "count")
# emoji_freq <- emoji_freq[order(-emoji_freq$count), ]
# 
# head(emoji_freq, 15)
# 
# emoji_freq |>
#   head(10) |>
#   ggplot(aes(x = reorder(emoji, count), y = count)) +
#   geom_col(fill = "steelblue") +
#   coord_flip() +
#   labs(
#     title = "Top 10 Most Used Emojis",
#     x = "Emoji",
#     y = "Count"
#   ) +
#   theme_minimal()

## ----temporal, eval=FALSE-----------------------------------------------------
# comments <- comments |>
#   mutate(
#     date = as.Date(publishedAt),
#     emoji_count = count_emojis(textDisplay)
#   )
# 
# daily_emoji <- comments |>
#   group_by(date) |>
#   summarise(
#     total_comments = n(),
#     comments_with_emoji = sum(has_emoji, na.rm = TRUE),
#     total_emojis = sum(emoji_count, na.rm = TRUE),
#     emoji_rate = comments_with_emoji / total_comments * 100,
#     avg_emojis = total_emojis / total_comments
#   )
# 
# ggplot(daily_emoji, aes(x = date, y = emoji_rate)) +
#   geom_line(color = "steelblue") +
#   geom_smooth(method = "loess", se = TRUE, alpha = 0.2) +
#   labs(
#     title = "Emoji Usage Rate Over Time",
#     x = "Date",
#     y = "% of Comments with Emojis"
#   ) +
#   theme_minimal()

## ----sentiment, eval=FALSE----------------------------------------------------
# positive_emojis <- c(
#   "\U0001F600", "\U0001F601", "\U0001F602", "\U0001F603", "\U0001F604",
#   "\U0001F605", "\U0001F606", "\U0001F60A", "\U0001F60D", "\U0001F618",
#   "\U0001F44D", "\U0001F44F", "\U00002764", "\U0001F389", "\U0001F38A"
# )
# 
# negative_emojis <- c(
#   "\U0001F620", "\U0001F621", "\U0001F622", "\U0001F623", "\U0001F624",
#   "\U0001F625", "\U0001F62D", "\U0001F44E", "\U0001F4A9", "\U0001F61E"
# )
# 
# comments <- comments |>
#   mutate(
#     emojis = extract_emojis(textDisplay),
#     pos_emoji = sapply(emojis, function(e) sum(e %in% positive_emojis)),
#     neg_emoji = sapply(emojis, function(e) sum(e %in% negative_emojis)),
#     emoji_sentiment = case_when(
#       pos_emoji > neg_emoji ~ "positive",
#       neg_emoji > pos_emoji ~ "negative",
#       pos_emoji == 0 & neg_emoji == 0 ~ "none",
#       TRUE ~ "neutral"
#     )
#   )
# 
# table(comments$emoji_sentiment)

## ----engagement, eval=FALSE---------------------------------------------------
# engagement_summary <- comments |>
#   group_by(has_emoji) |>
#   summarise(
#     n = n(),
#     mean_likes = mean(likeCount, na.rm = TRUE),
#     median_likes = median(likeCount, na.rm = TRUE)
#   )
# 
# print(engagement_summary)
# 
# ggplot(comments, aes(x = has_emoji, y = likeCount + 1)) +
#   geom_boxplot(fill = "steelblue", alpha = 0.7) +
#   scale_y_log10() +
#   labs(
#     title = "Like Counts: Emoji vs Non-Emoji Comments",
#     x = "Contains Emoji",
#     y = "Likes (log scale)"
#   ) +
#   theme_minimal()

## ----comparison, eval=FALSE---------------------------------------------------
# video_ids <- c("video_id_1", "video_id_2", "video_id_3")
# 
# all_comments <- lapply(video_ids, function(vid) {
#   comments <- get_all_comments(video_id = vid, max_results = 200)
#   comments$video_id <- vid
#   comments
# })
# all_comments <- bind_rows(all_comments)
# 
# video_emoji_stats <- all_comments |>
#   mutate(emoji_count = count_emojis(textDisplay)) |>
#   group_by(video_id) |>
#   summarise(
#     total_comments = n(),
#     emoji_rate = mean(emoji_count > 0) * 100,
#     avg_emojis = mean(emoji_count)
#   )
# 
# print(video_emoji_stats)

## ----clean-text, eval=FALSE---------------------------------------------------
# comments <- comments |>
#   mutate(
#     clean_text = remove_emojis(textDisplay),
#     clean_text = trimws(gsub("\\s+", " ", clean_text))
#   )
# 
# head(comments$clean_text[comments$has_emoji], 3)

## ----performance, eval=FALSE--------------------------------------------------
# comments_sample <- comments[sample(nrow(comments), min(1000, nrow(comments))), ]
# 
# comments_sample <- comments_sample |>
#   mutate(emoji_count = count_emojis(textDisplay))
# 
# emoji_rate_estimate <- mean(comments_sample$emoji_count > 0) * 100