--- title: "Emoji Analysis in YouTube Comments" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Emoji Analysis in YouTube Comments} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", echo = TRUE, fig.path = "figure/", fig.width = 8, fig.height = 4, out.width = "100%" ) ``` Emojis are increasingly important in social media communication. This vignette demonstrates how to analyze emoji usage patterns in YouTube comments using `tuber`'s built-in emoji functions. ## Setup ```{r setup, message=FALSE, warning=FALSE, eval=FALSE} library(tuber) library(dplyr) library(ggplot2) ``` ## Collecting Comments ```{r get-comments, eval=FALSE} yt_oauth("your_app_id", "your_app_secret") comments <- get_all_comments(video_id = "your_video_id", max_results = 500) ``` ## Basic Emoji Analysis ### Emoji Presence and Counts ```{r basic-analysis, eval=FALSE} comments <- comments |> mutate( has_emoji = has_emoji(textDisplay), emoji_count = count_emojis(textDisplay) ) summary(comments$emoji_count) emoji_rate <- mean(comments$has_emoji, na.rm = TRUE) * 100 cat("Comments with emojis:", round(emoji_rate, 1), "%\n") ``` ### Distribution of Emoji Usage ```{r distribution, eval=FALSE} comments |> filter(emoji_count > 0) |> ggplot(aes(x = emoji_count)) + geom_histogram(binwidth = 1, fill = "steelblue", color = "white") + labs( title = "Distribution of Emojis per Comment", x = "Number of Emojis", y = "Number of Comments" ) + theme_minimal() ``` ## Emoji Frequency Analysis ### Top Emojis ```{r frequency, eval=FALSE} all_emojis <- unlist(extract_emojis(comments$textDisplay)) emoji_freq <- as.data.frame(table(all_emojis), stringsAsFactors = FALSE) names(emoji_freq) <- c("emoji", "count") emoji_freq <- emoji_freq[order(-emoji_freq$count), ] head(emoji_freq, 15) emoji_freq |> head(10) |> ggplot(aes(x = reorder(emoji, count), y = count)) + geom_col(fill = "steelblue") + coord_flip() + labs( title = "Top 10 Most Used Emojis", x = "Emoji", y = "Count" ) + theme_minimal() ``` ## Temporal Analysis ### Emoji Usage Over Time ```{r temporal, eval=FALSE} comments <- comments |> mutate( date = as.Date(publishedAt), emoji_count = count_emojis(textDisplay) ) daily_emoji <- comments |> group_by(date) |> summarise( total_comments = n(), comments_with_emoji = sum(has_emoji, na.rm = TRUE), total_emojis = sum(emoji_count, na.rm = TRUE), emoji_rate = comments_with_emoji / total_comments * 100, avg_emojis = total_emojis / total_comments ) ggplot(daily_emoji, aes(x = date, y = emoji_rate)) + geom_line(color = "steelblue") + geom_smooth(method = "loess", se = TRUE, alpha = 0.2) + labs( title = "Emoji Usage Rate Over Time", x = "Date", y = "% of Comments with Emojis" ) + theme_minimal() ``` ## Sentiment Categories Emojis can indicate sentiment. Here's a simple categorization approach: ```{r sentiment, eval=FALSE} positive_emojis <- c( "\U0001F600", "\U0001F601", "\U0001F602", "\U0001F603", "\U0001F604", "\U0001F605", "\U0001F606", "\U0001F60A", "\U0001F60D", "\U0001F618", "\U0001F44D", "\U0001F44F", "\U00002764", "\U0001F389", "\U0001F38A" ) negative_emojis <- c( "\U0001F620", "\U0001F621", "\U0001F622", "\U0001F623", "\U0001F624", "\U0001F625", "\U0001F62D", "\U0001F44E", "\U0001F4A9", "\U0001F61E" ) comments <- comments |> mutate( emojis = extract_emojis(textDisplay), pos_emoji = sapply(emojis, function(e) sum(e %in% positive_emojis)), neg_emoji = sapply(emojis, function(e) sum(e %in% negative_emojis)), emoji_sentiment = case_when( pos_emoji > neg_emoji ~ "positive", neg_emoji > pos_emoji ~ "negative", pos_emoji == 0 & neg_emoji == 0 ~ "none", TRUE ~ "neutral" ) ) table(comments$emoji_sentiment) ``` ## Engagement Correlation ### Do emoji comments get more likes? ```{r engagement, eval=FALSE} engagement_summary <- comments |> group_by(has_emoji) |> summarise( n = n(), mean_likes = mean(likeCount, na.rm = TRUE), median_likes = median(likeCount, na.rm = TRUE) ) print(engagement_summary) ggplot(comments, aes(x = has_emoji, y = likeCount + 1)) + geom_boxplot(fill = "steelblue", alpha = 0.7) + scale_y_log10() + labs( title = "Like Counts: Emoji vs Non-Emoji Comments", x = "Contains Emoji", y = "Likes (log scale)" ) + theme_minimal() ``` ## Cross-Video Comparison ### Compare emoji usage across videos ```{r comparison, eval=FALSE} video_ids <- c("video_id_1", "video_id_2", "video_id_3") all_comments <- lapply(video_ids, function(vid) { comments <- get_all_comments(video_id = vid, max_results = 200) comments$video_id <- vid comments }) all_comments <- bind_rows(all_comments) video_emoji_stats <- all_comments |> mutate(emoji_count = count_emojis(textDisplay)) |> group_by(video_id) |> summarise( total_comments = n(), emoji_rate = mean(emoji_count > 0) * 100, avg_emojis = mean(emoji_count) ) print(video_emoji_stats) ``` ## Working with Clean Text For text analysis that should exclude emojis: ```{r clean-text, eval=FALSE} comments <- comments |> mutate( clean_text = remove_emojis(textDisplay), clean_text = trimws(gsub("\\s+", " ", clean_text)) ) head(comments$clean_text[comments$has_emoji], 3) ``` ## Performance Tips For large datasets: ```{r performance, eval=FALSE} comments_sample <- comments[sample(nrow(comments), min(1000, nrow(comments))), ] comments_sample <- comments_sample |> mutate(emoji_count = count_emojis(textDisplay)) emoji_rate_estimate <- mean(comments_sample$emoji_count > 0) * 100 ``` ## Summary Key functions used in this analysis: | Function | Purpose | |----------|---------| | `has_emoji()` | Check if text contains emojis | | `count_emojis()` | Count emojis in text | | `extract_emojis()` | Get list of emojis from text | | `remove_emojis()` | Strip emojis from text | | `replace_emojis()` | Replace emojis with custom text | These functions work directly on character vectors, making them easy to use with `dplyr::mutate()` and other tidyverse workflows.