EPPS 6302 Assignment 3

  1. Read in twitterdata1.R (in Teams folder)

    a. Use your developer account to extract Joe Biden’s tweets in last three months

    b. Analyze the tweet data:

    i. Most likes

    At the time of doing this assignment, the most likes were 6115.

    ii. Most retweets

    At the time of doing this assignment, the most likes were 38904.

    iii. What are the hashtags in above messages

    [1] “#joebiden” “#ethiopia” “#tigraygenocide”

    [4] “#rrbntpc5432levelresult” “#mahsaamini” “#tigray”

    [7] “#g20indonesia” “#m7novmassacreug” “#2yrstigraygenocide”

    [10] “#کیان_پیرفلک”

  2. Collect a bag of hashtags for the following topics:

    i. Black Lives Matter

    [1] "#bnwo"                    "#nfts"                  "#blm"            
     [4] "#dailyapology"            "#blacklivesmatter"     "#blackbritishlivesmatter"
     [7] "#blacked"                 "#bbc"                  "#tiktok"                 
    [10] "#black_lives_matter" 

    ii. PLA Taiwan

    [1] “#taiwan”          “#pla”             “#china”          
    [4] “#usa”             “#beijing”         “#taipei”         
    [7] “#xijinping”       “#rocaf”           “#socialist”      
    [10] “#new_taipei_city”

    iii. COVID vaccines

    [1] "#covid-infected" "#covid"          "#longcovid"      "#covid19"       
     [5] "#masks"          "#ccp"            "#tory"           "#doctoroffice"  
     [9] "#vaccines"       "#familycare"
# Data Methods: Social media (Twitter) data
# Sample program for using rtweet, sentiment analysis
# Use vignette("auth", package = "rtweet") for authentication
# Documentation: vignette("intro", package = "rtweet")
# GitHub: https://github.com/mkearney/rtweet
# [Bob Rudis 21 Recipes for Mining Twitter Data with rtweet](https://rud.is/books/21-recipes/)

rm(list=ls())

library(rtweet)
library(ggplot2)
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ tibble  3.1.6      ✔ dplyr   1.0.10
✔ tidyr   1.2.0      ✔ stringr 1.4.0 
✔ readr   2.1.1      ✔ forcats 0.5.1 
✔ purrr   0.3.4      
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter()  masks stats::filter()
✖ purrr::flatten() masks rtweet::flatten()
✖ dplyr::lag()     masks stats::lag()
library(quanteda)
Warning in .recacheSubclasses(def@className, def, env): undefined subclass
"packedMatrix" of class "mMatrix"; definition not updated
Warning in .recacheSubclasses(def@className, def, env): undefined subclass
"packedMatrix" of class "replValueSp"; definition not updated
Package version: 3.2.3
Unicode version: 14.0
ICU version: 70.1
Parallel computing: 8 of 8 threads used.
See https://quanteda.io for tutorials and examples.
library(quanteda.textmodels)
library(quanteda.textplots)
library(readr)

# Set up authentication using own Twitter account
# will save credentials to local drive as default.rds
Sys.setenv(TWITTER_BEARER = "AAAAAAAAAAAAAAAAAAAAAMsxhAEAAAAAzBUGoUHZtZp5Ax2OtJaAObG4rec%3DLGMtinMdRqtP9xU5NjFY3KVhZAyXM160sEyTbP5YTZr4T3MT0m")

params <- list(`user.fields` = 'description',
               `expansions` = 'pinned_tweet_id')

auth_setup_default()
Using default authentication available.
Reading auth from '/Users/sami_manuel/Library/Preferences/org.R-project.R/R/rtweet/default.rds'
## search for 500 tweets of "Joe Biden" in English
jbt <- rtweet::search_tweets(q = "JoeBiden", n = 500, lang = "en", retryonratelimit = TRUE)

#i. Most likes
max(jbt$favorite_count)
[1] 64787
#ii. Most Retweets
max(jbt$retweet_count)
[1] 18094
#ii. Most Replies
max(jbt$reply_count)
[1] NA
# analysis of Joe Biden tweets
jbt_twt = jbt$text
jbt_toks = tokens(jbt_twt)
jbttwtdfm <- dfm(jbt_toks)

# JBT Latent Semantic Analysis
jbt_sum_lsa <- textmodel_lsa(jbttwtdfm)
summary(jbt_sum_lsa)
                Length  Class     Mode   
sk                   10 -none-    numeric
docs               4820 -none-    numeric
features          21980 -none-    numeric
matrix_low_rank 1059436 -none-    numeric
data            1059436 dgCMatrix S4     
jbt_tweet_dfm <- tokens(jbt_twt, remove_punct = TRUE) %>%
  dfm()
head(jbt_tweet_dfm)
Document-feature matrix of: 6 documents, 2,179 features (99.15% sparse) and 0 docvars.
       features
docs    @joebiden we're waiting for your response to execution of
  text1         1     1       1   1    1        1  2         2  1
  text2         0     0       0   1    0        0  0         0  0
  text3         1     0       0   2    0        0  0         0  0
  text4         1     0       0   0    0        0  0         0  0
  text5         1     0       0   2    0        0  0         0  0
  text6         1     0       0   0    0        0  0         0  0
       features
docs    #mohsenshekari
  text1              1
  text2              0
  text3              0
  text4              0
  text5              0
  text6              0
[ reached max_nfeat ... 2,169 more features ]
jbt_tag_dfm <- dfm_select(jbt_tweet_dfm, pattern = "#*")
jbt_toptag <- names(topfeatures(jbt_tag_dfm, 50))
head(jbt_toptag, 10)
 [1] "#7️⃣6️⃣5️⃣daysoftigraygenocide" "#joebiden"               
 [3] "#tigraygenocide"          "#766daysoftigraygenocide"
 [5] "#tigrayan"                "#brittneygriner"         
 [7] "#paulwhelan"              "#donaldtrump"            
 [9] "#viktorbout"              "#bnnus"                  
## search for 500 tweets of "PLA Taiwan" in English
PLA <- rtweet::search_tweets(q = "PLA Taiwan", n = 500, lang = "en", retryonratelimit = TRUE)

# analysis of tweets
PLA_twt = PLA$text
PLA_toks = tokens(PLA_twt)
PLAtwtdfm <- dfm(PLA_toks)

# Latent Semantic Analysis
PLA_sum_lsa <- textmodel_lsa(PLAtwtdfm)
summary(PLA_sum_lsa)
                Length Class     Mode   
sk                  10 -none-    numeric
docs              5000 -none-    numeric
features         10080 -none-    numeric
matrix_low_rank 504000 -none-    numeric
data            504000 dgCMatrix S4     
PLA_tweet_dfm <- tokens(PLA_twt, remove_punct = TRUE) %>%
  dfm()
head(PLA_tweet_dfm)
Document-feature matrix of: 6 documents, 992 features (97.08% sparse) and 0 docvars.
       features
docs    24 pla aircraft and 4 plan vessels around taiwan were
  text1  1   1        2   3 1    1       2      1      1    1
  text2  0   1        1   2 0    0       0      0      1    0
  text3  0   1        1   1 1    1       1      1      1    1
  text4  0   1        1   1 1    1       1      1      1    1
  text5  0   1        1   1 1    1       1      1      1    1
  text6  0   1        1   1 1    1       1      1      1    1
[ reached max_nfeat ... 982 more features ]
PLA_tag_dfm <- dfm_select(PLA_tweet_dfm, pattern = "#*")
PLA_toptag <- names(topfeatures(PLA_tag_dfm, 50))
head(PLA_toptag, 10)
 [1] "#pla"                    "#taiwan"                
 [3] "#ccp"                    "#china"                 
 [5] "#taiwanstraitmedianline" "#ai"                    
 [7] "#ml"                     "#artificialintelligence"
 [9] "#machinelearning"        "#datascience"           
## search for 500 tweets of "BLM" in English
BLM <- rtweet::search_tweets(q = "Black Lives Matter", n = 500, lang = "en", retryonratelimit = TRUE)

# analysis of tweets
BLM_twt = BLM$text
BLM_toks = tokens(BLM_twt)
BLMtwtdfm <- dfm(BLM_toks)

# Latent Semantic Analysis
BLM_sum_lsa <- textmodel_lsa(BLMtwtdfm)
summary(BLM_sum_lsa)
                Length  Class     Mode   
sk                   10 -none-    numeric
docs               4380 -none-    numeric
features          24130 -none-    numeric
matrix_low_rank 1056894 -none-    numeric
data            1056894 dgCMatrix S4     
BLM_tweet_dfm <- tokens(BLM_twt, remove_punct = TRUE) %>%
  dfm()
head(BLM_tweet_dfm)
Document-feature matrix of: 6 documents, 2,395 features (98.95% sparse) and 0 docvars.
       features
docs    the major problem with israel is young generation of black
  text1   5     1       1    2      2  1     1          1  2     3
  text2   2     0       0    0      1  1     1          0  1     4
  text3   0     0       0    0      0  0     0          0  0     1
  text4   1     0       0    0      0  0     0          0  1     0
  text5   1     0       0    0      0  0     0          0  0     1
  text6   0     0       0    0      0  0     0          0  0     1
[ reached max_nfeat ... 2,385 more features ]
BLM_tag_dfm <- dfm_select(BLM_tweet_dfm, pattern = "#*")
BLM_toptag <- names(topfeatures(BLM_tag_dfm, 50))
head(BLM_toptag, 10)
 [1] "#bnwo"                  "#blacklivesmatter"      "#blm"                  
 [4] "#brittneygriner"        "#imagainstantisemitism" "#nowplaying"           
 [7] "#dailyapology"          "#visitourhomeofhope"    "#saytheirnames"        
[10] "#babecock"             
## search for 500 tweets of "COVID" in English
COVID <- rtweet::search_tweets(q = "COVID", n = 500, lang = "en", retryonratelimit = TRUE)

# analysis of tweets
COVID_twt = COVID$text
COVID_toks = tokens(COVID_twt)
COVIDtwtdfm <- dfm(COVID_toks)

# Latent Semantic Analysis
COVID_sum_lsa <- textmodel_lsa(COVIDtwtdfm)
summary(COVID_sum_lsa)
                Length  Class     Mode   
sk                   10 -none-    numeric
docs               5000 -none-    numeric
features          32880 -none-    numeric
matrix_low_rank 1644000 -none-    numeric
data            1644000 dgCMatrix S4     
COVID_tweet_dfm <- tokens(COVID_twt, remove_punct = TRUE) %>%
  dfm()
head(COVID_tweet_dfm)
Document-feature matrix of: 6 documents, 3,269 features (99.31% sparse) and 0 docvars.
       features
docs    3 take for example stanford's dr jay bhattacharya @drjbhattacharya who
  text1 1    1   1       1          1  1   1            1                1   1
  text2 0    0   0       0          0  0   0            0                0   1
  text3 0    0   1       0          0  0   0            0                0   0
  text4 0    0   0       0          0  0   0            0                0   1
  text5 0    0   0       0          0  0   0            0                0   0
  text6 0    0   0       0          0  0   0            0                0   0
[ reached max_nfeat ... 3,259 more features ]
COVID_tag_dfm <- dfm_select(COVID_tweet_dfm, pattern = "#*")
COVID_toptag <- names(topfeatures(COVID_tag_dfm, 50))
head(COVID_toptag, 10)
 [1] "#covid"              "#covid19"            "#diedsuddenly"      
 [4] "#039"                "#corona"             "#nationalsecurity"  
 [7] "#shippingplatform"   "#bnnus"              "#conspiracytheories"
[10] "#lo"