We’re going to run topic modeling on Tweets - the exception is we’re going to combine all users’ Tweets by each user, essentially running a user-level topic modeling.
tweets <- read.csv('../datasets/CharlotteTweets20Sample.csv', stringsAsFactors=F)
source('../functions.R')
library(quanteda)
twcorpus <- corpus(tweets$body)
docvars(twcorpus, "actor.id") <- as.character(tweets$actor.id)
twdfm <- dfm(twcorpus, groups = "actor.id", ignoredFeatures = c(stopwords("english"), "t.co", "https", "rt", "amp", "http", "t.c", "can", "u"), removeTwitter = TRUE, ngrams=c(1,2))
## Creating a dfm from a corpus ...
## ... grouping texts by variable: actor.id
## ... lowercasing
## ... tokenizing
## ... indexing documents: 9,943 documents
## ... indexing features: 341,460 feature types
## ... removed 142,037 features, from 182 supplied (glob) feature types
## ... created a 9943 x 199424 sparse dfm
## ... complete.
## Elapsed time: 28.23 seconds.
twdfm <- trim(twdfm, minDoc = 40)
## Removing features occurring in fewer than 40 documents: 198534
topfeatures(twdfm)
## charlotte nc just charlotte_nc love
## 5618 3324 2126 2124 1483
## like get day carolina time
## 1477 1365 1325 1239 1207
# install.packages("topicmodels")
library(topicmodels)
# we now export to a format that we can run the topic model with
dtm <- convert(twdfm, to="topicmodels")
# estimate LDA with K topics
K <- 20
lda <- LDA(dtm, k = K, method = "Gibbs",
control = list(verbose=25L, seed = 123, burnin = 100, iter = 500))
## K = 20; V = 890; M = 9434
## Sampling 600 iterations!
## Iteration 25 ...
## Iteration 50 ...
## Iteration 75 ...
## Iteration 100 ...
## Iteration 125 ...
## Iteration 150 ...
## Iteration 175 ...
## Iteration 200 ...
## Iteration 225 ...
## Iteration 250 ...
## Iteration 275 ...
## Iteration 300 ...
## Iteration 325 ...
## Iteration 350 ...
## Iteration 375 ...
## Iteration 400 ...
## Iteration 425 ...
## Iteration 450 ...
## Iteration 475 ...
## Iteration 500 ...
## Iteration 525 ...
## Iteration 550 ...
## Iteration 575 ...
## Iteration 600 ...
## Gibbs sampling completed!
term <- terms(lda, 10)
colnames(term) <- paste("Topic",1:K)
term
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## [1,] "lol" "charlotte" "drinking" "like" "love"
## [2,] "like" "gt" "america" "people" "much"
## [3,] "shit" "home" "bank" "just" "please"
## [4,] "got" "place" "stadium" "know" "im"
## [5,] "fuck" "hot" "america_stadium" "never" "guys"
## [6,] "just" "job" "photo" "really" "omg"
## [7,] "even" "know" "beer" "think" "always"
## [8,] "ass" "hours" "coffee" "feel" "follow"
## [9,] "get" "weather" "good" "say" "get"
## [10,] "damn" "monday" "nice" "ever" "girl"
## Topic 6 Topic 7 Topic 8 Topic 9
## [1,] "time" "happy" "tonight" "right"
## [2,] "new" "day" "party" "now"
## [3,] "hornets" "birthday" "live" "want"
## [4,] "arena" "thanks" "ciaa" "way"
## [5,] "vs" "great" "city" "get"
## [6,] "warner" "happy_birthday" "weekend" "go"
## [7,] "time_warner" "best" "night" "need"
## [8,] "just" "hope" "bar" "baby"
## [9,] "think" "friend" "friday" "right_now"
## [10,] "god" "rock" "music" "see"
## Topic 10 Topic 11 Topic 12 Topic 13 Topic 14
## [1,] "carolina" "new" "christmas" "today" "game"
## [2,] "charlotte" "year" "w" "us" "great"
## [3,] "north" "best" "house" "day" "good"
## [4,] "north_carolina" "school" "n" "new" "go"
## [5,] "charlotte_north" "see" "s" "now" "better"
## [6,] "just" "come" "try" "come" "got"
## [7,] "posted" "high" "week" "shop" "man"
## [8,] "just_posted" "first" "dinner" "sale" "play"
## [9,] "photo" "center" "chicken" "one" "big"
## [10,] "church" "family" "old" "special" "team"
## Topic 15 Topic 16 Topic 17 Topic 18 Topic 19
## [1,] "nc" "panthers" "get" "one" "just"
## [2,] "charlotte" "keeppounding" "like" "night" "good"
## [3,] "see" "now" "well" "last" "will"
## [4,] "looking" "super" "look" "time" "morning"
## [5,] "check" "game" "make" "another" "god"
## [6,] "concord" "bowl" "got" "go" "tomorrow"
## [7,] "huntersville" "win" "take" "fun" "first"
## [8,] "park" "super_bowl" "getting" "every" "running"
## [9,] "charlotte_nc" "football" "still" "life" "run"
## [10,] "matthews" "nfl" "hey" "amazing" "things"
## Topic 20
## [1,] "charlotte"
## [2,] "charlotte_nc"
## [3,] "nc"
## [4,] "clt"
## [5,] "airport"
## [6,] "douglas"
## [7,] "charlotte_douglas"
## [8,] "international"
## [9,] "douglas_international"
## [10,] "international_airport"
How accurate was the topic to finding the “highest” user of the Tweets?
# to get topic probabilities per actor ID (Twitter user)
postlist <- posterior(lda)
probtopics <- data.frame(postlist$topics)
#probtopics <- probtopics[,new.order]
colnames(probtopics) <- paste("Topic",1:K)
row <- order(-probtopics$`Topic 1`)
actorid <- rownames(probtopics[row[1],])
filter.data <- subset(tweets, actor.id == actorid)
How accurate were the Tweets to the topic? What other topics does the actor have a high probability (> 10%)? Do the users’ Tweets have some information related?
BONUS. Experiment with different number of topics. Which value appears to be more appropriate?
BONUS. BONUS. Use KL.empirical
from the entropy
library to calculate KL Divergence by user’s topic distributions. This will give a similarity measure (e.g., recommender) that will measure which users are most similar to this user.
# install.packages("cvTools")
require(cvTools)
K <- c(5, 10, 20, 30, 40)
results <- list()
i = 1
for (k in K){
cat("\n\n\n##########\n ", k, "topics", "\n")
res <- cvLDA(k, dtm)
results[[i]] <- res
i = i + 1
}
##
##
##
## ##########
## 5 topics
## 1 K = 5; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 5; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 2 K = 5; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 5; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 3 K = 5; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 5; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 4 K = 5; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 5; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 5 K = 5; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 5; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 6 K = 5; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 5; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 7 K = 5; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 5; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 8 K = 5; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 5; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 9 K = 5; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 5; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 10 K = 5; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 5; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
##
##
##
## ##########
## 10 topics
## 1 K = 10; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 10; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 2 K = 10; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 10; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 3 K = 10; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 10; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 4 K = 10; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 10; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 5 K = 10; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 10; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 6 K = 10; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 10; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 7 K = 10; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 10; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 8 K = 10; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 10; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 9 K = 10; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 10; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 10 K = 10; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 10; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
##
##
##
## ##########
## 20 topics
## 1 K = 20; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 2 K = 20; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 3 K = 20; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 4 K = 20; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 5 K = 20; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 6 K = 20; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 7 K = 20; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 8 K = 20; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 9 K = 20; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 10 K = 20; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 20; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
##
##
##
## ##########
## 30 topics
## 1 K = 30; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 2 K = 30; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 3 K = 30; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 4 K = 30; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 5 K = 30; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 6 K = 30; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 7 K = 30; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 8 K = 30; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 9 K = 30; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 10 K = 30; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 30; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
##
##
##
## ##########
## 40 topics
## 1 K = 40; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 2 K = 40; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 3 K = 40; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 4 K = 40; V = 890; M = 8490
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 890; M = 944
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 5 K = 40; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 6 K = 40; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 7 K = 40; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 8 K = 40; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 9 K = 40; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## 10 K = 40; V = 890; M = 8491
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
## K = 40; V = 890; M = 943
## Sampling 100 iterations!
## Iteration 50 ...
## Iteration 100 ...
## Gibbs sampling completed!
topicPlots(results,K)