This notebook uses supervised machine learning (CART and Random Forest) to predict Fake vs Real News media outlets on a Twitter user-level.
Our dataset includes 83 Twitter profiles: 31 real news and 52 fake news accounts.
There are 34 features from five dictionary/sources:
Moral Foundations: 11 features
Five foundations (with two levels: virtue/vice): care/harm, fairness/cheating, loyalty/betrayal, authority/subversion, sanctity/degradation.
Also includes one “general” moral foundations category.
Biased Language zip: 6 features
Bias, hedges, implicatives, factives, assertives, and reports.
Marta Recasens, Cristian Danescu-Niculescu-Mizil, and Dan Jurafsky. 2013. Linguistic Models for Analyzing and Detecting Biased Language. Proceedings of ACL 2013.
Built from Wikipedia “bias” deletions to identify “framing” and “epistemological” biases
Subjective: 8 features (strong, weak, each with positive/negative/neural)
Emotions: 6 features
Anger, Disgush, Fear, Joy, Sadness, Surprise
Volkova (2015)
Positive/Negative/Neutral: 3 features
For the 34 features, each feature is either normalized by the number of tweets (t[variable_name]
) or the number of users’ words (n[variable_name]
). This yields 70 total features.
library(tidyverse)
tweets <- read_csv("./data/moral_foundations.csv")
Recall how many tweets by category.
tweets %>%
group_by(LABEL) %>%
summarise(Count=n())
## # A tibble: 5 x 2
## LABEL Count
## <chr> <int>
## 1 clickbait 5573
## 2 hoax 476
## 3 propaganda 23735
## 4 realnews 46425
## 5 satire 227
Let’s group tweets by real vs fake.
tweets$type <- 0
tweets$type[tweets$LABEL != "realnews"] <- 1
table(tweets$type)
##
## 0 1
## 46425 30011
tweets$word.count <- quanteda::ntoken(tweets$modified_tweets)
user <- tweets %>% group_by(LABEL, screen_name, type) %>%
summarise(count = n(),
words = sum(word.count),
HarmVirtue = sum(HarmVirtue),
HarmVice = sum(HarmVice),
FairnessVirtue = sum(FairnessVirtue),
FairnessVice = sum(FairnessVice),
IngroupVirtue = sum(IngroupVirtue),
IngroupVice = sum(IngroupVice),
AuthorityVirtue = sum(AuthorityVirtue),
AuthorityVice = sum(AuthorityVice),
PurityVirtue = sum(PurityVirtue),
PurityVice = sum(PurityVice),
MoralityGeneral = sum(MoralityGeneral)) %>%
mutate(nHarmVirtue = HarmVirtue / words,
nHarmVice = HarmVice / words,
nFairnessVirtue = FairnessVirtue / words,
nFairnessVice = FairnessVice / words,
nIngroupVirtue = IngroupVirtue / words,
nIngroupVice = IngroupVice / words,
nAuthorityVirtue = AuthorityVirtue / words,
nAuthorityVice = AuthorityVice / words,
nPurityVirtue = PurityVirtue / words,
nPurityVice = PurityVice / words,
nMoralityGeneral = MoralityGeneral / words,
tHarmVirtue = HarmVirtue / count,
tHarmVice = HarmVice / count,
tFairnessVirtue = FairnessVirtue / count,
tFairnessVice = FairnessVice / count,
tIngroupVirtue = IngroupVirtue / count,
tIngroupVice = IngroupVice / count,
tAuthorityVirtue = AuthorityVirtue / count,
tAuthorityVice = AuthorityVice / count,
tPurityVirtue = PurityVirtue / count,
tPurityVice = PurityVice / count,
tMoralityGeneral = MoralityGeneral / count)
ggplot(user, aes(x = count, fill = as.factor(type))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Tweets") +
ylab("Density") +
scale_fill_discrete(name = "Account Type")
Let’s include the sentiment scores…
sentiment <- read_csv("./data/sentiment-scores.csv")
user <- merge(user, sentiment, by = "screen_name")
user <- user %>%
mutate(nAnger = anger / words,
nDisgust = disgust / words,
nFear = fear / words,
nJoy = joy / words,
nSadness = sadness / words,
nSurprise = surprise / words,
nPolarity = `polarity values` / words,
nNegative = negative / words,
nNeutral = neutral / words,
nPositive = positive / words,
tAnger = anger / count,
tDisgust = disgust / count,
tFear = fear / count,
tJoy = joy / count,
tSadness = sadness / count,
tSurprise = surprise / count,
tPolarity = `polarity values` / count,
tNegative = negative / count,
tNeutral = neutral / count,
tPositive = positive / count)
Bias scores…
bias <- read_csv("./data/bias.csv")
user <- merge(user, bias, by = "screen_name")
user <- user %>%
mutate(nBias = bias / words,
tBias = bias / count,
nAssertives = assertive_score / words,
tAssertives = assertive_score / count,
nFactives = factives_score / words,
tFactives = factives_score / count,
nHedges = hedges_score / words,
tHedges = hedges_score / count,
nImplicatives = implicatives_score / words,
tImplicatives = implicatives_score / count,
nReport = report_score / words,
tReport = report_score / count)
and subjectivity scores…
subjective <- read_csv("./data/subjective_aggregation.csv")
user <- merge(user, subjective, by = "screen_name")
user <- user %>%
mutate(nStrongPositive = strong_positive / words,
tStrongPositive = strong_positive / count,
nStrongNegative = strong_negative / words,
tStrongNegative = strong_negative / count,
nStrongNeutral = strong_neutral / words,
tStrongNeutral = strong_neutral / count,
nStrongSubjective = (strong_neutral + strong_positive + strong_negative) / words,
tStrongSubjective = (strong_neutral + strong_positive + strong_negative) / count,
nWeakPositive = weak_positive / words,
tWeakPositive = weak_positive / count,
nWeakNegative = weak_negative / words,
tWeakNegative = weak_negative / count,
nWeakNeutral = weak_neutral / words,
tWeakNeutral = weak_neutral / count,
nWeakSubjective = (weak_neutral + weak_positive + weak_negative) / words,
tWeakSubjective = (weak_neutral + weak_positive + weak_negative) / count)
First, create the label.
# Real News = 0
user$y <- 0
# Fake News = 1
user$y[user$LABEL != "realnews"] <- 1
user$yLabel <- ifelse(user$y==1,"Fake News","Real News")
table(user$LABEL, user$yLabel)
##
## Fake News Real News
## clickbait 18 0
## hoax 2 0
## propaganda 30 0
## realnews 0 31
## satire 2 0
dataset <- user[,c(17:38,49:68,75:86,93:110)]
set.seed(123) # need to use for replication
inTrain = caret::createDataPartition(dataset$y, p = 0.7, list = FALSE)
dfTrain=dataset[inTrain,]
dfTest=dataset[-inTrain,]
First, plot the normalized by tweets variables…
# choose only tweet normalized terms
t <- colnames(dfTrain)[grep("^t",colnames(dfTrain))]
corr <- cor(dfTrain[,c(t,"y")]) # exclude predictor
corrplot::corrplot(corr, tl.cex = 0.6)
Show variable correlation plot.
p <- cor(dataset[,t],dataset$y)
p <- data.frame(corr = p, row.names = row.names(p))
p <- p %>% arrange(corr)
barplot(p$corr, horiz = TRUE, las = 1, main = "Variable Correlation")
Positive: Negative, Fear, and Polarity (per words)
Negative: Bias (words and tweets), and the Fairness Virtue and InGroup Virtue (both per words and tweets)
Let’s briefly explore the top factors.
First, let’s consider the Bias dictionary.
ggplot(dfTrain, aes(x = nBias, fill = as.factor(yLabel))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Percent of User's Words in Bias Dictionary") +
ylab("Density") +
scale_fill_discrete(name = "Account Type")
and on a per tweet bias…
ggplot(dfTrain, aes(x = tBias, fill = as.factor(yLabel))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Avg Bias Lexicon Words per Tweet") +
ylab("Density") +
scale_fill_discrete(name = "Account Type")
Fairness Virtue as a percent of words…
ggplot(dfTrain, aes(x = nFairnessVirtue, fill = as.factor(yLabel))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Percent of User's Words in Fairness (Virtue) Dictionary") +
ylab("Density") +
scale_fill_discrete(name = "Account Type")
or the Fear per tweet level…
ggplot(dfTrain, aes(x = nFear, fill = as.factor(yLabel))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Percent of User's Words in Fear Dictionary") +
ylab("Density") +
scale_fill_discrete(name = "Account Type")
or Joy (per tweet)…
ggplot(dfTrain, aes(x = tJoy, fill = as.factor(yLabel))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Percent of User's Words in Joy Dictionary") +
ylab("Density") +
scale_fill_discrete(name = "Account Type")
#install.packages("rpart")
library(rpart); library(rpart.plot); library(caret)
First, let’s use 5-fold CV to tune the model’s cp parameter.
tc <- trainControl("cv",5)
rpart.grid <- expand.grid(.cp=c(0.01,0.02,0.05,0.1,0.2))
(train.rpart <- train(as.factor(y) ~., data=dfTrain[,-72], method="rpart", trControl=tc , tuneGrid=rpart.grid))
## CART
##
## 59 samples
## 70 predictors
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 47, 47, 47, 47, 48
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.01 0.8454545 0.6819720
## 0.02 0.8454545 0.6819720
## 0.05 0.8454545 0.6819720
## 0.10 0.8787879 0.7546993
## 0.20 0.8787879 0.7546993
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.2.
Let’s run the model and plot the results
fit <- rpart(as.factor(y) ~ ., data=dfTrain[,-72], method = "class", control = rpart.control(cp = train.rpart$bestTune$cp))
rpart.plot(fit)
We can show variable importance…
vi <- fit$variable.importance
par(mar=c(3,10,5,0))
barplot(vi[order(vi)], main = "Variable Importance", horiz = TRUE, las=1, offset = 1)
Let’s run accuracy, precision, and recall.
yTrain <- predict(fit, type = "class")
table(yTrain, dfTrain$y)
##
## yTrain 0 1
## 0 24 4
## 1 0 31
tab <- table(yTrain, dfTrain$y)
print(paste0("Accuracy is ",(tab[1,1]+tab[2,2])/sum(tab)))
## [1] "Accuracy is 0.932203389830508"
print(paste0("Precision is ",(tab[2,2])/sum(tab[2,])), digits = 3)
## [1] "Precision is 1"
print(paste0("Recall is ",(tab[2,2])/sum(tab[,2])), digits = 3)
## [1] "Recall is 0.885714285714286"
What were the incorrect users?
names <- user[inTrain,c("screen_name","LABEL")]
names[(yTrain != dfTrain$y),]
## screen_name LABEL
## 43 NewsBiscuit hoax
## 53 RealNewsTarget propaganda
## 57 rinf_community propaganda
## 73 TwitchyTeam clickbait
Let’s predict for the holdout.
yTest <- predict(fit, newdata = dfTest, type = "class")
table(yTest, dfTest$y)
##
## yTest 0 1
## 0 7 4
## 1 0 13
tab <- table(yTest, dfTest$y)
print(paste0("Accuracy is ",(tab[1,1]+tab[2,2])/sum(tab)))
## [1] "Accuracy is 0.833333333333333"
print(paste0("Precision is ",(tab[2,2])/sum(tab[2,])), digits = 3)
## [1] "Precision is 1"
print(paste0("Recall is ",(tab[2,2])/sum(tab[,2])), digits = 3)
## [1] "Recall is 0.764705882352941"
What were the incorrect predicted for the out-of-sample?
names <- user[-inTrain,c("screen_name","LABEL")]
names[(yTest != dfTest$y),]
## screen_name LABEL
## 12 BIZPACReview propaganda
## 27 HealthRanger propaganda
## 31 intellihubnews clickbait
## 54 realtruthkings propaganda
library(randomForest)
fit <- randomForest(as.factor(y) ~ ., data=dfTrain[,-72], ntree = 1000, importance=TRUE)
print(fit) # view results
##
## Call:
## randomForest(formula = as.factor(y) ~ ., data = dfTrain[, -72], ntree = 1000, importance = TRUE)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 8
##
## OOB estimate of error rate: 11.86%
## Confusion matrix:
## 0 1 class.error
## 0 21 3 0.1250000
## 1 4 31 0.1142857
Mis-classification rates per trees.
plot(fit, main = "Misclassification Rates")
Variable Importance
VarImportance <- varImpPlot(fit, main = "Variable Importance", n.var = 20)
Predict out-of-sample
yTest <- predict(fit, newdata = dfTest, type = "class")
table(yTest, dfTest$y)
##
## yTest 0 1
## 0 7 3
## 1 0 14
tab <- table(yTest, dfTest$y)
print(paste0("Accuracy is ",(tab[1,1]+tab[2,2])/sum(tab)))
## [1] "Accuracy is 0.875"
print(paste0("Precision is ",(tab[2,2])/sum(tab[2,])), digits = 3)
## [1] "Precision is 1"
print(paste0("Recall is ",(tab[2,2])/sum(tab[,2])), digits = 3)
## [1] "Recall is 0.823529411764706"
names <- user[-inTrain,c("screen_name","LABEL")]
names[(yTest != dfTest$y),]
## screen_name LABEL
## 12 BIZPACReview propaganda
## 31 intellihubnews clickbait
## 54 realtruthkings propaganda
As we found in the exploratory analysis, many of the Biased Language dimensions are highly correlated. To address this, we removed the report, hedges, and assertives dicionaries as much of the information is self-contained in other Biased Language dimensions
# remove Report, Hedges and Assertives
excludes <- c(-45,-46,-49,-50,-53,-54,-72)
set.seed(1234)
fit <- randomForest(as.factor(y) ~ .,
data=dfTrain[,c(-45,-46,-49,-50,-53,-54,-72)],
ntree = 1000,
importance=TRUE)
VarImportance <- varImpPlot(fit, main = "Variable Importance", n.var = 20)
yTest <- predict(fit, newdata = dfTest[,excludes], type = "class")
table(yTest, dfTest$y)
##
## yTest 0 1
## 0 7 0
## 1 0 17
codeDict <- function(var){
case_when(
var %in% c("tBias","nBias","tImplicatives","nImplicatives","nFactives","tFactives") ~ "Bias Language",
var %in% c("nFairnessVirture","tFairnessVirtue","tIngroupVice","nIngroupVirtue","tAuthorityVice","tIngroupVirtue", "nFairnessVirtue") ~ "Moral Foundations",
var %in% c("tWeakPositive","tStrongPositive","nWeakPositive","tWeakNeutral","nStrongPositive","nWeakNeutral") ~ "Subjectivity",
var %in% c("tJoy","nAnger","nFear","nJoy") ~ "Emotions",
var %in% c("nNegative","tPositive","nPositive","tNegative") ~ "Sentiment"
)
}
VarImportance <- as.tibble(VarImportance) %>%
mutate(fieldName = row.names(VarImportance),
group = codeDict(fieldName)) %>%
arrange(desc(MeanDecreaseAccuracy))
selected <- c("tBias","nFairnessVirtue","tWeakPositive","tStrongPositive","tWeakNeutral","nFear","tLoyaltyVirtue","nAnger","nNegative")
VarImportance$fieldName[VarImportance$fieldName == "tIngroupVirtue"] <- "tLoyaltyVirtue"
VarImportance$fieldName[VarImportance$fieldName == "tIngroupVice"] <- "tLoyaltyVice"
filter(VarImportance, MeanDecreaseAccuracy > 4.5) %>%
ggplot(aes(x = forcats::fct_reorder(fieldName, MeanDecreaseAccuracy, .desc = FALSE),
y = MeanDecreaseAccuracy,
fill = group,
color = ifelse(!(fieldName %in% selected), "Not Selected", "Selected"),
width=.75)) +
geom_col() +
coord_flip() +
labs(y = "Mean Decrease in Accuracy after Removing Feature",
x = "Language Feature",
fill = "Language Group") +
scale_fill_hue(l=80, c=50) +
theme(legend.position = c(0.8,0.3)) +
scale_color_manual(values = c('grey','black'), guide = FALSE)
After removing the redundant features, we find a better out-of-sample performance (now 100%).
Let’s normalize the six most predictive factors.
normalize <- function(x){
#https://stats.stackexchange.com/questions/70801/how-to-normalize-data-to-0-1-range
norm.value <- (x - min(x)) / (max(x) - min(x))
return(norm.value)
}
norm.df <- data.frame(screen_name = user$screen_name,
label = user$yLabel,
Bias = normalize(dataset$tBias),
Fairness = normalize(dataset$nFairnessVirtue),
Loyalty = normalize(dataset$tIngroupVirtue),
WeakSubjective = normalize(dataset$tWeakSubjective),
StrongSubjective = normalize(dataset$tStrongSubjective),
Positive = normalize(dataset$nPositive),
Negative = normalize(dataset$nNegative),
Fear = normalize(dataset$nFear),
Anger = normalize(dataset$nAnger)
)
These values will be used in the interface.
Next, we want to explore the distributions (density plots) of the dimensions separated by fake and real news accounts.
p1 <- ggplot(norm.df, aes(x = Bias, fill = as.factor(label))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Normalized Bias") +
ylab("Density") +
scale_fill_discrete(name = "Account Type") +
theme(legend.position="none")
p2 <- ggplot(norm.df, aes(x = Fairness, fill = as.factor(label))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Normalized Fairness") +
ylab("Density") +
scale_fill_discrete(name = "Account Type") +
theme(legend.position="none")
p3 <- ggplot(norm.df, aes(x = Loyalty, fill = as.factor(label))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Normalized Loyalty") +
ylab("Density") +
scale_fill_discrete(name = "Account Type") +
theme(legend.position="none")
p4 <- ggplot(norm.df, aes(x = WeakSubjective, fill = as.factor(label))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Normalized Weak Subjective") +
ylab("Density") +
scale_fill_discrete(name = "Account Type") +
theme(legend.position="none")
p5 <- ggplot(norm.df, aes(x = StrongSubjective, fill = as.factor(label))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Normalized Strong Subjective") +
ylab("Density") +
scale_fill_discrete(name = "Account Type") +
theme(legend.position="none")
p6 <- ggplot(norm.df, aes(x = Positive, fill = as.factor(label))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Normalized Positive") +
ylab("Density") +
scale_fill_discrete(name = "Account Type") +
theme(legend.position=c(0.7,0.75))
p7 <- ggplot(norm.df, aes(x = Negative, fill = as.factor(label))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Normalized Negative") +
ylab("Density") +
scale_fill_discrete(name = "Account Type") +
theme(legend.position="none")
p8 <- ggplot(norm.df, aes(x = Fear, fill = as.factor(label))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Normalized Fear") +
ylab("Density") +
scale_fill_discrete(name = "Account Type") +
theme(legend.position="none")
p9 <- ggplot(norm.df, aes(x = Anger, fill = as.factor(label))) +
geom_density(adjust = 0.8, alpha=0.3) +
xlab("Normalized Anger") +
ylab("Density") +
scale_fill_discrete(name = "Account Type") +
theme(legend.position="none")
multiplot(p1, p4, p7, p2, p5, p8, p3, p6, p9, cols=3)
Alternatively, we can select only two dimensions (e.g., Bias and Fairness), and see that these two features can linearly separate the data by Real (blue) and Fake (red) accounts.
p <- ggplot(norm.df, aes(x = Bias, y = Fairness, color = label, text = screen_name)) +
geom_point() +
theme(legend.position="none")
plotly::ggplotly(p)
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] randomForest_4.6-12 caret_6.0-76 lattice_0.20-35
## [4] rpart.plot_2.1.2 rpart_4.1-11 bindrcpp_0.2
## [7] forcats_0.2.0 stringr_1.3.0 dplyr_0.7.4
## [10] purrr_0.2.4 readr_1.1.1 tidyr_0.7.2
## [13] tibble_1.4.2 ggplot2_2.2.1.9000 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-131 pbkrtest_0.4-7 lubridate_1.7.1
## [4] httr_1.3.1 rprojroot_1.3-2 tools_3.4.3
## [7] backports_1.1.2 utf8_1.1.3 R6_2.2.2
## [10] lazyeval_0.2.1 mgcv_1.8-22 colorspace_1.3-2
## [13] nnet_7.3-12 withr_2.1.1.9000 mnormt_1.5-5
## [16] compiler_3.4.3 cli_1.0.0 rvest_0.3.2
## [19] quantreg_5.33 SparseM_1.77 xml2_1.1.1
## [22] network_1.13.0 plotly_4.7.1 labeling_0.3
## [25] scales_0.5.0.9000 psych_1.7.5 digest_0.6.15
## [28] foreign_0.8-69 minqa_1.2.4 rmarkdown_1.9.8
## [31] pkgconfig_2.0.1 htmltools_0.3.6 lme4_1.1-13
## [34] htmlwidgets_0.9 rlang_0.2.0.9000 readxl_1.0.0
## [37] rstudioapi_0.7 shiny_1.0.5 bindr_0.1
## [40] jsonlite_1.5 crosstalk_1.0.0 ModelMetrics_1.1.0
## [43] car_2.1-4 magrittr_1.5 Matrix_1.2-12
## [46] Rcpp_0.12.15 munsell_0.4.3 stringi_1.1.7
## [49] yaml_2.1.18 MASS_7.3-47 plyr_1.8.4
## [52] parallel_3.4.3 ggrepel_0.7.0 crayon_1.3.4
## [55] haven_1.1.1 splines_3.4.3 hms_0.3
## [58] knitr_1.20 pillar_1.1.0 reshape2_1.4.3
## [61] codetools_0.2-15 stopwords_0.9.0 stats4_3.4.3
## [64] fastmatch_1.1-0 glue_1.2.0 evaluate_0.10.1
## [67] data.table_1.10.4-3 RcppParallel_4.3.20 modelr_0.1.1
## [70] nloptr_1.0.4 httpuv_1.3.5 foreach_1.4.4
## [73] quanteda_1.0.0 MatrixModels_0.4-1 cellranger_1.1.0
## [76] gtable_0.2.0 assertthat_0.2.0 mime_0.5
## [79] xtable_1.8-2 broom_0.4.2 e1071_1.6-8
## [82] class_7.3-14 viridisLite_0.3.0 iterators_1.0.9
## [85] spacyr_0.9.3 corrplot_0.77