Text corpus to a sparse matrix
library(r2vec)
library(quanteda)
data(inaugTexts)
train <- inaugTexts[1:50]
train_vectors <- textVectors(
train,
normalize=TRUE, #Clean the text a little
split_token=' ', #Split on spaces
verbose=FALSE,
freqCutoff=.01, #Remove tokens in <1% of documents. 0.01 * 50 = .50
absCutoff=5, #Remove tokens in <5 documents
spellcheck=FALSE, #Don't spellcheck (not yet supported)
remove_stopwords=TRUE, #Remove stopwords after tokenizing
stem=TRUE, #Stem after stopword removal
ngrams=3, #Calculate 1, 2, 3 grams
skips=1, #Calculate skip-1-grams
tfidf=TRUE, #Do tfidf transformation after tokenization and n-grams/skip-grams
idf=NULL, #Compute idf based on input data
stops=NULL, #Use default stopwroids
pca=TRUE, #Do PCA after n-grams and skip-grams
pca_comp=5, #Use 5 PCA components
pca_rotation=NULL #Calculate pca rotation based on training data
)
train_vectors$M[1:10, 20:28]
0.000000 |
0.000000 |
0 |
0.5108256 |
0.0000000 |
0.000000 |
0.000000 |
1.714798 |
0.000000 |
0.000000 |
0.000000 |
0 |
0.5108256 |
0.0000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
1.966113 |
0.000000 |
0 |
0.0000000 |
0.4780358 |
0.000000 |
0.000000 |
0.000000 |
1.609438 |
0.000000 |
1.966113 |
0 |
0.0000000 |
0.4780358 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
1.966113 |
0 |
2.0433025 |
0.4780358 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0 |
0.0000000 |
0.0000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0 |
0.0000000 |
0.0000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0 |
0.5108256 |
0.4780358 |
0.000000 |
1.347074 |
0.000000 |
0.000000 |
0.000000 |
1.966113 |
0 |
1.5324769 |
0.0000000 |
0.000000 |
0.000000 |
0.000000 |
1.609438 |
3.932226 |
0.000000 |
0 |
0.0000000 |
1.4341074 |
2.120264 |
0.000000 |
0.000000 |
0.000000 |
We can also apply tsne after PCA in the training set
This gives a non-linear, sparse embedding of the original text data
set.seed(1)
train_vectors <- textVectors(
inaugTexts,
normalize=TRUE, #Clean the text a little
split_token=' ', #Split on spaces
verbose=FALSE,
freqCutoff=.01, #Remove tokens in <1% of documents. 0.01 * 57 = .57
absCutoff=5, #Remove tokens in <5 documents
spellcheck=FALSE, #Don't spellcheck (not yet supported)
remove_stopwords=TRUE, #Remove stopwords after tokenizing
stem=TRUE, #Stem after stopword removal
ngrams=3, #Calculate 1, 2, 3 grams
skips=1, #Calculate skip-1-grams
tfidf=TRUE, #Do tfidf transformation after tokenization and n-grams/skip-grams
idf=NULL, #Compute idf based on input data
stops=NULL, #Use default stopwroids
pca=TRUE, #Do PCA after n-grams and skip-grams
pca_comp=15, #Use 15 PCA components
pca_rotation=NULL, #Calculate pca rotation based on training data
tsne=TRUE, #Do tsen
tsne_dims=2, #Use 2 dimensions for tsne
tsne_perplexity=5 #Use perplexity of 5 for tsne
)
head(train_vectors$tsne_proj)
-16.7452469 |
-13.33312 |
0.0658602 |
-10.96081 |
-24.4213603 |
-19.81892 |
-20.0863325 |
-14.32583 |
-23.5062821 |
-12.60345 |
-14.1275749 |
-13.02524 |
The tsne embeddings can make for interesting plots, but they unfortunately cannot be applied to new data.
library(ggplot2)
df <- data.frame(
train_vectors$tsne_proj,
Year = inaugCorpus$documents$Year,
President = inaugCorpus$documents$President
)
df$Label <- paste0(df$President, ' (', substr(df$Year, 3, 4), ')')
df$Year <- as.numeric(as.character(df$Year))
p1 <- ggplot(df, aes(x=TSNE1, y=TSNE2, fill=Year, label=Label)) +
scale_fill_gradient2(low='#d73027', mid='#ffffbf', high='#4575b4', midpoint=1900) +
geom_point(pch=21, size=5, alpha=.80) +
geom_point(pch=21, size=5, colour = "black") +
geom_text(size=3, vjust=1.5, alpha=.80) +
theme_bw()
print(p1)