Text corpus to a sparse matrix

library(r2vec)
library(quanteda)
data(inaugTexts)
train <- inaugTexts[1:50]
train_vectors <- textVectors(
  train,
  normalize=TRUE, #Clean the text a little
  split_token=' ', #Split on spaces
  verbose=FALSE,
  freqCutoff=.01, #Remove tokens in <1% of documents.  0.01 * 50 = .50
  absCutoff=5, #Remove tokens in <5 documents
  spellcheck=FALSE, #Don't spellcheck (not yet supported)
  remove_stopwords=TRUE, #Remove stopwords after tokenizing
  stem=TRUE, #Stem after stopword removal
  ngrams=3, #Calculate 1, 2, 3 grams
  skips=1, #Calculate skip-1-grams
  tfidf=TRUE, #Do tfidf transformation after tokenization and n-grams/skip-grams
  idf=NULL, #Compute idf based on input data
  stops=NULL, #Use default stopwroids
  pca=TRUE, #Do PCA after n-grams and skip-grams
  pca_comp=5, #Use 5 PCA components
  pca_rotation=NULL #Calculate pca rotation based on training data
)
train_vectors$M[1:10, 20:28]
acquir acquisit across act action action will activ actual ad
0.000000 0.000000 0 0.5108256 0.0000000 0.000000 0.000000 1.714798 0.000000
0.000000 0.000000 0 0.5108256 0.0000000 0.000000 0.000000 0.000000 0.000000
1.966113 0.000000 0 0.0000000 0.4780358 0.000000 0.000000 0.000000 1.609438
0.000000 1.966113 0 0.0000000 0.4780358 0.000000 0.000000 0.000000 0.000000
0.000000 1.966113 0 2.0433025 0.4780358 0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0 0.0000000 0.0000000 0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0 0.0000000 0.0000000 0.000000 0.000000 0.000000 0.000000
0.000000 0.000000 0 0.5108256 0.4780358 0.000000 1.347074 0.000000 0.000000
0.000000 1.966113 0 1.5324769 0.0000000 0.000000 0.000000 0.000000 1.609438
3.932226 0.000000 0 0.0000000 1.4341074 2.120264 0.000000 0.000000 0.000000

Also includes dense representation via PCA

head(train_vectors$x)
PC1 PC2 PC3 PC4 PC5
11.272757 2.4875119 1.6797556 1.7921198 1.0524387
1.104637 0.4349945 0.1366512 -0.0588726 -0.0394039
19.736642 3.5704838 2.3579615 3.2631285 5.7484499
13.923017 0.8498908 5.8851600 2.9661084 -2.0003441
19.054935 1.0983766 1.8710726 5.9759279 -0.3723459
9.690705 0.6518111 1.1124987 3.2261019 1.5524241

We can apply the same transformation pipeline to a test set

test <- inaugTexts[51:57]
test_vectors <- predict(train_vectors, test)

Which returns both sparse:

as.matrix(test_vectors$M[1:7, 20:28])
acquir acquisit across act action action will activ actual ad
0 0 0.000000 1.5324769 0.9560716 0 1.347074 0 0
0 0 6.859194 1.0216512 0.0000000 0 0.000000 0 0
0 0 3.429597 0.0000000 0.0000000 0 0.000000 0 0
0 0 1.714798 1.0216512 0.0000000 0 0.000000 0 0
0 0 1.714798 2.5541281 0.0000000 0 0.000000 0 0
0 0 5.144395 0.5108256 0.4780358 0 0.000000 0 0
0 0 0.000000 3.0649537 0.4780358 0 0.000000 0 0

and dense represenations of the test data:

head(test_vectors$x)
PC1 PC2 PC3 PC4 PC5
15.85477 -8.539709 15.42695 -7.024635 -3.698453
11.60606 -9.289903 13.31814 -5.355161 -4.353974
15.75377 -11.502222 16.76248 -6.057926 -6.774927
10.92370 -6.888311 10.09985 -3.322565 -1.047456
16.02856 -10.407744 14.26081 -5.019637 -3.139051
16.16879 -9.995669 13.23644 -4.085293 -3.446497

We can also apply tsne after PCA in the training set

This gives a non-linear, sparse embedding of the original text data

set.seed(1)
train_vectors <- textVectors(
  inaugTexts,
  normalize=TRUE, #Clean the text a little
  split_token=' ', #Split on spaces
  verbose=FALSE,
  freqCutoff=.01, #Remove tokens in <1% of documents.  0.01 * 57 = .57
  absCutoff=5, #Remove tokens in <5 documents
  spellcheck=FALSE, #Don't spellcheck (not yet supported)
  remove_stopwords=TRUE, #Remove stopwords after tokenizing
  stem=TRUE, #Stem after stopword removal
  ngrams=3, #Calculate 1, 2, 3 grams
  skips=1, #Calculate skip-1-grams
  tfidf=TRUE, #Do tfidf transformation after tokenization and n-grams/skip-grams
  idf=NULL, #Compute idf based on input data
  stops=NULL, #Use default stopwroids
  pca=TRUE, #Do PCA after n-grams and skip-grams
  pca_comp=15, #Use 15 PCA components
  pca_rotation=NULL, #Calculate pca rotation based on training data
  tsne=TRUE, #Do tsen
  tsne_dims=2, #Use 2 dimensions for tsne
  tsne_perplexity=5 #Use perplexity of 5 for tsne
  )
head(train_vectors$tsne_proj)
TSNE1 TSNE2
-16.7452469 -13.33312
0.0658602 -10.96081
-24.4213603 -19.81892
-20.0863325 -14.32583
-23.5062821 -12.60345
-14.1275749 -13.02524

The tsne embeddings can make for interesting plots, but they unfortunately cannot be applied to new data.

library(ggplot2)
df <- data.frame(
  train_vectors$tsne_proj,
  Year = inaugCorpus$documents$Year,
  President = inaugCorpus$documents$President
)
df$Label <- paste0(df$President, ' (', substr(df$Year, 3, 4), ')')
df$Year <- as.numeric(as.character(df$Year))
p1 <- ggplot(df, aes(x=TSNE1, y=TSNE2, fill=Year, label=Label)) +
  scale_fill_gradient2(low='#d73027', mid='#ffffbf', high='#4575b4', midpoint=1900) +
  geom_point(pch=21, size=5, alpha=.80) +
  geom_point(pch=21, size=5, colour = "black") +
  geom_text(size=3, vjust=1.5, alpha=.80) +
  theme_bw()
print(p1)