Quick Demo of r2vec

Text corpus to a sparse matrix

library(r2vec)
library(quanteda)
data(inaugTexts)
train <- inaugTexts[1:50]
train_vectors <- textVectors(
  train,
  normalize=TRUE, #Clean the text a little
  split_token=' ', #Split on spaces
  verbose=FALSE,
  freqCutoff=.01, #Remove tokens in <1% of documents.  0.01 * 50 = .50
  absCutoff=5, #Remove tokens in <5 documents
  spellcheck=FALSE, #Don't spellcheck (not yet supported)
  remove_stopwords=TRUE, #Remove stopwords after tokenizing
  stem=TRUE, #Stem after stopword removal
  ngrams=3, #Calculate 1, 2, 3 grams
  skips=1, #Calculate skip-1-grams
  tfidf=TRUE, #Do tfidf transformation after tokenization and n-grams/skip-grams
  idf=NULL, #Compute idf based on input data
  stops=NULL, #Use default stopwroids
  pca=TRUE, #Do PCA after n-grams and skip-grams
  pca_comp=5, #Use 5 PCA components
  pca_rotation=NULL #Calculate pca rotation based on training data
)
train_vectors$M[1:10, 20:28]

acquir	acquisit	act	action	action will	activ	actual	ad
0.000000	0.000000	0.5108256	0.0000000	0.000000	0.000000	1.714798	0.000000
0.000000	0.000000	0.5108256	0.0000000	0.000000	0.000000	0.000000	0.000000
1.966113	0.000000	0.0000000	0.4780358	0.000000	0.000000	0.000000	1.609438
0.000000	1.966113	0.0000000	0.4780358	0.000000	0.000000	0.000000	0.000000
0.000000	1.966113	2.0433025	0.4780358	0.000000	0.000000	0.000000	0.000000
0.000000	0.000000	0.0000000	0.0000000	0.000000	0.000000	0.000000	0.000000
0.000000	0.000000	0.0000000	0.0000000	0.000000	0.000000	0.000000	0.000000
0.000000	0.000000	0.5108256	0.4780358	0.000000	1.347074	0.000000	0.000000
0.000000	1.966113	1.5324769	0.0000000	0.000000	0.000000	0.000000	1.609438
3.932226	0.000000	0.0000000	1.4341074	2.120264	0.000000	0.000000	0.000000

Also includes dense representation via PCA

head(train_vectors$x)

PC1	PC2	PC3	PC4	PC5
11.272757	2.4875119	1.6797556	1.7921198	1.0524387
1.104637	0.4349945	0.1366512	-0.0588726	-0.0394039
19.736642	3.5704838	2.3579615	3.2631285	5.7484499
13.923017	0.8498908	5.8851600	2.9661084	-2.0003441
19.054935	1.0983766	1.8710726	5.9759279	-0.3723459
9.690705	0.6518111	1.1124987	3.2261019	1.5524241

We can apply the same transformation pipeline to a test set

test <- inaugTexts[51:57]
test_vectors <- predict(train_vectors, test)

Which returns both sparse:

as.matrix(test_vectors$M[1:7, 20:28])

across	act	action	activ
0.000000	1.5324769	0.9560716	1.347074
6.859194	1.0216512	0.0000000	0.000000
3.429597	0.0000000	0.0000000	0.000000
1.714798	1.0216512	0.0000000	0.000000
1.714798	2.5541281	0.0000000	0.000000
5.144395	0.5108256	0.4780358	0.000000
0.000000	3.0649537	0.4780358	0.000000

and dense represenations of the test data:

head(test_vectors$x)

PC1	PC2	PC3	PC4	PC5
15.85477	-8.539709	15.42695	-7.024635	-3.698453
11.60606	-9.289903	13.31814	-5.355161	-4.353974
15.75377	-11.502222	16.76248	-6.057926	-6.774927
10.92370	-6.888311	10.09985	-3.322565	-1.047456
16.02856	-10.407744	14.26081	-5.019637	-3.139051
16.16879	-9.995669	13.23644	-4.085293	-3.446497

We can also apply tsne after PCA in the training set

This gives a non-linear, sparse embedding of the original text data

set.seed(1)
train_vectors <- textVectors(
  inaugTexts,
  normalize=TRUE, #Clean the text a little
  split_token=' ', #Split on spaces
  verbose=FALSE,
  freqCutoff=.01, #Remove tokens in <1% of documents.  0.01 * 57 = .57
  absCutoff=5, #Remove tokens in <5 documents
  spellcheck=FALSE, #Don't spellcheck (not yet supported)
  remove_stopwords=TRUE, #Remove stopwords after tokenizing
  stem=TRUE, #Stem after stopword removal
  ngrams=3, #Calculate 1, 2, 3 grams
  skips=1, #Calculate skip-1-grams
  tfidf=TRUE, #Do tfidf transformation after tokenization and n-grams/skip-grams
  idf=NULL, #Compute idf based on input data
  stops=NULL, #Use default stopwroids
  pca=TRUE, #Do PCA after n-grams and skip-grams
  pca_comp=15, #Use 15 PCA components
  pca_rotation=NULL, #Calculate pca rotation based on training data
  tsne=TRUE, #Do tsen
  tsne_dims=2, #Use 2 dimensions for tsne
  tsne_perplexity=5 #Use perplexity of 5 for tsne
  )
head(train_vectors$tsne_proj)

TSNE1	TSNE2
-16.7452469	-13.33312
0.0658602	-10.96081
-24.4213603	-19.81892
-20.0863325	-14.32583
-23.5062821	-12.60345
-14.1275749	-13.02524

The tsne embeddings can make for interesting plots, but they unfortunately cannot be applied to new data.

library(ggplot2)
df <- data.frame(
  train_vectors$tsne_proj,
  Year = inaugCorpus$documents$Year,
  President = inaugCorpus$documents$President
)
df$Label <- paste0(df$President, ' (', substr(df$Year, 3, 4), ')')
df$Year <- as.numeric(as.character(df$Year))
p1 <- ggplot(df, aes(x=TSNE1, y=TSNE2, fill=Year, label=Label)) +
  scale_fill_gradient2(low='#d73027', mid='#ffffbf', high='#4575b4', midpoint=1900) +
  geom_point(pch=21, size=5, alpha=.80) +
  geom_point(pch=21, size=5, colour = "black") +
  geom_text(size=3, vjust=1.5, alpha=.80) +
  theme_bw()
print(p1)

Quick Demo of r2vec

Zachary Deane-Mayer

2015-08-28

Text corpus to a sparse matrix

Also includes dense representation via PCA

We can apply the same transformation pipeline to a test set

We can also apply tsne after PCA in the training set