Intro

This is the replication script for ‘archivist: An R Package for Managing, Recording and Restoring Data Analysis Results’ (Przemyslaw Biecek, Marcin Kosinski) submitted to JSS.

First, make sure that archivist is installed.

if (!require(archivist)) {
  install.packages("archivist")
  library(archivist)
}

Section 2. Motivation

Reading artifacts from GitHub

archivist::aread('pbiecek/Eseje/arepo/ba7f58fafe7373420e3ddce039558140') 

Reading artifacts from package

library("archivist")
models <- asearch("pbiecek/graphGallery", patterns = "class:lm")
modelsBIC <- sapply(models, BIC)
sort(modelsBIC)
## 990861c7c27812ee959f10e5f76fe2c3 2a6e492cb6982f230e48cf46023e2e4f 
##                         39.05577                         67.52735 
## 0a82efeb8250a47718cea9d7f64e5ae7 378237103bb60c58600fe69bed6c7f11 
##                        189.73593                        189.73593 
## 7f11e03539d48d35f7e7fe7780527ba7 c1b1ef7bcddefb181f79176015bc3931 
##                        189.73593                        189.73593 
## 0e213ac68a45b6cd454d06b91f991bc7 e58d2f9d50b67ce4d397bf015ec1259c 
##                        243.49450                        243.49450 
## 18a98048f0584469483afb65294ce3ed 
##                        396.16690

Reading artifacts from Shiny

# wake up the shiny container at shinyapps
# not needed for other shiny apps
invisible(xml2::read_html("https://cogito.shinyapps.io/archivistShiny/"))

# here we are reading the artifact
archivist::aread('https://cogito.shinyapps.io/archivistShiny/arepo/ca680b829abd8f0a4bd2347dcf9fe534')

Section 3. Functionality

Section 3.1 Repository management

Creation of a new empty repository

repo <- "arepo"
createLocalRepo(repoDir = repo, default = TRUE)

Deletion of an existing repository

repo <- "arepo"
deleteLocalRepo(repoDir = repo)

Copying artifacts from other repositories

repo <- "arepo"
createLocalRepo(repoDir = repo, default = TRUE)
copyRemoteRepo(repoTo = repo, md5hashes= "7f3453331910e3f321ef97d87adb5bad", 
         user = "pbiecek", repo = "graphGallery", repoType = "github")

Showing repository statistics

showLocalRepo(repoDir = repo, method = "tags")
##                           artifact
## 1 7f3453331910e3f321ef97d87adb5bad
## 2 7f3453331910e3f321ef97d87adb5bad
## 3 7f3453331910e3f321ef97d87adb5bad
## 4 7f3453331910e3f321ef97d87adb5bad
## 5 7f3453331910e3f321ef97d87adb5bad
## 6 7f3453331910e3f321ef97d87adb5bad
## 7 7f3453331910e3f321ef97d87adb5bad
## 8 7f3453331910e3f321ef97d87adb5bad
## 9 7f3453331910e3f321ef97d87adb5bad
##                                             tag         createdDate
## 1                                    format:rda 2016-12-31 15:50:59
## 2                                      name:pl1 2016-12-31 15:50:59
## 3                                      class:gg 2016-12-31 15:50:59
## 4                                  class:ggplot 2016-12-31 15:50:59
## 5                           labelx:Sepal.Length 2016-12-31 15:50:59
## 6                           labely:Petal.Length 2016-12-31 15:50:59
## 7                      date:2016-12-31 15:50:59 2016-12-31 15:50:59
## 8 session_info:0c325724f6118fdd80e6504204b72cfa 2016-12-31 15:50:59
## 9                                    format:png 2016-12-31 15:51:00
summaryLocalRepo(repoDir = 
    system.file("graphGallery", package = "archivist")) 
## Number of archived artifacts in Repository:  7 
## Number of archived datasets in Repository:  3 
## Number of various classes archived in Repository: 
##             Number
## lm              3
## data.frame      2
## summary.lm      1
## gg              2
## ggplot          2
## Saves per day in Repository: 
##             Saves
## 2016-02-07     5
## 2016-02-08    13
## 2016-03-04     3
## 2016-12-31     4

Setting a default repository

setRemoteRepo(user = "pbiecek", repo = "graphGallery", repoType = "github")
setLocalRepo(repoDir = system.file("graphGallery", package = "archivist"))

Saving to the default local repository

setLocalRepo(repoDir = repo)
data(iris)
saveToLocalRepo(iris)
## [1] "ff575c261c949d073b2895b05d1097c3"
aoptions("repoType", "github")
## [1] "github"

Section 3.2 Artifact management

Saving an R object into a repository

library("ggplot2")
repo <- "arepo"
pl <- qplot(Sepal.Length, Petal.Length, data = iris)
saveToRepo(pl, repoDir = repo)
## [1] "b725eae07eba5c170489435e3466b760"
## attr(,"data")
## [1] "ff575c261c949d073b2895b05d1097c3"
showLocalRepo(repoDir = repo, "tags")
##                            artifact
## 1  7f3453331910e3f321ef97d87adb5bad
## 2  7f3453331910e3f321ef97d87adb5bad
## 3  7f3453331910e3f321ef97d87adb5bad
## 4  7f3453331910e3f321ef97d87adb5bad
## 5  7f3453331910e3f321ef97d87adb5bad
## 6  7f3453331910e3f321ef97d87adb5bad
## 7  7f3453331910e3f321ef97d87adb5bad
## 8  7f3453331910e3f321ef97d87adb5bad
## 9  7f3453331910e3f321ef97d87adb5bad
## 10 ff575c261c949d073b2895b05d1097c3
## 11 ff575c261c949d073b2895b05d1097c3
## 12 ff575c261c949d073b2895b05d1097c3
## 13 ff575c261c949d073b2895b05d1097c3
## 14 ff575c261c949d073b2895b05d1097c3
## 15 ff575c261c949d073b2895b05d1097c3
## 16 ff575c261c949d073b2895b05d1097c3
## 17 ff575c261c949d073b2895b05d1097c3
## 18 ff575c261c949d073b2895b05d1097c3
## 19 73c0af8a919ed6f073abd3ccb6a2090b
## 20 ff575c261c949d073b2895b05d1097c3
## 21 ff575c261c949d073b2895b05d1097c3
## 22 b725eae07eba5c170489435e3466b760
## 23 b725eae07eba5c170489435e3466b760
## 24 b725eae07eba5c170489435e3466b760
## 25 b725eae07eba5c170489435e3466b760
## 26 b725eae07eba5c170489435e3466b760
## 27 b725eae07eba5c170489435e3466b760
## 28 b725eae07eba5c170489435e3466b760
## 29 6bc0b4ff7194b1580cbaf6da54e749c7
## 30 b725eae07eba5c170489435e3466b760
## 31 ff575c261c949d073b2895b05d1097c3
## 32 ff575c261c949d073b2895b05d1097c3
## 33 ff575c261c949d073b2895b05d1097c3
## 34 b725eae07eba5c170489435e3466b760
##                                              tag         createdDate
## 1                                     format:rda 2016-12-31 15:50:59
## 2                                       name:pl1 2016-12-31 15:50:59
## 3                                       class:gg 2016-12-31 15:50:59
## 4                                   class:ggplot 2016-12-31 15:50:59
## 5                            labelx:Sepal.Length 2016-12-31 15:50:59
## 6                            labely:Petal.Length 2016-12-31 15:50:59
## 7                       date:2016-12-31 15:50:59 2016-12-31 15:50:59
## 8  session_info:0c325724f6118fdd80e6504204b72cfa 2016-12-31 15:50:59
## 9                                     format:png 2016-12-31 15:51:00
## 10                                    format:rda 2016-12-31 15:57:55
## 11                                     name:iris 2016-12-31 15:57:55
## 12                              class:data.frame 2016-12-31 15:57:55
## 13                          varname:Sepal.Length 2016-12-31 15:57:55
## 14                           varname:Sepal.Width 2016-12-31 15:57:55
## 15                          varname:Petal.Length 2016-12-31 15:57:55
## 16                           varname:Petal.Width 2016-12-31 15:57:55
## 17                               varname:Species 2016-12-31 15:57:55
## 18                      date:2016-12-31 15:57:55 2016-12-31 15:57:55
## 19                                    format:rda 2016-12-31 15:57:55
## 20 session_info:73c0af8a919ed6f073abd3ccb6a2090b 2016-12-31 15:57:55
## 21                                    format:txt 2016-12-31 15:57:55
## 22                                    format:rda 2016-12-31 15:57:55
## 23                                       name:pl 2016-12-31 15:57:55
## 24                                      class:gg 2016-12-31 15:57:55
## 25                                  class:ggplot 2016-12-31 15:57:55
## 26                           labelx:Sepal.Length 2016-12-31 15:57:55
## 27                           labely:Petal.Length 2016-12-31 15:57:55
## 28                      date:2016-12-31 15:57:55 2016-12-31 15:57:55
## 29                                    format:rda 2016-12-31 15:57:55
## 30 session_info:6bc0b4ff7194b1580cbaf6da54e749c7 2016-12-31 15:57:55
## 31                                    format:rda 2016-12-31 15:57:55
## 32                                    format:txt 2016-12-31 15:57:55
## 33 relationWith:b725eae07eba5c170489435e3466b760 2016-12-31 15:57:55
## 34                                    format:png 2016-12-31 15:57:55

Session info for this object

asession("11127cc6ce69a89d11d0e30865a33c13")
## [1] NA

Serialization of an object creation event into repository

library("archivist")
createLocalRepo("arepo", default = TRUE)
library("dplyr")
iris %a%
   filter(Sepal.Length < 6) %a%
   lm(Petal.Length~Species, data=.) %a%
   summary() -> tmp

ahistory(tmp)
##    iris                                  [ff575c261c949d073b2895b05d1097c3]
## -> filter(Sepal.Length < 6)              [d3696e13d15223c7d0bbccb33cc20a11]
## -> lm(Petal.Length ~ Species, data = .)  [990861c7c27812ee959f10e5f76fe2c3]
## -> summary()                             [050e41ec3bc40b3004bc6bdd356acae7]
ahistory(md5hash = "050e41ec3bc40b3004bc6bdd356acae7")
##    iris                                  [ff575c261c949d073b2895b05d1097c3]
## -> filter(Sepal.Length < 6)              [d3696e13d15223c7d0bbccb33cc20a11]
## -> lm(Petal.Length ~ Species, data = .)  [990861c7c27812ee959f10e5f76fe2c3]
## -> summary()                             [050e41ec3bc40b3004bc6bdd356acae7]

Loading an object from a repository

Femote, local or in a package

loadFromRemoteRepo("7f3453331910e3f321ef97d87adb5bad", repo="graphGallery", user="pbiecek", 
                             value=TRUE)

loadFromLocalRepo("7f3453", system.file("graphGallery", package = "archivist"), value=TRUE)

archivist::aread("pbiecek/graphGallery/7f3453331910e3f321ef97d87adb5bad")

library("archivist")
setLocalRepo(system.file("graphGallery", package = "archivist"))
# loadFromLocalRepo("7f3453", value=TRUE)
archivist::aread("7f3453")

setLocalRepo(system.file("graphGallery", package = "archivist"))
model <- aread("2a6e492cb6982f230e48cf46023e2e4f")
summary(model)
## 
## Call:
## lm(formula = Petal.Length ~ Sepal.Length + Species, data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.76390 -0.17875  0.00716  0.17461  0.79954 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.70234    0.23013  -7.397 1.01e-11 ***
## Sepal.Length       0.63211    0.04527  13.962  < 2e-16 ***
## Speciesversicolor  2.21014    0.07047  31.362  < 2e-16 ***
## Speciesvirginica   3.09000    0.09123  33.870  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2826 on 146 degrees of freedom
## Multiple R-squared:  0.9749, Adjusted R-squared:  0.9744 
## F-statistic:  1890 on 3 and 146 DF,  p-value: < 2.2e-16
digest::digest(model)
## [1] "2a6e492cb6982f230e48cf46023e2e4f"

Removal of an object from repository

rmFromLocalRepo("7f3453331910e3f321ef97d87adb5bad", repoDir = repo)

Remove all older than 30 days

(obj2rm <- searchInLocalRepo(list(dateFrom = "2010-01-01", dateTo = Sys.Date() - 30), repoDir = repo))
## character(0)
rmFromLocalRepo(obj2rm, repoDir = repo, many = TRUE)

Section 3.3 Search for an artifact

Search in a local/GitHub repository

searchInLocalRepo(pattern = "class:gg", 
    repoDir = system.file("graphGallery", package = "archivist"))
## [1] "7f3453331910e3f321ef97d87adb5bad" "369227e67f9164dcbe934dadf2b53cc2"
searchInLocalRepo(pattern = list(dateFrom = "2016-01-01",
    dateTo = "2016-02-07" ), 
    repoDir = system.file("graphGallery", package = "archivist"))
## [1] "d9313a0de3e2980201a8971e3384ff26" "ff575c261c949d073b2895b05d1097c3"
## [3] "2a6e492cb6982f230e48cf46023e2e4f" "93ecfdf1436932e2860c6dbdf2abc2ad"
## [5] "afb2550d0f886f0cf3b050f04c5cd4f8"
searchInLocalRepo(pattern=c("class:gg", "labelx:Sepal.Length"),
         repoDir = system.file("graphGallery", package = "archivist"))  
## [1] "369227e67f9164dcbe934dadf2b53cc2" "7f3453331910e3f321ef97d87adb5bad"

Retrieval of a list of R objects with given tags

setLocalRepo(system.file("graphGallery", package = "archivist"))
models <- asearch(patterns = c("class:lm", "coefname:Sepal.Length"))

models <- asearch("pbiecek/graphGallery",  
    patterns = c("class:lm", "coefname:Sepal.Length"))
lapply(models, coef)
## $`18a98048f0584469483afb65294ce3ed`
##  (Intercept) Sepal.Length 
##    -7.101443     1.858433 
## 
## $`2a6e492cb6982f230e48cf46023e2e4f`
##       (Intercept)      Sepal.Length Speciesversicolor  Speciesvirginica 
##        -1.7023422         0.6321099         2.2101378         3.0900021
plots <- asearch(patterns = c("class:gg", "labelx:Sepal.Length"))
length(plots)
## [1] 2
library("gridExtra")
do.call(grid.arrange, plots)

Section 3.4 Extensions

Archivisation of all results of a specific function

library("archivist")
createLocalRepo("allModels", default = TRUE)
atrace("lm", "z")
## [1] "lm"
# in the article is only one call to lm()
lm(Sepal.Length~Sepal.Width, data=iris)
## Tracing lm(Sepal.Length ~ Sepal.Width, data = iris) on exit
## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width, data = iris)
## 
## Coefficients:
## (Intercept)  Sepal.Width  
##      6.5262      -0.2234
lm(Sepal.Length~Petal.Length, data=iris)
## Tracing lm(Sepal.Length ~ Petal.Length, data = iris) on exit
## 
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = iris)
## 
## Coefficients:
##  (Intercept)  Petal.Length  
##       4.3066        0.4089
lm(Sepal.Length~Petal.Length, data=iris)
## Tracing lm(Sepal.Length ~ Petal.Length, data = iris) on exit
## 
## Call:
## lm(formula = Sepal.Length ~ Petal.Length, data = iris)
## 
## Coefficients:
##  (Intercept)  Petal.Length  
##       4.3066        0.4089
sapply(asearch("class:lm"), BIC)
## 42fcf77af2c40f70c445cbba513aeabd 5c5751e36b31b2251d2767d96993320a 
##                         381.0236                         169.0723
deleteLocalRepo("allModels")

Integration with the knitr package

Requires a knitr report to work

# addHooksToPrint(class=c("ggplot", "data.frame"),
#     repoDir = "arepo",
#     repo = "Eseje", user = "pbiecek", subdir = "arepo")

Restoring older versions of packages

asession("pbiecek/graphGallery/arepo/600bda83cb840947976bd1ce3a11879d")
##  setting  value                       
##  version  R version 3.2.2 (2015-08-14)
##  system   x86_64, darwin13.4.0        
##  ui       RStudio (0.99.441)          
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  tz       Europe/Warsaw               
##  date     2016-02-09                  
## 
##  package      * version  date       source                         
##  acepack        1.3-3.3  2013-05-03 CRAN (R 3.1.0)                 
##  archivist    * 1.9.7.2  2016-02-08 CRAN (R 3.2.2)                 
##  assertthat     0.1      2013-12-06 CRAN (R 3.1.0)                 
##  bitops         1.0-6    2013-08-17 CRAN (R 3.1.0)                 
##  car            2.1-1    2015-12-14 CRAN (R 3.2.3)                 
##  cluster        2.0.3    2015-07-21 CRAN (R 3.2.2)                 
##  colorspace     1.2-6    2015-03-11 CRAN (R 3.1.3)                 
##  DBI            0.3.1    2014-09-24 CRAN (R 3.1.1)                 
##  devtools       1.9.1    2015-09-11 CRAN (R 3.2.0)                 
##  digest         0.6.9    2016-01-08 CRAN (R 3.2.3)                 
##  dplyr        * 0.4.3    2015-09-01 CRAN (R 3.2.0)                 
##  foreign        0.8-65   2015-07-02 CRAN (R 3.2.2)                 
##  Formula        1.2-1    2015-04-07 CRAN (R 3.1.3)                 
##  ggplot2        2.0.0    2015-12-16 Github (hadley/ggplot2@11679cd)
##  gridExtra    * 2.0.0    2015-07-14 CRAN (R 3.2.0)                 
##  gtable         0.1.2    2012-12-05 CRAN (R 3.1.0)                 
##  Hmisc          3.17-0   2015-09-21 CRAN (R 3.2.0)                 
##  httr           1.0.0    2015-06-25 CRAN (R 3.2.0)                 
##  intsvy         1.8      2015-11-30 CRAN (R 3.2.2)                 
##  labeling       0.3      2014-08-23 CRAN (R 3.1.1)                 
##  lattice        0.20-33  2015-07-14 CRAN (R 3.2.2)                 
##  latticeExtra   0.6-26   2013-08-15 CRAN (R 3.1.0)                 
##  lazyeval       0.1.10   2015-01-02 CRAN (R 3.1.2)                 
##  lme4           1.1-10   2015-10-06 CRAN (R 3.2.2)                 
##  lubridate      1.5.0    2015-12-03 CRAN (R 3.2.3)                 
##  magrittr       1.5      2014-11-22 CRAN (R 3.1.2)                 
##  MASS           7.3-43   2015-07-16 CRAN (R 3.2.2)                 
##  Matrix         1.2-2    2015-07-08 CRAN (R 3.2.2)                 
##  MatrixModels   0.4-1    2015-08-22 CRAN (R 3.2.0)                 
##  memisc         0.97     2015-03-08 CRAN (R 3.1.3)                 
##  memoise        0.2.1    2014-04-22 CRAN (R 3.1.0)                 
##  mgcv           1.8-7    2015-07-23 CRAN (R 3.2.2)                 
##  minqa          1.2.4    2014-10-09 CRAN (R 3.1.1)                 
##  munsell        0.4.2    2013-07-11 CRAN (R 3.1.0)                 
##  nlme           3.1-121  2015-06-29 CRAN (R 3.2.2)                 
##  nloptr         1.0.4    2014-08-04 CRAN (R 3.1.1)                 
##  nnet           7.3-10   2015-06-29 CRAN (R 3.2.2)                 
##  pbkrtest       0.4-4    2015-12-12 CRAN (R 3.2.3)                 
##  plyr           1.8.3    2015-06-12 CRAN (R 3.2.0)                 
##  proto          0.3-10   2012-12-22 CRAN (R 3.1.0)                 
##  quantreg       5.19     2015-08-31 CRAN (R 3.2.0)                 
##  R6             2.1.2    2016-01-26 CRAN (R 3.2.3)                 
##  RColorBrewer   1.1-2    2014-12-07 CRAN (R 3.1.2)                 
##  Rcpp           0.12.3   2016-01-10 CRAN (R 3.2.3)                 
##  RCurl          1.95-4.7 2015-06-30 CRAN (R 3.2.0)                 
##  reshape        0.8.5    2014-04-23 CRAN (R 3.1.0)                 
##  rpart          4.1-10   2015-06-29 CRAN (R 3.2.2)                 
##  RSQLite        1.0.0    2014-10-25 CRAN (R 3.1.2)                 
##  scales         0.3.0    2015-08-25 CRAN (R 3.2.0)                 
##  SparseM        1.7      2015-08-15 CRAN (R 3.2.0)                 
##  stringi        1.0-1    2015-10-22 CRAN (R 3.2.0)                 
##  stringr        1.0.0    2015-04-30 CRAN (R 3.2.0)                 
##  survival       2.38-3   2015-07-02 CRAN (R 3.2.2)
# Be warned, this line will install al lot of packages in old versions
# restoreLibs("pbiecek/graphGallery/arepo/600bda83cb840947976bd1ce3a11879d")
# aread("pbiecek/graphGallery/arepo/600bda83cb840947976bd1ce3a11879d")

R Session

sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] gridExtra_2.2.1    dplyr_0.5.0        ggplot2_2.2.0.9000
## [4] archivist_2.1.2   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.8      plyr_1.8.4       bitops_1.0-6     tools_3.3.2     
##  [5] digest_0.6.10    lubridate_1.6.0  jsonlite_1.1     RSQLite_1.1-1   
##  [9] evaluate_0.10    memoise_1.0.0    tibble_1.2       gtable_0.2.0    
## [13] rstudioapi_0.6   shiny_0.14.2     DBI_0.5-1        curl_2.2        
## [17] yaml_2.1.14      withr_1.0.2      httr_1.2.1       stringr_1.1.0   
## [21] knitr_1.15       xml2_1.0.0       devtools_1.12.0  rprojroot_1.1   
## [25] grid_3.3.2       R6_2.2.0         rmarkdown_1.2    magrittr_1.5    
## [29] backports_1.0.4  scales_0.4.1     htmltools_0.3.5  assertthat_0.1  
## [33] mime_0.5         colorspace_1.3-1 xtable_1.8-2     httpuv_1.3.3    
## [37] labeling_0.3     stringi_1.1.2    RCurl_1.95-4.8   lazyeval_0.2.0  
## [41] munsell_0.4.3