<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml">

<head>

<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />


<meta name="author" content="Jason Willwerscheid" />


<title>Comparing initialization functions</title>

<script src="site_libs/jquery-1.11.3/jquery.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/cosmo.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script>
<link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<link href="site_libs/highlightjs-9.12.0/textmate.css" rel="stylesheet" />
<script src="site_libs/highlightjs-9.12.0/highlight.js"></script>
<link href="site_libs/font-awesome-4.5.0/css/font-awesome.min.css" rel="stylesheet" />

<style type="text/css">code{white-space: pre;}</style>
<style type="text/css">
  pre:not([class]) {
    background-color: white;
  }
</style>
<script type="text/javascript">
if (window.hljs) {
  hljs.configure({languages: []});
  hljs.initHighlightingOnLoad();
  if (document.readyState && document.readyState === "complete") {
    window.setTimeout(function() { hljs.initHighlighting(); }, 0);
  }
}
</script>



<style type="text/css">
h1 {
  font-size: 34px;
}
h1.title {
  font-size: 38px;
}
h2 {
  font-size: 30px;
}
h3 {
  font-size: 24px;
}
h4 {
  font-size: 18px;
}
h5 {
  font-size: 16px;
}
h6 {
  font-size: 12px;
}
.table th:not([align]) {
  text-align: left;
}
</style>


</head>

<body>

<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
code {
  color: inherit;
  background-color: rgba(0, 0, 0, 0.04);
}
img {
  max-width:100%;
  height: auto;
}
.tabbed-pane {
  padding-top: 12px;
}
button.code-folding-btn:focus {
  outline: none;
}
</style>


<style type="text/css">
/* padding for bootstrap navbar */
body {
  padding-top: 51px;
  padding-bottom: 40px;
}
/* offset scroll position for anchor links (for fixed navbar)  */
.section h1 {
  padding-top: 56px;
  margin-top: -56px;
}

.section h2 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h3 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h4 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h5 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h6 {
  padding-top: 56px;
  margin-top: -56px;
}
</style>

<script>
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark it active
  menuAnchor.parent().addClass('active');

  // if it's got a parent navbar menu mark it active as well
  menuAnchor.closest('li.dropdown').addClass('active');
});
</script>


<div class="container-fluid main-container">

<!-- tabsets -->
<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});
</script>

<!-- code folding -->




<script>
$(document).ready(function ()  {

    // move toc-ignore selectors from section div to header
    $('div.section.toc-ignore')
        .removeClass('toc-ignore')
        .children('h1,h2,h3,h4,h5').addClass('toc-ignore');

    // establish options
    var options = {
      selectors: "h1,h2,h3",
      theme: "bootstrap3",
      context: '.toc-content',
      hashGenerator: function (text) {
        return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase();
      },
      ignoreSelector: ".toc-ignore",
      scrollTo: 0
    };
    options.showAndHide = true;
    options.smoothScroll = true;

    // tocify
    var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>

<style type="text/css">

#TOC {
  margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
  position: relative;
  width: 100%;
}
}


.toc-content {
  padding-left: 30px;
  padding-right: 40px;
}

div.main-container {
  max-width: 1200px;
}

div.tocify {
  width: 20%;
  max-width: 260px;
  max-height: 85%;
}

@media (min-width: 768px) and (max-width: 991px) {
  div.tocify {
    width: 25%;
  }
}

@media (max-width: 767px) {
  div.tocify {
    width: 100%;
    max-width: none;
  }
}

.tocify ul, .tocify li {
  line-height: 20px;
}

.tocify-subheader .tocify-item {
  font-size: 0.90em;
  padding-left: 25px;
  text-indent: 0;
}

.tocify .list-group-item {
  border-radius: 0px;
}


</style>

<!-- setup 3col/9col grid for toc_float and main content  -->
<div class="row-fluid">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>

<div class="toc-content col-xs-12 col-sm-8 col-md-9">




<div class="navbar navbar-default  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html">FLASHvestigations</a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="index.html">Home</a>
</li>
<li>
  <a href="about.html">About</a>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        <li>
  <a href="https://github.com/willwerscheid/FLASHvestigations">
    <span class="fa fa-github"></span>
     
  </a>
</li>
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->

<!-- Add a small amount of space between sections. -->
<style type="text/css">
div.section {
  padding-top: 12px;
}
</style>

<div class="fluid-row" id="header">



<h1 class="title toc-ignore">Comparing initialization functions</h1>
<h4 class="author"><em>Jason Willwerscheid</em></h4>
<h4 class="date"><em>7/25/2018</em></h4>

</div>


<p><strong>Last updated:</strong> 2018-07-26</p>
<strong>workflowr checks:</strong> <small>(Click a bullet for more information)</small>
<ul>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>R Markdown file:</strong> up-to-date </summary></p>
<p>Great! Since the R Markdown file has been committed to the Git repository, you know the exact version of the code that produced these results.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Environment:</strong> empty </summary></p>
<p>Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Seed:</strong> <code>set.seed(20180714)</code> </summary></p>
<p>The command <code>set.seed(20180714)</code> was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Session information:</strong> recorded </summary></p>
<p>Great job! Recording the operating system, R version, and package versions is critical for reproducibility.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Repository version:</strong> <a href="https://github.com/willwerscheid/FLASHvestigations/tree/cbae7ec4f41491f8fcb3baa7fe6ac1c7a280a2fa" target="_blank">cbae7ec</a> </summary></p>
Great! You are using Git for version control. Tracking code development and connecting the code version to the results is critical for reproducibility. The version displayed above was the version of the Git repository at the time these results were generated. <br><br> Note that you need to be careful to ensure that all relevant files for the analysis have been committed to Git prior to generating the results (you can use <code>wflow_publish</code> or <code>wflow_git_commit</code>). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated:
<pre><code>
Ignored files:
    Ignored:    .DS_Store
    Ignored:    .Rhistory
    Ignored:    .Rproj.user/
    Ignored:    docs/.DS_Store
    Ignored:    docs/figure/.DS_Store

Untracked files:
    Untracked:  data/greedy19.rds

</code></pre>
Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes. </details>
</li>
</ul>
<details> <summary> <small><strong>Expand here to see past versions:</strong></small> </summary>
<ul>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
File
</th>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
<th style="text-align:left;">
Message
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/willwerscheid/FLASHvestigations/blob/cbae7ec4f41491f8fcb3baa7fe6ac1c7a280a2fa/analysis/init_fn.Rmd" target="_blank">cbae7ec</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-26
</td>
<td style="text-align:left;">
wflow_publish(“analysis/init_fn.Rmd”)
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/willwerscheid/FLASHvestigations/11187776df01c939d232659ab0ea10eb08bfd47b/docs/init_fn.html" target="_blank">1118777</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-26
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/willwerscheid/FLASHvestigations/blob/f37ec8adc497669a6cf28c857426f6215d947027/analysis/init_fn.Rmd" target="_blank">f37ec8a</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-26
</td>
<td style="text-align:left;">
wflow_publish(“analysis/init_fn.Rmd”)
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/willwerscheid/FLASHvestigations/aae4dfcc7a5bca6b9885567d66bc183757fa133a/docs/init_fn.html" target="_blank">aae4dfc</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-25
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/willwerscheid/FLASHvestigations/blob/6417d4e9ca3d658552b99b7953213af0612c7bd9/analysis/init_fn.Rmd" target="_blank">6417d4e</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-25
</td>
<td style="text-align:left;">
wflow_publish(c(“analysis/index.Rmd”, “analysis/init_fn.Rmd”))
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/willwerscheid/FLASHvestigations/131dd854cc8baa76ae6a5eee47d032180a67a6c5/docs/init_fn.html" target="_blank">131dd85</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-25
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/willwerscheid/FLASHvestigations/blob/06676b5085b953c637220c8f8bd4f9ad572f05a3/analysis/init_fn.Rmd" target="_blank">06676b5</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-25
</td>
<td style="text-align:left;">
wflow_publish(“analysis/init_fn.Rmd”)
</td>
</tr>
</tbody>
</table>
</ul>
<p></details></p>
<hr />
<div id="introduction" class="section level2">
<h2>Introduction</h2>
<p>The default function used to initialize a new factor/loading is <code>udv_si</code>, which is a simple wrapper to <code>softImpute</code> with option <code>type = &quot;als&quot;</code>. We previously chose to use <code>type = &quot;als&quot;</code> rather than <code>type = &quot;svd&quot;</code> because initial results showed the former to be much faster. Here I investigate further, using a large GTEx dataset as an example.</p>
</div>
<div id="problem-with-type-als" class="section level2">
<h2>Problem with type = “als”</h2>
<p>I discovered an issue when investigating occasional large decreases in the FLASH objective. The dataset I use here is taken from MASH’s analysis of GTEx data. <code>flash_add_greedy</code> successfully adds 19 factors, then adds a 20th factor that improves the objective until a bad update occurs. This update increases the objective to such an extent that <code>nullcheck</code> erroneously removes the factor and FLASH terminates prematurely.</p>
<p>The last few lines of verbose output are as follows:</p>
<pre><code>(...)
Objective:-1258100.51191229
Objective:-1258097.28313664
Objective:-1258093.37994353
Objective:-1258088.30966224
Objective:-1258081.1984392
Objective:-1258219.32382352
An iteration decreased the objective. This happens occasionally, perhaps due to numeric reasons. You could ignore this warning, but you might like to check out https://github.com/stephenslab/flashr/issues/26 for more details.performing nullcheck
objective from deleting factor:-1258152.21114343
objective from keeping factor:-1258219.32382352
factor zeroed out</code></pre>
<p>But if I add 19 factors, then add the 20th separately, I get a much different result:</p>
<pre><code>(...)
Objective:-1257665.05089993
Objective:-1257664.93935874
Objective:-1257664.72526429
Objective:-1257664.56199637
Objective:-1257664.52184646
Objective:-1257664.51611008
performing nullcheck
objective from deleting factor:-1258152.21114343
objective from keeping factor:-1257664.51611008
nullcheck complete, objective:-1257664.51611008</code></pre>
<p>The reason for the difference is that <code>softImpute</code> randomly initializes <code>u</code> when option <code>type = &quot;als&quot;</code> is used. On the second try, I simply got luckier with the initialization.</p>
<p>To reproduce the above results, run the following (it will take some time):</p>
<pre class="r"><code># devtools::install_github(&quot;stephenslab/flashr&quot;)
devtools::load_all(&quot;/Users/willwerscheid/GitHub/flashr&quot;)
# devtools::install_github(&quot;stephenslab/ebnm&quot;)
devtools::load_all(&quot;/Users/willwerscheid/GitHub/ebnm&quot;)

gtex &lt;- readRDS(gzcon(url(&quot;https://github.com/stephenslab/gtexresults/blob/master/data/MatrixEQTLSumStats.Portable.Z.rds?raw=TRUE&quot;)))
strong &lt;- t(gtex$strong.z)

# 20th factor zeroed out:
fl &lt;- flash_add_greedy(strong, Kmax=50, verbose=TRUE)

# 20th factor successfully added:
fl2 &lt;- flash_add_greedy(strong, Kmax=19, verbose=TRUE)
fl3 &lt;- flash_add_greedy(strong, Kmax=1, f_init=fl2, verbose=TRUE)</code></pre>
</div>
<div id="questions-for-investigation" class="section level2">
<h2>Questions for investigation</h2>
<p>The above increase in the objective function points to a larger problem, which I will turn to in a subsequent investigation. Here, I want to revisit the choice of default <code>init_fn</code>. In particular, should we instead default to <code>udv_svd</code> (a simple wrapper to <code>svd</code>) when there is no missing data? Since <code>softImpute(type = &quot;svd&quot;)</code> gives the same result as <code>svd</code>, and <code>softImpute(type = &quot;als&quot;)</code> begins by calling <code>svd</code> on a random matrix anyway, using <code>udv_svd</code> can only speed things up (in addition to eliminating an annoying source of randomness). Next, when there is missing data, what are the differences between <code>&quot;svd&quot;</code> and <code>&quot;als&quot;</code> in terms of speed and the final objective attained? Preliminary results suggested that <code>&quot;als&quot;</code> would be much faster, but it would be useful to verify those results on the GTEx dataset.</p>
</div>
<div id="results-no-missing-data" class="section level2">
<h2>Results: no missing data</h2>
<p>Since the 20th factor causes a problem for <code>udv_si</code>, I fit 19 factors greedily. I run it once with <code>udv_si_svd</code> and <code>udv_svd</code> and I run it twice with <code>udv_si</code> (using a different seed for the second run).</p>
<p>To reproduce the results in this section, run the following (make sure to load the <code>trackObj</code> branch of <code>flashr</code>). Since the fits take several minutes each, I pre-run the code and then load the results from file.</p>
<pre class="r"><code># This block was run in advance.

# devtools::install_github(&quot;stephenslab/flashr&quot;, ref=&quot;trackObj&quot;)
devtools::load_all(&quot;/Users/willwerscheid/GitHub/flashr&quot;)

res.udv_si &lt;- flash_add_greedy(strong, Kmax=19, verbose=TRUE)
res.udv_si_svd &lt;- flash_add_greedy(strong, Kmax=19, init_fn=&quot;udv_si_svd&quot;, verbose=TRUE)
res.udv_svd &lt;- flash_add_greedy(strong, Kmax=19, init_fn=&quot;udv_svd&quot;, verbose=TRUE)

# Change seed
res.udv_si666 &lt;- flash_add_greedy(strong, Kmax=19, verbose=TRUE, seed=666)

all_res &lt;- list(udv_si = res.udv_si,
                udv_si666 = res.udv_si666,
                udv_si_svd = res.udv_si_svd,
                udv_svd = res.udv_svd)

saveRDS(all_res, &quot;../data/init_fn/all_res.rds&quot;)</code></pre>
<pre class="r"><code># devtools::install_github(&quot;stephenslab/flashr&quot;, ref=&quot;trackObj&quot;)
devtools::load_all(&quot;/Users/willwerscheid/GitHub/flashr&quot;)</code></pre>
<pre><code>Loading flashr</code></pre>
<pre class="r"><code>gtex &lt;- readRDS(gzcon(url(&quot;https://github.com/stephenslab/gtexresults/blob/master/data/MatrixEQTLSumStats.Portable.Z.rds?raw=TRUE&quot;)))
strong &lt;- t(gtex$strong.z)

all_res &lt;- readRDS(&quot;./data/init_fn/all_res.rds&quot;)</code></pre>
<div id="initialization-time" class="section level3">
<h3>Initialization time</h3>
<p>The total time (in seconds) spent initializing factors is as follows.</p>
<pre class="r"><code>init_t &lt;- lapply(all_res, function(res) {unlist(res$init_t)})
lapply(init_t, sum)</code></pre>
<pre><code>$udv_si
[1] 13.78793

$udv_si666
[1] 12.13256

$udv_si_svd
[1] 8.962511

$udv_svd
[1] 2.968564</code></pre>
<p>As expected, <code>udv_svd</code> is fastest, but there is only about 10 seconds difference between <code>udv_svd</code> and <code>udv_si</code> (the slowest method). This difference is not hugely important given that the overall fit takes several minutes.</p>
<p>The initialization time per factor is as follows.</p>
<pre class="r"><code>K &lt;- length(init_t[[1]])
data &lt;- data.frame(t = unlist(init_t), 
                   init_fn = rep(names(init_t), each=K))
data$init_fn[data$init_fn == &quot;udv_si666&quot;] &lt;- &quot;udv_si&quot;
data$init_fn &lt;- factor(data$init_fn)

boxplot(t ~ init_fn, data, ylim=c(0, 2), 
        main=&quot;Initialization time (s)&quot;)</code></pre>
<p><img src="figure/init_fn.Rmd/nomissing_time-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of nomissing_time-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/willwerscheid/FLASHvestigations/blob/11187776df01c939d232659ab0ea10eb08bfd47b/docs/figure/init_fn.Rmd/nomissing_time-1.png" target="_blank">1118777</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-26
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/willwerscheid/FLASHvestigations/blob/131dd854cc8baa76ae6a5eee47d032180a67a6c5/docs/figure/init_fn.Rmd/nomissing_time-1.png" target="_blank">131dd85</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-25
</td>
</tr>
</tbody>
</table>
<p></details></p>
</div>
<div id="number-of-iterations" class="section level3">
<h3>Number of iterations</h3>
<p>Since the optimization step takes much more time than the initialization step, I check to see whether any initialization method is “better” in the sense that it requires less iterations to optimize. (I suppress results for <code>udv_si_svd</code> because they are identical to <code>udv_svd</code>.)</p>
<pre class="r"><code>niter &lt;- lapply(all_res, 
                function(res) {
                  sapply(res$obj, function(i) {length(i$after_tau)})
                })
niter$udv_si_svd &lt;- NULL

K &lt;- length(niter[[1]])
data &lt;- data.frame(niter = unlist(niter), 
                   init_fn = rep(names(niter), each=K))
data$init_fn[data$init_fn == &quot;udv_si666&quot;] &lt;- &quot;udv_si&quot;
data$init_fn &lt;- factor(data$init_fn)

boxplot(niter ~ init_fn, data, ylim=c(0, 120),
        main=&quot;Number of iterations per factor&quot;)</code></pre>
<p><img src="figure/init_fn.Rmd/nomissing_niter-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of nomissing_niter-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/willwerscheid/FLASHvestigations/blob/11187776df01c939d232659ab0ea10eb08bfd47b/docs/figure/init_fn.Rmd/nomissing_niter-1.png" target="_blank">1118777</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-26
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/willwerscheid/FLASHvestigations/blob/131dd854cc8baa76ae6a5eee47d032180a67a6c5/docs/figure/init_fn.Rmd/nomissing_niter-1.png" target="_blank">131dd85</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-25
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>These two plots look very similar to me.</p>
</div>
<div id="objective-attained" class="section level3">
<h3>Objective attained</h3>
<p>There is some variation in the final objective. <code>softImpute(type = &quot;als&quot;)</code> can do better than <code>svd</code>, but it can also do worse.</p>
<pre class="r"><code>lapply(all_res, function(res) {flash_get_objective(strong, res$f)})</code></pre>
<pre><code>$udv_si
[1] -1258152

$udv_si666
[1] -1257939

$udv_si_svd
[1] -1257996

$udv_svd
[1] -1257996</code></pre>
<p>The difference in objective function after adding each factor/loading pair is as follows. (Again, I suppress results for <code>udv_si_svd</code> because they are identical to <code>udv_svd</code>.)</p>
<pre class="r"><code>final.obj &lt;- lapply(all_res, 
                    function(res) {
                      sapply(res$obj, function(obj) {
                        max(unlist(obj))
                      })
                    })
si.diff &lt;- final.obj$udv_si - final.obj$udv_svd
si666.diff &lt;- final.obj$udv_si666 - final.obj$udv_svd
plot(si.diff, type=&#39;l&#39;, col=&quot;darkblue&quot;,
     xlab=&quot;Factor/loading index&quot;,
     ylab=&quot;Diff. in obj. from udv_svd&quot;,
     main=&quot;Diff. in obj. using udv_si rather than udv_svd&quot;)
lines(si666.diff, col=&quot;lightblue&quot;)
legend(&quot;topright&quot;, c(&quot;seed = 123&quot;, &quot;seed = 666&quot;), 
       lty=c(1, 1), col=c(&quot;darkblue&quot;, &quot;lightblue&quot;))
abline(0, 0, lty=2)</code></pre>
<p><img src="figure/init_fn.Rmd/nomissing_objeach-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of nomissing_objeach-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/willwerscheid/FLASHvestigations/blob/11187776df01c939d232659ab0ea10eb08bfd47b/docs/figure/init_fn.Rmd/nomissing_objeach-1.png" target="_blank">1118777</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-26
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/willwerscheid/FLASHvestigations/blob/131dd854cc8baa76ae6a5eee47d032180a67a6c5/docs/figure/init_fn.Rmd/nomissing_objeach-1.png" target="_blank">131dd85</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-25
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>Note that <code>udv_svd</code> is doing much worse at factor/loading 6, but catches up at factor/loading 7. In fact, both methods are actually adding the “same” factor/loading pairs here, but they do so in a different order:</p>
<pre class="r"><code>par(mfrow=c(2, 2))
missing.tissues &lt;- c(7, 8, 19, 20, 24, 25, 31, 34, 37)
gtex.colors &lt;- read.table(&quot;https://github.com/stephenslab/gtexresults/blob/master/data/GTExColors.txt?raw=TRUE&quot;, 
                          sep = &#39;\t&#39;, 
                          comment.char = &#39;&#39;)[-missing.tissues, 2]
ls &lt;- list(all_res$udv_si$f$EL[, 6], all_res$udv_si$f$EL[, 7],
        all_res$udv_svd$f$EL[, 6], all_res$udv_svd$f$EL[, 7])
mains &lt;- list(&quot;udv_si loading 6&quot;, &quot;udv_si loading 7&quot;,
           &quot;udv_svd loading 6&quot;, &quot;udv_svd loading 7&quot;)
for (i in 1:4) {
  barplot(ls[[i]], main=mains[[i]], las=2, cex.names = 0.4, 
          col=as.character(gtex.colors), names=&quot;&quot;)
}</code></pre>
<p><img src="figure/init_fn.Rmd/plot_factors-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of plot_factors-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/willwerscheid/FLASHvestigations/blob/11187776df01c939d232659ab0ea10eb08bfd47b/docs/figure/init_fn.Rmd/plot_factors-1.png" target="_blank">1118777</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-26
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>I zoom in on the last ten factor/loading pairs:</p>
<pre class="r"><code>plot(10:19, si.diff[10:19], type=&#39;l&#39;, col=&quot;darkblue&quot;,
     xlab=&quot;Factor/loading index&quot;,
     ylab=&quot;Diff. in obj. from udv_svd&quot;)
lines(10:19, si666.diff[10:19], col=&quot;lightblue&quot;)
legend(&quot;bottomleft&quot;, c(&quot;seed = 123&quot;, &quot;seed = 666&quot;), 
       lty=c(1, 1), col=c(&quot;darkblue&quot;, &quot;lightblue&quot;))
abline(0, 0, lty=2)</code></pre>
<p><img src="figure/init_fn.Rmd/nomissing_objeach2-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of nomissing_objeach2-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/willwerscheid/FLASHvestigations/blob/11187776df01c939d232659ab0ea10eb08bfd47b/docs/figure/init_fn.Rmd/nomissing_objeach2-1.png" target="_blank">1118777</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-26
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>While there are differences, there is no clear pattern here.</p>
</div>
</div>
<div id="results-missing-data" class="section level2">
<h2>Results: missing data</h2>
<p>I delete 5% of the entries in the dataset and repeat the experiment. Since <code>svd</code> cannot handle missing data, I only compare <code>udv_si</code> and <code>udv_si_svd</code>.</p>
<pre class="r"><code>set.seed(1)
missing &lt;- rbinom(length(strong), 1, prob=0.05)
strong.missing &lt;- strong
strong.missing[missing] &lt;- NA
fl_data &lt;- flash_set_data(strong.missing)</code></pre>
<pre class="r"><code># This block was run in advance.

res.missing.udv_si &lt;- flash_add_greedy(fl_data, Kmax=19, verbose=TRUE)
res.missing.udv_si666 &lt;- flash_add_greedy(fl_data, Kmax=19, verbose=TRUE, seed=666)
res.missing.udv_si_svd &lt;- flash_add_greedy(fl_data, Kmax=19, init_fn=&quot;udv_si_svd&quot;, verbose=TRUE)

all_missing &lt;- list(udv_si = res.missing.udv_si,
                    udv_si666 = res.missing.udv_si666,
                    udv_si_svd = res.missing.udv_si_svd)

saveRDS(all_missing, &quot;../data/init_fn/all_missing.rds&quot;)</code></pre>
<pre class="r"><code>all_missing &lt;- readRDS(&quot;./data/init_fn/all_missing.rds&quot;)</code></pre>
<div id="initialization-time-1" class="section level3">
<h3>Initialization time</h3>
<p>The total time (in seconds) spent initializing factors is as follows.</p>
<pre class="r"><code>init_t &lt;- lapply(all_missing, function(res) {unlist(res$init_t)})
lapply(init_t, sum)</code></pre>
<pre><code>$udv_si
[1] 15.04474

$udv_si666
[1] 11.99873

$udv_si_svd
[1] 7.355583</code></pre>
<p>The time per factor is as follows.</p>
<pre class="r"><code>K &lt;- length(init_t[[1]])
data &lt;- data.frame(t = unlist(init_t), 
                   init_fn = rep(names(init_t), each=K))
data$init_fn[data$init_fn == &quot;udv_si666&quot;] &lt;- &quot;udv_si&quot;
data$init_fn &lt;- factor(data$init_fn)

boxplot(t ~ init_fn, data, ylim=c(0, 2),
        main=&quot;Initialization time (s)&quot;)</code></pre>
<p><img src="figure/init_fn.Rmd/missing_time-1.png" width="672" style="display: block; margin: auto;" /></p>
<details> <summary><em>Expand here to see past versions of missing_time-1.png:</em></summary>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
<a href="https://github.com/willwerscheid/FLASHvestigations/blob/11187776df01c939d232659ab0ea10eb08bfd47b/docs/figure/init_fn.Rmd/missing_time-1.png" target="_blank">1118777</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-26
</td>
</tr>
<tr>
<td style="text-align:left;">
<a href="https://github.com/willwerscheid/FLASHvestigations/blob/131dd854cc8baa76ae6a5eee47d032180a67a6c5/docs/figure/init_fn.Rmd/missing_time-1.png" target="_blank">131dd85</a>
</td>
<td style="text-align:left;">
Jason Willwerscheid
</td>
<td style="text-align:left;">
2018-07-25
</td>
</tr>
</tbody>
</table>
<p></details></p>
<p>Interestingly, <code>udv_si_svd</code> seems to be reliably faster. This result contradicts our earlier results showing that using option <code>type = &quot;als&quot;</code> with <code>softImpute</code> was faster than using option <code>type = &quot;svd&quot;</code>. I’m not sure whether any code has changed in the meantime, or whether our preliminary investigations were simply faulty.</p>
</div>
<div id="number-of-iterations-objective-attained" class="section level3">
<h3>Number of iterations, objective attained</h3>
<p>With only 5% of data missing, the number of iterations per factor and the increase in objective per factor were nearly identical to the above, so I have suppressed them here.</p>
<p>To verify for yourself, run the following lines.</p>
<pre class="r"><code>niter &lt;- lapply(all_missing, 
                function(res) {
                  sapply(res$obj, function(i) {length(i$after_tau)})
                })

K &lt;- length(niter[[1]])
data &lt;- data.frame(niter = unlist(niter), 
                   init_fn = rep(names(niter), each=K))
data$init_fn[data$init_fn == &quot;udv_si666&quot;] &lt;- &quot;udv_si&quot;
data$init_fn &lt;- factor(data$init_fn)

boxplot(niter ~ init_fn, data, ylim=c(0, 120),
        main=&quot;Number of iterations per factor&quot;)</code></pre>
<pre class="r"><code>lapply(all_missing, function(res) {flash_get_objective(fl_data, res$f)})</code></pre>
<pre class="r"><code>final.obj &lt;- lapply(all_missing, 
                    function(res) {
                      sapply(res$obj, function(obj) {
                        max(unlist(obj))
                      })
                    })
si.diff &lt;- final.obj$udv_si - final.obj$udv_si_svd
si666.diff &lt;- final.obj$udv_si666 - final.obj$udv_si_svd
plot(si.diff, type=&#39;l&#39;, col=&quot;darkblue&quot;,
     xlab=&quot;Factor/loading index&quot;,
     ylab=&quot;Diff. in obj. from udv_si_svd&quot;,
     main=&quot;Diff. in obj. using udv_si rather than udv_si_svd&quot;)
lines(si666.diff, col=&quot;lightblue&quot;)
legend(&quot;topright&quot;, c(&quot;seed = 123&quot;, &quot;seed = 666&quot;), 
       lty=c(1, 1), col=c(&quot;darkblue&quot;, &quot;lightblue&quot;))
abline(0, 0, lty=2)</code></pre>
</div>
</div>
<div id="conclusions" class="section level2">
<h2>Conclusions</h2>
<p>There does not seem to be any advantage to using <code>softImpute</code> when there is no missing data. Indeed, <code>udv_si</code> is relatively slow and has the additional disadvantage that results depend upon a random initialization. For this reason, I recommend changing the default <code>init_fn</code> to <code>udv_svd</code> when there is no missing data.</p>
<p>More surprisingly, I was unable to verify earlier results that concluded that <code>udv_si</code> (that is, <code>softImpute</code> with option <code>type = &quot;als&quot;</code>) is faster than <code>udv_si_svd</code> (<code>softImpute</code> with option <code>type = &quot;svd&quot;</code>). On the contrary, <code>udv_si_svd</code> appears to be slightly faster, and again has the advantage that it is not random. Further, the algorithm is somewhat simpler to understand. Thus I recommend setting the default <code>init_fn</code> to <code>udv_si_svd</code> when data is missing.</p>
</div>
<div id="session-information" class="section level2">
<h2>Session information</h2>
<pre class="r"><code>sessionInfo()</code></pre>
<pre><code>R version 3.4.3 (2017-11-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS High Sierra 10.13.6

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] flashr_0.5-12

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.17        pillar_1.2.1        plyr_1.8.4         
 [4] compiler_3.4.3      git2r_0.21.0        workflowr_1.0.1    
 [7] R.methodsS3_1.7.1   R.utils_2.6.0       iterators_1.0.9    
[10] tools_3.4.3         testthat_2.0.0      digest_0.6.15      
[13] tibble_1.4.2        evaluate_0.10.1     memoise_1.1.0      
[16] gtable_0.2.0        lattice_0.20-35     rlang_0.2.0        
[19] Matrix_1.2-12       foreach_1.4.4       commonmark_1.4     
[22] yaml_2.1.17         parallel_3.4.3      ebnm_0.1-12        
[25] withr_2.1.1.9000    stringr_1.3.0       roxygen2_6.0.1.9000
[28] xml2_1.2.0          knitr_1.20          devtools_1.13.4    
[31] rprojroot_1.3-2     grid_3.4.3          R6_2.2.2           
[34] rmarkdown_1.8       ggplot2_2.2.1       ashr_2.2-10        
[37] magrittr_1.5        whisker_0.3-2       backports_1.1.2    
[40] scales_0.5.0        codetools_0.2-15    htmltools_0.3.6    
[43] MASS_7.3-48         assertthat_0.2.0    softImpute_1.4     
[46] colorspace_1.3-2    stringi_1.1.6       lazyeval_0.2.1     
[49] munsell_0.4.3       doParallel_1.0.11   pscl_1.5.2         
[52] truncnorm_1.0-8     SQUAREM_2017.10-1   R.oo_1.21.0        </code></pre>
</div>

<!-- Adjust MathJax settings so that all math formulae are shown using
TeX fonts only; see
http://docs.mathjax.org/en/latest/configuration.html.  This will make
the presentation more consistent at the cost of the webpage sometimes
taking slightly longer to load. Note that this only works because the
footer is added to webpages before the MathJax javascript. -->
<script type="text/x-mathjax-config">
  MathJax.Hub.Config({
    "HTML-CSS": { availableFonts: ["TeX"] }
  });
</script>

<hr>
<p>
  This reproducible <a href="http://rmarkdown.rstudio.com">R Markdown</a>
  analysis was created with
  <a href="https://github.com/jdblischak/workflowr">workflowr</a> 1.0.1
</p>
<hr>


</div>
</div>

</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.header').parent('thead').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>