<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml">

<head>

<meta charset="utf-8" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="generator" content="pandoc" />


<meta name="author" content="Lambda Moses" />

<meta name="date" content="2019-02-14" />

<title>Processing kallisto bus Output (10x v3 chemistry)</title>

<script src="site_libs/jquery-1.11.3/jquery.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/cosmo.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<script src="site_libs/jqueryui-1.11.4/jquery-ui.min.js"></script>
<link href="site_libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" />
<script src="site_libs/tocify-1.9.1/jquery.tocify.js"></script>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<link href="site_libs/highlightjs-9.12.0/textmate.css" rel="stylesheet" />
<script src="site_libs/highlightjs-9.12.0/highlight.js"></script>
<link href="site_libs/font-awesome-5.1.0/css/all.css" rel="stylesheet" />
<link href="site_libs/font-awesome-5.1.0/css/v4-shims.css" rel="stylesheet" />

<style type="text/css">code{white-space: pre;}</style>
<style type="text/css">
  pre:not([class]) {
    background-color: white;
  }
</style>
<script type="text/javascript">
if (window.hljs) {
  hljs.configure({languages: []});
  hljs.initHighlightingOnLoad();
  if (document.readyState && document.readyState === "complete") {
    window.setTimeout(function() { hljs.initHighlighting(); }, 0);
  }
}
</script>



<style type="text/css">
h1 {
  font-size: 34px;
}
h1.title {
  font-size: 38px;
}
h2 {
  font-size: 30px;
}
h3 {
  font-size: 24px;
}
h4 {
  font-size: 18px;
}
h5 {
  font-size: 16px;
}
h6 {
  font-size: 12px;
}
.table th:not([align]) {
  text-align: left;
}
</style>


</head>

<body>

<style type = "text/css">
.main-container {
  max-width: 940px;
  margin-left: auto;
  margin-right: auto;
}
code {
  color: inherit;
  background-color: rgba(0, 0, 0, 0.04);
}
img {
  max-width:100%;
  height: auto;
}
.tabbed-pane {
  padding-top: 12px;
}
.html-widget {
  margin-bottom: 20px;
}
button.code-folding-btn:focus {
  outline: none;
}
summary {
  display: list-item;
}
</style>


<style type="text/css">
/* padding for bootstrap navbar */
body {
  padding-top: 51px;
  padding-bottom: 40px;
}
/* offset scroll position for anchor links (for fixed navbar)  */
.section h1 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h2 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h3 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h4 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h5 {
  padding-top: 56px;
  margin-top: -56px;
}
.section h6 {
  padding-top: 56px;
  margin-top: -56px;
}
.dropdown-submenu {
  position: relative;
}
.dropdown-submenu>.dropdown-menu {
  top: 0;
  left: 100%;
  margin-top: -6px;
  margin-left: -1px;
  border-radius: 0 6px 6px 6px;
}
.dropdown-submenu:hover>.dropdown-menu {
  display: block;
}
.dropdown-submenu>a:after {
  display: block;
  content: " ";
  float: right;
  width: 0;
  height: 0;
  border-color: transparent;
  border-style: solid;
  border-width: 5px 0 5px 5px;
  border-left-color: #cccccc;
  margin-top: 5px;
  margin-right: -10px;
}
.dropdown-submenu:hover>a:after {
  border-left-color: #ffffff;
}
.dropdown-submenu.pull-left {
  float: none;
}
.dropdown-submenu.pull-left>.dropdown-menu {
  left: -100%;
  margin-left: 10px;
  border-radius: 6px 0 6px 6px;
}
</style>

<script>
// manage active state of menu based on current page
$(document).ready(function () {
  // active menu anchor
  href = window.location.pathname
  href = href.substr(href.lastIndexOf('/') + 1)
  if (href === "")
    href = "index.html";
  var menuAnchor = $('a[href="' + href + '"]');

  // mark it active
  menuAnchor.parent().addClass('active');

  // if it's got a parent navbar menu mark it active as well
  menuAnchor.closest('li.dropdown').addClass('active');
});
</script>

<div class="container-fluid main-container">

<!-- tabsets -->

<style type="text/css">
.tabset-dropdown > .nav-tabs {
  display: inline-table;
  max-height: 500px;
  min-height: 44px;
  overflow-y: auto;
  background: white;
  border: 1px solid #ddd;
  border-radius: 4px;
}

.tabset-dropdown > .nav-tabs > li.active:before {
  content: "";
  font-family: 'Glyphicons Halflings';
  display: inline-block;
  padding: 10px;
  border-right: 1px solid #ddd;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
  content: "&#xe258;";
  border: none;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open:before {
  content: "";
  font-family: 'Glyphicons Halflings';
  display: inline-block;
  padding: 10px;
  border-right: 1px solid #ddd;
}

.tabset-dropdown > .nav-tabs > li.active {
  display: block;
}

.tabset-dropdown > .nav-tabs > li > a,
.tabset-dropdown > .nav-tabs > li > a:focus,
.tabset-dropdown > .nav-tabs > li > a:hover {
  border: none;
  display: inline-block;
  border-radius: 4px;
}

.tabset-dropdown > .nav-tabs.nav-tabs-open > li {
  display: block;
  float: none;
}

.tabset-dropdown > .nav-tabs > li {
  display: none;
}
</style>

<script>
$(document).ready(function () {
  window.buildTabsets("TOC");
});

$(document).ready(function () {
  $('.tabset-dropdown > .nav-tabs > li').click(function () {
    $(this).parent().toggleClass('nav-tabs-open')
  });
});
</script>

<!-- code folding -->



<script>
$(document).ready(function ()  {

    // move toc-ignore selectors from section div to header
    $('div.section.toc-ignore')
        .removeClass('toc-ignore')
        .children('h1,h2,h3,h4,h5').addClass('toc-ignore');

    // establish options
    var options = {
      selectors: "h1,h2,h3",
      theme: "bootstrap3",
      context: '.toc-content',
      hashGenerator: function (text) {
        return text.replace(/[.\\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase();
      },
      ignoreSelector: ".toc-ignore",
      scrollTo: 0
    };
    options.showAndHide = true;
    options.smoothScroll = true;

    // tocify
    var toc = $("#TOC").tocify(options).data("toc-tocify");
});
</script>

<style type="text/css">

#TOC {
  margin: 25px 0px 20px 0px;
}
@media (max-width: 768px) {
#TOC {
  position: relative;
  width: 100%;
}
}


.toc-content {
  padding-left: 30px;
  padding-right: 40px;
}

div.main-container {
  max-width: 1200px;
}

div.tocify {
  width: 20%;
  max-width: 260px;
  max-height: 85%;
}

@media (min-width: 768px) and (max-width: 991px) {
  div.tocify {
    width: 25%;
  }
}

@media (max-width: 767px) {
  div.tocify {
    width: 100%;
    max-width: none;
  }
}

.tocify ul, .tocify li {
  line-height: 20px;
}

.tocify-subheader .tocify-item {
  font-size: 0.90em;
  padding-left: 25px;
  text-indent: 0;
}

.tocify .list-group-item {
  border-radius: 0px;
}


</style>

<!-- setup 3col/9col grid for toc_float and main content  -->
<div class="row-fluid">
<div class="col-xs-12 col-sm-4 col-md-3">
<div id="TOC" class="tocify">
</div>
</div>

<div class="toc-content col-xs-12 col-sm-8 col-md-9">




<div class="navbar navbar-default  navbar-fixed-top" role="navigation">
  <div class="container">
    <div class="navbar-header">
      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
        <span class="icon-bar"></span>
      </button>
      <a class="navbar-brand" href="index.html">BUS Notebooks R</a>
    </div>
    <div id="navbar" class="navbar-collapse collapse">
      <ul class="nav navbar-nav">
        <li>
  <a href="index.html">Home</a>
</li>
<li>
  <a href="about.html">About</a>
</li>
<li>
  <a href="license.html">License</a>
</li>
      </ul>
      <ul class="nav navbar-nav navbar-right">
        <li>
  <a href="https://github.com/BUStools/BUS_notebooks_R">
    <span class="fa fa-github"></span>
     
  </a>
</li>
      </ul>
    </div><!--/.nav-collapse -->
  </div><!--/.container -->
</div><!--/.navbar -->

<!-- Add a small amount of space between sections. -->
<style type="text/css">
div.section {
  padding-top: 12px;
}
</style>

<div class="fluid-row" id="header">



<h1 class="title toc-ignore">Processing kallisto bus Output (10x v3 chemistry)</h1>
<h4 class="author"><em>Lambda Moses</em></h4>
<h4 class="date"><em>2019-02-14</em></h4>

</div>


<p><strong>Last updated:</strong> 2019-02-14</p>
<strong>workflowr checks:</strong> <small>(Click a bullet for more information)</small>
<ul>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>R Markdown file:</strong> up-to-date </summary></p>
<p>Great! Since the R Markdown file has been committed to the Git repository, you know the exact version of the code that produced these results.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Environment:</strong> empty </summary></p>
<p>Great job! The global environment was empty. Objects defined in the global environment can affect the analysis in your R Markdown file in unknown ways. For reproduciblity it’s best to always run the code in an empty environment.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Seed:</strong> <code>set.seed(20181214)</code> </summary></p>
<p>The command <code>set.seed(20181214)</code> was run prior to running the code in the R Markdown file. Setting a seed ensures that any results that rely on randomness, e.g. subsampling or permutations, are reproducible.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Session information:</strong> recorded </summary></p>
<p>Great job! Recording the operating system, R version, and package versions is critical for reproducibility.</p>
</details>
</li>
<li>
<p><details> <summary> <strong style="color:blue;">✔</strong> <strong>Repository version:</strong> <a href="https://github.com/BUStools/BUS_notebooks_R/tree/92aa9159e1d9e192d401e974d8551a06bfc52c72" target="_blank">92aa915</a> </summary></p>
Great! You are using Git for version control. Tracking code development and connecting the code version to the results is critical for reproducibility. The version displayed above was the version of the Git repository at the time these results were generated. <br><br> Note that you need to be careful to ensure that all relevant files for the analysis have been committed to Git prior to generating the results (you can use <code>wflow_publish</code> or <code>wflow_git_commit</code>). workflowr only checks the R Markdown file, but you know if there are other scripts or data files that it depends on. Below is the status of the Git repository when the results were generated:
<pre><code>
Ignored files:
    Ignored:    .Rhistory
    Ignored:    .Rproj.user/
    Ignored:    BUSpaRse_notebooks.Rproj
    Ignored:    analysis/10xv2_cache/
    Ignored:    analysis/10xv3_cache/
    Ignored:    data/fastqs/
    Ignored:    data/hgmm_100_fastqs.tar
    Ignored:    data/hgmm_1k_fastqs.tar
    Ignored:    data/hgmm_1k_v3_fastqs.tar
    Ignored:    data/hgmm_1k_v3_fastqs/
    Ignored:    data/hs_cdna.fa.gz
    Ignored:    data/mm_cdna.fa.gz
    Ignored:    data/whitelist_v2.txt
    Ignored:    data/whitelist_v3.txt.gz
    Ignored:    docs/figure/
    Ignored:    output/hs_mm_tr_index.idx
    Ignored:    output/out_hgmm1k/
    Ignored:    output/out_hgmm1k_v3/

Untracked files:
    Untracked:  Rplots.pdf

</code></pre>
Note that any generated files, e.g. HTML, png, CSS, etc., are not included in this status report because it is ok for generated content to have uncommitted changes. </details>
</li>
</ul>
<details> <summary> <small><strong>Expand here to see past versions:</strong></small> </summary>
<ul>
<table style="border-collapse:separate; border-spacing:5px;">
<thead>
<tr>
<th style="text-align:left;">
File
</th>
<th style="text-align:left;">
Version
</th>
<th style="text-align:left;">
Author
</th>
<th style="text-align:left;">
Date
</th>
<th style="text-align:left;">
Message
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/BUStools/BUS_notebooks_R/blob/92aa9159e1d9e192d401e974d8551a06bfc52c72/analysis/10xv3.Rmd" target="_blank">92aa915</a>
</td>
<td style="text-align:left;">
Lambda Moses
</td>
<td style="text-align:left;">
2019-02-14
</td>
<td style="text-align:left;">
Named chunks with images
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/BUStools/BUS_notebooks_R/87d15f5ac8538c38c74af4f0763f79887c2a5e4d/docs/10xv3.html" target="_blank">87d15f5</a>
</td>
<td style="text-align:left;">
Lambda Moses
</td>
<td style="text-align:left;">
2019-02-14
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/BUStools/BUS_notebooks_R/blob/8d9fe9a5ef4f9ee3a3123a4db61029831dacc2e3/analysis/10xv3.Rmd" target="_blank">8d9fe9a</a>
</td>
<td style="text-align:left;">
Lambda Moses
</td>
<td style="text-align:left;">
2019-02-14
</td>
<td style="text-align:left;">
More detailed explanations for kallisto bus workshop
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/BUStools/BUS_notebooks_R/b7b21a0eab033a957a93102ee2222a1d1139d1dd/docs/10xv3.html" target="_blank">b7b21a0</a>
</td>
<td style="text-align:left;">
Lambda Moses
</td>
<td style="text-align:left;">
2019-02-02
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/BUStools/BUS_notebooks_R/85a5770b53befbfc968af9e488b546e123a8eab6/docs/10xv3.html" target="_blank">85a5770</a>
</td>
<td style="text-align:left;">
Lambda Moses
</td>
<td style="text-align:left;">
2018-12-14
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/BUStools/BUS_notebooks_R/blob/ca1d6ce5bd620c25609b92c6d098229c1b713bb6/analysis/10xv3.Rmd" target="_blank">ca1d6ce</a>
</td>
<td style="text-align:left;">
Lambda Moses
</td>
<td style="text-align:left;">
2018-12-14
</td>
<td style="text-align:left;">
Clean up
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/BUStools/BUS_notebooks_R/6276894f775df80018e05467e79c54a750722e4f/docs/10xv3.html" target="_blank">6276894</a>
</td>
<td style="text-align:left;">
Lambda Moses
</td>
<td style="text-align:left;">
2018-12-14
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/BUStools/BUS_notebooks_R/166854fe0508a3781bd961e329b843464151201e/docs/10xv3.html" target="_blank">166854f</a>
</td>
<td style="text-align:left;">
Lambda Moses
</td>
<td style="text-align:left;">
2018-12-14
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/BUStools/BUS_notebooks_R/blob/ef5cfc5bd78a314089158309ce9ee482384d549b/analysis/10xv3.Rmd" target="_blank">ef5cfc5</a>
</td>
<td style="text-align:left;">
Lambda Moses
</td>
<td style="text-align:left;">
2018-12-14
</td>
<td style="text-align:left;">
Corrected stated file size
</td>
</tr>
<tr>
<td style="text-align:left;">
html
</td>
<td style="text-align:left;">
<a href="https://cdn.rawgit.com/BUStools/BUS_notebooks_R/e0e7d0a7267201f05098eb6245cf9e754646c3de/docs/10xv3.html" target="_blank">e0e7d0a</a>
</td>
<td style="text-align:left;">
Lambda Moses
</td>
<td style="text-align:left;">
2018-12-14
</td>
<td style="text-align:left;">
Build site.
</td>
</tr>
<tr>
<td style="text-align:left;">
Rmd
</td>
<td style="text-align:left;">
<a href="https://github.com/BUStools/BUS_notebooks_R/blob/aab422a6b0dc03205e25c321268bf0959dd4d69c/analysis/10xv3.Rmd" target="_blank">aab422a</a>
</td>
<td style="text-align:left;">
Lambda Moses
</td>
<td style="text-align:left;">
2018-12-14
</td>
<td style="text-align:left;">
Added 10xv3 notebook
</td>
</tr>
</tbody>
</table>
</ul>
<p></details></p>
<hr />
<p>In this vignette, we process fastq data from scRNA-seq (10x v2 chemistry) with to make a sparse matrix that can be used in downstream analysis. Then we will start that standard downstream analysis with <code>Seurat</code>.</p>
<div id="setup" class="section level1">
<h1>Setup</h1>
<p>First, if you want to run this notebook, please make sure you have <a href="https://git-scm.com/book/en/v2/Getting-Started-Installing-Git"><code>git</code></a> installed, go to the terminal and</p>
<pre class="bash"><code>git clone https://github.com/BUStools/BUS_notebooks_R.git</code></pre>
<p>This notebook is in the <code>analysis</code> directory. Also, in RStudio, go to Tools -&gt; Global Options -&gt; R Markdown, and set “Evaluate chunks in directory” to “Project”. The reason why is that this website was built with <a href="https://jdblischak.github.io/workflowr/"><code>workflowr</code></a>, which renders the notebooks in the project directory rather than the document directory.</p>
<p>Alternatively, you can directly download this notebook <a href="https://github.com/BUStools/BUS_notebooks_R/raw/master/analysis/10xv3.Rmd">here</a> and open it in RStudio, but make sure to adjust the paths where files are downloaded and where output files are stored to match those in your environment.</p>
<div id="install-packages" class="section level2">
<h2>Install packages</h2>
<p>The primary analysis section of this notebook demonstrates the use of command line tools <code>kallisto</code> and <code>bustools</code>. The binary of <code>bustools</code> can be found <a href="https://github.com/BUStools/bustools/releases">here</a>. We need the development branch of <code>kallisto</code> for this notebook, as the current release version does not support 10x v3 chemistry. This needs to be built from source. The instruction is below.</p>
<p>Note for Windows users: <code>bustools</code> does not have a Windows binary, but you can use a <a href="https://docs.microsoft.com/en-us/windows/wsl/install-win10">Linux subsystem in Windows 10</a>. If you use earlier version of Windows, then unfortunately you need to either use Linux dual boot or virtual machine or find a Linux or Mac computer such as a server.</p>
<div id="install-devel-branch-of-kallisto" class="section level3">
<h3>Install devel branch of kallisto</h3>
<p>Here we use <a href="https://pachterlab.github.io/kallisto/starting"><code>kallisto</code></a> to pseudoalign the reads to the transcriptome and then to create the <code>bus</code> file to be converted to a sparse matrix.</p>
<p>Note that for 10x v3 chemistry, we need the development branch of <code>kallisto</code>; 10xv3 is not supported by the current release version. See <a href="https://pachterlab.github.io/kallisto/source">this link</a> for an instruction to build <code>kallisto</code> from source. I will also demonstrate how to install the development version here:</p>
<pre class="bash"><code>cd ~
git clone https://github.com/pachterlab/kallisto.git
cd kallisto
# Switch to devel branch
git checkout devel
# Run autoconf, only done once, not run again when you recompile
cd ext/htslib
autoheader
autoconf
# Get back to kallisto root directory
cd ../..
# Build kallisto
mkdir build
cd build
# Run cmake
cmake -DCMAKE_INSTALL_PREFIX=&lt;where you want the kallisto binary to be&gt; ..
make
make install</code></pre>
<p>Note that if you installed the development version of <code>kallisto</code> in your personal directory (if you don’t have root privilege), you need to add the directory with the binary of the development version to the environment variable <code>PATH</code> and add the directory containing any dynamic library dependency to the environment variable <code>LD_LIBRARY_PATH</code> (e.g. <code>~/anaconda3/lib</code>, if you used <code>conda</code> to install the package). If you see error like <code>unable to load dynamic library, libhdf5.so.103 not found</code>, while you are sure that you have installed <code>hdf5</code>, then you should find <code>libhdf5.so.103</code> and add the directory containing it to <code>LD_LIBRARY_PATH</code>.</p>
<p>How to add something to a variable in <code>bash</code>? For example, in each <code>bash</code> chunk in RStudio:</p>
<pre class="bash"><code>export PATH=$PATH:/home/lambda/mylibs/bin
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/lambda/miniconda3/lib
# Other bash commands...</code></pre>
<p>The <code>$PATH</code> means the existing content of the environment variable <code>PATH</code>, and here we are adding something new to the existing content, without overwriting the existing content. The same applies for <code>LD_LIBRARY_PATH</code>.</p>
<p>In RStudio, each <code>bash</code> chunk is a separate session, so you will need to add those directories to <code>PATH</code> and <code>LD_LIBRARY_PATH</code> in every single <code>bash</code> chunk, which is quite annoying. Also note that, if you use Linux, while every time you log in, the file <code>.bashrc</code> is sourced, adding non-default directories to variables like <code>PATH</code>, the <code>bash</code> chunks in R are not affected by this. The <code>PATH</code> and other variables are different from those you see in the terminal outside RStudio. So you will have to <code>source ~/.bashrc</code> in every single <code>bash chunk</code>, which is also quite annoying.</p>
<p>A way to work around this is to create a file in your home directory called <code>.Renviron</code>, such as in Linux terminal, with <code>vim .Renviron</code>. Alternatively, you can use in R <code>file.create(&quot;~/.Renviron&quot;)</code>, and then open that file in RStudio to edit it. Then add all the paths to command line tools you want R to find there. Then restart the R session; the <code>.Renviron</code> file is sourced when R starts up. Below is the content of my <code>.Renviron</code>:</p>
<pre class="bash"><code>PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/home/lambda/mylibs/bin
LD_LIBRARY_PATH=/home/lambda/mylibs/lib:/home/lambda/mylibs/lib64:/usr/lib:/usr/lib64:/home/lambda/miniconda3/lib</code></pre>
<p>You can see the numerous paths in my personal directory added to the environment variables. Perhaps there’s a better way, but so far, this works. The default version of <code>kallisto</code> in the server of our group is the release version, so for the rest of the notebook, the path <code>~/mylibs/bin</code> signifies the devel version.</p>
</div>
<div id="install-r-dependencies" class="section level3">
<h3>Install R dependencies</h3>
<p>We will be using the R packages below. <code>BUSpaRse</code> is not yet on CRAN or Bioconductor. For Mac users, see the <a href="https://bustools.github.io/BUS_notebooks_R/index.html">installation note for <code>BUSpaRse</code></a>. We will also use <code>Seurat</code> 3.0, which is not yet on CRAN (the CRAN version is 2.3.4), for the standard downstream analysis. <code>Seurat</code> 3.0 has some cool new features such as getting batch corrected gene count matrix and cell label transfer across datasets. See <a href="https://www.biorxiv.org/content/10.1101/460147v1">this paper</a> for more detail. Though we will not get into those new features here, it’s worthwhile to check them out.</p>
<pre class="r"><code># Install devtools if it&#39;s not already installed
if (!require(devtools)) {
  install.packages(&quot;devtools&quot;)
}
# Install from GitHub
devtools::install_github(&quot;BUStools/BUSpaRse&quot;)
devtools::install_github(&quot;satijalab/seurat&quot;, ref = &quot;release/3.0&quot;)</code></pre>
<p>The package <code>DropletUtils</code> will be used to estimate the number of real cells as opposed to empty droplets. It’s on Bioconductor, and here is how it should be installed:</p>
<pre class="r"><code>if (!require(BiocManager)) {
  install.packages(&quot;BiocManager&quot;)
}
BiocManager::install(&quot;DropletUtils&quot;)</code></pre>
<p>The other R packages below are on CRAN, and can be installed with <code>install.packages</code>.</p>
<pre class="r"><code>library(BUSpaRse)
library(data.table)
library(Seurat)
library(tidyverse)
library(DropletUtils)
library(Matrix)</code></pre>
</div>
</div>
<div id="download-data" class="section level2">
<h2>Download data</h2>
<p>The data set we are using here is 1k 1:1 Mixture of Fresh Frozen Human (HEK293T) and Mouse (NIH3T3) Cells from the 10x website. First, we download the fastq files (4.54 GB).</p>
<pre class="r"><code>download.file(&quot;http://cf.10xgenomics.com/samples/cell-exp/3.0.0/hgmm_1k_v3/hgmm_1k_v3_fastqs.tar&quot;, destfile = &quot;./data/hgmm_1k_v3_fastqs.tar&quot;, quiet = TRUE)</code></pre>
<p>Then untar this file</p>
<pre class="bash"><code>cd ./data
tar -xvf ./hgmm_1k_v3_fastqs.tar</code></pre>
<pre><code>#&gt; hgmm_1k_v3_fastqs/
#&gt; hgmm_1k_v3_fastqs/hgmm_1k_v3_S1_L001_R2_001.fastq.gz
#&gt; hgmm_1k_v3_fastqs/hgmm_1k_v3_S1_L002_I1_001.fastq.gz
#&gt; hgmm_1k_v3_fastqs/hgmm_1k_v3_S1_L001_R1_001.fastq.gz
#&gt; hgmm_1k_v3_fastqs/hgmm_1k_v3_S1_L002_R1_001.fastq.gz
#&gt; hgmm_1k_v3_fastqs/hgmm_1k_v3_S1_L002_R2_001.fastq.gz
#&gt; hgmm_1k_v3_fastqs/hgmm_1k_v3_S1_L001_I1_001.fastq.gz</code></pre>
</div>
</div>
<div id="primary-analysis" class="section level1">
<h1>Primary analysis</h1>
<div id="build-the-kallisto-index" class="section level2">
<h2>Build the <code>kallisto</code> index</h2>
<p>Here we use <a href="https://pachterlab.github.io/kallisto/about">kallisto</a> to pseudoalign the reads to the transcriptome and then to create the <code>bus</code> file to be converted to a sparse matrix. The first step is to build an index of the transcriptome. This data set has both human and mouse cells, so we need both human and mouse transcriptomes. The transcriptomes downloaded here are from Ensembl version 95, released in January 2019. As of the writing of this notebook, this is the most recent Ensembl release.</p>
<pre class="r"><code># Human transcriptome
download.file(&quot;ftp://ftp.ensembl.org/pub/release-95/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz&quot;, &quot;./data/hs_cdna.fa.gz&quot;, quiet = TRUE)
# Mouse transcriptome
download.file(&quot;ftp://ftp.ensembl.org/pub/release-95/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz&quot;, &quot;./data/mm_cdna.fa.gz&quot;, quiet = TRUE)</code></pre>
<pre class="bash"><code>~/mylibs/bin/kallisto version</code></pre>
<pre><code>#&gt; kallisto, version 0.45.0</code></pre>
<p>Actually, we don’t need to unzip the fasta files</p>
<pre class="bash"><code>~/mylibs/bin/kallisto index -i ./output/hs_mm_tr_index.idx ./data/hs_cdna.fa.gz ./data/mm_cdna.fa.gz</code></pre>
<pre><code>#&gt; 
#&gt; [build] loading fasta file ./data/hs_cdna.fa.gz
#&gt; [build] loading fasta file ./data/mm_cdna.fa.gz
#&gt; [build] k-mer length: 31
#&gt; [build] warning: clipped off poly-A tail (longer than 10)
#&gt;         from 2083 target sequences
#&gt; [build] warning: replaced 8 non-ACGUT characters in the input sequence
#&gt;         with pseudorandom nucleotides
#&gt; [build] counting k-mers ... done.
#&gt; [build] building target de Bruijn graph ...  done 
#&gt; [build] creating equivalence classes ...  done
#&gt; [build] target de Bruijn graph has 2145397 contigs and contains 206575877 k-mers</code></pre>
</div>
<div id="run-kallisto-bus" class="section level2">
<h2>Run <code>kallisto bus</code></h2>
<p>Here we will generate the <code>bus</code> file. Here <code>bus</code> stands for <em>B</em>arbode, <em>U</em>MI, <em>S</em>et (i.e. equivalent class). In text form, it is a table whose first column is the barcode. The second column is the UMI that are associated with the barcode. The third column is the index of the equivalence class reads with the UMI maps to (equivalence class will be explained later). The fourth column is count of reads with this barcode, UMI, and equivalence class combination, which is ignored as one UMI should stand for one molecule. See <a href="https://www.biorxiv.org/content/10.1101/472571v2">this paper</a> for more detail.</p>
<p>These are the technologies supported by <code>kallisto bus</code>:</p>
<pre class="r"><code>system(&quot;~/mylibs/bin/kallisto bus --list&quot;, intern = TRUE)</code></pre>
<pre><code>#&gt; Warning in system(&quot;~/mylibs/bin/kallisto bus --list&quot;, intern = TRUE):
#&gt; running command &#39;~/mylibs/bin/kallisto bus --list&#39; had status 1</code></pre>
<pre><code>#&gt;  [1] &quot;List of supported single-cell technologies&quot;
#&gt;  [2] &quot;&quot;                                          
#&gt;  [3] &quot;short name       description&quot;              
#&gt;  [4] &quot;----------       -----------&quot;              
#&gt;  [5] &quot;10xv1            10x version 1 chemistry&quot;  
#&gt;  [6] &quot;10xv2            10x version 2 chemistry&quot;  
#&gt;  [7] &quot;10xv3            10x version 3 chemistry&quot;  
#&gt;  [8] &quot;CELSeq           CEL-Seq&quot;                  
#&gt;  [9] &quot;CELSeq2          CEL-Seq version 2&quot;        
#&gt; [10] &quot;DropSeq          DropSeq&quot;                  
#&gt; [11] &quot;inDrops          inDrops&quot;                  
#&gt; [12] &quot;SCRBSeq          SCRB-Seq&quot;                 
#&gt; [13] &quot;SureCell         SureCell for ddSEQ&quot;       
#&gt; [14] &quot;&quot;                                          
#&gt; attr(,&quot;status&quot;)
#&gt; [1] 1</code></pre>
<p>Here we see 10xv3 support. Here we have 2 samples. Each sample has 3 files: <code>I1</code> means sample index, <code>R1</code> means barcode and UMI, and <code>R2</code> means the piece of cDNA. The <code>-i</code> argument specifies the index file we just built. The <code>-o</code> argument specifies the output directory. The <code>-x</code> argument specifies the sequencing technology used to generate this data set. The <code>-t</code> argument specifies the number of threads used, and 8 threads have been used in this example.</p>
<pre class="bash"><code>cd ./data
~/mylibs/bin/kallisto bus -i ../output/hs_mm_tr_index.idx \
-o ../output/out_hgmm1k_v3 -x 10xv3 -t8 \
./hgmm_1k_v3_fastqs/hgmm_1k_v3_S1_L001_R1_001.fastq.gz \
./hgmm_1k_v3_fastqs/hgmm_1k_v3_S1_L001_R2_001.fastq.gz \
./hgmm_1k_v3_fastqs/hgmm_1k_v3_S1_L002_R1_001.fastq.gz \
./hgmm_1k_v3_fastqs/hgmm_1k_v3_S1_L002_R2_001.fastq.gz</code></pre>
<pre><code>#&gt; 
#&gt; [index] k-mer length: 31
#&gt; [index] number of targets: 303,693
#&gt; [index] number of k-mers: 206,575,877
#&gt; [index] number of equivalence classes: 1,256,267
#&gt; [quant] will process sample 1: ./hgmm_1k_v3_fastqs/hgmm_1k_v3_S1_L001_R1_001.fastq.gz
#&gt;                                ./hgmm_1k_v3_fastqs/hgmm_1k_v3_S1_L001_R2_001.fastq.gz
#&gt; [quant] will process sample 2: ./hgmm_1k_v3_fastqs/hgmm_1k_v3_S1_L002_R1_001.fastq.gz
#&gt;                                ./hgmm_1k_v3_fastqs/hgmm_1k_v3_S1_L002_R2_001.fastq.gz
#&gt; [quant] finding pseudoalignments for the reads ... done
#&gt; [quant] processed 63,105,786 reads, 46,343,056 reads pseudoaligned</code></pre>
<p>See what the outputs are</p>
<pre class="r"><code>list.files(&quot;./output/out_hgmm1k_v3/&quot;)</code></pre>
<pre><code>#&gt; [1] &quot;matrix.ec&quot;         &quot;output.bus&quot;        &quot;output.sorted&quot;    
#&gt; [4] &quot;output.sorted.txt&quot; &quot;run_info.json&quot;     &quot;transcripts.txt&quot;</code></pre>
</div>
<div id="run-bustools" class="section level2">
<h2>Run <code>BUStools</code></h2>
<p>The <code>output.bus</code> file is a binary. In order to make R parse it, we need to convert it into a sorted text file. There’s a command line tool <a href="https://github.com/BUStools/bustools"><code>bustools</code></a> for this. The reason why <code>kallisto bus</code> outputs this binary rather than the sorted text file directly is that the binary can be very compressed to save disk space. The such compression is achieved, a future version of R package <code>BUSpaRse</code> may support directly reading the binary to convert to sparse matrix. At present, as <code>bustools</code> is still under development, such compression has not been achieved, and as a result, the binary is slightly larger than the text file.</p>
<pre class="bash"><code># Sort, with 8 threads
bustools sort -o ./output/out_hgmm1k_v3/output.sorted -t8 ./output/out_hgmm1k_v3/output.bus
# Convert sorted file to text
bustools text -o ./output/out_hgmm1k_v3/output.sorted.txt ./output/out_hgmm1k_v3/output.sorted</code></pre>
<pre><code>#&gt; Read in 46343056 number of busrecords
#&gt; All sorted
#&gt; Read in 37843256 number of busrecords</code></pre>
</div>
<div id="map-transcripts-to-genes" class="section level2">
<h2>Map transcripts to genes</h2>
<p>For the sparse matrix, most people are interested in how many UMIs per gene per cell, we here we will quantify this from the <code>bus</code> output, and to do so, we need to find which gene corresponds to each transcript. Remember in the output of <code>kallisto bus</code>, there’s the file <code>transcripts.txt</code>. Those are the transcripts in the transcriptome index. Now we’ll only keep the transcripts present there and make sure that the transcripts in <code>tr2g</code> are in the same order as those in the index. You will soon see why the order is important.</p>
<p>Remember that we downloaded transcriptome FASTA files from Ensembl just now. In FASTA files, each entry is a sequence with a name. In Ensembl FASTA files, the sequence name has genome annotation of the corresponding sequence, so we can extract transcript IDs and corresponding gene IDs and gene names from there.</p>
<pre class="r"><code>tr2g &lt;- transcript2gene(fasta_file = c(&quot;./data/hs_cdna.fa.gz&quot;, &quot;./data/mm_cdna.fa.gz&quot;),
                        kallisto_out_path = &quot;./output/out_hgmm1k_v3&quot;)</code></pre>
<pre><code>#&gt; Reading FASTA file.
#&gt; Reading FASTA file.</code></pre>
<pre><code>#&gt; Sorting transcripts</code></pre>
<pre class="r"><code>head(tr2g) %&gt;% knitr::kable()</code></pre>
<table>
<thead>
<tr class="header">
<th align="left">transcript</th>
<th align="left">gene</th>
<th align="left">gene_name</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="left">ENST00000632684.1</td>
<td align="left">ENSG00000282431.1</td>
<td align="left">TRBD1</td>
</tr>
<tr class="even">
<td align="left">ENST00000434970.2</td>
<td align="left">ENSG00000237235.2</td>
<td align="left">TRDD2</td>
</tr>
<tr class="odd">
<td align="left">ENST00000448914.1</td>
<td align="left">ENSG00000228985.1</td>
<td align="left">TRDD3</td>
</tr>
<tr class="even">
<td align="left">ENST00000415118.1</td>
<td align="left">ENSG00000223997.1</td>
<td align="left">TRDD1</td>
</tr>
<tr class="odd">
<td align="left">ENST00000631435.1</td>
<td align="left">ENSG00000282253.1</td>
<td align="left">TRBD1</td>
</tr>
<tr class="even">
<td align="left">ENST00000390583.1</td>
<td align="left">ENSG00000211923.1</td>
<td align="left">IGHD3-10</td>
</tr>
</tbody>
</table>
<p>Alternative ways of getting <code>tr2g</code> have been implemented in the <code>BUSpaRse</code> package. You may use <code>tr2g_ensembl</code> to query Ensembl with biomart to get transcript and gene IDs. If you use this method, then please make sure that the Ensembl version used in the query matches that of the transcriptome. This method is convenient for the user since you only need to input species names, but it can be slow since biomart database query can be slow. You may also use <code>tr2g_gtf</code> for GTF files and <code>tr2g_gff3</code> for GFF3 files, which are more useful for non-model organisms absent from Ensemble. After calling the <code>tr2g_*</code> family of functions, you should sort the transcripts from those functions with <code>sort_tr2g</code> so the transcripts are in the same order as those in the kallisto index. The FASTA way is the fastest, as reading FASTA files is faster than reading GTF and GFF files. Whichever way you choose, this shouldn’t take much more than half a minute per species.</p>
<p>The function <code>transcript2gene</code> not only gets the transcript and gene IDs but also sorts the transcripts. This function only supports Ensembl biomart query and Ensembl fasta files as the source of transcript and gene IDs, as the attribute field of GTF and GFF3 files can differ between sources and further cleanup may be needed.</p>
</div>
<div id="map-ecs-to-genes" class="section level2">
<h2>Map ECs to genes</h2>
<p>The 3rd column in the <code>output.sorted.txt</code> is the equivalence class (EC) index of each UMI for each cell barcode. Equivalence class (EC) means the set of transcripts in the transcriptome that the read is compatible to. While in most cases, an EC only has transcripts for the same gene, there are some ECs that have transcripts for different genes. The file in the <code>kallisto bus</code> output, <code>matrix.ec</code>, maps the EC index in <code>output.sorted.txt</code> to sets of line numbers in the transcriptome assembly. That’s why we ensured that the <code>tr2g</code> data frame has the same order as the transcripts in the index.</p>
<p>Also note that while this function supports multithreading, unless the <code>matrix.ec</code> file is really large, setting <code>ncores &gt; 1</code> will not improve performance since only 1 core will be used anyway. It may even <em>compromise</em> performance since creating multiple processes has overhead. For this dataset, where <code>matrix.ec</code> is 68.8MB, it is not large enough to warrant the use of more than 1 core.</p>
<pre class="r"><code>genes &lt;- EC2gene(tr2g, &quot;./output/out_hgmm1k_v3&quot;, ncores = 1)</code></pre>
<pre><code>#&gt; Reading matrix.ec</code></pre>
<pre><code>#&gt; Processing genes</code></pre>
<p>Now for each EC, we have a set of genes the EC is compatible to.</p>
<pre class="r"><code>head(genes)</code></pre>
<pre><code>#&gt; [[1]]
#&gt; [1] &quot;ENSG00000282431.1&quot;
#&gt; 
#&gt; [[2]]
#&gt; [1] &quot;ENSG00000237235.2&quot;
#&gt; 
#&gt; [[3]]
#&gt; [1] &quot;ENSG00000228985.1&quot;
#&gt; 
#&gt; [[4]]
#&gt; [1] &quot;ENSG00000223997.1&quot;
#&gt; 
#&gt; [[5]]
#&gt; [1] &quot;ENSG00000282253.1&quot;
#&gt; 
#&gt; [[6]]
#&gt; [1] &quot;ENSG00000211923.1&quot;</code></pre>
<pre class="r"><code>tail(genes)</code></pre>
<pre><code>#&gt; [[1]]
#&gt; [1] &quot;ENSG00000196418.12&quot; &quot;ENSG00000178199.13&quot; &quot;ENSG00000011523.13&quot;
#&gt; 
#&gt; [[2]]
#&gt; [1] &quot;ENSMUSG00000044026.2&quot;  &quot;ENSMUSG00000026470.14&quot; &quot;ENSMUSG00000048424.17&quot;
#&gt; [4] &quot;ENSMUSG00000110746.3&quot; 
#&gt; 
#&gt; [[3]]
#&gt;  [1] &quot;ENSG00000121413.12&quot; &quot;ENSG00000117115.12&quot; &quot;ENSG00000117475.13&quot;
#&gt;  [4] &quot;ENSG00000197380.10&quot; &quot;ENSG00000158220.13&quot; &quot;ENSG00000163378.13&quot;
#&gt;  [7] &quot;ENSG00000196417.12&quot; &quot;ENSG00000149311.18&quot; &quot;ENSG00000167984.17&quot;
#&gt; [10] &quot;ENSG00000274897.2&quot;  &quot;ENSG00000265863.1&quot; 
#&gt; 
#&gt; [[4]]
#&gt; [1] &quot;ENSG00000070814.19&quot; &quot;ENSG00000166847.9&quot; 
#&gt; 
#&gt; [[5]]
#&gt;  [1] &quot;ENSG00000135930.13&quot; &quot;ENSG00000204842.15&quot; &quot;ENSG00000141665.12&quot;
#&gt;  [4] &quot;ENSG00000163877.10&quot; &quot;ENSG00000197580.11&quot; &quot;ENSG00000131686.14&quot;
#&gt;  [7] &quot;ENSG00000159023.21&quot; &quot;ENSG00000183323.12&quot; &quot;ENSG00000113558.18&quot;
#&gt; [10] &quot;ENSG00000277868.4&quot; 
#&gt; 
#&gt; [[6]]
#&gt; [1] &quot;ENSG00000148734.7&quot;  &quot;ENSG00000187715.13&quot;</code></pre>
<p>Apparently some ECs map to many genes.</p>
</div>
<div id="make-the-sparse-matrix" class="section level2">
<h2>Make the sparse matrix</h2>
<p>For 10x, we do have a file with all valid cell barcodes that comes with CellRanger.</p>
<pre class="bash"><code># Copy v3 chemistry whitelist to working directory
cp ~/cellranger-3.0.1/cellranger-cs/3.0.1/lib/python/cellranger/barcodes/3M-february-2018.txt.gz \
./data/whitelist_v3.txt.gz</code></pre>
<pre class="r"><code># Read in the whitelist
whitelist_v3 &lt;- fread(&quot;./data/whitelist_v3.txt.gz&quot;, header = FALSE)$V1
length(whitelist_v3)</code></pre>
<pre><code>#&gt; [1] 6794880</code></pre>
<p>That’s an order of magnitude more than the 737K in v2 chemistry.</p>
<p>Now we have everything we need to make the sparse matrix. This function reads in <code>output.sorted.txt</code> line by line and processes them. It does not do barcode correction for now, so the barcode must exactly match those in the whitelist if one is provided. It took 5 to 6 minutes to construct the sparse matrix in the hgmm6k dataset, which has over 280 million lines in <code>output.sorted.txt</code>, which is over 9GB. Here the data set is smaller, and it takes less than a minute. A future version of <code>BUSpaRse</code> will parallelize this process to make it more scalable for huge datasets. For now, it’s single threaded.</p>
<p>Note that the arguments <code>est_ncells</code> (estimated number of cells) and <code>est_ngenes</code> (estimated number of genes) are important. With the estimate, this function reserves memory for the data to be added into, reducing the need of reallocation, which will slow the function down. Since the vast majority of “cells” you get in this sparse matrix are empty droplets rather than cells, please put at least 200 times more “cells” than you actually expect in <code>est_ncells</code>.</p>
<p>If you do not have a whitelist of barcodes, then it’s fine; the <code>whitelist</code> argument is optional.</p>
<pre class="r"><code>res_mat &lt;- make_sparse_matrix(&quot;./output/out_hgmm1k_v3/output.sorted.txt&quot;,
                              genes = genes, est_ncells = 3e5,
                              est_ngenes = nrow(tr2g),
                              whitelist = whitelist_v3)</code></pre>
<pre><code>#&gt; Reading data
#&gt; Read 5 million lines
#&gt; Read 10 million lines
#&gt; Read 15 million lines
#&gt; Read 20 million lines
#&gt; Read 25 million lines
#&gt; Read 30 million lines
#&gt; Read 35 million lines
#&gt; Constructing sparse matrix</code></pre>
<p>The matrix we get has genes in rows and barcode in columns. The row names are the gene IDs (not using human readable gene names since they’re not guaranteed to be unique), and the column names are cell barcodes.</p>
<p>For Ensembl transcriptomes, all the 3 steps in <code>BUSpaRse</code> we have just done can be condensed into one step, with the <code>busparse_gene_count</code> function.</p>
</div>
</div>
<div id="explore-the-data" class="section level1">
<h1>Explore the data</h1>
<div id="remove-empty-droplets" class="section level2">
<h2>Remove empty droplets</h2>
<p>Cool, so now we have the sparse matrix. What does it look like?</p>
<pre class="r"><code>dim(res_mat)</code></pre>
<pre><code>#&gt; [1]  47332 416360</code></pre>
<p>That’s way more cells than we expect, which is about 1000. So what’s going on?</p>
<p>How many UMIs per barcode?</p>
<pre class="r"><code>tot_counts &lt;- Matrix::colSums(res_mat)
summary(tot_counts)</code></pre>
<pre><code>#&gt;      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
#&gt;      1.00      1.00      1.00     82.63      4.00 154919.00</code></pre>
<p>The vast majority of “cells” have only a few UMI detected. Those are empty droplets. 10x claims to have cell capture rate of up to 65%, but in practice, depending on how many cells are in fact loaded, the rate can be much lower. A commonly used method to estimate the number of empty droplets is barcode ranking knee and inflection points, as those are often assumed to represent transition between two components of a distribution. While more sophisticated method exist (e.g. see <a href="https://www.bioconductor.org/packages/devel/bioc/vignettes/DropletUtils/inst/doc/DropletUtils.html#detecting-empty-droplets"><code>emptyDrops</code> in <code>DropletUtils</code></a>), for simplicity, we will use the barcode ranking method here. However, whichever way we go, we don’t have the ground truth.</p>
<pre class="r"><code># Compute barcode rank
bc_rank &lt;- barcodeRanks(res_mat)</code></pre>
<pre class="r"><code>qplot(bc_rank$rank, bc_rank$total, geom = &quot;line&quot;) +
  geom_hline(yintercept = bc_rank$knee, color = &quot;blue&quot;, linetype = 2) +
  geom_hline(yintercept = bc_rank$inflection, color = &quot;green&quot;, linetype = 2) +
  annotate(&quot;text&quot;, x = 1000, y = 1.5 * c(bc_rank$knee, bc_rank$inflection),
           label = c(&quot;knee&quot;, &quot;inflection&quot;), color = c(&quot;blue&quot;, &quot;green&quot;)) +
  scale_x_log10() +
  scale_y_log10() +
  labs(x = &quot;Rank&quot;, y = &quot;Total reads&quot;) +
  theme_bw()</code></pre>
<p><img src="figure/10xv3.Rmd/rank-1.svg" width="672" style="display: block; margin: auto;" /></p>
<p>The inflection point looks like a reasonable number of cells.</p>
<pre class="r"><code># Filter the matrix
res_mat &lt;- res_mat[, tot_counts &gt; bc_rank$inflection]
dim(res_mat)</code></pre>
<pre><code>#&gt; [1] 47332  1020</code></pre>
</div>
<div id="cell-species" class="section level2">
<h2>Cell species</h2>
<p>How many cells are from humans and how many from mice? The number of cells with mixed species indicates doublet rate.</p>
<pre class="r"><code>gene_species &lt;- ifelse(str_detect(rownames(res_mat), &quot;^ENSMUSG&quot;), &quot;mouse&quot;, &quot;human&quot;)
mouse_inds &lt;- gene_species == &quot;mouse&quot;
human_inds &lt;- gene_species == &quot;human&quot;
# mark cells as mouse or human
cell_species &lt;- tibble(n_mouse_umi = Matrix::colSums(res_mat[mouse_inds,]),
                       n_human_umi = Matrix::colSums(res_mat[human_inds,]),
                       tot_umi = Matrix::colSums(res_mat),
                       prop_mouse = n_mouse_umi / tot_umi,
                       prop_human = n_human_umi / tot_umi)</code></pre>
<pre class="r"><code># Classify species based on proportion of UMI
cell_species &lt;- cell_species %&gt;% 
  mutate(species = case_when(
    prop_mouse &gt; 0.9 ~ &quot;mouse&quot;,
    prop_human &gt; 0.9 ~ &quot;human&quot;,
    TRUE ~ &quot;mixed&quot;
  ))</code></pre>
<pre class="r"><code>ggplot(cell_species, aes(n_human_umi, n_mouse_umi, color = species)) +
  geom_point(size = 0.5) +
  theme_bw()</code></pre>
<p><img src="figure/10xv3.Rmd/barn-1.svg" width="672" style="display: block; margin: auto;" /></p>
<p>Great, looks like the vast majority of cells are not mixed.</p>
<pre class="r"><code>cell_species %&gt;% 
  dplyr::count(species) %&gt;% 
  mutate(proportion = n / ncol(res_mat))</code></pre>
<pre><code>#&gt; # A tibble: 3 x 3
#&gt;   species     n proportion
#&gt;   &lt;chr&gt;   &lt;int&gt;      &lt;dbl&gt;
#&gt; 1 human     488    0.478  
#&gt; 2 mixed       2    0.00196
#&gt; 3 mouse     530    0.520</code></pre>
<p>Great, only about 0.2% of cells here are doublets, which is lower than the ~1% 10x lists. Also, it seems from the plot that most “doublets” have very few UMIs. Doublet rate tends to be lower when cell concentration is lower. However, doublets can still be formed with cells from the same species, so the number of mixed species “cells” is only a lower bound of doublet rate.</p>
</div>
<div id="dimension-reduction" class="section level2">
<h2>Dimension reduction</h2>
<p>Note: <a href="https://github.com/satijalab/seurat/tree/release/3.0">Seurat 3.0</a>, which is not yet on CRAN, is used in this notebook.</p>
<pre class="r"><code>seu &lt;- CreateSeuratObject(res_mat, min.cells = 3) %&gt;% 
  NormalizeData(verbose = FALSE) %&gt;% 
  ScaleData(verbose = FALSE) %&gt;% 
  FindVariableFeatures(verbose = FALSE)</code></pre>
<pre class="r"><code># Add species to meta data
seu &lt;- AddMetaData(seu, metadata = cell_species$species, col.name = &quot;species&quot;)</code></pre>
<p>See how number of total counts and number of genes expressed are distributed.</p>
<pre class="r"><code>VlnPlot(seu, c(&quot;nCount_RNA&quot;, &quot;nFeature_RNA&quot;), group.by = &quot;species&quot;)</code></pre>
<p><img src="figure/10xv3.Rmd/vln-1.svg" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>seu &lt;- RunPCA(seu, verbose = FALSE, npcs = 30)
ElbowPlot(seu, ndims = 30)</code></pre>
<p><img src="figure/10xv3.Rmd/elbow-1.svg" width="672" style="display: block; margin: auto;" /></p>
<pre class="r"><code>DimPlot(seu, reduction = &quot;pca&quot;, pt.size = 0.5, group.by = &quot;species&quot;)</code></pre>
<p><img src="figure/10xv3.Rmd/pca-1.svg" width="672" style="display: block; margin: auto;" /></p>
<p>The first PC separates species, as expected. Here the 2 mixed species “cells” seem to be more similar to mouse cells than human cells.</p>
<pre class="r"><code>seu &lt;- RunTSNE(seu, dims = 1:20, check_duplicates = FALSE)
DimPlot(seu, reduction = &quot;tsne&quot;, pt.size = 0.5, group.by = &quot;species&quot;)</code></pre>
<p><img src="figure/10xv3.Rmd/tsne-1.svg" width="672" style="display: block; margin: auto;" /></p>
</div>
<div id="session-information" class="section level2">
<h2>Session information</h2>
<pre class="r"><code>sessionInfo()</code></pre>
<pre><code>#&gt; R version 3.5.1 (2018-07-02)
#&gt; Platform: x86_64-redhat-linux-gnu (64-bit)
#&gt; Running under: CentOS Linux 7 (Core)
#&gt; 
#&gt; Matrix products: default
#&gt; BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
#&gt; 
#&gt; locale:
#&gt;  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#&gt;  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#&gt;  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#&gt;  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#&gt;  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#&gt; [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#&gt; 
#&gt; attached base packages:
#&gt; [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
#&gt; [8] methods   base     
#&gt; 
#&gt; other attached packages:
#&gt;  [1] bindrcpp_0.2.2              Matrix_1.2-15              
#&gt;  [3] DropletUtils_1.2.2          SingleCellExperiment_1.4.1 
#&gt;  [5] SummarizedExperiment_1.12.0 DelayedArray_0.8.0         
#&gt;  [7] matrixStats_0.54.0          Biobase_2.42.0             
#&gt;  [9] GenomicRanges_1.34.0        GenomeInfoDb_1.18.1        
#&gt; [11] IRanges_2.16.0              S4Vectors_0.20.1           
#&gt; [13] BiocGenerics_0.28.0         BiocParallel_1.16.5        
#&gt; [15] forcats_0.3.0               stringr_1.4.0              
#&gt; [17] dplyr_0.7.8                 purrr_0.3.0                
#&gt; [19] readr_1.3.1                 tidyr_0.8.2                
#&gt; [21] tibble_2.0.1                ggplot2_3.1.0              
#&gt; [23] tidyverse_1.2.1             Seurat_3.0.0.9000          
#&gt; [25] data.table_1.12.0           BUSpaRse_0.99.0            
#&gt; 
#&gt; loaded via a namespace (and not attached):
#&gt;   [1] utf8_1.1.4               reticulate_1.10         
#&gt;   [3] R.utils_2.7.0            tidyselect_0.2.5        
#&gt;   [5] RSQLite_2.1.1            AnnotationDbi_1.44.0    
#&gt;   [7] htmlwidgets_1.3          grid_3.5.1              
#&gt;   [9] Rtsne_0.15               devtools_2.0.1          
#&gt;  [11] munsell_0.5.0            codetools_0.2-15        
#&gt;  [13] ica_1.0-2                future_1.11.1.1         
#&gt;  [15] withr_2.1.2              colorspace_1.4-0        
#&gt;  [17] highr_0.7                knitr_1.21              
#&gt;  [19] rstudioapi_0.9.0         ROCR_1.0-7              
#&gt;  [21] gbRd_0.4-11              listenv_0.7.0           
#&gt;  [23] labeling_0.3             Rdpack_0.10-1           
#&gt;  [25] git2r_0.23.0             GenomeInfoDbData_1.2.0  
#&gt;  [27] bit64_0.9-7              rhdf5_2.26.0            
#&gt;  [29] rprojroot_1.3-2          generics_0.0.2          
#&gt;  [31] xfun_0.4                 R6_2.3.0                
#&gt;  [33] doParallel_1.0.14        rsvd_1.0.0              
#&gt;  [35] locfit_1.5-9.1           bitops_1.0-6            
#&gt;  [37] assertthat_0.2.0         SDMTools_1.1-221        
#&gt;  [39] scales_1.0.0             gtable_0.2.0            
#&gt;  [41] npsurv_0.4-0             globals_0.12.4          
#&gt;  [43] processx_3.2.1           workflowr_1.1.1         
#&gt;  [45] rlang_0.3.1              zeallot_0.1.0           
#&gt;  [47] splines_3.5.1            rtracklayer_1.42.1      
#&gt;  [49] lazyeval_0.2.1           broom_0.5.1             
#&gt;  [51] plyranges_1.2.0          yaml_2.2.0              
#&gt;  [53] modelr_0.1.2             backports_1.1.2         
#&gt;  [55] tools_3.5.1              usethis_1.4.0           
#&gt;  [57] gplots_3.0.1.1           RColorBrewer_1.1-2      
#&gt;  [59] sessioninfo_1.1.1        ggridges_0.5.1          
#&gt;  [61] Rcpp_1.0.0               plyr_1.8.4              
#&gt;  [63] progress_1.2.0           zlibbioc_1.28.0         
#&gt;  [65] RCurl_1.95-4.11          ps_1.2.1                
#&gt;  [67] prettyunits_1.0.2        pbapply_1.4-0           
#&gt;  [69] cowplot_0.9.4            zoo_1.8-4               
#&gt;  [71] haven_2.0.0              ggrepel_0.8.0           
#&gt;  [73] cluster_2.0.7-1          fs_1.2.6                
#&gt;  [75] magrittr_1.5             lmtest_0.9-36           
#&gt;  [77] RANN_2.6.1               whisker_0.3-2           
#&gt;  [79] fitdistrplus_1.0-14      pkgload_1.0.2           
#&gt;  [81] hms_0.4.2                lsei_1.2-0              
#&gt;  [83] evaluate_0.12            XML_3.98-1.17           
#&gt;  [85] readxl_1.1.0             testthat_2.0.1          
#&gt;  [87] compiler_3.5.1           biomaRt_2.38.0          
#&gt;  [89] KernSmooth_2.23-15       crayon_1.3.4            
#&gt;  [91] R.oo_1.22.0              htmltools_0.3.6         
#&gt;  [93] lubridate_1.7.4          DBI_1.0.0               
#&gt;  [95] MASS_7.3-51.1            cli_1.0.1               
#&gt;  [97] R.methodsS3_1.7.1        gdata_2.18.0            
#&gt;  [99] metap_1.0                bindr_0.1.1             
#&gt; [101] igraph_1.2.2             pkgconfig_2.0.2         
#&gt; [103] GenomicAlignments_1.18.1 plotly_4.8.0            
#&gt; [105] xml2_1.2.0               foreach_1.4.4           
#&gt; [107] XVector_0.22.0           bibtex_0.4.2            
#&gt; [109] rvest_0.3.2              callr_3.1.1             
#&gt; [111] digest_0.6.18            tsne_0.1-3              
#&gt; [113] Biostrings_2.50.2        rmarkdown_1.11          
#&gt; [115] cellranger_1.1.0         edgeR_3.24.3            
#&gt; [117] Rsamtools_1.34.1         gtools_3.8.1            
#&gt; [119] nlme_3.1-137             jsonlite_1.6            
#&gt; [121] Rhdf5lib_1.4.0           fansi_0.4.0             
#&gt; [123] desc_1.2.0               viridisLite_0.3.0       
#&gt; [125] limma_3.38.3             pillar_1.3.1            
#&gt; [127] lattice_0.20-38          httr_1.4.0              
#&gt; [129] pkgbuild_1.0.2           survival_2.43-3         
#&gt; [131] glue_1.3.0               remotes_2.0.2           
#&gt; [133] png_0.1-7                iterators_1.0.10        
#&gt; [135] bit_1.1-14               stringi_1.3.1           
#&gt; [137] HDF5Array_1.10.1         blob_1.1.1              
#&gt; [139] caTools_1.17.1.1         memoise_1.1.0           
#&gt; [141] irlba_2.3.2              future.apply_1.1.0      
#&gt; [143] ape_5.2</code></pre>
</div>
</div>

<!-- Adjust MathJax settings so that all math formulae are shown using
TeX fonts only; see
http://docs.mathjax.org/en/latest/configuration.html.  This will make
the presentation more consistent at the cost of the webpage sometimes
taking slightly longer to load. Note that this only works because the
footer is added to webpages before the MathJax javascript. -->
<script type="text/x-mathjax-config">
  MathJax.Hub.Config({
    "HTML-CSS": { availableFonts: ["TeX"] }
  });
</script>

<hr>
<p>
  This reproducible <a href="http://rmarkdown.rstudio.com">R Markdown</a>
  analysis was created with
  <a href="https://github.com/jdblischak/workflowr">workflowr</a> 1.1.1
</p>
<hr>


</div>
</div>

</div>

<script>

// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
  $('tr.header').parent('thead').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
  bootstrapStylePandocTables();
});


</script>

<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>