386 lines
11 KiB
HTML
386 lines
11 KiB
HTML
<!doctype html>
|
||
<html lang="en">
|
||
<head>
|
||
<meta charset="utf-8">
|
||
<title>Druid pour l'analyse de données en temps réel</title>
|
||
<meta name="description" content="Druid pour l'analyse de données en temps réel">
|
||
<meta name="author" content="Yann Esposito" />
|
||
<meta name="apple-mobile-web-app-capable" content="yes" />
|
||
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
|
||
<link rel="stylesheet" href="../.reveal.js-3.2.0/css/reveal.css">
|
||
<link rel="stylesheet" href="../.reveal.js-3.2.0/css/theme/solarized.css" id="theme">
|
||
<!-- For syntax highlighting -->
|
||
<link rel="stylesheet" href="../.reveal.js-3.2.0/lib/css/zenburn.css">
|
||
<!-- If the query includes 'print-pdf', use the PDF print sheet -->
|
||
<script>
|
||
document.write( '<link rel="stylesheet" href="../.reveal.js-3.2.0/css/print/' +
|
||
( window.location.search.match( /print-pdf/gi ) ? 'pdf' : 'paper' ) +
|
||
'.css" type="text/css" media="print">' );
|
||
</script>
|
||
<!--[if lt IE 9]>
|
||
<script src="../.reveal.js-3.2.0/lib/js/html5shiv.js"></script>
|
||
<![endif]-->
|
||
</head>
|
||
<body>
|
||
|
||
<div class="reveal">
|
||
|
||
<!-- Any section element inside of this container is displayed as a slide -->
|
||
<div class="slides">
|
||
|
||
<section>
|
||
<h1>Druid pour l'analyse de données en temps réel</h1>
|
||
<h3>Yann Esposito</h3>
|
||
<p>
|
||
<h4>7 Avril 2016</h4>
|
||
</p>
|
||
</section>
|
||
|
||
|
||
<section id="intro" class="level1">
|
||
<h1>Intro</h1>
|
||
<section id="plan" class="level2">
|
||
<h2>Plan</h2>
|
||
<ul>
|
||
<li>Introduction; why?</li>
|
||
<li>How?</li>
|
||
</ul>
|
||
</section>
|
||
<section id="experience" class="level2">
|
||
<h2>Experience</h2>
|
||
<ul>
|
||
<li>Real Time Social Media Analytics</li>
|
||
</ul>
|
||
</section>
|
||
<section id="real-time" class="level2">
|
||
<h2>Real Time?</h2>
|
||
<ul>
|
||
<li>Ingestion Latency: seconds</li>
|
||
<li>Query Latency: seconds</li>
|
||
</ul>
|
||
</section>
|
||
<section id="demande" class="level2">
|
||
<h2>Demande</h2>
|
||
<ul>
|
||
<li>Twitter: <code>20k msg/s</code>, <code>1msg = 10ko</code> pendant 24h</li>
|
||
<li>Facebook public: 1000 à 2000 msg/s en continu</li>
|
||
</ul>
|
||
</section>
|
||
<section id="en-pratique" class="level2">
|
||
<h2>En pratique</h2>
|
||
<ul>
|
||
<li>Twitter: 400 msg/s en continu, pics à 1500</li>
|
||
</ul>
|
||
</section>
|
||
<section id="origine-php" class="level2">
|
||
<h2>Origine (PHP)</h2>
|
||
<p><img src="img/bad_php.jpg" alt="History" /> </p>
|
||
</section>
|
||
<section id="introduction" class="level2">
|
||
<h2>Introduction</h2>
|
||
<ul>
|
||
<li>Traitement de donnée gros volume + faible latence</li>
|
||
<li>Typiquement <code>pulse</code></li>
|
||
</ul>
|
||
<p><a href="http://pulse.vigiglo.be/#/vigiglobe/Earthquake/dashboard" target="_blank"> DEMO </a></p>
|
||
</section>
|
||
<section id="pre-considerations" class="level2">
|
||
<h2>Pre Considerations</h2>
|
||
<p>Discovered vs Invented</p>
|
||
</section>
|
||
<section id="try-to-conceptualize-events" class="level2">
|
||
<h2>Try to conceptualize (events)</h2>
|
||
<p>Scalable + Real Time + Fail safe</p>
|
||
<ul>
|
||
<li>timeseries</li>
|
||
<li>alerting system</li>
|
||
<li>top N</li>
|
||
<li>etc...</li>
|
||
</ul>
|
||
</section>
|
||
<section id="in-the-end" class="level2">
|
||
<h2>In the End</h2>
|
||
<p>Druid concepts are always emerging naturally</p>
|
||
</section>
|
||
</section>
|
||
<section id="druid" class="level1">
|
||
<h1>Druid</h1>
|
||
<section id="who" class="level2">
|
||
<h2>Who</h2>
|
||
<p>Metamarkets</p>
|
||
</section>
|
||
<section id="goal" class="level2">
|
||
<h2>Goal</h2>
|
||
<blockquote>
|
||
<p>Druid is an open source store designed for real-time exploratory analytics on large data sets.</p>
|
||
</blockquote>
|
||
<blockquote>
|
||
<p>hosted dashboard that would allow users to arbitrarily explore and visualize event streams.</p>
|
||
</blockquote>
|
||
</section>
|
||
<section id="concepts" class="level2">
|
||
<h2>Concepts</h2>
|
||
<ul>
|
||
<li>Column-oriented storage layout</li>
|
||
<li>distributed, shared-nothing architecture</li>
|
||
<li>advanced indexing structure</li>
|
||
</ul>
|
||
</section>
|
||
<section id="features" class="level2">
|
||
<h2>Features</h2>
|
||
<ul>
|
||
<li>fast aggregations</li>
|
||
<li>flexible filters</li>
|
||
<li>low latency data ingestion</li>
|
||
</ul>
|
||
<p><strong>arbitrary exploration of billion-row tables tables with sub-second latencies</strong></p>
|
||
</section>
|
||
<section id="storage" class="level2">
|
||
<h2>Storage</h2>
|
||
<ul>
|
||
<li>Columnar</li>
|
||
<li>Inverted Index</li>
|
||
<li>Immutable Segments</li>
|
||
</ul>
|
||
</section>
|
||
<section id="columnar-storage" class="level2">
|
||
<h2>Columnar Storage</h2>
|
||
</section>
|
||
<section id="index" class="level2">
|
||
<h2>Index</h2>
|
||
<ul>
|
||
<li>Values are dictionary encoded</li>
|
||
</ul>
|
||
<p><code>{"USA" 1, "Canada" 2, "Mexico" 3, ...}</code></p>
|
||
<ul>
|
||
<li>Bitmap for every dimension value (used by filters)</li>
|
||
</ul>
|
||
<p><code>"USA" -> [0 1 0 0 1 1 0 0 0]</code></p>
|
||
<ul>
|
||
<li>Column values (used by aggergation queries)</li>
|
||
</ul>
|
||
<p><code>[2,1,3,15,1,1,2,8,7]</code></p>
|
||
</section>
|
||
<section id="data-segments" class="level2">
|
||
<h2>Data Segments</h2>
|
||
<ul>
|
||
<li>Per time interval</li>
|
||
<li>skip segments when querying</li>
|
||
<li>Immutable</li>
|
||
<li>Cache friendly</li>
|
||
<li>No locking</li>
|
||
<li>Versioned</li>
|
||
<li>No locking</li>
|
||
<li>Read-write concurrency</li>
|
||
</ul>
|
||
</section>
|
||
<section id="real-time-ingestion" class="level2">
|
||
<h2>Real-time ingestion</h2>
|
||
<ul>
|
||
<li>Via Real-Time Node and Firehose</li>
|
||
<li>No redundancy or HA, thus not recommended</li>
|
||
<li>Via Indexing Service and Tranquility API</li>
|
||
<li>Core API</li>
|
||
<li>Integration with Streaming Frameworks</li>
|
||
<li>HTTP Server</li>
|
||
<li><strong>Kafka Consumer</strong></li>
|
||
</ul>
|
||
</section>
|
||
<section id="batch-ingestion" class="level2">
|
||
<h2>Batch Ingestion</h2>
|
||
<ul>
|
||
<li>File based (HDFS, S3, ...)</li>
|
||
</ul>
|
||
</section>
|
||
<section id="real-time-ingestion-1" class="level2">
|
||
<h2>Real-time Ingestion</h2>
|
||
<pre><code>Task 1: [ Interval ][ Window ]
|
||
Task 2: [ ]
|
||
--------------------------------------->
|
||
time</code></pre>
|
||
<p>Minimum indexing slots =<br />
|
||
Data Sources × Partitions × Replicas × 2</p>
|
||
</section>
|
||
</section>
|
||
<section id="querying" class="level1">
|
||
<h1>Querying</h1>
|
||
<section id="query-types" class="level2">
|
||
<h2>Query types</h2>
|
||
<ul>
|
||
<li>Group by: group by multiple dimensions</li>
|
||
<li>Top N: like grouping by a single dimension</li>
|
||
<li>Timeseries: without grouping over dimensions</li>
|
||
<li>Search: Dimensions lookup</li>
|
||
<li>Time Boundary: Find available data timeframe</li>
|
||
<li>Metadata queries</li>
|
||
</ul>
|
||
</section>
|
||
<section id="tip" class="level2">
|
||
<h2>Tip</h2>
|
||
<ul>
|
||
<li>Prefer <code>topN</code> over <code>groupBy</code></li>
|
||
<li>Prefer <code>timeseries</code> over <code>topN</code></li>
|
||
<li>Use limits (and priorities)</li>
|
||
</ul>
|
||
</section>
|
||
<section id="query-spec" class="level2">
|
||
<h2>Query Spec</h2>
|
||
<ul>
|
||
<li>Data source</li>
|
||
<li>Dimensions</li>
|
||
<li>Interval</li>
|
||
<li>Filters</li>
|
||
<li>Aggergations</li>
|
||
<li>Post Aggregations</li>
|
||
<li>Granularity</li>
|
||
<li>Context (query configuration)</li>
|
||
<li>Limit</li>
|
||
</ul>
|
||
</section>
|
||
<section id="examples" class="level2">
|
||
<h2>Example(s)</h2>
|
||
<p>TODO</p>
|
||
</section>
|
||
<section id="caching" class="level2">
|
||
<h2>Caching</h2>
|
||
<ul>
|
||
<li>Historical node level</li>
|
||
<li>By segment</li>
|
||
<li>Broker Level</li>
|
||
<li>By segment and query</li>
|
||
<li><code>groupBy</code> is disabled on purpose!</li>
|
||
<li>By default - local caching</li>
|
||
</ul>
|
||
</section>
|
||
<section id="load-rules" class="level2">
|
||
<h2>Load Rules</h2>
|
||
<ul>
|
||
<li>Can be defined</li>
|
||
<li>What can be set</li>
|
||
</ul>
|
||
</section>
|
||
</section>
|
||
<section id="components" class="level1">
|
||
<h1>Components</h1>
|
||
<section id="druid-components" class="level2">
|
||
<h2>Druid Components</h2>
|
||
<ul>
|
||
<li>Real-time Nodes</li>
|
||
<li>Historical Nodes</li>
|
||
<li>Broker Nodes</li>
|
||
<li>Coordinator</li>
|
||
<li>For indexing:</li>
|
||
<li>Overlord</li>
|
||
<li><p>Middle Manager</p></li>
|
||
<li>Deep Storage</li>
|
||
<li><p>Metadata Storage</p></li>
|
||
<li>Load Balancer</li>
|
||
<li><p>Cache</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="coordinator" class="level2">
|
||
<h2>Coordinator</h2>
|
||
<p>Manage Segments</p>
|
||
</section>
|
||
<section id="real-time-nodes" class="level2">
|
||
<h2>Real-time Nodes</h2>
|
||
<ul>
|
||
<li>Pulling data in real-time</li>
|
||
<li>Indexing it</li>
|
||
</ul>
|
||
</section>
|
||
<section id="historical-nodes" class="level2">
|
||
<h2>Historical Nodes</h2>
|
||
<ul>
|
||
<li>Keep historical segments</li>
|
||
</ul>
|
||
</section>
|
||
<section id="overlord" class="level2">
|
||
<h2>Overlord</h2>
|
||
<ul>
|
||
<li>Accepts tasks and distributes them to middle manager</li>
|
||
</ul>
|
||
</section>
|
||
<section id="middle-manager" class="level2">
|
||
<h2>Middle Manager</h2>
|
||
<ul>
|
||
<li>Execute submitted tasks via Peons</li>
|
||
</ul>
|
||
</section>
|
||
<section id="broker-nodes" class="level2">
|
||
<h2>Broker Nodes</h2>
|
||
<ul>
|
||
<li>Route query to Real-time and Historical nodes</li>
|
||
<li>Merge results</li>
|
||
</ul>
|
||
</section>
|
||
<section id="deep-storage" class="level2">
|
||
<h2>Deep Storage</h2>
|
||
<ul>
|
||
<li>Segments backup (HDFS, S3, ...)</li>
|
||
</ul>
|
||
</section>
|
||
</section>
|
||
<section id="considerations-tools" class="level1">
|
||
<h1>Considerations & Tools</h1>
|
||
<section id="when-not-to-choose-druid" class="level2">
|
||
<h2>When <em>not</em> to choose Druid</h2>
|
||
<ul>
|
||
<li>Data is not time-series</li>
|
||
<li>Cardinality is <em>very</em> high</li>
|
||
<li>Number of dimensions is high</li>
|
||
<li>Setup cost must be avoided</li>
|
||
</ul>
|
||
</section>
|
||
<section id="graphite-metrics" class="level2">
|
||
<h2>Graphite (metrics)</h2>
|
||
<p><img src="img/graphite.png" alt="Graphite" />__</p>
|
||
<p><a href="http://graphite.wikidot.com">Graphite</a></p>
|
||
</section>
|
||
<section id="pivot-exploring-data" class="level2">
|
||
<h2>Pivot (exploring data)</h2>
|
||
<p><img src="img/pivot.gif" alt="Pivot" /> </p>
|
||
<p><a href="https://github.com/implydata/pivot">Pivot</a></p>
|
||
</section>
|
||
<section id="caravel-exploring-data" class="level2">
|
||
<h2>Caravel (exploring data)</h2>
|
||
<p><img src="img/caravel.png" alt="caravel" /> </p>
|
||
<p><a href="https://github.com/airbnb/caravel">Caravel</a></p>
|
||
</section>
|
||
</section>
|
||
</div>
|
||
|
||
<script src="../.reveal.js-3.2.0/lib/js/head.min.js"></script>
|
||
<script src="../.reveal.js-3.2.0/js/reveal.js"></script>
|
||
|
||
<script>
|
||
// Full list of configuration options available here:
|
||
// https://github.com/hakimel/reveal.js#configuration
|
||
Reveal.initialize({
|
||
controls: true,
|
||
progress: true,
|
||
history: true,
|
||
center: false,
|
||
|
||
// available themes are in /css/theme
|
||
theme: Reveal.getQueryHash().theme || 'solarized',
|
||
|
||
// default/cube/page/concave/zoom/linear/fade/none
|
||
transition: Reveal.getQueryHash().transition || 'linear',
|
||
|
||
// Optional libraries used to extend on reveal.js
|
||
dependencies: [
|
||
{ src: '..//.reveal.js-3.2.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
|
||
{ src: '..//.reveal.js-3.2.0/plugin/markdown/showdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
|
||
{ src: '..//.reveal.js-3.2.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
|
||
{ src: '..//.reveal.js-3.2.0/plugin/highlight/highlight.js', async: true, callback: function() { hljs.initHighlightingOnLoad(); } },
|
||
{ src: '..//.reveal.js-3.2.0/plugin/zoom-js/zoom.js', async: true, condition: function() { return !!document.body.classList; } },
|
||
{ src: '..//.reveal.js-3.2.0/plugin/notes/notes.js', async: true, condition: function() { return !!document.body.classList; } }
|
||
]
|
||
});
|
||
|
||
</script>
|
||
|
||
</body>
|
||
</html>
|