latest commit
This commit is contained in:
parent
3602f53cfc
commit
585d84325c
17 changed files with 584 additions and 506 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -3,3 +3,4 @@ build/
|
|||
compile
|
||||
*.hi
|
||||
*.o
|
||||
.DS_Store
|
||||
|
|
|
@ -84,7 +84,8 @@ body {
|
|||
|
||||
.reveal strong,
|
||||
.reveal b {
|
||||
font-weight: bold; }
|
||||
font-weight: bold;
|
||||
color: #b58900; }
|
||||
|
||||
.reveal em {
|
||||
font-style: italic; }
|
||||
|
|
Binary file not shown.
BIN
README.pdf
BIN
README.pdf
Binary file not shown.
Binary file not shown.
304
druid/druid.html
304
druid/druid.html
|
@ -60,52 +60,43 @@
|
|||
</ul></li>
|
||||
<li><a href="#data">Data</a><ul>
|
||||
<li><a href="#concepts-1">Concepts</a></li>
|
||||
<li><a href="#indexing">Indexing</a></li>
|
||||
<li><a href="#loading">Loading</a></li>
|
||||
<li><a href="#querying">Querying</a></li>
|
||||
<li><a href="#segments">Segments</a></li>
|
||||
</ul></li>
|
||||
<li><a href="#roll-up">Roll-up</a><ul>
|
||||
<li><a href="#example">Example</a></li>
|
||||
<li><a href="#as-sql">as SQL</a></li>
|
||||
</ul></li>
|
||||
<li><a href="#sharding">Sharding</a><ul>
|
||||
<li><a href="#segments">Segments</a></li>
|
||||
<li><a href="#segments-1">Segments</a><ul>
|
||||
<li><a href="#sharding">Sharding</a></li>
|
||||
<li><a href="#core-data-structure">Core Data Structure</a></li>
|
||||
<li><a href="#dictionary">Dictionary</a></li>
|
||||
<li><a href="#columnn-data">Columnn Data</a></li>
|
||||
<li><a href="#bitmaps">Bitmaps</a></li>
|
||||
</ul></li>
|
||||
<li><a href="#data-1">Data</a><ul>
|
||||
<li><a href="#indexing-data">Indexing Data</a></li>
|
||||
<li><a href="#loading-data">Loading data</a></li>
|
||||
<li><a href="#querying-the-data">Querying the data</a></li>
|
||||
<li><a href="#columnar-storage">Columnar Storage</a></li>
|
||||
<li><a href="#index">Index</a></li>
|
||||
<li><a href="#data-segments">Data Segments</a></li>
|
||||
<li><a href="#example-1">Example</a></li>
|
||||
<li><a href="#example-multiple-matches">Example (multiple matches)</a></li>
|
||||
<li><a href="#real-time-ingestion">Real-time ingestion</a></li>
|
||||
<li><a href="#batch-ingestion">Batch Ingestion</a></li>
|
||||
<li><a href="#real-time-ingestion-1">Real-time Ingestion</a></li>
|
||||
</ul></li>
|
||||
<li><a href="#querying">Querying</a><ul>
|
||||
<li><a href="#querying-1">Querying</a><ul>
|
||||
<li><a href="#query-types">Query types</a></li>
|
||||
<li><a href="#tip">Tip</a></li>
|
||||
<li><a href="#query-spec">Query Spec</a></li>
|
||||
<li><a href="#examples">Example(s)</a></li>
|
||||
<li><a href="#result">Result</a></li>
|
||||
<li><a href="#caching">Caching</a></li>
|
||||
<li><a href="#load-rules">Load Rules</a></li>
|
||||
</ul></li>
|
||||
<li><a href="#components">Components</a><ul>
|
||||
<li><a href="#druid-components">Druid Components</a></li>
|
||||
<li><a href="#druid-components">Druid Components</a><ul>
|
||||
<li><a href="#druid-1">Druid</a></li>
|
||||
<li><a href="#also">Also</a></li>
|
||||
<li><a href="#coordinator">Coordinator</a></li>
|
||||
<li><a href="#real-time-nodes">Real-time Nodes</a></li>
|
||||
<li><a href="#historical-nodes">Historical Nodes</a></li>
|
||||
<li><a href="#overlord">Overlord</a></li>
|
||||
<li><a href="#middle-manager">Middle Manager</a></li>
|
||||
<li><a href="#broker-nodes">Broker Nodes</a></li>
|
||||
<li><a href="#deep-storage">Deep Storage</a></li>
|
||||
</ul></li>
|
||||
<li><a href="#considerations-tools">Considerations & Tools</a><ul>
|
||||
<li><a href="#when-not-to-choose-druid">When <em>not</em> to choose Druid</a></li>
|
||||
<li><a href="#graphite-metrics">Graphite (metrics)</a></li>
|
||||
<li><a href="#pivot-exploring-data">Pivot (exploring data)</a></li>
|
||||
<li><a href="#caravel-exploring-data">Caravel (exploring data)</a></li>
|
||||
<li><a href="#caravel">Caravel</a></li>
|
||||
<li><a href="#conclusions">Conclusions</a><ul>
|
||||
<li><a href="#precompute-your-time-series">Precompute your time series?</a></li>
|
||||
<li><a href="#dont-reinvent-it">Don’t reinvent it</a></li>
|
||||
<li><a href="#druid-way-is-the-right-way">Druid way is the right way!</a></li>
|
||||
</ul></li>
|
||||
</ul>
|
||||
</nav>
|
||||
|
@ -155,7 +146,7 @@
|
|||
<li>Manually coded HyperLogLog in js</li>
|
||||
</ul>
|
||||
<h1 id="return-of-experience">Return of Experience</h1>
|
||||
<p><img src="img/mongoDB.png" alt="MongoDB the destroyer" /> </p>
|
||||
<p><img src="img/MongoDB.png" alt="MongoDB the destroyer" /> </p>
|
||||
<h1 id="return-of-experience-1">Return of Experience</h1>
|
||||
<ul>
|
||||
<li>Ingestion still in PHP (600 msg/s max)</li>
|
||||
|
@ -294,88 +285,24 @@
|
|||
<li><strong>Dimension columns</strong>: strings (used to filter or to group)</li>
|
||||
<li><strong>Metric columns</strong>: used for aggregations (count, sum, mean, etc…)</li>
|
||||
</ul>
|
||||
<h1 id="roll-up">Roll-up</h1>
|
||||
<h2 id="example">Example</h2>
|
||||
<pre><code>timestamp page ... added deleted
|
||||
2011-01-01T00:01:35Z Justin Bieber 10 65
|
||||
2011-01-01T00:03:63Z Justin Bieber 15 62
|
||||
2011-01-01T01:04:51Z Justin Bieber 32 45
|
||||
2011-01-01T01:01:00Z Ke$ha 17 87
|
||||
2011-01-01T01:02:00Z Ke$ha 43 99
|
||||
2011-01-01T02:03:00Z Ke$ha 12 53</code></pre>
|
||||
<pre><code>timestamp page ... nb added deleted
|
||||
2011-01-01T00:00:00Z Justin Bieber 2 25 127
|
||||
2011-01-01T01:00:00Z Justin Bieber 1 32 45
|
||||
2011-01-01T01:00:00Z Ke$ha 2 60 186
|
||||
2011-01-01T02:00:00Z Ke$ha 1 12 53</code></pre>
|
||||
<h2 id="as-sql">as SQL</h2>
|
||||
<pre><code>GROUP BY timestamp, page, nb, added, deleted
|
||||
:: nb = COUNT(1)
|
||||
, added = SUM(added)
|
||||
, deleted = SUM(deleted)</code></pre>
|
||||
<p>In practice can dramatically reduce the size (up to x100)</p>
|
||||
<h1 id="sharding">Sharding</h1>
|
||||
<h2 id="segments">Segments</h2>
|
||||
<p><small><code>sampleData_2011-01-01T01:00:00:00Z_2011-01-01T02:00:00:00Z_v1_0</code></small></p>
|
||||
<pre><code>2011-01-01T01:00:00Z Justin Bieber 1 20 45
|
||||
2011-01-01T01:00:00Z Ke$ha 1 30 106</code></pre>
|
||||
<p><small><code>sampleData_2011-01-01T01:00:00:00Z_2011-01-01T02:00:00:00Z_v1_0</code></small></p>
|
||||
<pre><code>2011-01-01T01:00:00Z Justin Bieber 1 12 45
|
||||
2011-01-01T01:00:00Z Ke$ha 2 30 80</code></pre>
|
||||
<h2 id="core-data-structure">Core Data Structure</h2>
|
||||
<p><img src="img/druid-column-types.png" alt="Segment" /> </p>
|
||||
<ul>
|
||||
<li>dictionary</li>
|
||||
<li>a bitmap for each value</li>
|
||||
<li>a list of the columns values encoded using the dictionary</li>
|
||||
</ul>
|
||||
<h2 id="dictionary">Dictionary</h2>
|
||||
<pre><code>{ "Justin Bieber": 0
|
||||
, "Ke$ha": 1
|
||||
}</code></pre>
|
||||
<h2 id="columnn-data">Columnn Data</h2>
|
||||
<pre><code>[ 0
|
||||
, 0
|
||||
, 1
|
||||
, 1
|
||||
]</code></pre>
|
||||
<h2 id="bitmaps">Bitmaps</h2>
|
||||
<p>one for each value of the column</p>
|
||||
<pre><code>value="Justin Bieber": [1,1,0,0]
|
||||
value="Ke$ha": [0,0,1,1]</code></pre>
|
||||
<h1 id="data-1">Data</h1>
|
||||
<h2 id="indexing-data">Indexing Data</h2>
|
||||
<h2 id="indexing">Indexing</h2>
|
||||
<ul>
|
||||
<li>Immutable snapshots of data</li>
|
||||
<li>data structure highly optimized for analytic queries</li>
|
||||
<li>Each column is stored separately</li>
|
||||
<li>Indexes data on a per shard (segment) level</li>
|
||||
</ul>
|
||||
<h2 id="loading-data">Loading data</h2>
|
||||
<h2 id="loading">Loading</h2>
|
||||
<ul>
|
||||
<li>Real-Time</li>
|
||||
<li>Batch</li>
|
||||
</ul>
|
||||
<h2 id="querying-the-data">Querying the data</h2>
|
||||
<h2 id="querying">Querying</h2>
|
||||
<ul>
|
||||
<li>JSON over HTTP</li>
|
||||
<li>Single Table Operations, no joins.</li>
|
||||
</ul>
|
||||
<h2 id="columnar-storage">Columnar Storage</h2>
|
||||
<h2 id="index">Index</h2>
|
||||
<ul>
|
||||
<li>Values are dictionary encoded</li>
|
||||
</ul>
|
||||
<p><code>{"USA" 1, "Canada" 2, "Mexico" 3, ...}</code></p>
|
||||
<ul>
|
||||
<li>Bitmap for every dimension value (used by filters)</li>
|
||||
</ul>
|
||||
<p><code>"USA" -> [0 1 0 0 1 1 0 0 0]</code></p>
|
||||
<ul>
|
||||
<li>Column values (used by aggergation queries)</li>
|
||||
</ul>
|
||||
<p><code>[2,1,3,15,1,1,2,8,7]</code></p>
|
||||
<h2 id="data-segments">Data Segments</h2>
|
||||
<h2 id="segments">Segments</h2>
|
||||
<ul>
|
||||
<li>Per time interval
|
||||
<ul>
|
||||
|
@ -392,6 +319,61 @@ value="Ke$ha": [0,0,1,1]</code></pre>
|
|||
<li>Read-write concurrency</li>
|
||||
</ul></li>
|
||||
</ul>
|
||||
<h1 id="roll-up">Roll-up</h1>
|
||||
<h2 id="example">Example</h2>
|
||||
<pre><code>timestamp page ... added deleted
|
||||
2011-01-01T00:01:35Z Cthulhu 10 65
|
||||
2011-01-01T00:03:63Z Cthulhu 15 62
|
||||
2011-01-01T01:04:51Z Cthulhu 32 45
|
||||
2011-01-01T01:01:00Z Azatoth 17 87
|
||||
2011-01-01T01:02:00Z Azatoth 43 99
|
||||
2011-01-01T02:03:00Z Azatoth 12 53</code></pre>
|
||||
<pre><code>timestamp page ... nb added deleted
|
||||
2011-01-01T00:00:00Z Cthulhu 2 25 127
|
||||
2011-01-01T01:00:00Z Cthulhu 1 32 45
|
||||
2011-01-01T01:00:00Z Azatoth 2 60 186
|
||||
2011-01-01T02:00:00Z Azatoth 1 12 53</code></pre>
|
||||
<h2 id="as-sql">as SQL</h2>
|
||||
<pre><code>GROUP BY timestamp, page, nb, added, deleted
|
||||
:: nb = COUNT(1)
|
||||
, added = SUM(added)
|
||||
, deleted = SUM(deleted)</code></pre>
|
||||
<p>In practice can dramatically reduce the size (up to x100)</p>
|
||||
<h1 id="segments-1">Segments</h1>
|
||||
<h2 id="sharding">Sharding</h2>
|
||||
<p><small><code>sampleData_2011-01-01T01:00:00:00Z_2011-01-01T02:00:00:00Z_v1_0</code></small></p>
|
||||
<pre><code>timestamp page ... nb added deleted
|
||||
2011-01-01T01:00:00Z Cthulhu 1 20 45
|
||||
2011-01-01T01:00:00Z Azatoth 1 30 106</code></pre>
|
||||
<p><small><code>sampleData_2011-01-01T01:00:00:00Z_2011-01-01T02:00:00:00Z_v1_0</code></small></p>
|
||||
<pre><code>timestamp page ... nb added deleted
|
||||
2011-01-01T01:00:00Z Cthulhu 1 12 45
|
||||
2011-01-01T01:00:00Z Azatoth 2 30 80</code></pre>
|
||||
<h2 id="core-data-structure">Core Data Structure</h2>
|
||||
<p><img src="img/druid-column-types.png" alt="Segment" /> </p>
|
||||
<ul>
|
||||
<li>dictionary</li>
|
||||
<li>a bitmap for each value</li>
|
||||
<li>a list of the columns values encoded using the dictionary</li>
|
||||
</ul>
|
||||
<h2 id="example-1">Example</h2>
|
||||
<pre><code>dictionary: { "Cthulhu": 0
|
||||
, "Azatoth": 1 }
|
||||
|
||||
column data: [0, 0, 1, 1]
|
||||
|
||||
bitmaps (one for each value of the column):
|
||||
value="Cthulhu": [1,1,0,0]
|
||||
value="Azatoth": [0,0,1,1]</code></pre>
|
||||
<h2 id="example-multiple-matches">Example (multiple matches)</h2>
|
||||
<pre><code>dictionary: { "Cthulhu": 0
|
||||
, "Azatoth": 1 }
|
||||
|
||||
column data: [0, [0,1], 1, 1]
|
||||
|
||||
bitmaps (one for each value of the column):
|
||||
value="Cthulhu": [1,1,0,0]
|
||||
value="Azatoth": [0,1,1,1]</code></pre>
|
||||
<h2 id="real-time-ingestion">Real-time ingestion</h2>
|
||||
<ul>
|
||||
<li>Via Real-Time Node and Firehose
|
||||
|
@ -411,13 +393,11 @@ value="Ke$ha": [0,0,1,1]</code></pre>
|
|||
<li>File based (HDFS, S3, …)</li>
|
||||
</ul>
|
||||
<h2 id="real-time-ingestion-1">Real-time Ingestion</h2>
|
||||
<pre><code>Task 1: [ Interval ][ Window ]
|
||||
Task 2: [ ]
|
||||
--------------------------------------->
|
||||
time</code></pre>
|
||||
<p>Minimum indexing slots =<br />
|
||||
Data Sources × Partitions × Replicas × 2</p>
|
||||
<h1 id="querying">Querying</h1>
|
||||
<pre><code>Task 1: [ Interval ][ Window ]
|
||||
Task 2: [ ]
|
||||
----------------------------------------------------->
|
||||
time</code></pre>
|
||||
<h1 id="querying-1">Querying</h1>
|
||||
<h2 id="query-types">Query types</h2>
|
||||
<ul>
|
||||
<li>Group by: group by multiple dimensions</li>
|
||||
|
@ -427,26 +407,27 @@ Task 2: [ ]
|
|||
<li>Time Boundary: Find available data timeframe</li>
|
||||
<li>Metadata queries</li>
|
||||
</ul>
|
||||
<h2 id="tip">Tip</h2>
|
||||
<ul>
|
||||
<li>Prefer <code>topN</code> over <code>groupBy</code></li>
|
||||
<li>Prefer <code>timeseries</code> over <code>topN</code></li>
|
||||
<li>Use limits (and priorities)</li>
|
||||
</ul>
|
||||
<h2 id="query-spec">Query Spec</h2>
|
||||
<ul>
|
||||
<li>Data source</li>
|
||||
<li>Dimensions</li>
|
||||
<li>Interval</li>
|
||||
<li>Filters</li>
|
||||
<li>Aggergations</li>
|
||||
<li>Post Aggregations</li>
|
||||
<li>Granularity</li>
|
||||
<li>Context (query configuration)</li>
|
||||
<li>Limit</li>
|
||||
</ul>
|
||||
<h2 id="examples">Example(s)</h2>
|
||||
<p>TODO</p>
|
||||
<pre><code>{"queryType": "groupBy",
|
||||
"dataSource": "druidtest",
|
||||
"granularity": "all",
|
||||
"dimensions": [],
|
||||
"aggregations": [
|
||||
{"type": "count", "name": "rows"},
|
||||
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
|
||||
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
|
||||
],
|
||||
"intervals": ["2010-01-01T00:00/2020-01-01T00"]}</code></pre>
|
||||
<h2 id="result">Result</h2>
|
||||
<pre><code>[ {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : {
|
||||
"imps" : 5,
|
||||
"wp" : 15000.0,
|
||||
"rows" : 5
|
||||
}
|
||||
} ]</code></pre>
|
||||
<h2 id="caching">Caching</h2>
|
||||
<ul>
|
||||
<li>Historical node level
|
||||
|
@ -458,15 +439,10 @@ Task 2: [ ]
|
|||
<li>By segment and query</li>
|
||||
<li><code>groupBy</code> is disabled on purpose!</li>
|
||||
</ul></li>
|
||||
<li>By default - local caching</li>
|
||||
<li>By default: local caching</li>
|
||||
</ul>
|
||||
<h2 id="load-rules">Load Rules</h2>
|
||||
<ul>
|
||||
<li>Can be defined</li>
|
||||
<li>What can be set</li>
|
||||
</ul>
|
||||
<h1 id="components">Components</h1>
|
||||
<h2 id="druid-components">Druid Components</h2>
|
||||
<h1 id="druid-components">Druid Components</h1>
|
||||
<h2 id="druid-1">Druid</h2>
|
||||
<ul>
|
||||
<li>Real-time Nodes</li>
|
||||
<li>Historical Nodes</li>
|
||||
|
@ -477,56 +453,60 @@ Task 2: [ ]
|
|||
<li>Overlord</li>
|
||||
<li>Middle Manager</li>
|
||||
</ul></li>
|
||||
<li>Deep Storage</li>
|
||||
<li><p>Metadata Storage</p></li>
|
||||
</ul>
|
||||
<h2 id="also">Also</h2>
|
||||
<ul>
|
||||
<li>Deep Storage (S3, HDFS, …)</li>
|
||||
<li>Metadata Storage (SQL)</li>
|
||||
<li>Load Balancer</li>
|
||||
<li><p>Cache</p></li>
|
||||
<li>Cache</li>
|
||||
</ul>
|
||||
<h2 id="coordinator">Coordinator</h2>
|
||||
<p>Manage Segments</p>
|
||||
<h2 id="real-time-nodes">Real-time Nodes</h2>
|
||||
<ul>
|
||||
<li>Pulling data in real-time</li>
|
||||
<li>Indexing it</li>
|
||||
</ul>
|
||||
<h2 id="historical-nodes">Historical Nodes</h2>
|
||||
<li>Real-time Nodes (pull data, index it)</li>
|
||||
<li>Historical Nodes (keep old segments)</li>
|
||||
<li>Broker Nodes (route queries to RT & Hist. nodes, merge)</li>
|
||||
<li>Coordinator (manage segemnts)</li>
|
||||
<li>For indexing:
|
||||
<ul>
|
||||
<li>Keep historical segments</li>
|
||||
<li>Overlord (distribute task to the middle manager)</li>
|
||||
<li>Middle Manager (execute tasks via Peons)</li>
|
||||
</ul></li>
|
||||
</ul>
|
||||
<h2 id="overlord">Overlord</h2>
|
||||
<ul>
|
||||
<li>Accepts tasks and distributes them to middle manager</li>
|
||||
</ul>
|
||||
<h2 id="middle-manager">Middle Manager</h2>
|
||||
<ul>
|
||||
<li>Execute submitted tasks via Peons</li>
|
||||
</ul>
|
||||
<h2 id="broker-nodes">Broker Nodes</h2>
|
||||
<ul>
|
||||
<li>Route query to Real-time and Historical nodes</li>
|
||||
<li>Merge results</li>
|
||||
</ul>
|
||||
<h2 id="deep-storage">Deep Storage</h2>
|
||||
<ul>
|
||||
<li>Segments backup (HDFS, S3, …)</li>
|
||||
</ul>
|
||||
<h1 id="considerations-tools">Considerations & Tools</h1>
|
||||
<h2 id="when-not-to-choose-druid">When <em>not</em> to choose Druid</h2>
|
||||
<h1 id="when-not-to-choose-druid">When <em>not</em> to choose Druid</h1>
|
||||
<ul>
|
||||
<li>Data is not time-series</li>
|
||||
<li>Cardinality is <em>very</em> high</li>
|
||||
<li>Number of dimensions is high</li>
|
||||
<li>Setup cost must be avoided</li>
|
||||
</ul>
|
||||
<h2 id="graphite-metrics">Graphite (metrics)</h2>
|
||||
<h1 id="graphite-metrics">Graphite (metrics)</h1>
|
||||
<p><img src="img/graphite.png" alt="Graphite" />__</p>
|
||||
<p><a href="http://graphite.wikidot.com">Graphite</a></p>
|
||||
<h2 id="pivot-exploring-data">Pivot (exploring data)</h2>
|
||||
<h1 id="pivot-exploring-data">Pivot (exploring data)</h1>
|
||||
<p><img src="img/pivot.gif" alt="Pivot" /> </p>
|
||||
<p><a href="https://github.com/implydata/pivot">Pivot</a></p>
|
||||
<h2 id="caravel-exploring-data">Caravel (exploring data)</h2>
|
||||
<h1 id="caravel">Caravel</h1>
|
||||
<p><img src="img/caravel.png" alt="caravel" /> </p>
|
||||
<p><a href="https://github.com/airbnb/caravel">Caravel</a></p>
|
||||
<h1 id="conclusions">Conclusions</h1>
|
||||
<h2 id="precompute-your-time-series">Precompute your time series?</h2>
|
||||
<p><img src="img/wrong.jpg" alt="You’re doing it wrong" /> </p>
|
||||
<h2 id="dont-reinvent-it">Don’t reinvent it</h2>
|
||||
<ul>
|
||||
<li>need a user facing API</li>
|
||||
<li>need time series on many dimensions</li>
|
||||
<li>need real-time</li>
|
||||
<li>big volume of data</li>
|
||||
</ul>
|
||||
<h2 id="druid-way-is-the-right-way">Druid way is the right way!</h2>
|
||||
<ol type="1">
|
||||
<li>Push in kafka</li>
|
||||
<li>Add the right dimensions</li>
|
||||
<li>Push in druid</li>
|
||||
<li>???</li>
|
||||
<li>Profit!</li>
|
||||
</ol>
|
||||
<div id="footer">
|
||||
<a href="yannesposito.com">Y</a>
|
||||
</div>
|
||||
|
|
292
druid/druid.md
292
druid/druid.md
|
@ -57,7 +57,7 @@ date: 7 Avril 2016
|
|||
|
||||
# Return of Experience
|
||||
|
||||
![MongoDB the destroyer](img/mongoDB.png)\
|
||||
![MongoDB the destroyer](img/MongoDB.png)\
|
||||
|
||||
# Return of Experience
|
||||
|
||||
|
@ -214,26 +214,54 @@ Store data in custom column format highly optimized for aggregation & filter.
|
|||
- **Dimension columns**: strings (used to filter or to group)
|
||||
- **Metric columns**: used for aggregations (count, sum, mean, etc...)
|
||||
|
||||
## Indexing
|
||||
|
||||
- Immutable snapshots of data
|
||||
- data structure highly optimized for analytic queries
|
||||
- Each column is stored separately
|
||||
- Indexes data on a per shard (segment) level
|
||||
|
||||
## Loading
|
||||
|
||||
- Real-Time
|
||||
- Batch
|
||||
|
||||
## Querying
|
||||
|
||||
- JSON over HTTP
|
||||
- Single Table Operations, no joins.
|
||||
|
||||
## Segments
|
||||
|
||||
- Per time interval
|
||||
- skip segments when querying
|
||||
- Immutable
|
||||
- Cache friendly
|
||||
- No locking
|
||||
- Versioned
|
||||
- No locking
|
||||
- Read-write concurrency
|
||||
|
||||
# Roll-up
|
||||
|
||||
## Example
|
||||
|
||||
~~~
|
||||
timestamp page ... added deleted
|
||||
2011-01-01T00:01:35Z Justin Bieber 10 65
|
||||
2011-01-01T00:03:63Z Justin Bieber 15 62
|
||||
2011-01-01T01:04:51Z Justin Bieber 32 45
|
||||
2011-01-01T01:01:00Z Ke$ha 17 87
|
||||
2011-01-01T01:02:00Z Ke$ha 43 99
|
||||
2011-01-01T02:03:00Z Ke$ha 12 53
|
||||
timestamp page ... added deleted
|
||||
2011-01-01T00:01:35Z Cthulhu 10 65
|
||||
2011-01-01T00:03:63Z Cthulhu 15 62
|
||||
2011-01-01T01:04:51Z Cthulhu 32 45
|
||||
2011-01-01T01:01:00Z Azatoth 17 87
|
||||
2011-01-01T01:02:00Z Azatoth 43 99
|
||||
2011-01-01T02:03:00Z Azatoth 12 53
|
||||
~~~
|
||||
|
||||
~~~
|
||||
timestamp page ... nb added deleted
|
||||
2011-01-01T00:00:00Z Justin Bieber 2 25 127
|
||||
2011-01-01T01:00:00Z Justin Bieber 1 32 45
|
||||
2011-01-01T01:00:00Z Ke$ha 2 60 186
|
||||
2011-01-01T02:00:00Z Ke$ha 1 12 53
|
||||
timestamp page ... nb added deleted
|
||||
2011-01-01T00:00:00Z Cthulhu 2 25 127
|
||||
2011-01-01T01:00:00Z Cthulhu 1 32 45
|
||||
2011-01-01T01:00:00Z Azatoth 2 60 186
|
||||
2011-01-01T02:00:00Z Azatoth 1 12 53
|
||||
~~~
|
||||
|
||||
## as SQL
|
||||
|
@ -247,22 +275,25 @@ GROUP BY timestamp, page, nb, added, deleted
|
|||
|
||||
In practice can dramatically reduce the size (up to x100)
|
||||
|
||||
# Sharding
|
||||
|
||||
## Segments
|
||||
# Segments
|
||||
|
||||
## Sharding
|
||||
|
||||
<small>`sampleData_2011-01-01T01:00:00:00Z_2011-01-01T02:00:00:00Z_v1_0`</small>
|
||||
|
||||
~~~
|
||||
2011-01-01T01:00:00Z Justin Bieber 1 20 45
|
||||
2011-01-01T01:00:00Z Ke$ha 1 30 106
|
||||
timestamp page ... nb added deleted
|
||||
2011-01-01T01:00:00Z Cthulhu 1 20 45
|
||||
2011-01-01T01:00:00Z Azatoth 1 30 106
|
||||
~~~
|
||||
|
||||
<small>`sampleData_2011-01-01T01:00:00:00Z_2011-01-01T02:00:00:00Z_v1_0`</small>
|
||||
|
||||
~~~
|
||||
2011-01-01T01:00:00Z Justin Bieber 1 12 45
|
||||
2011-01-01T01:00:00Z Ke$ha 2 30 80
|
||||
timestamp page ... nb added deleted
|
||||
2011-01-01T01:00:00Z Cthulhu 1 12 45
|
||||
2011-01-01T01:00:00Z Azatoth 2 30 80
|
||||
~~~
|
||||
|
||||
## Core Data Structure
|
||||
|
@ -273,78 +304,32 @@ In practice can dramatically reduce the size (up to x100)
|
|||
- a bitmap for each value
|
||||
- a list of the columns values encoded using the dictionary
|
||||
|
||||
## Dictionary
|
||||
## Example
|
||||
|
||||
~~~
|
||||
{ "Justin Bieber": 0
|
||||
, "Ke$ha": 1
|
||||
}
|
||||
dictionary: { "Cthulhu": 0
|
||||
, "Azatoth": 1 }
|
||||
|
||||
column data: [0, 0, 1, 1]
|
||||
|
||||
bitmaps (one for each value of the column):
|
||||
value="Cthulhu": [1,1,0,0]
|
||||
value="Azatoth": [0,0,1,1]
|
||||
~~~
|
||||
|
||||
## Columnn Data
|
||||
## Example (multiple matches)
|
||||
|
||||
~~~
|
||||
[ 0
|
||||
, 0
|
||||
, 1
|
||||
, 1
|
||||
]
|
||||
dictionary: { "Cthulhu": 0
|
||||
, "Azatoth": 1 }
|
||||
|
||||
column data: [0, [0,1], 1, 1]
|
||||
|
||||
bitmaps (one for each value of the column):
|
||||
value="Cthulhu": [1,1,0,0]
|
||||
value="Azatoth": [0,1,1,1]
|
||||
~~~
|
||||
|
||||
## Bitmaps
|
||||
|
||||
one for each value of the column
|
||||
|
||||
~~~
|
||||
value="Justin Bieber": [1,1,0,0]
|
||||
value="Ke$ha": [0,0,1,1]
|
||||
~~~
|
||||
|
||||
# Data
|
||||
|
||||
## Indexing Data
|
||||
|
||||
- Immutable snapshots of data
|
||||
- data structure highly optimized for analytic queries
|
||||
- Each column is stored separately
|
||||
- Indexes data on a per shard (segment) level
|
||||
|
||||
## Loading data
|
||||
|
||||
- Real-Time
|
||||
- Batch
|
||||
|
||||
## Querying the data
|
||||
|
||||
- JSON over HTTP
|
||||
- Single Table Operations, no joins.
|
||||
|
||||
## Columnar Storage
|
||||
|
||||
## Index
|
||||
|
||||
- Values are dictionary encoded
|
||||
|
||||
`{"USA" 1, "Canada" 2, "Mexico" 3, ...}`
|
||||
|
||||
- Bitmap for every dimension value (used by filters)
|
||||
|
||||
`"USA" -> [0 1 0 0 1 1 0 0 0]`
|
||||
|
||||
- Column values (used by aggergation queries)
|
||||
|
||||
`[2,1,3,15,1,1,2,8,7]`
|
||||
|
||||
## Data Segments
|
||||
|
||||
- Per time interval
|
||||
- skip segments when querying
|
||||
- Immutable
|
||||
- Cache friendly
|
||||
- No locking
|
||||
- Versioned
|
||||
- No locking
|
||||
- Read-write concurrency
|
||||
|
||||
## Real-time ingestion
|
||||
|
||||
|
@ -363,15 +348,12 @@ value="Ke$ha": [0,0,1,1]
|
|||
## Real-time Ingestion
|
||||
|
||||
~~~
|
||||
Task 1: [ Interval ][ Window ]
|
||||
Task 2: [ ]
|
||||
--------------------------------------->
|
||||
time
|
||||
Task 1: [ Interval ][ Window ]
|
||||
Task 2: [ ]
|
||||
----------------------------------------------------->
|
||||
time
|
||||
~~~
|
||||
|
||||
Minimum indexing slots =
|
||||
Data Sources × Partitions × Replicas × 2
|
||||
|
||||
# Querying
|
||||
|
||||
## Query types
|
||||
|
@ -383,27 +365,34 @@ Minimum indexing slots =
|
|||
- Time Boundary: Find available data timeframe
|
||||
- Metadata queries
|
||||
|
||||
## Tip
|
||||
|
||||
- Prefer `topN` over `groupBy`
|
||||
- Prefer `timeseries` over `topN`
|
||||
- Use limits (and priorities)
|
||||
|
||||
## Query Spec
|
||||
|
||||
- Data source
|
||||
- Dimensions
|
||||
- Interval
|
||||
- Filters
|
||||
- Aggergations
|
||||
- Post Aggregations
|
||||
- Granularity
|
||||
- Context (query configuration)
|
||||
- Limit
|
||||
|
||||
## Example(s)
|
||||
|
||||
TODO
|
||||
~~~
|
||||
{"queryType": "groupBy",
|
||||
"dataSource": "druidtest",
|
||||
"granularity": "all",
|
||||
"dimensions": [],
|
||||
"aggregations": [
|
||||
{"type": "count", "name": "rows"},
|
||||
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
|
||||
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
|
||||
],
|
||||
"intervals": ["2010-01-01T00:00/2020-01-01T00"]}
|
||||
~~~
|
||||
|
||||
## Result
|
||||
|
||||
~~~
|
||||
[ {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : {
|
||||
"imps" : 5,
|
||||
"wp" : 15000.0,
|
||||
"rows" : 5
|
||||
}
|
||||
} ]
|
||||
~~~
|
||||
|
||||
## Caching
|
||||
|
||||
|
@ -412,16 +401,11 @@ TODO
|
|||
- Broker Level
|
||||
- By segment and query
|
||||
- `groupBy` is disabled on purpose!
|
||||
- By default - local caching
|
||||
- By default: local caching
|
||||
|
||||
## Load Rules
|
||||
# Druid Components
|
||||
|
||||
- Can be defined
|
||||
- What can be set
|
||||
|
||||
# Components
|
||||
|
||||
## Druid Components
|
||||
## Druid
|
||||
|
||||
- Real-time Nodes
|
||||
- Historical Nodes
|
||||
|
@ -431,65 +415,65 @@ TODO
|
|||
- Overlord
|
||||
- Middle Manager
|
||||
|
||||
+ Deep Storage
|
||||
+ Metadata Storage
|
||||
## Also
|
||||
|
||||
+ Load Balancer
|
||||
+ Cache
|
||||
- Deep Storage (S3, HDFS, ...)
|
||||
- Metadata Storage (SQL)
|
||||
- Load Balancer
|
||||
- Cache
|
||||
|
||||
## Coordinator
|
||||
|
||||
Manage Segments
|
||||
- Real-time Nodes (pull data, index it)
|
||||
- Historical Nodes (keep old segments)
|
||||
- Broker Nodes (route queries to RT & Hist. nodes, merge)
|
||||
- Coordinator (manage segemnts)
|
||||
- For indexing:
|
||||
- Overlord (distribute task to the middle manager)
|
||||
- Middle Manager (execute tasks via Peons)
|
||||
|
||||
## Real-time Nodes
|
||||
|
||||
- Pulling data in real-time
|
||||
- Indexing it
|
||||
|
||||
## Historical Nodes
|
||||
|
||||
- Keep historical segments
|
||||
|
||||
## Overlord
|
||||
|
||||
- Accepts tasks and distributes them to middle manager
|
||||
|
||||
## Middle Manager
|
||||
|
||||
- Execute submitted tasks via Peons
|
||||
|
||||
## Broker Nodes
|
||||
|
||||
- Route query to Real-time and Historical nodes
|
||||
- Merge results
|
||||
|
||||
## Deep Storage
|
||||
|
||||
- Segments backup (HDFS, S3, ...)
|
||||
|
||||
# Considerations & Tools
|
||||
|
||||
## When *not* to choose Druid
|
||||
# When *not* to choose Druid
|
||||
|
||||
- Data is not time-series
|
||||
- Cardinality is _very_ high
|
||||
- Number of dimensions is high
|
||||
- Setup cost must be avoided
|
||||
|
||||
## Graphite (metrics)
|
||||
# Graphite (metrics)
|
||||
|
||||
![Graphite](img/graphite.png)\__
|
||||
|
||||
[Graphite](http://graphite.wikidot.com)
|
||||
|
||||
## Pivot (exploring data)
|
||||
# Pivot (exploring data)
|
||||
|
||||
![Pivot](img/pivot.gif)\
|
||||
|
||||
[Pivot](https://github.com/implydata/pivot)
|
||||
|
||||
## Caravel (exploring data)
|
||||
# Caravel
|
||||
|
||||
![caravel](img/caravel.png)\
|
||||
|
||||
[Caravel](https://github.com/airbnb/caravel)
|
||||
|
||||
# Conclusions
|
||||
|
||||
## Precompute your time series?
|
||||
|
||||
![You're doing it wrong](img/wrong.jpg)\
|
||||
|
||||
## Don't reinvent it
|
||||
|
||||
- need a user facing API
|
||||
- need time series on many dimensions
|
||||
- need real-time
|
||||
- big volume of data
|
||||
|
||||
## Druid way is the right way!
|
||||
|
||||
1. Push in kafka
|
||||
2. Add the right dimensions
|
||||
3. Push in druid
|
||||
4. ???
|
||||
5. Profit!
|
||||
|
|
BIN
druid/druid.pdf
BIN
druid/druid.pdf
Binary file not shown.
|
@ -101,7 +101,7 @@
|
|||
</section>
|
||||
<section id="return-of-experience" class="level1">
|
||||
<h1>Return of Experience</h1>
|
||||
<p><img src="img/mongoDB.png" alt="MongoDB the destroyer" /> </p>
|
||||
<p><img src="img/MongoDB.png" alt="MongoDB the destroyer" /> </p>
|
||||
</section>
|
||||
<section id="return-of-experience-1" class="level1">
|
||||
<h1>Return of Experience</h1>
|
||||
|
@ -292,78 +292,8 @@
|
|||
<li><strong>Metric columns</strong>: used for aggregations (count, sum, mean, etc...)</li>
|
||||
</ul>
|
||||
</section>
|
||||
</section>
|
||||
<section id="roll-up" class="level1">
|
||||
<h1>Roll-up</h1>
|
||||
<section id="example" class="level2">
|
||||
<h2>Example</h2>
|
||||
<pre><code>timestamp page ... added deleted
|
||||
2011-01-01T00:01:35Z Justin Bieber 10 65
|
||||
2011-01-01T00:03:63Z Justin Bieber 15 62
|
||||
2011-01-01T01:04:51Z Justin Bieber 32 45
|
||||
2011-01-01T01:01:00Z Ke$ha 17 87
|
||||
2011-01-01T01:02:00Z Ke$ha 43 99
|
||||
2011-01-01T02:03:00Z Ke$ha 12 53</code></pre>
|
||||
<pre><code>timestamp page ... nb added deleted
|
||||
2011-01-01T00:00:00Z Justin Bieber 2 25 127
|
||||
2011-01-01T01:00:00Z Justin Bieber 1 32 45
|
||||
2011-01-01T01:00:00Z Ke$ha 2 60 186
|
||||
2011-01-01T02:00:00Z Ke$ha 1 12 53</code></pre>
|
||||
</section>
|
||||
<section id="as-sql" class="level2">
|
||||
<h2>as SQL</h2>
|
||||
<pre><code>GROUP BY timestamp, page, nb, added, deleted
|
||||
:: nb = COUNT(1)
|
||||
, added = SUM(added)
|
||||
, deleted = SUM(deleted)</code></pre>
|
||||
<p>In practice can dramatically reduce the size (up to x100)</p>
|
||||
</section>
|
||||
</section>
|
||||
<section id="sharding" class="level1">
|
||||
<h1>Sharding</h1>
|
||||
<section id="segments" class="level2">
|
||||
<h2>Segments</h2>
|
||||
<p><small><code>sampleData_2011-01-01T01:00:00:00Z_2011-01-01T02:00:00:00Z_v1_0</code></small></p>
|
||||
<pre><code>2011-01-01T01:00:00Z Justin Bieber 1 20 45
|
||||
2011-01-01T01:00:00Z Ke$ha 1 30 106</code></pre>
|
||||
<p><small><code>sampleData_2011-01-01T01:00:00:00Z_2011-01-01T02:00:00:00Z_v1_0</code></small></p>
|
||||
<pre><code>2011-01-01T01:00:00Z Justin Bieber 1 12 45
|
||||
2011-01-01T01:00:00Z Ke$ha 2 30 80</code></pre>
|
||||
</section>
|
||||
<section id="core-data-structure" class="level2">
|
||||
<h2>Core Data Structure</h2>
|
||||
<p><img src="img/druid-column-types.png" alt="Segment" /> </p>
|
||||
<ul>
|
||||
<li>dictionary</li>
|
||||
<li>a bitmap for each value</li>
|
||||
<li>a list of the columns values encoded using the dictionary</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="dictionary" class="level2">
|
||||
<h2>Dictionary</h2>
|
||||
<pre><code>{ "Justin Bieber": 0
|
||||
, "Ke$ha": 1
|
||||
}</code></pre>
|
||||
</section>
|
||||
<section id="columnn-data" class="level2">
|
||||
<h2>Columnn Data</h2>
|
||||
<pre><code>[ 0
|
||||
, 0
|
||||
, 1
|
||||
, 1
|
||||
]</code></pre>
|
||||
</section>
|
||||
<section id="bitmaps" class="level2">
|
||||
<h2>Bitmaps</h2>
|
||||
<p>one for each value of the column</p>
|
||||
<pre><code>value="Justin Bieber": [1,1,0,0]
|
||||
value="Ke$ha": [0,0,1,1]</code></pre>
|
||||
</section>
|
||||
</section>
|
||||
<section id="data-1" class="level1">
|
||||
<h1>Data</h1>
|
||||
<section id="indexing-data" class="level2">
|
||||
<h2>Indexing Data</h2>
|
||||
<section id="indexing" class="level2">
|
||||
<h2>Indexing</h2>
|
||||
<ul>
|
||||
<li>Immutable snapshots of data</li>
|
||||
<li>data structure highly optimized for analytic queries</li>
|
||||
|
@ -371,40 +301,22 @@ value="Ke$ha": [0,0,1,1]</code></pre>
|
|||
<li>Indexes data on a per shard (segment) level</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="loading-data" class="level2">
|
||||
<h2>Loading data</h2>
|
||||
<section id="loading" class="level2">
|
||||
<h2>Loading</h2>
|
||||
<ul>
|
||||
<li>Real-Time</li>
|
||||
<li>Batch</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="querying-the-data" class="level2">
|
||||
<h2>Querying the data</h2>
|
||||
<section id="querying" class="level2">
|
||||
<h2>Querying</h2>
|
||||
<ul>
|
||||
<li>JSON over HTTP</li>
|
||||
<li>Single Table Operations, no joins.</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="columnar-storage" class="level2">
|
||||
<h2>Columnar Storage</h2>
|
||||
</section>
|
||||
<section id="index" class="level2">
|
||||
<h2>Index</h2>
|
||||
<ul>
|
||||
<li>Values are dictionary encoded</li>
|
||||
</ul>
|
||||
<p><code>{"USA" 1, "Canada" 2, "Mexico" 3, ...}</code></p>
|
||||
<ul>
|
||||
<li>Bitmap for every dimension value (used by filters)</li>
|
||||
</ul>
|
||||
<p><code>"USA" -> [0 1 0 0 1 1 0 0 0]</code></p>
|
||||
<ul>
|
||||
<li>Column values (used by aggergation queries)</li>
|
||||
</ul>
|
||||
<p><code>[2,1,3,15,1,1,2,8,7]</code></p>
|
||||
</section>
|
||||
<section id="data-segments" class="level2">
|
||||
<h2>Data Segments</h2>
|
||||
<section id="segments" class="level2">
|
||||
<h2>Segments</h2>
|
||||
<ul>
|
||||
<li>Per time interval
|
||||
<ul>
|
||||
|
@ -422,6 +334,77 @@ value="Ke$ha": [0,0,1,1]</code></pre>
|
|||
</ul></li>
|
||||
</ul>
|
||||
</section>
|
||||
</section>
|
||||
<section id="roll-up" class="level1">
|
||||
<h1>Roll-up</h1>
|
||||
<section id="example" class="level2">
|
||||
<h2>Example</h2>
|
||||
<pre><code>timestamp page ... added deleted
|
||||
2011-01-01T00:01:35Z Cthulhu 10 65
|
||||
2011-01-01T00:03:63Z Cthulhu 15 62
|
||||
2011-01-01T01:04:51Z Cthulhu 32 45
|
||||
2011-01-01T01:01:00Z Azatoth 17 87
|
||||
2011-01-01T01:02:00Z Azatoth 43 99
|
||||
2011-01-01T02:03:00Z Azatoth 12 53</code></pre>
|
||||
<pre><code>timestamp page ... nb added deleted
|
||||
2011-01-01T00:00:00Z Cthulhu 2 25 127
|
||||
2011-01-01T01:00:00Z Cthulhu 1 32 45
|
||||
2011-01-01T01:00:00Z Azatoth 2 60 186
|
||||
2011-01-01T02:00:00Z Azatoth 1 12 53</code></pre>
|
||||
</section>
|
||||
<section id="as-sql" class="level2">
|
||||
<h2>as SQL</h2>
|
||||
<pre><code>GROUP BY timestamp, page, nb, added, deleted
|
||||
:: nb = COUNT(1)
|
||||
, added = SUM(added)
|
||||
, deleted = SUM(deleted)</code></pre>
|
||||
<p>In practice can dramatically reduce the size (up to x100)</p>
|
||||
</section>
|
||||
</section>
|
||||
<section id="segments-1" class="level1">
|
||||
<h1>Segments</h1>
|
||||
<section id="sharding" class="level2">
|
||||
<h2>Sharding</h2>
|
||||
<p><small><code>sampleData_2011-01-01T01:00:00:00Z_2011-01-01T02:00:00:00Z_v1_0</code></small></p>
|
||||
<pre><code>timestamp page ... nb added deleted
|
||||
2011-01-01T01:00:00Z Cthulhu 1 20 45
|
||||
2011-01-01T01:00:00Z Azatoth 1 30 106</code></pre>
|
||||
<p><small><code>sampleData_2011-01-01T01:00:00:00Z_2011-01-01T02:00:00:00Z_v1_0</code></small></p>
|
||||
<pre><code>timestamp page ... nb added deleted
|
||||
2011-01-01T01:00:00Z Cthulhu 1 12 45
|
||||
2011-01-01T01:00:00Z Azatoth 2 30 80</code></pre>
|
||||
</section>
|
||||
<section id="core-data-structure" class="level2">
|
||||
<h2>Core Data Structure</h2>
|
||||
<p><img src="img/druid-column-types.png" alt="Segment" /> </p>
|
||||
<ul>
|
||||
<li>dictionary</li>
|
||||
<li>a bitmap for each value</li>
|
||||
<li>a list of the columns values encoded using the dictionary</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="example-1" class="level2">
|
||||
<h2>Example</h2>
|
||||
<pre><code>dictionary: { "Cthulhu": 0
|
||||
, "Azatoth": 1 }
|
||||
|
||||
column data: [0, 0, 1, 1]
|
||||
|
||||
bitmaps (one for each value of the column):
|
||||
value="Cthulhu": [1,1,0,0]
|
||||
value="Azatoth": [0,0,1,1]</code></pre>
|
||||
</section>
|
||||
<section id="example-multiple-matches" class="level2">
|
||||
<h2>Example (multiple matches)</h2>
|
||||
<pre><code>dictionary: { "Cthulhu": 0
|
||||
, "Azatoth": 1 }
|
||||
|
||||
column data: [0, [0,1], 1, 1]
|
||||
|
||||
bitmaps (one for each value of the column):
|
||||
value="Cthulhu": [1,1,0,0]
|
||||
value="Azatoth": [0,1,1,1]</code></pre>
|
||||
</section>
|
||||
<section id="real-time-ingestion" class="level2">
|
||||
<h2>Real-time ingestion</h2>
|
||||
<ul>
|
||||
|
@ -446,15 +429,13 @@ value="Ke$ha": [0,0,1,1]</code></pre>
|
|||
</section>
|
||||
<section id="real-time-ingestion-1" class="level2">
|
||||
<h2>Real-time Ingestion</h2>
|
||||
<pre><code>Task 1: [ Interval ][ Window ]
|
||||
Task 2: [ ]
|
||||
--------------------------------------->
|
||||
time</code></pre>
|
||||
<p>Minimum indexing slots =<br />
|
||||
Data Sources × Partitions × Replicas × 2</p>
|
||||
<pre><code>Task 1: [ Interval ][ Window ]
|
||||
Task 2: [ ]
|
||||
----------------------------------------------------->
|
||||
time</code></pre>
|
||||
</section>
|
||||
</section>
|
||||
<section id="querying" class="level1">
|
||||
<section id="querying-1" class="level1">
|
||||
<h1>Querying</h1>
|
||||
<section id="query-types" class="level2">
|
||||
<h2>Query types</h2>
|
||||
|
@ -467,31 +448,30 @@ Task 2: [ ]
|
|||
<li>Metadata queries</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="tip" class="level2">
|
||||
<h2>Tip</h2>
|
||||
<ul>
|
||||
<li>Prefer <code>topN</code> over <code>groupBy</code></li>
|
||||
<li>Prefer <code>timeseries</code> over <code>topN</code></li>
|
||||
<li>Use limits (and priorities)</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="query-spec" class="level2">
|
||||
<h2>Query Spec</h2>
|
||||
<ul>
|
||||
<li>Data source</li>
|
||||
<li>Dimensions</li>
|
||||
<li>Interval</li>
|
||||
<li>Filters</li>
|
||||
<li>Aggergations</li>
|
||||
<li>Post Aggregations</li>
|
||||
<li>Granularity</li>
|
||||
<li>Context (query configuration)</li>
|
||||
<li>Limit</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="examples" class="level2">
|
||||
<h2>Example(s)</h2>
|
||||
<p>TODO</p>
|
||||
<pre><code>{"queryType": "groupBy",
|
||||
"dataSource": "druidtest",
|
||||
"granularity": "all",
|
||||
"dimensions": [],
|
||||
"aggregations": [
|
||||
{"type": "count", "name": "rows"},
|
||||
{"type": "longSum", "name": "imps", "fieldName": "impressions"},
|
||||
{"type": "doubleSum", "name": "wp", "fieldName": "wp"}
|
||||
],
|
||||
"intervals": ["2010-01-01T00:00/2020-01-01T00"]}</code></pre>
|
||||
</section>
|
||||
<section id="result" class="level2">
|
||||
<h2>Result</h2>
|
||||
<pre><code>[ {
|
||||
"version" : "v1",
|
||||
"timestamp" : "2010-01-01T00:00:00.000Z",
|
||||
"event" : {
|
||||
"imps" : 5,
|
||||
"wp" : 15000.0,
|
||||
"rows" : 5
|
||||
}
|
||||
} ]</code></pre>
|
||||
</section>
|
||||
<section id="caching" class="level2">
|
||||
<h2>Caching</h2>
|
||||
|
@ -505,21 +485,14 @@ Task 2: [ ]
|
|||
<li>By segment and query</li>
|
||||
<li><code>groupBy</code> is disabled on purpose!</li>
|
||||
</ul></li>
|
||||
<li>By default - local caching</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="load-rules" class="level2">
|
||||
<h2>Load Rules</h2>
|
||||
<ul>
|
||||
<li>Can be defined</li>
|
||||
<li>What can be set</li>
|
||||
<li>By default: local caching</li>
|
||||
</ul>
|
||||
</section>
|
||||
</section>
|
||||
<section id="components" class="level1">
|
||||
<h1>Components</h1>
|
||||
<section id="druid-components" class="level2">
|
||||
<h2>Druid Components</h2>
|
||||
<section id="druid-components" class="level1">
|
||||
<h1>Druid Components</h1>
|
||||
<section id="druid-1" class="level2">
|
||||
<h2>Druid</h2>
|
||||
<ul>
|
||||
<li>Real-time Nodes</li>
|
||||
<li>Historical Nodes</li>
|
||||
|
@ -530,59 +503,34 @@ Task 2: [ ]
|
|||
<li>Overlord</li>
|
||||
<li>Middle Manager</li>
|
||||
</ul></li>
|
||||
<li>Deep Storage</li>
|
||||
<li><p>Metadata Storage</p></li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="also" class="level2">
|
||||
<h2>Also</h2>
|
||||
<ul>
|
||||
<li>Deep Storage (S3, HDFS, ...)</li>
|
||||
<li>Metadata Storage (SQL)</li>
|
||||
<li>Load Balancer</li>
|
||||
<li><p>Cache</p></li>
|
||||
<li>Cache</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="coordinator" class="level2">
|
||||
<h2>Coordinator</h2>
|
||||
<p>Manage Segments</p>
|
||||
</section>
|
||||
<section id="real-time-nodes" class="level2">
|
||||
<h2>Real-time Nodes</h2>
|
||||
<ul>
|
||||
<li>Pulling data in real-time</li>
|
||||
<li>Indexing it</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="historical-nodes" class="level2">
|
||||
<h2>Historical Nodes</h2>
|
||||
<li>Real-time Nodes (pull data, index it)</li>
|
||||
<li>Historical Nodes (keep old segments)</li>
|
||||
<li>Broker Nodes (route queries to RT & Hist. nodes, merge)</li>
|
||||
<li>Coordinator (manage segemnts)</li>
|
||||
<li>For indexing:
|
||||
<ul>
|
||||
<li>Keep historical segments</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="overlord" class="level2">
|
||||
<h2>Overlord</h2>
|
||||
<ul>
|
||||
<li>Accepts tasks and distributes them to middle manager</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="middle-manager" class="level2">
|
||||
<h2>Middle Manager</h2>
|
||||
<ul>
|
||||
<li>Execute submitted tasks via Peons</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="broker-nodes" class="level2">
|
||||
<h2>Broker Nodes</h2>
|
||||
<ul>
|
||||
<li>Route query to Real-time and Historical nodes</li>
|
||||
<li>Merge results</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="deep-storage" class="level2">
|
||||
<h2>Deep Storage</h2>
|
||||
<ul>
|
||||
<li>Segments backup (HDFS, S3, ...)</li>
|
||||
<li>Overlord (distribute task to the middle manager)</li>
|
||||
<li>Middle Manager (execute tasks via Peons)</li>
|
||||
</ul></li>
|
||||
</ul>
|
||||
</section>
|
||||
</section>
|
||||
<section id="considerations-tools" class="level1">
|
||||
<h1>Considerations & Tools</h1>
|
||||
<section id="when-not-to-choose-druid" class="level2">
|
||||
<h2>When <em>not</em> to choose Druid</h2>
|
||||
<section id="when-not-to-choose-druid" class="level1">
|
||||
<h1>When <em>not</em> to choose Druid</h1>
|
||||
<ul>
|
||||
<li>Data is not time-series</li>
|
||||
<li>Cardinality is <em>very</em> high</li>
|
||||
|
@ -590,21 +538,46 @@ Task 2: [ ]
|
|||
<li>Setup cost must be avoided</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="graphite-metrics" class="level2">
|
||||
<h2>Graphite (metrics)</h2>
|
||||
<section id="graphite-metrics" class="level1">
|
||||
<h1>Graphite (metrics)</h1>
|
||||
<p><img src="img/graphite.png" alt="Graphite" />__</p>
|
||||
<p><a href="http://graphite.wikidot.com">Graphite</a></p>
|
||||
</section>
|
||||
<section id="pivot-exploring-data" class="level2">
|
||||
<h2>Pivot (exploring data)</h2>
|
||||
<section id="pivot-exploring-data" class="level1">
|
||||
<h1>Pivot (exploring data)</h1>
|
||||
<p><img src="img/pivot.gif" alt="Pivot" /> </p>
|
||||
<p><a href="https://github.com/implydata/pivot">Pivot</a></p>
|
||||
</section>
|
||||
<section id="caravel-exploring-data" class="level2">
|
||||
<h2>Caravel (exploring data)</h2>
|
||||
<section id="caravel" class="level1">
|
||||
<h1>Caravel</h1>
|
||||
<p><img src="img/caravel.png" alt="caravel" /> </p>
|
||||
<p><a href="https://github.com/airbnb/caravel">Caravel</a></p>
|
||||
</section>
|
||||
<section id="conclusions" class="level1">
|
||||
<h1>Conclusions</h1>
|
||||
<section id="precompute-your-time-series" class="level2">
|
||||
<h2>Precompute your time series?</h2>
|
||||
<p><img src="img/wrong.jpg" alt="You're doing it wrong" /> </p>
|
||||
</section>
|
||||
<section id="dont-reinvent-it" class="level2">
|
||||
<h2>Don't reinvent it</h2>
|
||||
<ul>
|
||||
<li>need a user facing API</li>
|
||||
<li>need time series on many dimensions</li>
|
||||
<li>need real-time</li>
|
||||
<li>big volume of data</li>
|
||||
</ul>
|
||||
</section>
|
||||
<section id="druid-way-is-the-right-way" class="level2">
|
||||
<h2>Druid way is the right way!</h2>
|
||||
<ol type="1">
|
||||
<li>Push in kafka</li>
|
||||
<li>Add the right dimensions</li>
|
||||
<li>Push in druid</li>
|
||||
<li>???</li>
|
||||
<li>Profit!</li>
|
||||
</ol>
|
||||
</section>
|
||||
</section>
|
||||
</div>
|
||||
|
||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 300 KiB After Width: | Height: | Size: 516 KiB |
BIN
druid/img/wrong.jpg
Normal file
BIN
druid/img/wrong.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 70 KiB |
|
@ -1,3 +1,3 @@
|
|||
<div id="footer">
|
||||
<a href="yannesposito.com">Y</a>
|
||||
<a href="http://yannesposito.com">Y</a>
|
||||
</div>
|
||||
|
|
BIN
index.beamer.pdf
Normal file
BIN
index.beamer.pdf
Normal file
Binary file not shown.
36
index.html
Normal file
36
index.html
Normal file
|
@ -0,0 +1,36 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="generator" content="pandoc">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
|
||||
<title></title>
|
||||
<style type="text/css">code{white-space: pre;}</style>
|
||||
<!--[if lt IE 9]>
|
||||
<script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
|
||||
<![endif]-->
|
||||
<link rel="stylesheet" href="styling.css">
|
||||
</head>
|
||||
<body>
|
||||
<p>Generated documents:</p>
|
||||
<ul>
|
||||
<li>Druid:
|
||||
<ul>
|
||||
<li><a href="druid/druid.html">druid doc</a></li>
|
||||
<li><a href="druid/druid.pdf">druid pdf</a></li>
|
||||
<li><a href="druid/druid.reveal.html">druid html pres</a></li>
|
||||
<li><a href="druid/druid.beamer.pdf">druid pdf pres</a></li>
|
||||
</ul></li>
|
||||
<li>README:
|
||||
<ul>
|
||||
<li><a href="README.html">README doc</a></li>
|
||||
<li><a href="README.pdf">README pdf</a></li>
|
||||
<li><a href="README.reveal.html">README html pres</a></li>
|
||||
<li><a href="README.beamer.pdf">README pdf pres</a></li>
|
||||
</ul></li>
|
||||
</ul>
|
||||
<div id="footer">
|
||||
<a href="http://yannesposito.com">Y</a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
12
index.md
Normal file
12
index.md
Normal file
|
@ -0,0 +1,12 @@
|
|||
Generated documents:
|
||||
|
||||
- Druid:
|
||||
- [druid doc](druid/druid.html)
|
||||
- [druid pdf](druid/druid.pdf)
|
||||
- [druid html pres](druid/druid.reveal.html)
|
||||
- [druid pdf pres](druid/druid.beamer.pdf)
|
||||
- README:
|
||||
- [README doc](README.html)
|
||||
- [README pdf](README.pdf)
|
||||
- [README html pres](README.reveal.html)
|
||||
- [README pdf pres](README.beamer.pdf)
|
BIN
index.pdf
Normal file
BIN
index.pdf
Normal file
Binary file not shown.
91
index.reveal.html
Normal file
91
index.reveal.html
Normal file
|
@ -0,0 +1,91 @@
|
|||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title></title>
|
||||
<meta name="description" content="">
|
||||
<meta name="apple-mobile-web-app-capable" content="yes" />
|
||||
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
|
||||
<link rel="stylesheet" href=".reveal.js-3.2.0/css/reveal.css">
|
||||
<link rel="stylesheet" href=".reveal.js-3.2.0/css/theme/default.css" id="theme">
|
||||
<!-- For syntax highlighting -->
|
||||
<link rel="stylesheet" href=".reveal.js-3.2.0/lib/css/zenburn.css">
|
||||
<!-- If the query includes 'print-pdf', use the PDF print sheet -->
|
||||
<script>
|
||||
document.write( '<link rel="stylesheet" href=".reveal.js-3.2.0/css/print/' +
|
||||
( window.location.search.match( /print-pdf/gi ) ? 'pdf' : 'paper' ) +
|
||||
'.css" type="text/css" media="print">' );
|
||||
</script>
|
||||
<!--[if lt IE 9]>
|
||||
<script src=".reveal.js-3.2.0/lib/js/html5shiv.js"></script>
|
||||
<![endif]-->
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<div class="reveal">
|
||||
|
||||
<!-- Any section element inside of this container is displayed as a slide -->
|
||||
<div class="slides">
|
||||
|
||||
<section>
|
||||
<h1></h1>
|
||||
|
||||
<p>
|
||||
<h4></h4>
|
||||
</p>
|
||||
</section>
|
||||
|
||||
|
||||
<p>Generated documents:</p>
|
||||
<ul>
|
||||
<li>Druid:
|
||||
<ul>
|
||||
<li><a href="druid/druid.html">druid doc</a></li>
|
||||
<li><a href="druid/druid.pdf">druid pdf</a></li>
|
||||
<li><a href="druid/druid.reveal.html">druid html pres</a></li>
|
||||
<li><a href="druid/druid.beamer.pdf">druid pdf pres</a></li>
|
||||
</ul></li>
|
||||
<li>README:
|
||||
<ul>
|
||||
<li><a href="README.html">README doc</a></li>
|
||||
<li><a href="README.pdf">README pdf</a></li>
|
||||
<li><a href="README.reveal.html">README html pres</a></li>
|
||||
<li><a href="README.beamer.pdf">README pdf pres</a></li>
|
||||
</ul></li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<script src=".reveal.js-3.2.0/lib/js/head.min.js"></script>
|
||||
<script src=".reveal.js-3.2.0/js/reveal.js"></script>
|
||||
|
||||
<script>
|
||||
// Full list of configuration options available here:
|
||||
// https://github.com/hakimel/reveal.js#configuration
|
||||
Reveal.initialize({
|
||||
controls: true,
|
||||
progress: true,
|
||||
history: true,
|
||||
center: false,
|
||||
|
||||
// available themes are in /css/theme
|
||||
theme: Reveal.getQueryHash().theme || 'default',
|
||||
|
||||
// default/cube/page/concave/zoom/linear/fade/none
|
||||
transition: Reveal.getQueryHash().transition || 'linear',
|
||||
|
||||
// Optional libraries used to extend on reveal.js
|
||||
dependencies: [
|
||||
{ src: '/.reveal.js-3.2.0/lib/js/classList.js', condition: function() { return !document.body.classList; } },
|
||||
{ src: '/.reveal.js-3.2.0/plugin/markdown/showdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
|
||||
{ src: '/.reveal.js-3.2.0/plugin/markdown/markdown.js', condition: function() { return !!document.querySelector( '[data-markdown]' ); } },
|
||||
{ src: '/.reveal.js-3.2.0/plugin/highlight/highlight.js', async: true, callback: function() { hljs.initHighlightingOnLoad(); } },
|
||||
{ src: '/.reveal.js-3.2.0/plugin/zoom-js/zoom.js', async: true, condition: function() { return !!document.body.classList; } },
|
||||
{ src: '/.reveal.js-3.2.0/plugin/notes/notes.js', async: true, condition: function() { return !!document.body.classList; } }
|
||||
]
|
||||
});
|
||||
|
||||
</script>
|
||||
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in a new issue