Move riemann conf to ansible
This commit is contained in:
parent
4362ecca89
commit
9da6874eb1
1 changed files with 0 additions and 193 deletions
193
riemann.config
193
riemann.config
|
@ -1,193 +0,0 @@
|
||||||
; -*- mode: clojure; -*-
|
|
||||||
; vim: filetype=clojure
|
|
||||||
|
|
||||||
(logging/init {:file "riemann.log"})
|
|
||||||
|
|
||||||
; Listen on the local interface over TCP (5555), UDP (5555), and websockets
|
|
||||||
; (5556)
|
|
||||||
(let [host "0.0.0.0"]
|
|
||||||
(tcp-server {:host host})
|
|
||||||
(udp-server {:host host})
|
|
||||||
(ws-server {:host host}))
|
|
||||||
|
|
||||||
; ;Start a Graphite server on the usual TCP port for Carbon, port 2004:
|
|
||||||
(def graphite-server-tcp (graphite-server :host "0.0.0.0" :port 2004
|
|
||||||
:protocol :tcp
|
|
||||||
:parser-fn
|
|
||||||
(fn [{:keys [service] :as event}]
|
|
||||||
(let [[source hostname metricname] (clojure.string/split service #"\." 3)]
|
|
||||||
{:host (clojure.string/replace hostname #"_" ".")
|
|
||||||
:service metricname
|
|
||||||
:metric (:metric event)
|
|
||||||
:tags source
|
|
||||||
:time (:time event)
|
|
||||||
:ttl 30}))))
|
|
||||||
|
|
||||||
; Like the graphite-server-tcp function above, but listening for UDP packets instead of TCP:
|
|
||||||
(def graphite-server-udp (graphite-server :host "0.0.0.0" :port 2004
|
|
||||||
:protocol :udp
|
|
||||||
:parser-fn
|
|
||||||
(fn [{:keys [service] :as event}]
|
|
||||||
(let [[source hostname metricname] (clojure.string/split service #"\." 3)]
|
|
||||||
{ :host (clojure.string/replace hostname #"_" ".")
|
|
||||||
:service metricname
|
|
||||||
:metric (:metric event)
|
|
||||||
:tags source
|
|
||||||
:time (:time event)
|
|
||||||
:ttl 30}))))
|
|
||||||
|
|
||||||
; Expire old events from the index every 5 seconds.
|
|
||||||
(periodically-expire 5)
|
|
||||||
|
|
||||||
(let [index (index)]
|
|
||||||
; Inbound events will be passed to these streams:
|
|
||||||
(streams
|
|
||||||
(where (service #"e.+ .+x bytes")
|
|
||||||
(scale 1/1048576 index)
|
|
||||||
(else
|
|
||||||
(default :ttl 60
|
|
||||||
; Index all events immediately.
|
|
||||||
index
|
|
||||||
|
|
||||||
; Log expired events.
|
|
||||||
(expired
|
|
||||||
(fn [event] (info "expired" event))))))))
|
|
||||||
|
|
||||||
; ------------------------------------------------------------------------------
|
|
||||||
; STREAM ADDITIONS
|
|
||||||
(defn add-stream-by [service-name f suffix]
|
|
||||||
(let [index (index)]
|
|
||||||
(streams
|
|
||||||
(where (service service-name)
|
|
||||||
(coalesce
|
|
||||||
(smap f
|
|
||||||
(with {:service service-name
|
|
||||||
:host (str "Aggregator " suffix)} index)))))))
|
|
||||||
|
|
||||||
(defn add-sum-stream [service-name]
|
|
||||||
(add-stream-by service-name folds/sum "sum"))
|
|
||||||
(defn add-min-stream [service-name]
|
|
||||||
(add-stream-by service-name folds/minimum "min"))
|
|
||||||
(defn add-max-stream [service-name]
|
|
||||||
(add-stream-by service-name folds/maximum "max"))
|
|
||||||
(defn add-mean-stream [service-name]
|
|
||||||
(add-stream-by service-name folds/mean "mean"))
|
|
||||||
(defn add-median-stream [service-name]
|
|
||||||
(add-stream-by service-name folds/median "median"))
|
|
||||||
(defn add-count-stream [service-name]
|
|
||||||
(add-stream-by service-name folds/count "count"))
|
|
||||||
|
|
||||||
|
|
||||||
(defn add-all-grouping-streams [service-name]
|
|
||||||
(add-sum-stream service-name)
|
|
||||||
(add-min-stream service-name)
|
|
||||||
(add-max-stream service-name)
|
|
||||||
(add-mean-stream service-name)
|
|
||||||
(add-median-stream service-name)
|
|
||||||
(add-count-stream service-name))
|
|
||||||
|
|
||||||
; this create the service "haarp nb sum",
|
|
||||||
; the sum on all hosts of the 'haarp nb' metrics
|
|
||||||
(add-sum-stream "haarp nb")
|
|
||||||
(add-sum-stream "haarp events")
|
|
||||||
|
|
||||||
; helper for supercell
|
|
||||||
(defn add-aggregates-for-supercell [env]
|
|
||||||
(doall
|
|
||||||
(map
|
|
||||||
(fn [source]
|
|
||||||
(add-sum-stream (str "supercell " env " " source " nb") )
|
|
||||||
(add-sum-stream (str "supercell " env " " source " msg"))
|
|
||||||
(add-max-stream (str "supercell " env " " source " latency")))
|
|
||||||
["twitter" "facebook"])))
|
|
||||||
|
|
||||||
; helper for lightning
|
|
||||||
(defn add-aggregates-for-lightning [env]
|
|
||||||
(doall
|
|
||||||
(map (fn [server-kind]
|
|
||||||
(add-sum-stream (str "lightning " env " " server-kind " nb" ))
|
|
||||||
(add-max-stream (str "lightning " env " " server-kind " total"))
|
|
||||||
(add-mean-stream (str "lightning " env " " server-kind " total"))
|
|
||||||
(add-median-stream (str "lightning " env " " server-kind " total")))
|
|
||||||
["public" "admin" "system"])))
|
|
||||||
|
|
||||||
(doall
|
|
||||||
(map (fn [env]
|
|
||||||
(add-aggregates-for-supercell env)
|
|
||||||
(add-aggregates-for-lightning env))
|
|
||||||
["dev" "pp" "prod"]))
|
|
||||||
|
|
||||||
; WARN: should be deprecated post v0.3.0
|
|
||||||
(add-sum-stream "supercell prod nb")
|
|
||||||
(add-sum-stream "supercell prod msg")
|
|
||||||
(add-max-stream "supercell prod latency")
|
|
||||||
; /WARN
|
|
||||||
|
|
||||||
; ------------------------------------------------------------------------------
|
|
||||||
;; -- ALERTING
|
|
||||||
(def notify-sysadmin-team
|
|
||||||
(let [email (mailer {:from "riemann@vigiglobe.com"})]
|
|
||||||
(throttle 1000 3600 (rollup 5 3600 (email "dreamteam@vigiglobe.com")))))
|
|
||||||
|
|
||||||
;; Basic alerting. You generally don't want this
|
|
||||||
; (streams (where (state "critical")) alert-everyone)
|
|
||||||
|
|
||||||
(defn critial-fraction [service-name warn-threshold critial-threshold events]
|
|
||||||
(let [nb-events (count events)
|
|
||||||
nb-critical-events (count (filter #(= "critical" (:state %)) events))
|
|
||||||
fraction (if (= nb-events 0) 0 (/ nb-critical-events nb-events))]
|
|
||||||
{:service (str "fail " service-name)
|
|
||||||
:host "MCP"
|
|
||||||
:metric fraction
|
|
||||||
:time (:time (first events))
|
|
||||||
:state (condp <= fraction
|
|
||||||
critial-threshold "critical"
|
|
||||||
warn-threshold "warning"
|
|
||||||
"ok")}))
|
|
||||||
|
|
||||||
(defn alert-if-critical-too-long-with-thresholds
|
|
||||||
[service-name nb-sec warn-threshold critial-threshold]
|
|
||||||
(let [index (index)]
|
|
||||||
(streams
|
|
||||||
(where (service service-name)
|
|
||||||
(fixed-time-window
|
|
||||||
nb-sec
|
|
||||||
(smap #(critial-fraction service-name warn-threshold critial-threshold %)
|
|
||||||
(where (state "critical")
|
|
||||||
index
|
|
||||||
notify-sysadmin-team)))))))
|
|
||||||
|
|
||||||
(defn alert-if-critical-too-long-with-thresholds-for-host
|
|
||||||
[service-name host-regex nb-sec warn-threshold critial-threshold]
|
|
||||||
(let [index (index)]
|
|
||||||
(streams
|
|
||||||
(where (and (service service-name)
|
|
||||||
(host host-regex))
|
|
||||||
(fixed-time-window
|
|
||||||
nb-sec
|
|
||||||
(smap #(critial-fraction service-name warn-threshold critial-threshold %)
|
|
||||||
(where (state "critical")
|
|
||||||
index
|
|
||||||
notify-sysadmin-team)))))))
|
|
||||||
|
|
||||||
(defn alert-if-critical-too-long [service-name nb-sec]
|
|
||||||
(alert-if-critical-too-long-with-thresholds service-name nb-sec 0.3 0.9))
|
|
||||||
|
|
||||||
|
|
||||||
; ------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
; Alert everybody if 'supercell prod twitter nb' is critical for 30 seconds
|
|
||||||
(alert-if-critical-too-long "supercell prod twitter nb" 30)
|
|
||||||
(alert-if-critical-too-long "supercell prod facebook nb" 30)
|
|
||||||
(alert-if-critical-too-long "haarp nb" 30)
|
|
||||||
|
|
||||||
(alert-if-critical-too-long-with-thresholds-for-host
|
|
||||||
"kafka message/sec" "tornado" 10 0.1 0.9)
|
|
||||||
|
|
||||||
(alert-if-critical-too-long "cpu" 120)
|
|
||||||
(alert-if-critical-too-long "load" 120)
|
|
||||||
(alert-if-critical-too-long "memory" 120)
|
|
||||||
(alert-if-critical-too-long "disk /" 120)
|
|
||||||
|
|
||||||
(def graph (graphite {:host "localhost"}))
|
|
||||||
(streams graph)
|
|
Loading…
Reference in a new issue