From 9da6874eb1e8811c00708ec914954df091bc4e72 Mon Sep 17 00:00:00 2001 From: Matthieu Sprunck Date: Fri, 3 Apr 2015 10:53:05 +0200 Subject: [PATCH] Move riemann conf to ansible --- riemann.config | 193 ------------------------------------------------- 1 file changed, 193 deletions(-) delete mode 100644 riemann.config diff --git a/riemann.config b/riemann.config deleted file mode 100644 index bd79e6c..0000000 --- a/riemann.config +++ /dev/null @@ -1,193 +0,0 @@ -; -*- mode: clojure; -*- -; vim: filetype=clojure - -(logging/init {:file "riemann.log"}) - -; Listen on the local interface over TCP (5555), UDP (5555), and websockets -; (5556) -(let [host "0.0.0.0"] - (tcp-server {:host host}) - (udp-server {:host host}) - (ws-server {:host host})) - -; ;Start a Graphite server on the usual TCP port for Carbon, port 2004: -(def graphite-server-tcp (graphite-server :host "0.0.0.0" :port 2004 - :protocol :tcp - :parser-fn - (fn [{:keys [service] :as event}] - (let [[source hostname metricname] (clojure.string/split service #"\." 3)] - {:host (clojure.string/replace hostname #"_" ".") - :service metricname - :metric (:metric event) - :tags source - :time (:time event) - :ttl 30})))) - -; Like the graphite-server-tcp function above, but listening for UDP packets instead of TCP: -(def graphite-server-udp (graphite-server :host "0.0.0.0" :port 2004 - :protocol :udp - :parser-fn - (fn [{:keys [service] :as event}] - (let [[source hostname metricname] (clojure.string/split service #"\." 3)] - { :host (clojure.string/replace hostname #"_" ".") - :service metricname - :metric (:metric event) - :tags source - :time (:time event) - :ttl 30})))) - -; Expire old events from the index every 5 seconds. -(periodically-expire 5) - -(let [index (index)] - ; Inbound events will be passed to these streams: - (streams - (where (service #"e.+ .+x bytes") - (scale 1/1048576 index) - (else - (default :ttl 60 - ; Index all events immediately. - index - - ; Log expired events. - (expired - (fn [event] (info "expired" event)))))))) - -; ------------------------------------------------------------------------------ -; STREAM ADDITIONS -(defn add-stream-by [service-name f suffix] - (let [index (index)] - (streams - (where (service service-name) - (coalesce - (smap f - (with {:service service-name - :host (str "Aggregator " suffix)} index))))))) - -(defn add-sum-stream [service-name] - (add-stream-by service-name folds/sum "sum")) -(defn add-min-stream [service-name] - (add-stream-by service-name folds/minimum "min")) -(defn add-max-stream [service-name] - (add-stream-by service-name folds/maximum "max")) -(defn add-mean-stream [service-name] - (add-stream-by service-name folds/mean "mean")) -(defn add-median-stream [service-name] - (add-stream-by service-name folds/median "median")) -(defn add-count-stream [service-name] - (add-stream-by service-name folds/count "count")) - - -(defn add-all-grouping-streams [service-name] - (add-sum-stream service-name) - (add-min-stream service-name) - (add-max-stream service-name) - (add-mean-stream service-name) - (add-median-stream service-name) - (add-count-stream service-name)) - -; this create the service "haarp nb sum", -; the sum on all hosts of the 'haarp nb' metrics -(add-sum-stream "haarp nb") -(add-sum-stream "haarp events") - -; helper for supercell -(defn add-aggregates-for-supercell [env] - (doall - (map - (fn [source] - (add-sum-stream (str "supercell " env " " source " nb") ) - (add-sum-stream (str "supercell " env " " source " msg")) - (add-max-stream (str "supercell " env " " source " latency"))) - ["twitter" "facebook"]))) - -; helper for lightning -(defn add-aggregates-for-lightning [env] - (doall - (map (fn [server-kind] - (add-sum-stream (str "lightning " env " " server-kind " nb" )) - (add-max-stream (str "lightning " env " " server-kind " total")) - (add-mean-stream (str "lightning " env " " server-kind " total")) - (add-median-stream (str "lightning " env " " server-kind " total"))) - ["public" "admin" "system"]))) - -(doall - (map (fn [env] - (add-aggregates-for-supercell env) - (add-aggregates-for-lightning env)) - ["dev" "pp" "prod"])) - -; WARN: should be deprecated post v0.3.0 -(add-sum-stream "supercell prod nb") -(add-sum-stream "supercell prod msg") -(add-max-stream "supercell prod latency") -; /WARN - -; ------------------------------------------------------------------------------ -;; -- ALERTING -(def notify-sysadmin-team - (let [email (mailer {:from "riemann@vigiglobe.com"})] - (throttle 1000 3600 (rollup 5 3600 (email "dreamteam@vigiglobe.com"))))) - -;; Basic alerting. You generally don't want this -; (streams (where (state "critical")) alert-everyone) - -(defn critial-fraction [service-name warn-threshold critial-threshold events] - (let [nb-events (count events) - nb-critical-events (count (filter #(= "critical" (:state %)) events)) - fraction (if (= nb-events 0) 0 (/ nb-critical-events nb-events))] - {:service (str "fail " service-name) - :host "MCP" - :metric fraction - :time (:time (first events)) - :state (condp <= fraction - critial-threshold "critical" - warn-threshold "warning" - "ok")})) - -(defn alert-if-critical-too-long-with-thresholds - [service-name nb-sec warn-threshold critial-threshold] - (let [index (index)] - (streams - (where (service service-name) - (fixed-time-window - nb-sec - (smap #(critial-fraction service-name warn-threshold critial-threshold %) - (where (state "critical") - index - notify-sysadmin-team))))))) - -(defn alert-if-critical-too-long-with-thresholds-for-host - [service-name host-regex nb-sec warn-threshold critial-threshold] - (let [index (index)] - (streams - (where (and (service service-name) - (host host-regex)) - (fixed-time-window - nb-sec - (smap #(critial-fraction service-name warn-threshold critial-threshold %) - (where (state "critical") - index - notify-sysadmin-team))))))) - -(defn alert-if-critical-too-long [service-name nb-sec] - (alert-if-critical-too-long-with-thresholds service-name nb-sec 0.3 0.9)) - - -; ------------------------------------------------------------------------------ - -; Alert everybody if 'supercell prod twitter nb' is critical for 30 seconds -(alert-if-critical-too-long "supercell prod twitter nb" 30) -(alert-if-critical-too-long "supercell prod facebook nb" 30) -(alert-if-critical-too-long "haarp nb" 30) - -(alert-if-critical-too-long-with-thresholds-for-host - "kafka message/sec" "tornado" 10 0.1 0.9) - -(alert-if-critical-too-long "cpu" 120) -(alert-if-critical-too-long "load" 120) -(alert-if-critical-too-long "memory" 120) -(alert-if-critical-too-long "disk /" 120) - -(def graph (graphite {:host "localhost"})) -(streams graph)