93 lines
3.2 KiB
Clojure
93 lines
3.2 KiB
Clojure
; -*- mode: clojure; -*-
|
|
; vim: filetype=clojure
|
|
|
|
(logging/init {:file "riemann.log"})
|
|
|
|
; Listen on the local interface over TCP (5555), UDP (5555), and websockets
|
|
; (5556)
|
|
(let [host "0.0.0.0"]
|
|
(tcp-server {:host host})
|
|
(udp-server {:host host})
|
|
(ws-server {:host host}))
|
|
|
|
; Expire old events from the index every 5 seconds.
|
|
(periodically-expire 5)
|
|
|
|
(let [index (index)]
|
|
; Inbound events will be passed to these streams:
|
|
(streams
|
|
(where (service #"e.+ .+x bytes")
|
|
(scale 1/1048576 index)
|
|
(else
|
|
(default :ttl 60
|
|
; Index all events immediately.
|
|
index
|
|
|
|
; Log expired events.
|
|
(expired
|
|
(fn [event] (info "expired" event))))))))
|
|
|
|
;; -- ALERTING
|
|
(def notify-sysadmin-team
|
|
(let [email (mailer {:from "riemann@vigiglobe.com"})]
|
|
(throttle 1000 3600 (rollup 5 3600 (email "dreamteam@vigiglobe.com")))))
|
|
|
|
;; Basic alerting. You generally don't want this
|
|
; (streams (where (state "critical")) alert-everyone)
|
|
|
|
(defn critial-fraction [service-name warn-threshold critial-threshold events]
|
|
(let [nb-events (count events)
|
|
nb-critical-events (count (filter #(= "critical" (:state %)) events))
|
|
fraction (if (= nb-events 0) 0 (/ nb-critical-events nb-events))]
|
|
{:service (str "fail " service-name)
|
|
:host "MCP"
|
|
:metric fraction
|
|
:time (:time (first events))
|
|
:state (condp <= fraction
|
|
critial-threshold "critical"
|
|
warn-threshold "warning"
|
|
"ok")}))
|
|
|
|
(defn alert-if-critical-too-long-with-thresholds
|
|
[service-name nb-sec warn-threshold critial-threshold]
|
|
(let [index (index)]
|
|
(streams
|
|
(where (service service-name)
|
|
(fixed-time-window
|
|
nb-sec
|
|
(smap #(critial-fraction service-name warn-threshold critial-threshold %)
|
|
(where (state "critical")
|
|
index
|
|
notify-sysadmin-team)))))))
|
|
|
|
(defn alert-if-critical-too-long-with-thresholds-for-host
|
|
[service-name host-regex nb-sec warn-threshold critial-threshold]
|
|
(let [index (index)]
|
|
(streams
|
|
(where (and (service service-name)
|
|
(host host-regex))
|
|
(fixed-time-window
|
|
nb-sec
|
|
(smap #(critial-fraction service-name warn-threshold critial-threshold %)
|
|
(where (state "critical")
|
|
index
|
|
notify-sysadmin-team)))))))
|
|
|
|
(defn alert-if-critical-too-long [service-name nb-sec]
|
|
(alert-if-critical-too-long-with-thresholds service-name nb-sec 0.3 0.9))
|
|
|
|
|
|
; ------------------------------------------------------------------------------
|
|
|
|
; Alert everybody if 'supercell prod twitter nb' is critical for 30 seconds
|
|
(alert-if-critical-too-long "supercell prod twitter nb" 30)
|
|
(alert-if-critical-too-long "supercell prod facebook nb" 30)
|
|
(alert-if-critical-too-long "haarp nb" 30)
|
|
|
|
(alert-if-critical-too-long-with-thresholds-for-host
|
|
"kafka message/sec" "tornado" 10 0.1 0.9)
|
|
|
|
(alert-if-critical-too-long "cpu" 120)
|
|
(alert-if-critical-too-long "load" 120)
|
|
(alert-if-critical-too-long "memory" 120)
|
|
(alert-if-critical-too-long "disk /" 120)
|