


IF [DOES] NOT EXIST [[<X>] <Y> CYCLES]    THEN     action    [ELSE IF SUCCEEDED [[<X>] <Y> CYCLES] THEN action]

action的相关动作:ALERT告警  、RESTART 重启 、START 启动、 STOP关闭、EXEC 执行命令、 UNMONITOR 不监控


check process named with pidfile /var/run/named.pid

start program = “/etc/init.d/named start”

stop program  = “/etc/init.d/named stop”

if failed port 53 use type udp protocol dns then restart

if 3 restarts within 5 cycles then timeout





IF  resource  operator value [[<X>] <Y> CYCLES]    THEN    action [ELSE IF SUCCEEDED [[<X>] <Y> CYCLES] THEN action]

resource:就是监控的对象,如"CPU", "TOTALCPU", "CPU([user|system|wait])", "MEMORY", "SWAP", "CHILDREN", "TOTALMEMORY", "LOADAVG([1min|5min|15min])".

operator:逻辑判断符号,如 > ,=,< 等

 check system myhost.mydomain.tld
if loadavg (1min) > 4 then alert
if loadavg (5min) > 2 then alert
if memory usage > 75% then alert
if cpu usage (user) > 70% then alert
if cpu usage (system) > 30% then alert
if cpu usage (wait) > 20% then alert




  check file apache_bin with path /usr/local/apache/bin/httpd
if failed checksum and
  expect the sum 8f7f419955cefa0b33a2ba316cba3659 then unmonitor
if failed permission 755 then unmonitor
if failed uid root then unmonitor
if failed gid root then unmonitor
alert security@foo.bar on {
  checksum, permission, uid, gid, unmonitor
} with the mail-format { subject: Alarm! }
group server


check file with path /home/laicb/test.txt  
   if does not exist for 5 cycles then alert  
   if changed size for  1 cycles then alert //如果没有指定,查看服务所对应的会发现是for 5 times within 5cycles


check file passwd with path /etc/passwd
      if failed uid root then unmonitor
check file shadow with path /etc/shadow
      if failed gid root then unmonitorv


check process myapp with pidfile /var/run/myapp.pid
   start program = "/etc/init.d/myapp start"
   stop program = "/etc/init.d/myapp stop"
   if uptime > 3 days then restart


check filesystem datafs with path /dev/sdb1
      group server
      start program  = "/bin/mount /data"
      stop program  =  "/bin/umount /data"
      if failed permission 660 then unmonitor
      if failed uid root then unmonitor
      if failed gid disk then unmonitor
      if space usage > 80 % then alert
      if space usage > 94 % then stop
      if inode usage > 80 % then alert
      if inode usage > 94 % then stop
      alert root@localhost


check host www.tildeslash.com with address www.tildeslash.com

if failed

icmp type echo count 5 with timeout 15 seconds

then alert


