# The original version of this file generates an approximately 1400 # line NetSaint configuration in about 12 seconds (most of that time # is trying to find the parents of hosts with the findparent script). # The basic time period; this can be made more complicated with # different hours on different days, etc. timeperiod t24x7 "All day, every day" { sunday, monday, tuesday, wednesday, thursday, friday 00:00-24:00; } # Me :-) contact mcguire "Tommy McGuire" { email "mcguire@cs.utexas.edu"; host hostmail:t24x7 down, recovery; service servmail:t24x7 critical, recovery; } # SSH, checked every 15 minutes, 3 retries at 5m intervals. The # command "cshosts -a linux" lists the department's Linux machines. service ssh "SSH" { check t24x7/15; retry 3/5; contact mcguire : t24x7/0; command checkssh; critical; recovery; hostclass linux "|cshosts -a linux"; } service ntp "NTP" { check t24x7/15; retry 3/5; contact mcguire : t24x7/0; command checkntp; critical; recovery; hostclass linux "|cshosts -a linux"; } # We've had problems with cron, but this only checks once an hour. service cron "cron" { check t24x7/60; retry 3/5; contact mcguire: t24x7/0; command checkcron; critical; recovery; hostclass linux "|cshosts -a linux"; } # Horatio on spinach2.cs.utexas.edu. This will try to notify me that # it is still down every 3 hours. service https "Horatio HTTPS" { check t24x7/15; retry 2/5; contact mcguire:t24x7/180; command checkhttps; critical; warning; recovery; hostclass linux "spinach2"; } # I'm only interested in the switches if a host is unreachable, so the # actual switch service is that it is the parent of a host. service null "NULL service" { check t24x7/15; retry 1/5; contact mcguire:t24x7/0; command check-dummy; hostclass switches "|cshosts switches"; } # The department has every host's network parent in the DNS, and the # findparent script prints the parent when given a hostname. hostclass linux "CS production Linux" { command check-host-alive; parentcommand "/u/mcguire/src/nscc/findparent"; down; recovery; contact mcguire:t24x7/0; } hostclass switches "CS network switches" { command check-host-alive; parentcommand "/u/mcguire/src/nscc/findparent"; unreachable; down; recovery; contact mcguire:t24x7/0; } # Newlines will be translated to \n. # The commands would be better if backslash-escaped newlines were # translated to spaces, but I haven't done that yet. command servmail "/usr/bin/printf \"Notification Type: %s Service: %s Host: %s Address: %s State: %s Date/Time: %s Additional Info: %s Check execution time: %s Latency: %s \" '$NOTIFICATIONTYPE$' '$SERVICEDESC$' '$HOSTALIAS$' '$HOSTADDRESS$' '$SERVICESTATE$' '$DATETIME$' '$OUTPUT$' '$EXECUTIONTIME$' '$LATENCY$' | /lusr/bin/mail -s 'NETSAINT: $HOSTALIAS$/$SERVICEDESC$ $SERVICESTATE$ ($NOTIFICATIONNUMBER$)' $CONTACTEMAIL$"; command hostmail "/usr/bin/printf \"Notification Type: %s Host: %s State: %s Address: %s Info: %s Date/Time: %s \" '$NOTIFICATIONTYPE$' '$HOSTNAME$' '$HOSTSTATE$' '$HOSTADDRESS$' '$OUTPUT$' '$DATETIME$' | /lusr/bin/mail -s 'NETSAINT: $HOSTNAME$ $HOSTSTATE$ ($NOTIFICATIONNUMBER$)' $CONTACTEMAIL$"; command check-host-alive "/lusr/etc/netsaint/check_ping -H $HOSTADDRESS$ -w 10,10% -c 5000,100% -p 1"; command checkssh "/lusr/etc/netsaint/check_ssh $HOSTADDRESS$"; command checkntp "/lusr/etc/netsaint/check_time -H $HOSTADDRESS$ -w10 -c 60"; command check-dummy "/lusr/etc/netsaint/check_dummy 0"; command checkhttps "/lusr/etc/netsaint/check_http -S $HOSTADDRESS$"; command checkcron "/lusr/etc/netsaint/check_by_ssh -H $HOSTADDRESS$ /lusr/etc/netsaint/check_procs -C cron -w :3 -c 1:";