#!/usr/bin/perl -w use strict; use POSIX; use Getopt::Long; use LWP; my $send_email = 0; #my $email_target = 'backupandrecovery2@radian.biz'; my $email_target = 'gabriel.rosenkoetter@radian.biz'; my $send_opcmsg = 0; my $help = 0; my $debug = 0; my $pp_def = $pol_prefix; my $et_def = $email_target; GetOptions('policy=s' => \$pol_prefix, 'email' => \$send_email, 'recipient=s' => \$email_target, 'openview' => \$send_opcmsg, 'help' => \$help, 'debug' => \$debug); if ($help) { print " --policy, -p ..... Specify policy [prefix] to watch.\n", " [Default: $pp_def]\n", " --email, -e ...... Send email notification.\n", " --recipient, -r .. Specify email recipient.\n", " [Default: $et_def]\n", " --openview, -o ... Use opcmsg to raise an OpenView alert.\n", " --help, -h ....... Display this message.\n\n", " In absence of -e or -o, output is to STDOUT.\n"; exit 0; } $ENV{PATH} .= ':/usr/openv/netbackup/bin:/usr/openv/netbackup/bin/admincmd:/usr/openv/netbackup/bin/goodies:/usr/openv/volmgr/bin:/usr/openv/volmgr/bin/goodies:/usr/openv/java:/opt/OV/bin'; my $date = strftime('%c', localtime()); # %c is "locale appropriate" # XXX we want to: # 1 check the last time we ran (lock file with $date as # sec-since-epoch? # 2 only check log messages since then (INCLUDING ROTATED LOGS!) # # considering 1 more carefully later... my $email_body = ''; # XXX # 1 figure out what MM uses for logging # 2 parse the appropriate log file out of syslog.conf # 3 bonus points: figure out what log rotation we're using and parse # "yesterday's" file out of THAT my $syslog_file = "/var/adm/syslog/syslog.log"; open SYSLOG, 'r', $syslog_file; while () { # Drives can be downed by bptm like this: # Feb 13 08:54:44 pa2ibks2 bptm[10416]: Application (NetBackup) has DOWN'ed drive index 2, see application error log for further information # (for more than 3 I/O failures in the last N hours) # # or more directly by robot daemons (tldd, acsd, ...?) like this: # Feb 16 09:10:58 pa2ibks2 tldd[17447]: TLD(1) drive 3 (device 14) is being DOWNED, status: Robotic mount failure # (could also be dismount failure) # # We care about these. # # They can also get downed by vmoprcmd through ltid like this: # Feb 15 10:32:56 pa2ibks2 ltid[17353]: Operator has DOWN'ed drive P2R1E4-9940b-00-ndmp (device 12) # # We don't care about these. # # The consistent ID here is "(device X)", where X is the index # number for that host, so we want to use that and tpconfig to # find the rest of the drive's information. # # Robots can be downed by robot daemons like this: # Feb 12 22:21:49 pa2ibks2 tldd[1886]: TLD(1) going to DOWN state, status: Unable to sense robotic device # (need some more samples... ACS especially may give more info on the cause) if ($_ = m/DOWN/) { my $proc = (split(' ', $_))[4]; # XXX ugh, portabiliwhat? if ($proc = m/^bptm/) { # definitely a drive } elsif ($proc = m/^tldd/ || $proc = m/^acsd/ || $proc = m/^tl4d/ || $proc = m/^tl8d/ || $proc = m/^tlhd/ || $proc = m/^tlmd/ || $proc = m/^ts8d/ || # XXX relevant? (below) $proc = m/^odld/ || $proc = m/^tsdd/) { # drive or robot } my $message = sprintf "NetBackup job ID %s still running on %s for %s under policy %s at %s.\n", $job->{'jobid'}, $job->{master_server}, $job->{client}, $job->{class}, $date; $email_body .= $message; chomp($message); if ($send_opcmsg) { my $command = 'opcmsg severity=major application=Netbackup ' . 'msg_grp=SAN object="Long-running Backup" msg_text="' . $message . '"'; $email_body .= "Notifying OpenView with command:\n $command\n"; # XXX uncomment when OpenView's ready my $output = qx{$command 2>&1}; $email_body .= $output if ($? > 0); } $email_body .= "\n"; } chomp(my $hostname = qx{uname -n}); if ($send_email && length($email_body) > 0) { my $ua = LWP::UserAgent->new; my $req = HTTP::Request->new(POST => "mailto:$email_target"); $req->header(Subject => "Long-running backup jobs for $hostname"); $req->content($email_body); my $res = $ua->request($req); print STDERR $res->status_line, "\n" unless ($res->is_success); } else { print $email_body; } exit 0;