#!/usr/bin/perl
#
# pflogsumm.pl - Produce summaries of Postfix/VMailer MTA in logfile -
#	Copyright (C) 1998-99 by James S. Seymour (jseymour@jimsun.LinxNet.com)
#	(See "License", below.)  Version 19990121-01
#
# Usage:
#    pflogsumm.pl -[emq] [-d <today|yesterday>] [-c <cnt>] [file1 [filen]]
#
# Options:
#    -c <cnt>       top <cnt> to display in each traffic catagory
#
#    -d today       means just today
#    -d yesterday   means just "yesterday"
#
#    -e             extended (extreme? excessive?) detail - emit detailed
#                   reports.  At present, this includes only a per-message
#                   report, sorted by sender domain, then user-in-domain,
#                   then by queue i.d.
#
#                   WARNING: the data built to generate this report can
#                   quickly consume very large amounts of memory if a lot
#                   of log entries are processed!
#
#    -m             modify (mung?) UUCP-style bang-paths
#
#                   This is for use when you have a mix of Internet-style
#                   domain addresses and UUCP-style bang-paths in the log.
#                   Upstream UUCP feeds sometimes mung Internet domain
#                   style address into bang-paths.  This option can
#                   sometimes undo the "damage".  For example:
#                   "somehost.dom!username@foo" (where "foo" is the next
#                   host upstream and "somehost.dom" was whence the email
#                   originated) will get converted to
#                   "foo!username@somehost.dom".  This also affects the
#                   extended detail report (-e), to help ensure that by-
#		    domain-by-name sorting is more accurate.
#
#    -q             quiet - don't print headings for empty reports (note:
#                   headings for warning, fatal, and "master" messages will
#                   always be printed.)
#
#    If no file(s) specified, reads from stdin.  Output is to stdout.
#
# Typical usage:
#    Produce a report of previous day's activities:
#        pflogsumm.pl -d yesterday /var/log/syslog
#    A report of prior week's activities (after logs rotated):
#        pflogsumm.pl /var/log/syslog.1
#    What's happened so far today:
#        pflogsumm.pl -d today /var/log/syslog
#
# Notes:
#
#    For display purposes: integer values are munged into "kilo" and
#    "mega" notation as they exceed certain values.  I chose the
#    admittedly arbitrary boundaries of 512k and 512m as the points
#    at which to do this--my thinking being 512x was the largest
#    number (of digits) that most folks can comfortably grok
#    at-a-glance.  These are "computer" "k" and "m", not 1000 and
#    1,000,000.  You can easily change all of this with some
#    constants near the beginning of the program.
#
#    The "messages-per-day" report is not generated for single-day
#    reports.  For multiple-day reports: "messages-per-hour" numbers
#    are daily averages (reflected in the report heading).
#
#    It's important that the logs are presented to pflogsumm in
#    chronological order so that message sizes are available when
#    needed.
#
#    A note to the software-savvy: there will appear to be a lot of
#    duplication (code-wise) herein.  There is.  This is generally
#    done for the performance benefit of avoiding subroutine call
#    overhead in loops.
#
# License:
#    This program is free software; you can redistribute it and/or
#    modify it under the terms of the GNU General Public License
#    as published by the Free Software Foundation; either version 2
#    of the License, or (at your option) any later version.
#    
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#    
#    You may have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
#    USA.
#    
#    An on-line copy of the GNU General Public License can be found
#    http://www.fsf.org/copyleft/gpl.html.
#
# Thanks To:
#    Paul D. Robertson <proberts@clark.net>
#    Simon J Mudd <simon.mudd@alltrading.com>

use strict;
use Getopt::Std;

# Variables and constants used throughout pflogsumm
use vars qw(
    $progName
    $usageMsg
    $opt_c $opt_d $opt_e $opt_m $opt_q
    $divByOneKAt $divByOneMegAt $oneK $oneMeg
    @monthNames %monthNums $thisYr $thisMon
);

# Some constants used by display routines.  I arbitrarily chose to
# display in kilobytes and megabytes at the 512k and 512m boundaries,
# respectively.  Season to taste.
$divByOneKAt   = 524288;	# 512k
$divByOneMegAt = 536870912;	# 512m
$oneK          = 1024;		# 1k
$oneMeg        = 1048576;	# 1m

# Constants used throughout pflogsumm
@monthNames = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
%monthNums = qw(
    Jan  0 Feb  1 Mar  2 Apr  3 May  4 Jun  5
    Jul  6 Aug  7 Sep  8 Oct  9 Nov 10 Dec 11);
($thisMon, $thisYr) = (localtime(time()))[4,5];

# Variables used only in main loop
my (
    $cmd, $qid, $addr, $size, $relay, $status,
    $dateStr,
    %fatals, %warnings, %masterMsgs,
    %fromCnt, %fromSize, %msgSizes, %toCnt, %toSize,
    %deferred, %bounced,
    %noMsgSize, %msgDetail,
    $senderCnt, $recipCnt, $msgsRcvd, $msgsDlvrd, $sizeRcvd, $sizeDlvrd,
    $msgMonStr, $msgMon, $msgDay, $msgTimeStr, $msgHr, $msgMin, $msgSec,
    $msgYr,
    $revMsgDateStr, $dayCnt, %msgsPerDay,
    %rejects,
    %rcvdMsg, $fwdCnt
);

# Messages received and delivered per hour
my @rcvPerHr = qw(0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0);
my @dlvPerHr = @rcvPerHr;
my $lastMsgDay = 0;

$progName = "pflogsumm.pl";
$usageMsg =
    "usage: $progName -[emq] [-d <today|yesterday>] [-c <cnt>] [file(s)]";

# I know: this is ugly
$opt_c = 0;
$opt_e = 0;
$opt_m = 0;
$opt_q = 0;
getopts('c:d:eqm') ||
    die "$usageMsg\n";

$dateStr = get_datestr($opt_d) if(defined($opt_d));

# debugging
#open(UNPROCD, "> unprocessed") ||
#    die "couldn't open \"unprocessed\": $!\n";

while(<>) {
    next if(defined($dateStr) && ! /^$dateStr/o);
    ($msgMonStr, $msgDay, $msgTimeStr, $cmd, $qid) =
	m#^(...)\s+([0-9]+)\s(..:..:..)\s.*?(?:vmailer|postfix)[-/]([^\[:]*).*?: ([^:]+)#o;
    ($msgMonStr, $msgDay, $msgTimeStr, $cmd, $qid) =
	m#^(...)\s+([0-9]+)\s(..:..:..)\s.*?(vmailer|postfix[^\[:]*).*?: ([^:]+)#o unless($cmd);
    next unless($cmd);
    chomp;

    # snatch out log entry date & time
    ($msgHr, $msgMin, $msgSec) = split(/:/, $msgTimeStr);
    $msgMon = $monthNums{$msgMonStr};
    $msgYr = $thisYr; --$msgYr if($msgMon > $thisMon);

    # the following test depends on one getting more than one message a
    # month--or at least that successive messages don't arrive on the
    # same month-day in successive months :-)
    unless($msgDay == $lastMsgDay) {
	$lastMsgDay = $msgDay;
	$revMsgDateStr = sprintf "%d%02d%02d", $msgYr + 1900, $msgMon, $msgDay;
	++$dayCnt;
    }

    if($qid eq 'warning') {
	++$warnings{$cmd};
    } elsif($qid eq 'fatal') {
	++$fatals{$cmd};
    } elsif($qid eq 'reject') {
	# This could get real ugly!
	# First: get everything following the "reject: " token
	my ($rejTyp, $rejFrom, $rejRmdr) =
	    /^.* reject: ([^ ]+) from ([^:]+): (.*)$/o;
	# Next: get the reject "reason"
	(my $rejReas = $rejRmdr) =~ s/^(?:.*[:;] )?([^,]+).*$/$1/o;
	# stash in "triple-subscripted-array"
	++$rejects{$rejTyp}{$rejReas}{gimme_domain($rejFrom)};
    } elsif($cmd eq 'master') {
	++$masterMsgs{(split(/^.*master.*: /))[1]};
    } else {
	if((($addr, $size) = /from=<([^>]*)>, size=([0-9]+)/o) == 2)
	{
	    next if($msgSizes{$qid});	# avoid double-counting!
	    if($addr) {
		if($opt_m && $addr =~ /^(.*!)*([^!]+)!([^!@]+)@([^\.]+)$/o) {
		    $addr = "$4!" . ($1? "$1" : "") . $3 . "\@$2";
		}
		$addr =~ s/(@.+)/\L$1/o;
	    } else {
		$addr = "from=<>"
	    }
	    $msgSizes{$qid} = $size;
	    push(@{$msgDetail{$qid}}, $addr) if($opt_e);
	    # Avoid counting forwards
	    if($rcvdMsg{$qid}) {
	        ++$senderCnt unless($fromCnt{$addr});
		++$fromCnt{$addr};
		$fromSize{$addr} += $size;
		$sizeRcvd += $size;
		delete($rcvdMsg{$qid});		# limit hash size
	    }
	}
	elsif((($addr, $relay, $status, my $rmdr) =
		/to=<([^>]*)>, relay=([^,]+),.*? status=([^ ]+)(.*)$/o) >= 3)
	{
	    if($status eq 'sent') {
		# was it actually forwarded, rather than delivered?
		if($rmdr =~ /forwarded as /o) {
		    ++$fwdCnt;
		    next;
		}
		if($opt_m && $addr =~ /^(.*!)*([^!]+)!([^!@]+)@([^\.]+)$/o) {
		    $addr = "$4!" . ($1? "$1" : "") . $3 . "\@$2";
		}
		$addr =~ s/(@.+)/\L$1/o;
		++$recipCnt unless($toCnt{$addr});
		++$toCnt{$addr};
		++$dlvPerHr[$msgHr];
		++${$msgsPerDay{$revMsgDateStr}}[1];
		++$msgsDlvrd;
		if($msgSizes{$qid}) {
		    $toSize{$addr} += $msgSizes{$qid};
		    $sizeDlvrd += $msgSizes{$qid};
		} else {
		    $noMsgSize{$qid} = $addr;
		    push(@{$msgDetail{$qid}}, "(sender not in log)") if($opt_e);
		    # put this back later? mebbe with -v?
		    # msg_warn("no message size for qid: $qid");
		}
		push(@{$msgDetail{$qid}}, $addr) if($opt_e);
	    } elsif($status eq 'deferred') {
		my ($deferredReas) = /, status=deferred \(([^:\)\/]+)/o;
		++$deferred{$cmd}{$deferredReas};
	    } elsif($status eq 'bounced') {
		my ($bounceReas) = /, status=bounced \(([^.:;,\)]+)/o;
		++$bounced{$relay}{$bounceReas};
	    } else {
#		print UNPROCD "$_\n";
	    }
	}
	elsif(($cmd eq 'pickup' && /: (sender|uid)=/o) ||
	      ($cmd eq 'smtpd'  && /: client=/o))
	{
	    ++$rcvPerHr[$msgHr];
	    ++${$msgsPerDay{$revMsgDateStr}}[0];
	    ++$msgsRcvd;
	    ++$rcvdMsg{$qid};	# quick-set a flag
	}
	else
	{
#	    print UNPROCD "$_\n";
	}
    }
}

# debugging
#close(UNPROCD) ||
#    die "problem closing \"unprocessed\": $!\n";

if(defined($dateStr)) {
    print "Postfix log summaries for $dateStr\n";
}

print "\nGrand Totals\n------------\n";
print_val_with_title($msgsRcvd,  "messages received");
print_val_with_title($msgsDlvrd, "messages delivered");
print_val_with_title($sizeRcvd,  "bytes received");
print_val_with_title($sizeDlvrd, "bytes delivered");
print_val_with_title($senderCnt, "senders");
print_val_with_title($recipCnt,  "recipients");
print_val_with_title($fwdCnt,    "forwarded");

print_per_day_summary(\%msgsPerDay) if($dayCnt > 1);
print_per_hour_summary(\@rcvPerHr, \@dlvPerHr, $dayCnt);

print_hash_by_cnt_vals(\%fromCnt, "Senders by message count", $opt_c, $opt_q);
print_hash_by_cnt_vals(\%toCnt, "Recipients by message count", $opt_c, $opt_q);
print_hash_by_cnt_vals(\%fromSize, "Senders by message size", $opt_c, $opt_q);
print_hash_by_cnt_vals(\%toSize, "Recipients by message size", $opt_c, $opt_q);

print_hash_by_key(\%noMsgSize, "Messages with no size data", $opt_c, 1);

print_nested_hash(\%deferred, "messages deferred", $opt_q);
print_nested_hash(\%bounced, "messages bounced (by relay)", $opt_q);
print_nested_hash(\%rejects, "messages rejected", $opt_q);

print_hash_by_cnt_vals(\%warnings, "Warnings", 0, $opt_q);
print_hash_by_cnt_vals(\%fatals, "Fatal Errors", 0, $opt_q);
print_hash_by_cnt_vals(\%masterMsgs,"Master daemon messages", 0, $opt_q);

print_detailed_msg_data(\%msgDetail, "Message detail", $opt_q) if($opt_e);


# print an integer value with associated title
sub print_val_with_title {
    my($value, $title) = @_;
    $value = 0 unless($value);
    my $units = ' ';
    if($value > $divByOneMegAt) {
	$value /= $oneMeg;
	$units = 'm'
    } elsif($value > $divByOneKAt) {
	$value /= $oneK;
	$units = 'k'
    }
    printf " %6d%s  %s\n", $value, $units, $title;
}

# print "per-day" traffic summary
# (done in a subroutine only to keep main-line code clean)
sub print_per_day_summary {
    my($msgsPerDay) = @_;
    print <<End_Of_Per_Day_Heading;

Per-Day Traffic Summary
    date          received  delivered
    ---------------------------------
End_Of_Per_Day_Heading

    foreach (sort { $a <=> $b } keys(%$msgsPerDay)) {
	my ($msgYr, $msgMon, $msgDay) = unpack("A4 A2 A2", $_);
	my $msgMonStr = $monthNames[$msgMon];
	printf "    $msgMonStr %2d $msgYr", $msgDay;
	foreach my $value (@{$msgsPerDay->{$_}}) {
	    $value = 0 unless($value);
	    my $units = ' ';
	    if($value > $divByOneMegAt) {
		$value /= $oneMeg;
		$units = 'm'
	    } elsif($value > $divByOneKAt) {
		$value /= $oneK;
		$units = 'k'
	    }
	    printf "    %6d%s", $value, $units;
	}
	print "\n";
    }
}

# print "per-hour" traffic summary
# (done in a subroutine only to keep main-line code clean)
sub print_per_hour_summary {
    my($rcvPerHr, $dlvPerHr, $dayCnt) = @_;
    my $reportType = $dayCnt > 1? 'Daily Average' : 'Summary';
    print <<End_Of_Per_Hour_Heading;

Per-Hour Traffic $reportType
    time        received  delivered
    -------------------------------
End_Of_Per_Hour_Heading

    for(my $hour = 0; $hour < 24; ++$hour) {
	printf "    %02d00-%02d00", $hour, $hour + 1;
	foreach my $value (@$rcvPerHr[$hour], @$dlvPerHr[$hour]) {
	    my $units = ' ';
	    $value = ($value / $dayCnt) + 0.5 if($dayCnt);
	    if($value > $divByOneMegAt) {
		$value /= $oneMeg;
		$units = 'm'
	    } elsif($value > $divByOneKAt) {
		$value /= $oneK;
		$units = 'k'
	    }
	    printf "    %6d%s", $value, $units;
	}
	print "\n";
    }
}

# print hash contents sorted by numeric values in descending
# order (i.e.: highest first)
sub print_hash_by_cnt_vals {
    my($hashRef, $title, $cnt, $quiet) = @_;
    my $dottedLine;
    $title = sprintf "%s%s", $cnt? "top $cnt " : "", $title;
    unless(%$hashRef) {
	return if($quiet);
	$dottedLine = ": none";
    } else {
	$dottedLine = "\n" . "-" x length($title);
    }
    printf "\n$title$dottedLine\n";
    really_print_hash_by_cnt_vals($hashRef, $cnt, ' ');
}

# print hash contents sorted by key in ascending order
sub print_hash_by_key {
    my($hashRef, $title, $cnt, $quiet) = @_;
    my $dottedLine;
    $title = sprintf "%s%s", $cnt? "first $cnt " : "", $title;
    unless(%$hashRef) {
	return if($quiet);
	$dottedLine = ": none";
    } else {
	$dottedLine = "\n" . "-" x length($title);
    }
    printf "\n$title$dottedLine\n";
    foreach (sort keys(%$hashRef))
    {
	printf " %s  %s\n", $_, $hashRef->{$_};
	last if --$cnt == 0;
    }
}

# print "nested" hashes
sub print_nested_hash {
    my($hashRef, $title, $quiet) = @_;
    my $dottedLine;
    unless(%$hashRef) {
	return if($quiet);
	$dottedLine = ": none";
    } else {
	$dottedLine = "\n" . "-" x length($title);
    }
    printf "\n$title$dottedLine\n";
    walk_nested_hash($hashRef, 0);
}

# "walk" a "nested" hash
sub walk_nested_hash {
    my ($hashRef, $level) = @_;
    $level += 2;
    my $indents = ' ' x $level;
    my ($keyName, $hashVal) = each(%$hashRef);

    if(ref($hashVal) eq 'HASH') {
	foreach (sort keys %$hashRef) {
	    print "$indents$_\n";
	    walk_nested_hash($hashRef->{$_}, $level);
	}
    } else {
	really_print_hash_by_cnt_vals($hashRef, 0, $indents);
#	print "\n"
    }
}

# print per-message info in excruciating detail :-)
sub print_detailed_msg_data {
    use vars '$hashRef';
    local($hashRef) = $_[0];
    my($title, $quiet) = @_[1,2];
    my $dottedLine;
    unless(%$hashRef) {
	return if($quiet);
	$dottedLine = ": none";
    } else {
	$dottedLine = "\n" . "-" x length($title);
    }
    printf "\n$title$dottedLine\n";
    foreach (sort by_domain_then_user keys(%$hashRef))
    {
	printf " %s  %s\n", $_, shift(@{$hashRef->{$_}});
	foreach (@{$hashRef->{$_}}) {
	    print "   $_\n";
	}
	print "\n";
    }
}

# *really* print hash contents sorted by numeric values in descending
# order (i.e.: highest first) :-)
sub really_print_hash_by_cnt_vals {
    my($hashRef, $cnt, $indents) = @_;

    foreach (reverse sort { $hashRef->{$a} <=> $hashRef->{$b} }
	keys(%$hashRef))
    {
	my $value = $hashRef->{$_};
	my $units = ' ';
	if($value > $divByOneMegAt) {
	    $value /= $oneMeg;
	    $units = 'm'
	} elsif($value > $divByOneKAt) {
	    $value /= $oneK;
	    $units = 'k'
	}
	printf "$indents%6d%s  %s\n", $value, $units, $_;
	last if --$cnt == 0;
    }
}

# subroutine to sort by domain, then user in domain, then by queue i.d.
# Note: mixing Internet-style domain names and UUCP-style bang-paths
# may confuse this thing.  An attempt is made to use the first host
# preceding the username in the bang-path as the "domain" if none is
# found otherwise.
sub by_domain_then_user {
    # first see if we can get "user@somedomain"
    my($userNameA, $domainA) = split(/\@/, ${$hashRef->{$a}}[0]);
    my($userNameB, $domainB) = split(/\@/, ${$hashRef->{$b}}[0]);

    # try "somedomain!user"?
    ($userNameA, $domainA) = (split(/!/, ${$hashRef->{$a}}[0]))[-1,-2]
	unless($domainA);
    ($userNameB, $domainB) = (split(/!/, ${$hashRef->{$b}}[0]))[-1,-2]
	unless($domainB);

    # now re-order "mach.host.dom"/"mach.host.do.co" to
    # "host.dom.mach"/"host.do.co.mach"
    $domainA =~ s/^(.*)\.([^\.]+)\.([^\.]{3}|[^\.]{2,3}\.[^\.]{2})$/$2.$3.$1/o
	if($domainA);
    $domainB =~ s/^(.*)\.([^\.]+)\.([^\.]{3}|[^\.]{2,3}\.[^\.]{2})$/$2.$3.$1/o
	if($domainB);

    # oddly enough, doing this here is marginally faster than doing
    # an "if-else", above.  go figure.
    $domainA = "" unless($domainA);
    $domainB = "" unless($domainB);

    if($domainA lt $domainB) {
	return -1;
    } elsif($domainA gt $domainB) {
	return 1;
    } else {
	# disregard leading bang-path
	$userNameA =~ s/^.*!//o;
	$userNameB =~ s/^.*!//o;
	if($userNameA lt $userNameB) {
	    return -1;
	} elsif($userNameA gt $userNameB) {
	    return 1;
	} else {
	    if($a lt $b) {
		return -1;
	    } elsif($a gt $b) {
		return 1;
	    }
	}
    }
    return 0;
}

# return a date string to match in log
sub get_datestr {
    my $dateOpt = $_[0];

    my $aDay = 60 * 60 * 24;

    my $time = time();
    if($dateOpt eq "yesterday") {
	$time -= $aDay;
    } elsif($dateOpt ne "today") {
	die "$usageMsg\n";
    }
    my ($t_mday, $t_mon) = (localtime($time))[3,4];

    return sprintf("%s %2d", $monthNames[$t_mon], $t_mday);
}

# if there's a real domain: uses that.  Otherwise uses the first
# three octets of the IP addr.  (In the latter case: usually pretty
# safe to assume it's a dialup with a class C IP addr.)  Lower-
# cases returned domain name.
sub gimme_domain {
    $_ = $_[0];
 
    # split domain/ipaddr into separates
    my($domain, $ipAddr) = /^([^\[]+)\[([^\]]+)\]:?$/o;
 
    # now re-order "mach.host.dom"/"mach.host.do.co" to
    # "host.dom.mach"/"host.do.co.mach"
    if($domain eq 'unknown') {
        ($domain = $ipAddr) =~ s/\.[0-9]+$//o;
    } else {
        $domain =~
            s/^(.*)\.([^\.]+)\.([^\.]{3}|[^\.]{2,3}\.[^\.]{2})$/\L$2.$3/o;
    }
 
    return $domain;
}

###
### Warning and Error Routines
###

# Emit warning message to stderr
sub msg_warn {
    print STDERR "warning: $progName: $_[0]\n";
}

