#!/usr/bin/perl
#
my $revision = '$Id: sanitizer.pl,v 1.21 2000/07/22 16:30:03 bre Exp $';
my $version = 'Anomy 0.1.0 : sanitizer.pl';
#
##  Copyright (c) 2000 Bjarni R. Einarsson. All rights reserved.
##  This program is free software; you can redistribute it and/or modify
##  it under the terms of the GNU General Public License as published by
##  the Free Software Foundation; either version 2 of the License, or
##  (at your option) any later version.
#
# This is an email sanitizer.  Stick it in your .procmailrc!
# Implemented features:
#
#   + Minimal resource consumption.
#   + Can truncates all MIME headers to a reasonable length, to avoid 
#     buffer overflows in buggy mail readers.  Also attempts to protect
#     against overflows based on information parsed from other headers.
#   + Can renames attachments so they won't get auto-executed by naughty
#     mail readers.
#   + Defangs active HTML content.
#   + Can attach a log of what was done to the message (if anything
#     interesting was done, that is, and we can find a place to put
#     the log w/o breaking the message).
#   + Supports external virus scanners.
#   + Includes a crude built in scanner designed to guess whether macros in 
#     Microsoft documents are hostile or not.
#
# TODO:
#
#   + Copy Inflex's incoming/outgoing mail hack.
#   + More flexible logging, e.g. by email, files or syslog.
#   + Allow rejection of messages, resulting in replies etc.
#
# Most of the ideas in this script were borrowed from John D. Hardin's 
# "security through procmail" ruleset, which is available here:
# ftp://ftp.rubyriver.com/pub/jhardin/antispam/procmail-security.html
#
# This script never loads the entire message into memory and creates no
# temporary files unless virus scanning is enabled, which should help it
# scale.
#
# Note that this script is a little differently licensed from the rest 
# of the Anomy tools because I borrowed GPL'd code from John's script.
#
# Documentation and new versions are here: http://mailtools.anomy.net/
#
BEGIN { push @INC, $ENV{"ANOMY"} . "/bin"; };
use strict;
use Anomy::MIMEStream;
use IO::File;


##[ Default configuration ]###################################################

my $conf = {

    # Features.
    # Disable stuff by replacing 1s with 0s.
    #
    "feat_verbose"    => 1,  # Warn user about unscanned parts and Other Stuff.
    "feat_log_inline" => 1,  # Attach log to message.
    "feat_log_stderr" => 1,  # Print log to stderr.
    "feat_files"      => 1,  # Enable filename-based policy decisions.
    "feat_boundaries" => 0,  # Replace all boundary strings with our own.
                             # NOTE:  Always breaks PGP/MIME messages!
    "feat_lengths"    => 1,  # Protect against buffer overflows.
    "feat_scripts"    => 1,  # Defang incoming shell scripts.
    "feat_html"       => 1,  # Defang active HTML content.
    "feat_trust_pgp"  => 0,  # Trust PGP signed messages -> don't scan them
	"feat_uuencoded"  => 1,  # Sanitized UU encoded attachments.
	"feat_forwards"   => 1,  # Sanitize forwarded messages.

	# Score thresholds.
	# Use 0 to disable each threshold.
    #
    "score_bad"      => 100, # Any message exceeding this value will cause
                             # the sanitizer to return a non-zero exit code
                             # after processing the entire message.

    "score_panic"    => 0,   # If the sanitizer's internal score exceeds this
                             # value, the sanitizer will terminate immediately
                             # with a non-zero exit code.

    ##########################################################################
    # If feat_files is non-zero, the following rules will be used to decide
    # what to do with an attachment.  The rules are all filename based, each
    # "list" being a regulaur expression.
    #
    # The file is compared to each list in order (1 to file_list_max) and on
    # the first match the defined policy is enforced.  If a file matches no
    # lists the default policy is used.
    #
    # Valid policies are:
    #
    #   mangle  - Completely ofbuscates the file name.
    #   defang  - Defangs the file name, without making it completely 
    #             illegible.
    #   accept  - Attachment is accepted as-is (possibly subject to 
    #             HTML or shell script defanging though).
    #   save    - Save the attachment to the "file_save_dir" directory,
    #             replace it with an informative message.
    #   drop    - Delete the attachment
    #   unknown - Indeterminate result, check the next policy.
	#   panic   - Return immediately with a non-zero exit code.
    #
	# In addition, if an exclamation mark (!) is appended to a policy, then
	# in addition to enforcing the policy, the internal bug score will be
	# incremented past the "score_bad" value, causing the sanitizer to 
	# return a non-zero exit code whnt it is finished.
	#
    # If a policy has four values, e.g. "save:save!:drop:save", then the file 
    # will be scanned for viruses using an external virus scanner.  Which of 
    # the four policies is used then depends on whether the result is "clean"
    # (1st), "successfully disinfected" (2nd), "unsuccessfully disinfected" 
    # (3rd) or "scan failed" (4th).
    #
    # The scanner definitions are as follows:
    #
    #   "e1:e2:e3:e4:/path/to/scanner args ... %FILENAME ..."
    #
    # The e1, e2, e3, e4 are comma-delimited lists of exit codes that match
    # the four different "interesting" return values we exped scanners to 
    # return.  Unexpected values are assumed to be in the "scan failed" 
    # category.
    #

    "file_list_rules" => 3,

    # This is the file name template, used for creating (temporary?) files
    # when scanning or saving attachments.  The following substitutions are
    # supported:
    #               $d - Day of month (01-31)
    #               $m - Month number (01-12)
    #               $y - Two digit year (00-99)
    #               $Y - Four digit year
    #               $H - Hour (00-23)
    #               $M - Minute (00-59)
    #               $S - Second (00-59)
    #
    #               $P - This process's PID, in hex.
    #               $T - The current Unix time, in hex.
    #               $F - A safe version of the original file name.
    #               $  - A random character, from [A-Z0-9].
    #
    # It's recommended that all file name templates contain a few '$'
    # characters, since a new name will be generated (up to five times, 
    # after that it will give up) if the chosen one is already in use.  
    # More '$' substitions will mean fewer collisions.  Note that any
    # directories must exist, the sanitizer will NOT create them for you.
    # So if you are using random directory hashing make sure to create 
    # all the directories ahead of time!
    #
    "file_name_tpl" => '/tmp/att-$F-$T.$$',

    # This defines file names which always get mangled.
    # FIXME:  This list is very incomplete!
    "file_list_1_scanner" => 0,
    "file_list_1_policy" => "mangle",
    "file_list_1" => '(?i)'.
        '((happy99|x-mas|setup|aol4free|ie0199).exe)',

    # This defines file names which don't get mangled, unless they are on
    # the blacklist.  Anything not on this list gets mangled.
    #
    "file_list_2_scanner" => 0,
    "file_list_2_policy" => "accept",
    "file_list_2" => '(?i)\.'.
        '(gif|jpe?g|pn[mg]|x[pb]m|dvi|e?ps|p(df|cx)'.       # Graphics
        '|mp[32]|wav|au|ram?'.                              # Sound
        '|avi|mov|mpe?g'.                                   # Movies
        '|z(ip|oo)|ar[cj]|lha|[tr]ar|rpm|deb|slp|tgz'.      # Archives
        '|t(xt|ex)|csv|l(og|yx)'.                           # Text
        '|[ch](pp|\+\+)?|s|inc|asm|pa(tch|s)|java|php\d?'.  # Uncompiled code
        '|[ja]sp'.                                          # ...
    #
    # The following depends on the HTML sanitizer doing it's job properly.
    #
        '|[sp]?html?'.                                      # HTML
        ')(\.g?z|\.bz2?)*$',                                # Compressed?

    # Scan attachments that look like they're Microsoft documents for
    # viruses, using John D. Hardin's macro scanning code.  A score of 
    # 25 or above is considered "infected" and the file is quarantined.
    #
    "file_list_3_scanner" => '0:1:2:builtin 25',
    "file_list_3_policy" => "accept:save:save:defang",
    "file_list_3"        => '(?i)\.(do[tc]|xl[sw]|p[po]t|rtf)$',

    # Define some empty lists, so people don't have to edit the source to
    # define policies with more lists.
    "file_list_4_policy"  => 0,
    "file_list_4_scanner" => 0,
    "file_list_4"         => 0,
    "file_list_5_policy"  => 0,
    "file_list_5_scanner" => 0,
    "file_list_5"         => 0,

    # This defines the default policy, for filenames that don't match
    # any of the preceding lists.
    "file_default_policy" => "defang",

    ##########################################################################

    # Messages.  Translate?
    #
    "header_info" => "X-Sanitizer: This message has been sanitized!",
    "header_url"  => "X-Sanitizer-URL: http://mailtools.anomy.net/",
    "header_rev"  => "X-Sanitizer-Rev: $revision",

    "msg_file_drop" => 
        "****\nNOTE:  An attachment was deleted from this part of the message,\n".
        "because it failed one or more checks by the virus scanning system.\n".
        "See the attached sanitization log for more details or contact your\n".
        "system administrator.\n\n".
        "The removed attachment's name was:\n\n".
        "\t%FILENAME\n\n".
        "It might be a good idea to contact the sender and warn them that\n".
        "their system is infected.\n****\n",

    "msg_file_save" => 
        "****\nNOTE:  An attachment was deleted from this part of the message,\n".
        "because it failed one or more checks by the virus scanning system.\n".
        "The file has been quarantined on the mail server, with the following\n".
        "file name:\n\n".
        "\t%SAVEDNAME\n\n".
        "The removed attachment's original name was:\n\n".
        "\t%FILENAME\n\n".
        "It is recommended that you contact your system administrator if you\n".
        "need access to the file.  It might also be a good idea to contact the\n".
        "sender, and warn them that their system may be infected.\n****\n",

    "msg_pgp_warning" => 
        "WARNING:  The following data has NOT been sanitized, to ensure\n".
        "          that the signature remains intact, if valid.  Please\n".
        "          be careful if you open any enclosed attachments.\n\n",

    "msg_log_prefix" =>
        "This message has been 'sanitized'.  This means that potentially\n".
        "dangerous content has been rewritten or removed.  The following\n".
        "log describes which actions were taken.\n",

    "msg_usage" =>
        "$version\n$revision\n\n".
        "Usage: sanitizer.pl [ 'variable op value' | 'filename' ] ... \n".
        "\n".
        "FIXME:  unwritten\n",

    "msg_current" =>
        "Current configuration:\n",

    # Limits
    "max_conf_recursion" => 5,
};


##[ Main ]####################################################################

my $pid = $$;
my $log = undef;
my $bugscore = 0;
my $boringlog = undef;
my @charray = ('A'..'Z', 0..9);

# Read command line arguements, configure sanitizer.
#
# On error, print out usage instructions and a copy of the
# current configuration to STDERR.
#
while (my $arg = shift @ARGV)
{
	if (my $err = ReadConfigLine($arg))
	{
		print STDERR $conf->{"msg_usage"}, "\n";
		print STDERR $conf->{"msg_current"}, "\n";
		
		for my $key (sort(keys(%{ $conf })))
		{
			my $val = $conf->{$key};
			$key = sprintf("%-20s", $key);
			
			$val =~ s/\\/\\\\/g;
			$val =~ s/\n/\\n\n$key += $1/g;
			$val =~ s/\t/\\t/g;
			$val =~ s/\#/\\#/g;
#			$val =~ s/(.{55})/$1/g;
			$val =~ s/\+=  /+= \\s/g;
			$val =~ s/ \n/\\s\n/g;
			$val =~ s/\n\S+\s+[\+\.]?=\s*$//ms;

			print STDERR $key, "  = ", $val, "\n";
		}
		
		print STDERR "\n", $err, "\n";
		exit(1);
	}
}

# Sanity check for scoring system
#
if (($conf->{"score_panic"}) && 
    ($conf->{"score_panic"} < $conf->{"score_bad"}))
{
	$conf->{"score_bad"} = $conf->{"score_panic"};
}

# Set up parsers
#
my $parsers = {
	# Search text parts for inline uuencoded attachments, so we can mangle
	# their names and possibly scan the attachments themselves.  This also
	# takes care of defanging HTML.
	"text/*" => \&CleanText,
	"application/pgp" => \&CleanText,
	"inline/text/*" => \&CleanText,
	
	# Check headers, parse contents...
	"multipart/*" => \&CleanMultipart,

	# Sanitize encapsulated messages.
	"message/rfc822" => \&CleanRFC822,

    # We don't recognize this content-type, see if we can figure anything
    # out from the headers themselves.  Sanitize headers (at least).
    "DEFAULT" => \&CleanUnknown,
};

if ($conf->{"feat_trust_pgp"})
{
    # Only scan headers of signed stuff, if we decide that we
    # trust messages that are signed or encrypted.
    #
	$parsers->{"multipart/signed"} = \&CleanHeaders;
	$parsers->{"multipart/encrypted"} = \&CleanHeaders;
}

# Parse message header
#
my $message = Anomy::MIMEStream->New(*STDIN, *STDOUT, $parsers);
$message->ParseHeader();

# Append blurb to header.
{
	my $t;
	chomp $message->{"rawheader"};
	$message->{"rawheader"} .= "$t\n" if ($t = $conf->{"header_info"});
	$message->{"rawheader"} .= "$t\n" if ($t = $conf->{"header_url"});
	$message->{"rawheader"} .= "$t\n" if ($t = $conf->{"header_rev"});
	$message->{"rawheader"} .= "\n";
}

# Go!
$message->ParseBody();

DumpLog(undef, 1);

exit(1) if (($conf->{"score_bad"}) && ($bugscore >= $conf->{"score_bad"}));
exit(0);


##[ Configuration routines ]##################################################

# Parses a single line from a configuration file.
# Configuration files look like this:
#
#	# this is a comment
#	# set some variables
#	variable = value
#	variable += value
#	# load another configuration file
#	/path/to/another/configuration/file
#
# All white space in the value is replaced with spaces.   A " #" sequence
# (white space followed by '#') marks the beginning of a comment, and is
# ignored.  
#
# The following escape sequences are expanded in the value string to let
# you get around these "features":
#
#		\\	->	\
#		\#	->	#
#		\n	->	newline
#		\t	->	tab
#		\s	->	space
#
# Using the .= or += instead of just = will append the string to the
# variable, instead of resetting it.
#
my $config_recursion = 0;
sub ReadConfigLine
{
	my $line = shift;
	
	if ($line =~ /^\s*([a-z0-9_]+)\s*([\+\.]*=)\s*(.*)\s*$/si)
	{
		# OK, this lookes like a variable configuration
		
		my ($var, $op, $val) = (lc($1), $2, $3);

		unless (defined $conf->{$var}) 
		{
			return "Unknown configuration variable: $var\n";
		}

		$val =~ s/^\s+#.*$//;
		$val =~ tr/\t\n/  /;
		$val =~ s/\\([\\nts#])/ { $_=$1; tr|\\nts#|\\\n\t #|; $_ } /eg;

		if ($op eq '=')
		{			
			$conf->{$var} = $val;
		}
		else
		{
			$conf->{$var} .= $val;
		}
	}
	elsif ($line !~ /^\s*(#.*)?$/)
	{
		# Ooh, we're supposed to include another configuration file!
		
		my $fn = $line;
		$fn =~ s/^\s*(.*)\s*$/$1/;

		$config_recursion++;
		return "Configuration files nested too deep!\n"
			if ($config_recursion > $conf->{"max_conf_recursion"});

		local *CF;
		return "Cannot read $fn: $!\n"
			unless ((-r $fn) && (open (CF, "< $fn")));

		my $ln = 0;
		while (my $cl = <CF>)
		{
			$ln++;
			if (my $err = ReadConfigLine($cl))
			{
				$config_recursion--;
				return "[$config_recursion] Error in $fn, line $ln:\n" . $err;
			}
		}
		$config_recursion--;
	}
	return undef;
}

##[ Sanitizers, output, etc. ]#################################################


# Add a single line to our log, update the bug score.
#
sub Log
{
	my ($pinc, $sinc, $msg) = @_;
	$bugscore += $sinc;
		
	$log .= "[ score: $bugscore ]\n" if ($sinc);
	$log .= sprintf "%5.5d\t%s", $pid, $msg;

	die "Too few argumenets to Log(), stopped" unless ($msg);

	if (($conf->{"score_panic"}) && ($bugscore >= $conf->{"score_panic"}))
	{
		$log .= sprintf "%5.5d\tPanic threshold %s exceeded, terminating.\n", 
					$pid, $conf->{"score_panic"};

		DumpLog(undef, 1);
		exit(2);
	}

	$pid += $pinc;
}

# Yup.
#
sub QuoteHTML
{
	my $data = shift;
	$data =~ s/\&/&amp;/g;
	$data =~ s/\</&lt;/g;
	$data =~ s/\>/&gt;/g;
	return $data;
}


# This will print out the contents of $log, if the time looks right.
#
# Unfortunately, we can't guarantee that the time will /ever/ be right,
# so this may fail to embed the logs in the message - they are guaranteed
# to go to stderr though.  The logs try to be as unobtrusive as possible.
#
sub DumpLog
{
	my $part = shift;
	my $force = shift;
	my $type = $part->{"mime"}->{"type"} if ($part);

	return unless ($log);
	my $inline = (($conf->{"feat_log_inline"}) && ($part));

	my $ppart;
	if (($inline) && ($ppart = $part->{"parent"}))
	{
	 	$inline = 0 if ($ppart->{"parent"});
		$inline = 0 if ($ppart->{"mime"}->{"type"} =~ /^(multipart|text)\//i);
		$inline = 0 if ($type !~ /^text\/(plain|html)/i);
	}
	
	my $prelog = \$conf->{"msg_log_prefix"};

	my $logged = 0;
	if ($inline)
	{

		$logged = 1;
		if ($type =~ /^multipart\//i) 
		{
			$part->Write(
				"--". $part->{"mime"}->{"boundary"} ."\n".
				"Content-Type: text/sanitizer-log; charset=\"iso-8859-1\"\n".
				"Content-Transfer-Encoding: 8bit\n".
				"Content-Disposition: attachment; filename=\"sanitizer.log\"\n\n".
				"$$prelog\n$log\n$boringlog\n$version\n$revision\n\n");
		}
		elsif ($type =~ /^(text\/plain|application\/pgp)/i)
		{
			$part->Write("-- \n$$prelog\n$log\n$version\n$revision\n");
		}
		elsif ($type =~ /^text\/html/i) 
		{
			$part->Write("<HR><B>$$prelog</B><P><PRE>\n".
						QuoteHTML($log) ."\n$version\n$revision\n</PRE>\n");
		}
		else
		{
			# Failed..
			$logged = 0;
		}
	}
	if ($logged || $force)
	{
		if ($conf->{"feat_log_stderr"})
		{
			print STDERR "\n", $$prelog, "\n", $log, "\n\n";
		}

		# Clear log.
		$log = undef;
	}
}


# This will create a new file based on the "file_name_tpl" template.
# Returns undef on failure.
#
sub CreateAttFile
{
  	my $fn = shift;
	my $ofn = shift;
	my $fh = undef;
	my $cnt = 0;

	$ofn = Anomy::MIMEStream::Encode7bit(undef, $ofn);
	$ofn =~ s/[^A-Za-z0-9\._=-]/_/g;

	do
	{
		my $T = time();
		my ($S, $M, $H, $d, $m, $y) = localtime($T);
		$$fn = $conf->{"file_name_tpl"} || return undef;

		# Date stuff
		$$fn =~ s/\$T/ sprintf("%x", $T) /eg;
		$$fn =~ s/\$S/ sprintf("%2.2d", $S) /eg;
		$$fn =~ s/\$M/ sprintf("%2.2d", $M) /eg;
		$$fn =~ s/\$H/ sprintf("%2.2d", $H) /eg;
		$$fn =~ s/\$d/ sprintf("%2.2d", $d) /eg;
		$$fn =~ s/\$m/ sprintf("%2.2d", $m) /eg;
		$$fn =~ s/\$y/ sprintf("%2.2d", $y % 100) /eg;
		$$fn =~ s/\$Y/ sprintf("%d", $y + 1900) /eg;

		# PID
		$$fn =~ s/\$P/ sprintf("%x", $$) /eg;
		
		# Safe file name
		$$fn =~ s/\$F/$ofn/g;

		# Random characters
		$$fn =~ s/\$/ $charray[ int(rand(35.99)) ] /eg;
	}
	while (($cnt++ < 5) && 
		   (!defined ($fh = IO::File->new($$fn, O_CREAT|O_EXCL|O_RDWR))));

	return $fh;
}


# This is the built in macro scanner, based on John D. Hardin's code.
#
# It has been improved to use only a fixed, small amount of memory
# even when reading binary data.  The scanner checks a sliding 256-byte
# window - first for a sign that this is an MS document, then for stuff 
# that looks like macros.
#
# When macro stuff is seen, a score is incremented by an amount hopefully
# reflecting how "dangerous" it is.  If the total score exceeds a user 
# defined value, the attachment is considered poisoned and the scan 
# terminates immediately with a nonzero return value.
#
sub MacroScan
{
	my $fh = shift;
	my $poison_score = shift;

	my $score = 0;
	my $msapp = 0;	
	
	# Read relatively small chunks at a time, to minimize the extra
	# work done by the pattern maching.  We just trust the OS to make
	# this efficient...
	#
	my $chunksize = 128;
	local $/ = \$chunksize;
	
	# Initialize buffer.
	$fh->seek(0, SEEK_SET);
	my $buff = <$fh> . <$fh>;

	Log 0, 0, "Scanning attachment:\n";

	while ($buff)
	{	
		unless ($msapp)	
		{
			if ($buff =~ /\000(Microsoft (Word Document|Excel Spreadsheet)|MSWordDoc|Microsoft Excel|Word\.Document\.[0-9]+|Excel\.Sheet\.[0-9]+)\000/)
			{
				$msapp = 1;

				Log 0, 0, "\tThis appears to be a Microsoft document, restarting scan...\n";
				
				# Restart scan
				$fh->seek(0, SEEK_SET);
				$buff = <$fh> . <$fh>;
				next;
			}
		}
		else
		{
			# Lots of while loops here - we replace the leading \000 boundary
		   	# with 'x' characters to ensure this eventually completes.
			#
			$score += 99 while ($buff =~ s/\000(VirusProtection)/x$1/i);
			$score += 99 while ($buff =~ s/\000(select\s[^\000]*shell\s*\()/x$1/i);
			$score +=  9 while ($buff =~ s/\000(regedit|SaveNormalPrompt|Outlook.Application\000)/x$1/i);
			$score +=  4 while ($buff =~ s/\000(ID="{[-0-9A-F]+)$/x$1/i);
			$score +=  4 while ($buff =~ s/\000(CreateObject)/x$1/i);
			$score +=  4 while ($buff =~ s/(?:\000|\004)(([a-z0-9_]\.)*(Autoexec|Workbook_(Open|BeforeClose)|Document_(Open|New|Close)))/x$1/i);
			$score +=  4 while ($buff =~ s/(?:\000|\004)(Logon|AddressLists|AddressEntries|Recipients|Subject|Body|Attachments|Logoff)/x$1/i);
			$score +=  2 while ($buff =~ s/\000(Shell|Options|CodeModule)/x$1/i);
			$score +=  2 while ($buff =~ s/\000(([a-z]+\.)?Application\000)/x$1/i);
			$score +=  2 while ($buff =~ s/(?:\000|\004)(stdole|NormalTemplate)/x$1/i);
			$score +=  1 while ($buff =~ s/\000(ID="{[-0-9A-F]+}"|ThisWorkbook\000|PrivateProfileString)/x$1/i);
			$score +=  1 while ($buff =~ s/(?:\000|\004)(ActiveDocument|ThisDocument)/x$1/i);
			$score +=  1 while ($buff =~ s/\000(\[?HKEY_(CLASSES_ROOT|CURRENT_USER|LOCAL_MACHINE))/x$1/);

			# Save cycles!  Return early!
			if ($score > $poison_score) 
			{
				Log 1, 0,	"Score exceeded $poison_score after scanning ". 
							($chunksize * $.) ." bytes!\n\n";
				return 1;
			}
		}

		# Read on...
		$buff = substr($buff, $chunksize, $chunksize) . <$fh>;
	}

	Log 1, 0,	"Attachment passed macro scan.\n\n"; 
	return 0;
}


# Scan a file for viruses, using the given command string.
#
sub ScanFile
{
	my ($e1, $e2, $e3, $cmd) = split(/:/, shift);
	my $filename = shift;
	my $fh = shift;

	return 0 unless (defined $cmd);

	$cmd =~ s/%FILENAME/$filename/gi;
	my @args = split(/\s+/, $cmd);						
	my $spid = undef;
	my $sleeps = 0;

	# Use built-in scanner if requested.
	return MacroScan($fh, $args[1]) if (lc($args[0]) eq "builtin");

	# Flush buffers, before forking.
 	STDOUT->flush();
	STDERR->flush();

	do
	{
		unless (defined ($spid = open(SCANNER, "-|")))
		{
			Log 0, 0, "Cannot fork: $!";
			return 3 if ($sleeps++ > 6);
			sleep(10);
		}
	} until (defined $spid);

	if (!$spid) # Are we the kid?
	{
		print STDOUT "Scan cmd: ", join(' ', @args), "\n";
		
		# We want the scanner's stderr to be sent to stdout!
		close(STDERR) && 
			open(STDERR, ">&STDOUT") || 
				print STDOUT "WARNING:  Couldn't dup STDOUT!\n";

		unless (exec { $args[0] } @args)
		{
			print STDOUT "Exec failed: $!\n";
			die "Exec failed: $!";
		}
	}

	# Not the kid, read the scanner's output.
	Log 0, 0, "Scanning attachment:\n";
	while (my $l = <SCANNER>)
	{
		Log 0, 0, "\t" . $l;
	}
	close (SCANNER);

	# Was file clean?
	for my $v (split(/,/, $e1))
	{
		if ($? == 256*$v)
		{
			Log 0, 0, "Scan succeeded, file is clean.\n";
			return 0;
		}
	}

	# Was file dirty, but is now clean?
	for my $v (split(/,/, $e2))
	{
		if ($? == 256*$v)
		{
			Log 0, 0, "File was infected, but the virus checker fixed it.\n";
			return 1;
		}
	}

	# Was file dirty and unfixable?
	for my $v (split(/,/, $e3))
	{
		if ($? == 256*$v)
		{
			Log 0, 0, "File was infected, virus checker couldn't fix it.\n";
			return 2;
		}
	}

	Log 0, 0, "Unknown exit code: $?\n";
	return 3;
}


# Clean/scan a file, sanitize the file name.  
#
# This will change the part's reader pointer to point to a virtual on-disk
# part, if a virus scanner is used.
#
sub SanitizeFile
{
	my $part = shift;
	
	my $unknown = undef;
	my $fnp = \$unknown;
	my $typep = \$unknown;
	
	for my $fhn ("name", "filename")
	{
		$fnp = \$part->{"mime"}->{$fhn} if ($part->{"mime"}->{$fhn}); 
	}
	$typep = \$part->{"mime"}->{"type"} if ($part->{"mime"}->{"type"}); 
	
	# Abort if the part has no file name.
	return undef unless ($$fnp);

	# Store original file name & type
	my $ofn = $$fnp;
	my $otype = $$typep;
	my $pol = undef;
	my $sinc = 0;
	my $fh = undef;

	for my $i (1..$conf->{"file_list_rules"}, -1)
	{
		if (($i < 0) || ($$fnp =~ $conf->{"file_list_$i"}))
		{
			my @policy = split(":", $conf->{"file_default_policy"});
			my $filename = undef;

			@policy = split(":", $conf->{"file_list_${i}_policy"})
				if ($conf->{"file_list_${i}_policy"});

			my $scan_result = 0;
			if ((@policy == 1) && (lc($policy[0]) =~ /^drop\!?$/))
			{
				if (!$fh)
				{
					while ($part->Read()) { };
				}
			}
			elsif ((@policy == 4) || (lc($policy[0]) =~ /^save\!?$/))
			{
				# Create a file name from our template.
				my $ofh = $fh;
				if (($fh) || ($fh = CreateAttFile(\$filename, $$fnp)))
				{
					if (!$ofh)
					{
						# Save the attachment to disk...
						while (my $l = $part->Read())
						{
							# FIXME:  need better error handling!
							$fh->print($l) || die;
						}

						$fh->flush();
					}

					# We want to scan this attachment
					if (@policy == 4)
					{
						$scan_result = 
							ScanFile($conf->{"file_list_${i}_scanner"},
									 $filename, $fh);
					}
				}
				else
				{
					# Error :(
					Log 0, 0, "Failed to create temporary file for scanning attachment!\n";
					$scan_result = 3;
				}
			}
			$pol = lc($policy[$scan_result]);
			$sinc += $conf->{"score_bad"} if ($pol =~ s/\!$//);

			# Enforce policy, based on scan result.
			if ($pol eq "mangle")
			{
				$sinc += 100;
				$$fnp = "BLACKLISTED.DEFANGED-$pid";
				$$typep = "application/DEFANGED-$pid";
			}
			elsif ($pol eq "defang")
			{
				$$fnp =~ s/\./_/g;
				$$fnp .= ".DEFANGED-$pid";
				$$typep = "application/DEFANGED-$pid";
			}
			elsif ($pol =~ /^(drop|save)$/i)
			{
				my $msg = $conf->{"msg_file_". lc($1)};
				my $fn = $filename;

				$fn =~ s/^.*\///g;
				$msg =~ s/%FILENAME/$$fnp/gi;
				$msg =~ s/%SAVEDNAME/$fn/gi;
				$part->UnRead($msg);

				# Fix encoding etc.
				$part->{"mime"}->{"encoding"} = "8bit";
				$part->{"mime"}->{"disposition"} = "inline";
				$part->{"encoder"} = $part->{"encoders"}->{"8bit"};
				if ($part->{"uupart"})
				{
					$part->{"postponed"} = undef;
					$part->{"rawheaders"} = undef;
					$part->{"uupart"} = 0;
				}
				$$typep = "text/plain";
				$$fnp = "DEFANGED-$pid.txt";

				# This keeps the file from getting reincluded.
				$fh = undef;

				# This may keep the file from getting deleted.
				$filename = undef if ($pol eq "save");
			}
			elsif ($pol eq "panic")
			{
				$$fnp .= "-PANIC";
				if ($conf->{"score_panic"})
				{
					$sinc += $conf->{"score_panic"};
				}
				else
				{
					$conf->{"score_panic"} = 10000;
					$sinc += 10000;
				}
			}

			if (defined $fh)
			{
				# Prepare to re-read attachment from disk.
				# I suppose this is bad OO-form.
				$fh->seek(0, SEEK_SET);
				my $reader = Anomy::MIMEStream->New(*$fh);
				$part->{"reader"} = $reader;
				$reader->{"writer"} = $part;
			}

			# Keep trying?
			next if ($pol eq "unknown");

			# Cleanup.
			unlink $filename if ($filename);

			last;
		}
	}

	# Remove spooky characters from file name.
	$$fnp = Anomy::MIMEStream::Encode7bit(undef, $$fnp);
	$$fnp =~ s/[^A-Za-z0-9\._=-]/_/g;

	# Truncate file name length, by chopping off anything preceding the
	# last 80 characters - the extension matters more than the beginning
	# of the name.
	$$fnp =~ s/^.*(.{80})$/$1/;

	# Log change...
	if (($ofn) && ($$fnp ne $ofn))
	{
		Log 0, 0, "Enforced policy: $pol (attachment $ofn)\n";
		Log 0, 0, "Replaced MIME type:\n\t>>$otype<<\t\nwith\t>>$$typep<<\n" 
			if (($otype) && ($otype ne $$typep));
		Log 1, $sinc, "Rewrote filename:\n\t>>$ofn<<\nas\t\>>$$fnp<<\n\n";
	}
}


# Truncate just about anything, printing out a blurb at the same
# time.  The fourth argument is a replacement value to use instead
# of the overly-long original one.  Omitting it just truncates the
# field.
#
sub Truncate
{
	my $dataname = shift;
	my $data = shift;
	my $maxlen = shift;
	my $safeval = shift;

	if ($maxlen < length($$data))
	{
		$safeval = substr($$data, 0, $maxlen) unless ($safeval);

		Log 1, 100,	"Rewrote long ( >$maxlen bytes ) $dataname:\n\t>>". 
					$$data ."<<\nas\t>>$safeval<<\n\n";

		$$data = $safeval;
	}
}


# This is a helper function for SanitizeHTML
#
sub SanitizeTag
{
	my $tag = shift;
	my $otag = $tag;

	# Don't defang email addresses!
	return $tag if ($tag =~ /<\S+\@/);

	# Defang dangerous tags & attributes
	$tag =~ s/(<(?:!--\s*)?)(META|APP|SCRIPT|OBJECT|EMBED|I?FRAME|STYLE)/${1}DEFANGED_$2/sgi;
	$tag =~ s/STYLE\s*=/DEFANGED_STYLE=/sgi;
	$tag =~ s/On(Abort|Blur|Change|Click|DblClick|DragDrop|Error|Focus|KeyDown|KeyPress|KeyUp|Load|MouseDown|MouseMove|MouseOut|MouseOver|MouseUp|Move|Reset|Resize|Select|Submit|Unload)/DEFANGED_On$1/sgi;

	# Unquote things that have no good reason to be quoted - potentially
	# dangerous obfuscated URLs, for the most part.
	if ($tag =~ /["'][^"'\s]*&#x?[1-9][0-9a-f]/si) 
	{
		while ($tag =~ /["'][^"'\s]*&#((4[6-9]|5[0-8]|6[4-9]|[78][0-9]|9[07-9]|1[0-1][0-9]|12[0-2]))/s) 
		{
			my $char = chr($1);
			$tag =~ s/&#$1;?/$char/sg;
		}
		while ($tag =~ /["'][^"'\s]*&#(x(2[ef]|3[0-9a]|4[0-9a-f]|5[0-9a]|6[1-9a-f]|7[0-9a]))/si) 
		{
			my $char = chr(hex("0$1"));
			$tag =~ s/&#$1;?/$char/sgi;
		}
	}
	if ($tag =~ /["'][^"'\s]*%[2-7][0-9a-f]/si) {
		while ($tag =~ /["'][^"'\s]*%((2[ef]|3[0-9a]|4[0-9a-f]|5[0-9a]|6[1-9a-f]|7[0-9a]))/si)
		{
			my $char = chr(hex("0x$1"));
			$tag =~ s/%$1/$char/sgi;
		}
	}

	# Defang javascript: urls.
	$tag =~ s/(["'])([a-z]+script|mocha):/${1}DEFANGED_$2:/sgi;
	$tag =~ s/(["'])&{/${1}DEFANGED_&{/sg;

	# Logging
	if ($otag ne $tag)
	{
		my $ntag = $tag;

		$ntag =~ s/\s+/ /gs;
		$ntag =~ s/[<>]/_/gs;
		$otag =~ s/\s+/ /gs;
		$otag =~ s/[<>]/_/gs;
		Log 1, 1, "Rewrote HTML tag:\n\t$otag\nas\t$ntag\n\n";
	}

	return $tag;
} 

# This routine sanitizes a snippet of HTML.
#
# If the tag is incomplete, it is returned as a "leftover" for later
# scanning, else the function returns undef.  The calling function is
# expected to detect overly long leftovers, and truncate them if 
# necessary.
#
# This is based on John's HTML sanitization code.  I may replace this 
# later with a more complete parser which defangs any attributes not 
# specifically white-listed.
#
sub SanitizeHTML
{
	my $lp = shift;
	my $leftovers = undef;

	# Check for trailing tags...
	if ($$lp =~ s/(<(?:!--\s+)?[A-Za-z]+[^>]+)$//s)
	{
		$leftovers = $1;
		if (length($leftovers) > 2048) 
		{
			my $lo = $leftovers;
			
			$lo =~ s/[<>]/_/gs;
			$lo =~ s/\s+/ /gs;

			Log 0, 1,	"Split really long tag (over 2k):\n\t".
						substr($lo, 0, 25) ." ... ". 
						substr($lo, -25, -1) . " [snip!]\n\n";
			
			$$lp .= $leftovers . '>';
			$leftovers = "<DEFANGED.$pid ";
			$pid++;
		}
	}

	# Defang active HTML content
	$$lp =~ s/(<[A-Za-z]+.*?>)/ SanitizeTag($1) /gse;

	return $leftovers;
}

# This routine will truncate and rewrite message headers, to block 
# buffer-overflow exploits and filename-based trojans.
#
sub SanitizeHeaders
{
	my $part = shift;
	my $boundfix = shift;
	my $headers = $part->{"headers"};
	my $mime = $part->{"mime"};

	my $oldbugscore = $bugscore;

	# Header length checks, more strict than the generic tests below.
	if ($conf->{"feat_lengths"})
	{
		Truncate("MIME content-type", \$mime->{"type"},     80, "application/octet-stream");
		Truncate("MIME charset",      \$mime->{"charset"},  40, "iso-8859-1");
		Truncate("MIME encoding",     \$mime->{"encoding"}, 40, "8bit");
		

		# This is more strict than the following header trick, and is 
		# designed to explicitly catch the Outlook Date: overflow.
		if ($part->{"rawheader"} =~ s/(Date:\s*[^\n]{35,35})([^\n]+)/$1\nX-Defanged-Date: [$pid] $2/gs)
		{
			Log 1, 10, "Split unusually long Date: header.\n";
		}
		# This will limit the length of each individual word in the headers
		# to 128 characters, inserting a space when longer words are 
		# encountered.  This is designed to foil attacks based on 
		# vulnerabilities like those described in various bugtraq posts 
		# in July 2000, including the USSR-2000050 advisory.
		if ($part->{"rawheader"} =~ s/(\S{120,120})(\S)/$1 [Split:$pid] $2/gs)
		{
			Log 1, 10, "Split long word(s) in header.\n";
		}		
	}

	# File name sanity checks...
	SanitizeFile($part) if ($conf->{"feat_files"});

	# Replace the boundary string: we know ours is sane! :-)
	if (($conf->{"feat_boundaries"}) && ($boundfix) && ($mime->{"boundary"}))
	{
		$part->{"bad-mime-boundary"} = $mime->{"boundary"};
		$mime->{"boundary"} = Anomy::MIMEStream::MakeBoundary();

		$boringlog .= "Replaced original MIME boundary:\n\t>>".
					  $part->{"bad-mime-boundary"} ."<<\nwith\t>>".
					  $mime->{"boundary"} ."<<\n\n";
	}

	# Rebuild the MIME headers with sane/safe values.
	my $newheaders = undef;
	for my $header ("Content-Type", 
					"Content-Transfer-Encoding",
					"Content-Disposition")
	{
		next unless (defined $headers->{lc($header)});

		my @fields = split(/\s+/, $part->{"mime-headers"}->{lc($header)} );
		my $t = undef;

		for my $field (@fields)
		{
			my $value = $mime->{lc($field)};

			Truncate("MIME $field", \$value, 100) if ($conf->{"feat_lengths"});
			
			my $oval = $value;
			if (($conf->{"mangle"}) && ($value =~ s/[\'\"\`\$]/_/g))
			{
				Log 1, 10, "Rewrote MIME $field:\n\t>>$oval<<\nas\t>>$value<<\n\n";
			}

			if ($t)
			{
				$t .= "; $field=\"$value\"";
			}
			else
			{
				$t = $value;
			}
		}
		$newheaders .= $header .": ". $t ."\n";

		# Modifying the $headers directly here should be okay, since we are
		# inserting equivalent values (not identical) unless new bugs are 
		# found, in which case the raw headers will be updated.
		$headers->{lc($header)} = $t;
	}

	# Only modify part header if absolutely necessary.	
	if (($conf->{"bounds"}) || ($bugscore > $oldbugscore))
	{
		$part->KillRawMimeHeaders();
		chomp $part->{"rawheader"};
		if (!$part->{"parent"})
		{
			$part->{"rawheader"} .= "MIME-Version: 1.0\n";
			$headers->{"mime-version"} = "1.0\n";
		}
		$part->{"rawheader"} .= $newheaders . "\n";
	}
	
	# Remove any active HTML from header.
	my $rhl = SanitizeHTML(\$part->{"rawheader"});
	$part->{"rawheader"} .= $rhl;
}


##[ Parsers ]##################################################################


# This sanitizes a text part, and it's headers.
#
sub CleanText
{
 	my $part = shift;
	my $reader = $part;

	my $ishtml = 0;
	if (($part->{"mime"}->{"filename"} =~ /html?$/i) ||
	    ($part->{"mime"}->{"name"} =~ /html?$/i) ||
	    ($part->{"mime"}->{"type"} =~ /html?$/i))
	{
		$ishtml = 1;
	}

	my $in_trusted_text = 0;
    $in_trusted_text = 100
		if (($conf->{"feat_trust_pgp"}) &&
		    ($part->{"mime"}->{"type"} =~ /multipart\/signed/i));

	SanitizeHeaders($part, 1);
	$part->WriteHeader();

	# Some sanitizations only make sense at the very top of a file.	
	# This determines how many non-blank lines are "near the top".
	my $neartop = 5;

	my $leftovers = undef;
	while (my $l = $reader->Read())
	{
		$l = $leftovers . $l if ($leftovers);

		if ($l =~ /^-+BEGIN.*?SIGNED\s+MESSAGE-+\s*$/smi)
		{
			if ($conf->{"feat_trust_pgp"})
			{
				$in_trusted_text++;

				if ($conf->{"feat_verbose"})
				{
					Log 1, 1, "Disabled scanning for signed part of message.\n\n";
					$part->Write($conf->{"msg_pgp_warning"});
				}
				else
				{
					Log 1, 1, "Silently disabled scanning for signed part of message.\n\n";
				}
			}
		}

		if ($in_trusted_text)
		{
			$in_trusted_text-- if ($l =~ /^-+END.*?SIGNATURE-+\s*$/smi);
		}
		# This "else" is safe, since a PGP boundary is harmless.
		else
   		{
			# Check for inline forwarded messages.
		    if (($conf->{"feat_forwards"}) && 
				($l =~ s/^(---+.*?Forward.*?---+\s*)$//smi))
			{
				my $fwd = $1;

				# Deal with leftover html snippets by closing them.
				if (($conf->{"feat_html"}) && ($l !~ /^\s*$/))
				{
					$l .= "DEFANGED.$pid>\n";
					Log 0, 1, "Closed open HTML tag preceding forwarded message.\n\n";

					SanitizeHTML(\$l);
					$part->Write($l);
				}
				
			   	$part->Write($fwd);
				$reader->ParserForwardedMessage();
				$l = $reader->{"postponed"};
				undef $reader->{"postponed"};
			}
			# Check for inline uuencoded attachments.  
			# Sanitize their contents.
			if (($conf->{"feat_uuencoded"}) && 
				($l =~ s/^(begin \d\d\d\d? \S+\s*)$//smi))
			{
				$reader->UnRead($1);
				
				# Deal with leftover html snippets by closing them.
				if (($conf->{"feat_html"}) && ($l !~ /^\s*$/))
				{
					$l .= "DEFANGED.$pid>\n";
					Log 0, 1, "Closed open HTML tag preceding uuencoded attachment.\n\n";

					SanitizeHTML(\$l);
					$part->Write($l);
				}

				Anomy::MIMEStream::ParserUUAttachment($reader);
				$l = $reader->{"postponed"};
			}
			elsif ($neartop)
			{
				# Check for Unix shell scripts.
				if (($conf->{"feat_scripts"}) && 
					($l =~ s/^#!/#!\/bin\/sh\necho DEFANGED.$pid\nexit\n#!/gsm))
				{
					Log 1, 1, "Defanged UNIX shell script(s).\n\n";
				}
			}

			# Sanitize embedded HTML - we do this last so our hiding stuff
			# in $leftovers won't disable the other checks.
			$leftovers = SanitizeHTML(\$l) if ($conf->{"feat_html"});
		}

		$neartop-- if (($neartop) && ($l !~ /^\s*$/));

		if ((!$in_trusted_text) && ($ishtml) && 
			($l =~ s/^(.*)(<\/(?:BODY|HTML))/$2/si))
		{
			$part->Write($1);
			DumpLog($part);
		}
	   	$part->Write($l);
	}
	$part->Write($leftovers) if ($leftovers);
	
	# Append log "signature style"
	DumpLog($part) if (!$in_trusted_text);

	# Flush buffers
	$part->Write(undef);
}


# This sanitizes the headers of an otherwise unfamiliar part.
#
sub CleanUnknown
{
	my $part = shift;
	my $guess = $part->GuessMimeType();

	if ($guess =~ /text\/html/i)
	{
		# Why wasn't this marked as text/html ?
		return CleanHTML($part);
	}
	
	if ($guess =~ /text\/plain/)
	{
		# Why wasn't this marked as text/plain ?
		return CleanText($part);
	}

	return CleanHeaders($part);
}


# This sanitizes only the headers of a part.
#
sub CleanHeaders
{
	my $part = shift;
	my $reader = $part;

	SanitizeHeaders($part, 0);
	Anomy::MIMEStream::ParserCat($reader);

	# BUG: Can't dump logs here, could royally screw up message.
}


# This sanitizes the headers of a message/rfc822 part, before processing
# the part itself with the default handlers.
#
sub CleanRFC822
{
	my $part = shift;

	SanitizeHeaders($part, 0);
	Anomy::MIMEStream::ParserRFC822($part);

	# BUG: Can't dump logs here, could royally screw up message.
}


# This sanitizes the headers of a multipart part, before processing the
# part itself with the default handlers.
#
sub CleanMultipart
{
 	my $reader = shift;
	
	my $mime = { };
	%{ $mime } = %{ $reader->{"mime"} };

	# Create an output writer with clean headers.
	my $writer = Anomy::MIMEStream->Writer($reader, $mime);
	SanitizeHeaders($writer, 1);

	Anomy::MIMEStream::ParserUnclosedMultipart($reader);
	
	# Append sanitization log as seperate text part, if this is the end 
	# of the outermost multipart/* part.
	#
	DumpLog($writer);

	# Garbage collection & cleanup.
	$writer->Write(undef);
	$writer->Close();
	$writer->Amputate();
}
