#!/bin/sh

# Mon May 23 01:05:59 BST 2005

# Generates a couple of plots with regard to spam detection over time.

# You have to change the following three variables as appropriate.
# You may also need to change the fields selected by the cut command, to match
# the date of receipt

# path to directory with spams which weren't identified as such
unsureDir="/home/serzan/.kde/share/apps/kmail/mail/.spam.directory/manual/cur"
# path to directory with spams which were correctly identified as spam
certainDir="/home/serzan/.kde/share/apps/kmail/mail/spam/cur"
# your email address
address="serzan@hellug.gr"

IFS=
tmp="/tmp/tmp$$"
tmp2="/tmp/tmp2$$"
out="/tmp/out$$"
out2="/tmp/out2$$"
>"$out"
>"$out2"

# these extract the date from the first Received header which should look
# something like:
# for <serzan@hellug.gr>; Mon, 16 May 2005 01:37:41 +0300
cDates=`grep "for <$address>" $certainDir/* | uniq | cut -d' ' -f 4,5,6`
uDates=`grep "for <$address>" $unsureDir/*  | uniq | cut -d' ' -f 4,5,6`

echo $cDates >"$tmp"
echo $uDates >>"$tmp"
cat "$tmp" | sort | uniq >"$tmp2"

day=0
while read date; do
	echo "$date"
	cCount=`echo "$cDates" | grep "$date" | wc -l`
	uCount=`echo "$uDates" | grep "$date" | wc -l`
	echo "$day $cCount $uCount" >>"$out"
	day=`expr $day + 1`
done < $tmp2

day=0
unset IFS
while read date; do
	echo "$date"
	pCount=0
	uCount=0
	dayCount=`grep "for <$address>.*$date.*+0300$" $unsureDir/* $certainDir/*  | uniq | wc -l`
	alist=`grep "for <$address>.*$date.*+0300$" $unsureDir/*  | cut -d: -f 1,2 | uniq`
	blist=`grep "for <$address>.*$date.*+0300$" $certainDir/* | cut -d: -f 1,2 | uniq`
	for mail in $alist $blist; do
		grep 'Subject: {Possible Spam} ' $mail 2>&1 >/dev/null
		if [ $? -eq 0 ]; then
			pCount=`expr $pCount + 1`
		fi
	done
	uCount=`expr $dayCount - $pCount`
	echo "$day $dayCount $pCount $uCount" >>"$out2"
	day=`expr $day + 1`
done < $tmp2

echo -e "
set terminal png\n
set out '/tmp/detected.png'\n
set xlabel 'day'\n
set ylabel 'number of spam mails'\n
set key top left\n
set title 'spam detection over time\n
plot   	'$out2' using 1:2 title 'total spam' w linespoints,\
		'$out' using 1:2 title 'detected by bogofilter' w linespoints,\
        '$out2' using 1:3 title 'detected by ISP filter' w linespoints\n
\n
set terminal png\n
set out '/tmp/undetected.png'\n
set xlabel 'day'\n
set ylabel 'number of spam mails'\n
set key top left\n
set title 'spam detection over time\n
plot   	'$out2' using 1:2 title 'total spam' w linespoints,\
        '$out2' using 1:4 title 'false negatives (ISP filter)' w linespoints,\
        '$out' using 1:3 title 'false negatives (bogofilter)' w linespoints\n" \
| gnuplot

rm -f $tmp $tmp2 $out $out2
