#!/usr/bin/perl -w

# Copyright (C) 2001 Simon Huggins

# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc., 59
# Temple Place, Suite 330, Boston, MA 02111-1307  USA

use strict;

my (@tags,%words,%blacklist);

if (defined $cfg{'grep_debug'} and $cfg{'grep_debug'}) {
	open(DEBUG, ">$cfg{'grep_debugfile'}");
}

open(HANDLE, "<$cfg{'tagfile'}") or htagdie "Could not open $cfg{'tagfile'}: $!\n";
@tags=<HANDLE>;
close(HANDLE);

while(<DATA>) {
	chomp;
	s/\s+//g;
	$blacklist{$_}++;
}

open(MSG, "<$cfg{'msgfile'}");
while(<MSG>) {
	s/[\s\t\n]+/ /g;
	tr/A-Za-z0-9 //dc; # delete non-alphanumeric
	s/\b\d+\b//g;
	$_ = lc $_;
	my @words = split;
	foreach (@words) {
		next if length($_)>9;
		$words{$_}++ if not exists $blacklist{$_};
	}
}
close(MSG);

my @goodtags;
my $count=0;
foreach my $key (sort { $words{$b} <=> $words{$a} }
		keys %words) {
	print DEBUG "$key occurred $words{$key} times\n" if $cfg{'grep_debug'};
	my @foundtags = grep { /\b$key\b/i } @tags;
	push @goodtags,@foundtags; # Tags with more than one matching word will get
				   # pushed on more than one time
	print DEBUG join "\n",@foundtags if $cfg{'grep_debug'};
	$count++;
	last if $count >20;
}

open(OUT, ">$cfg{'tmptagfile'}")
	or htagdie "$0: Could not open $cfg{'tmptagfile'}: $!\n";
reg_deletion("$cfg{'tmptagfile'}");
if (@goodtags) {
	print OUT $goodtags[rand(@goodtags)];
} else {
	exit(5);
}

END {
	close(OUT);

	if ($cfg{'grep_debug'}) {
		close(DEBUG);
	}
}

__DATA__
a 
about 
again 
all 
am 
an 
and 
another 
any 
apr
are 
arent 
as 
at 
aug
be 
because 
been 
before 
being 
but 
by 
can 
cant 
cat 
could 
dec
did 
do 
doesnt 
dont 
down 
ehlo
esmtp
even 
every 
feb
for 
fri
from 
gmt
go
great 
had
hadnt 
has 
have 
he 
her
here 
hers
herself 
him 
himself
his 
how 
however 
i 
id
if 
im 
in 
instead 
into 
is 
it 
its 
itself 
ive 
jan
jul
jun
know 
like 
lots 
mar
may
maybe
me
might
might 
mine
mon
more 
must 
my 
near 
need 
new 
no 
not 
nov
now 
oct
of 
off 
oh 
on 
or 
ought
ours
out 
over 
please 
quite 
received
said 
same 
sat
seem
seemed 
seems
sep
she
should
should 
smtp
so 
some 
such 
sun
than 
that 
thats 
the 
their
theirs
them
then 
there 
theres 
these 
they
this 
thu
to 
tom
too 
tue
up 
us
very 
want 
was 
we 
wed
well 
went 
were
what 
when 
which 
while 
who 
why 
will 
with 
wont 
would
would 
yes
yet 
you 
your 
youre 
yours
youve 
