#!/usr/bin/perl -w
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

use strict;
use FindBin;
use Getopt::Std;
getopts("fm:M:X:l:L:pxhc:at:s:ioTSdP");

use vars qw {
  $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
  $opt_a $opt_t $opt_s $opt_i $sorting $opt_o $opt_T $opt_S $opt_X
  $opt_d $opt_P
};

# as per http://wiki.apache.org/spamassassin/RulesProjPromotion, for -P
my $promote_so_min      = 0.95;
my $promote_hitrate_min = 0.02;
my $promote_fprate_max  = 1.00;

sub usage {
  die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC]
          [-s SC] [-a] [-p] [-x] [-i] [-T] [-S] [-o] [-d] [spam log] [ham log]

    -c p   use p as the rules directory
    -f     falses. count only false-negative or false-positive matches
    -m RE  print rules matching regular expression
    -t RE  print rules with tflags matching regular expression
    -M RE  only consider log entries matching regular expression
    -X RE  don't consider log entries matching regular expression
    -l LC  also print language specific rules for lang code LC (or 'all')
    -L LC  only print language specific rules for lang code LC (or 'all')
    -a     display all tests
    -p     percentages. implies -x
    -x     extended output, with S/O ratio and scores
    -s SC  which scoreset to use
    -i     use IG (information gain) for ranking
    -T     display rule times. implies -x, -p
    -o     display hit overlaps against all other rules
    -S     display score-map of hits
    -P     flag which rules pass the promotion criteria
    -d     XML output.  conflicts with -x, -p

    options -l and -L are mutually exclusive.

    options -M and -X are *not* mutually exclusive.

    if either the spam or and ham logs are unspecified, the defaults
    are \"spam.log\" and \"ham.log\" in the cwd.

";
}

usage() if($opt_h || ($opt_l && $opt_L));
usage() if($opt_d && ($opt_x || $opt_p));

if ($opt_p) {
  $opt_x = 1;
}

if ($opt_d) {
  $opt_x = $opt_p = 1;
}

$opt_s = 0 if ( !defined $opt_s );

my $cffile = $opt_c || "$FindBin::Bin/../rules";

# "our" so that the require'd file can overwrite them
my $rules_pl_unparseable;
our %rules = ();
our %scores = ();

my %soratio = ();
my %freq_spam = ();
my %freq_ham = ();
my %hmap_spam = ();
my %hmap_ham = ();
my %scoremap_spam = ();
my %scoremap_ham = ();
my %freq = ();
my $num_spam = 0;
my $num_ham = 0;
my %ranking = ();
my $ok_lang = '';

my %rule_times = ();

readscores($cffile);

$ok_lang = lc ($opt_l || $opt_L || '');
if ($ok_lang eq 'all') { $ok_lang = '.'; }

if ($opt_t && $rules_pl_unparseable) {
  die "-t requires rules.pl to be parseable";
}

foreach my $key (keys %rules) {

  if ( ($opt_L && !$rules{$key}->{lang}) ||
       ($rules{$key}->{lang} &&
         (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/io)
     ) ) {
    delete $rules{$key} ; next;
  }

  $freq_spam{$key} = 0;
  $freq_ham{$key} = 0;
  if ($opt_o) {
    $hmap_spam{$key} = '';
    $hmap_ham{$key} = '';
  }
}

readlogs();

my $hdr_all = $num_spam + $num_ham;
my $hdr_spam = $num_spam;
my $hdr_ham = $num_ham;

my $sorting = $opt_i ? "IG" : "RANK";

if ($opt_d) {
  $hdr_all ||= 0.00001;     # avoid div by 0 in the next 2 statements
  $hdr_spam = ($num_spam / $hdr_all) * 100.0;
  $hdr_ham = ($num_ham / $hdr_all) * 100.0;
  $opt_P = 1;

  print qq{

    <freqs>
      <allmessages>
        <count class='spam'>$num_spam</count>
        <count class='ham'>$num_spam</count>
        <pc class='spam'>$hdr_spam</pc>
        <pc class='ham'>$hdr_spam</pc>
      </allmessages>

  };

}
elsif ($opt_p) {
  printf "%7s  %7s  %7s  %6s  %6s  %6s  %s\n",
  	"MSECS", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%",
        "S/O", $sorting, "SCORE", "NAME";

  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
  	0, $hdr_spam, $hdr_ham,
        soratio ($num_spam,$num_ham), 0, 0;

  $hdr_all ||= 0.00001;     # avoid div by 0 in the next 2 statements
  $hdr_spam = ($num_spam / $hdr_all) * 100.0;
  $hdr_ham = ($num_ham / $hdr_all) * 100.0;
  $hdr_all = 100.0;             # this is obvious

  printf "%7.5f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
  	0, $hdr_spam, $hdr_ham,
        soratio ($num_spam,$num_ham), 0, 0;

}
elsif ($opt_p) {
  printf "%8s %7s  %7s  %6s  %6s  %6s  %s\n",
  	"OVERALL%", $opt_f?"FNEG%":"SPAM%", $opt_f?"FPO%":"HAM%",
        "S/O", $sorting, "SCORE", "NAME";

  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
  	$hdr_all, $hdr_spam, $hdr_ham,
        soratio ($num_spam,$num_ham), 0, 0;

  $hdr_all ||= 0.00001;     # avoid div by 0 in the next 2 statements
  $hdr_spam = ($num_spam / $hdr_all) * 100.0;
  $hdr_ham = ($num_ham / $hdr_all) * 100.0;
  $hdr_all = 100.0;             # this is obvious

  printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
  	$hdr_all, $hdr_spam, $hdr_ham,
        soratio ($num_spam,$num_ham), 0, 0;

}
elsif ($opt_x) {
  printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
  printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
  	$hdr_all, $hdr_spam, $hdr_ham,
        soratio ($num_spam,$num_ham), 0, 0;

} else {
  printf "%10s  %10s  %10s  %s\n",
  	"OVERALL", "SPAM", "HAM", "NAME";
  printf "%10d  %10d  %10d  (all messages)\n",
  	$hdr_all, $hdr_spam, $hdr_ham;
}

my %done = ();
my @tests = ();
my $rank_hi = 0;
my $rank_lo = 9999999;

# variables for wanted/unwanted RANK
my %wanted;
my %unwanted;
my %isnice;
my %wranks;
my %uranks;

# rules that we want to look at
$freq{$_}++ for keys %freq_ham;
$freq{$_}++ for keys %freq_spam;

my $test;
foreach $test (keys %freq) {
  my $parsed_rules_entry = $rules{$test};

  # do not require 'tmp/rules.pl' to have been built from the
  # exact same ruleset version; this assumption screws up nightly
  # mass-check reports if they are generated with a different SVN rev
  # next unless (exists $rules{$test});

  next if (!$opt_a && $test =~ /^__/);

  next if $done{$test}; $done{$test} = 1;
  push (@tests, $test);

  my $isnice = 0;
  if ($parsed_rules_entry) {
    if ($parsed_rules_entry->{tflags} &&
            $parsed_rules_entry->{tflags} =~ /\bnice\b/)
    {
      $isnice = 1;
    }
  }
  $isnice{$test} = $isnice;

  my $fs = $freq_spam{$test}; $fs ||= 0;
  my $fn = $freq_ham{$test}; $fn ||= 0;
  my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
  my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;

  my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj);

  if ($isnice) {
    $soratio = 1.0 - $soratio;
    my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp;
  }

  if ($opt_i) {
    # come up with a ranking
    my $rank;

    # IG system: from "Learning to Filter Unsolicited Commercial E-Mail",
    # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the
    # Boolean attributes (ie. the rules). Measures "the average reduction in
    # the entropy of C (classification) given the value of X (the rule)". Makes
    # a good ranking measure with a proper statistical basis. ;)
    #
    # Still would like to get an entropy measure in, too.
    #
    #             sum                                    P(X = x ^ C = c)
    # IG(X,C) = x in [0, 1]    P(X = x ^ C = c) . log2( ------------------- )
    #           c in [Ch, Cs]                           P(X = x) . P(C = c)
    #
    my $safe_nspam = $num_spam || 0.0000001;
    my $safe_nham = $num_ham || 0.0000001;

    my $num_all = ($num_spam + $num_ham);
    my $safe_all = $num_all || 0.0000001;
    my $f_all = $fs+$fn;

    my $px0 = (($num_all - $f_all) / $safe_all);         # P(X = 0)
    my $px1 = ($f_all / $safe_all);                      # P(X = 1)
    my $pccs = ($num_spam / $safe_all);                  # P(C = Cs)
    my $pcch = ($num_ham / $safe_all);                   # P(C = Ch)
    my $px1ccs = ($fs / $safe_nspam);                   # P(X = 1 ^ C = Cs)
    my $px1cch = ($fn / $safe_nham);                    # P(X = 1 ^ C = Ch)
    my $px0ccs = (($num_spam - $fs) / $safe_nspam);     # P(X = 0 ^ C = Cs)
    my $px0cch = (($num_ham - $fn) / $safe_nham);       # P(X = 0 ^ C = Ch)
    my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001;
    my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001;
    my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001;
    my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001;

    sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931...

    my $safe_px0ccs = ($px0ccs || 0.0000001);
    my $safe_px0cch = ($px0cch || 0.0000001);
    my $safe_px1ccs = ($px1ccs || 0.0000001);
    my $safe_px1cch = ($px1cch || 0.0000001);
    $rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) +
                    ( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) +
                    ( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) +
                    ( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) );

    $ranking{$test} = $rank;
    $rank_hi = $rank if ($rank > $rank_hi);
    $rank_lo = $rank if ($rank < $rank_lo);
  }
  else {
    # RANK: basic wanted/unwanted ranking
    #
    # The rank of each test based on two ranks: (1) the number of wanted
    # hits and (2) the number of unwanted hits.  Each test is ranked
    # positionally for both its wanted and unwanted hits (ties are
    # allowed) and the two ranks are normalized to have the same range.
    # Those two ranks are added together, producing a single RANK number
    # that is then normalized to [0, 1].  The result is equivalent to:
    #
    # RANK(rule) = (percentile(wanted) + percentile(unwanted))/2
    #
    $wanted{$test} = $isnice ? $fn : $fs;
    $unwanted{$test} = $isnice ? $fs : $fn;
    # count number of ranks of each type
    $wranks{$wanted{$test}} = 1;
    $uranks{$unwanted{$test}} = 1;
  }
}

# finish basic wanted/unwanted ranking
if (! $opt_i) {
  my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
  my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;

  # first half of ranking is the wanted rank
  my $position = 0;
  my $last = undef;
  for my $test (@wanted) {
    $position++ if defined $last && $last != $wanted{$test};
    $ranking{$test} += $position;
    $last = $wanted{$test}
  }

  # second half of ranking is the unwanted rank
  my $normalize = (scalar keys %wranks) / (scalar keys %uranks || 0.001);
  $position = 0;
  $last = undef;
  for my $test (@unwanted) {
    $position++ if defined $last && $last != $unwanted{$test};
    $ranking{$test} += ($position * $normalize);
    $last = $unwanted{$test};
    $rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi);
    $rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo);
  }
}

{
  # now normalise the rankings to [0, 1]
  $rank_hi -= $rank_lo;
  foreach my $test (@tests) {
    $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi);
  }
}

if ($opt_T) {
  read_timings();
}

foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) {
  my $parsed_rules_entry = $rules{$test};

  # do not require 'tmp/rules.pl' to have been built from the
  # exact same ruleset version; this assumption screws up nightly
  # mass-check reports if they are generated with a different SVN rev
  # next unless (exists $rules{$test});
  
  next if (!$opt_a && $test =~ /^__/);

  my $fs = $freq_spam{$test}; $fs ||= 0;
  my $fn = $freq_ham{$test}; $fn ||= 0;
  my $fa = $fs+$fn;
  my $num_fs = $fs;
  my $num_fn = $fn;
  my $num_fa = $fa;

  my $tflags = '';
  if ($parsed_rules_entry) {
    $tflags = $parsed_rules_entry->{tflags};
  }

  # match certain tests
  next if ($opt_m && $test !~ m/$opt_m/);
  # match tflags
  next if ($opt_t && (!$tflags || $tflags !~ /$opt_t/));

  if (!$opt_a && !$opt_t && $tflags) {
    # not net tests
    next if ($tflags =~ /\bnet\b/ && ($opt_s % 2 == 0));

    # not userconf
    # Jul 13 2005 jm: removed.  this blocks SPF_PASS from showing up!
    # why should userconf rules not be visible in freqs output?
    # next if ($tflags =~ /\buserconf\b/);
  }

  # adjust based on corpora sizes (and cvt to % while we're at it)
  my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
  my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;

  if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; }

  if ($opt_p) {
    my $denom = ($num_spam + $num_ham) || 0.000001; # avoid / by 0
    $fa = ($fa / $denom) * 100.0;
    $fs = $fsadj;
    $fn = $fnadj;
  }

  my $soratio = $soratio{$test};
  if (!defined $soratio) {
    $soratio{$test} = soratio ($fsadj, $fnadj);
  }

  my $promotable;
  if ($opt_P) {
    $promotable = 1;

    if ($isnice{$test}) {
      if (($soratio{$test} > (1.0 - $promote_so_min))
          || ($fn < $promote_hitrate_min)
          || ($fs >= $promote_fprate_max))
      {
        $promotable = 0;
      }
    } else {
      if (($soratio{$test} < $promote_so_min)
          || ($fs < $promote_hitrate_min)
          || ($fn >= $promote_fprate_max))
      {
        $promotable = 0;
      }
    }
  }
  my $promotable_str = $opt_P ? ($promotable ? '+ ' : '- ') : '';

  if ($opt_d) {
    print qq{
      <rule>
        <time>}.($rule_times{$test}||0).qq{</time>
        <count class='all'>$num_fa</count>
        <count class='spam'>$num_fs</count>
        <count class='ham'>$num_fn</count>
        <pc class='all'>}.sprintf("%.5f", $fa).qq{</pc>
        <pc class='spam'>}.sprintf("%.5f", $fs).qq{</pc>
        <pc class='ham'>}.sprintf("%.5f", $fn).qq{</pc>
        <so>}.sprintf("%.8f", $soratio).qq{</so>
        <rank>}.sprintf("%.8f", $ranking{$test}).qq{</rank>
        <score set='$opt_s'>}.($scores{$test}||0).qq{</score>
        <promotable>$promotable</promotable>
        <test>$test</test>
    };

  } elsif ($opt_T) {
    printf "%7.5f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s%s\n",
  	$rule_times{$test}||0, $fs, $fn, $soratio, $ranking{$test},
        $scores{$test}||0,
        $promotable_str, $test;

  } elsif ($opt_p) {
    printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s%s\n",
  	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0,
        $promotable_str, $test;

  } elsif ($opt_x) {
    printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  %s%s\n",
  	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}||0,
        $promotable_str, $test;

  } else {
    printf "%10d  %10d  %10d  %s\n", $fa, $fs, $fn, $test;
  }

  if ($opt_S) {
    _print_scoremap("ham", $scoremap_ham{$test});
    _print_scoremap("spam", $scoremap_spam{$test});
  }

  if ($opt_o) {
    compute_overlaps_for_rule($test);
  }

  if ($opt_d) {
    print qq{ </rule> };
  }
}

if ($opt_d) {
  print qq{
    </freqs>
  };
}
exit;


sub _print_scoremap {
  my ($name, $smap) = @_;

  if ($opt_d) {
    print qq{ <scoremap class='$name'> };
  }

  $smap ||= { };
  my @scores = (sort { $a <=> $b } keys %{$smap});

  my $total = 0;
  foreach my $score (@scores) {
    $total += $smap->{$score};
  }

  foreach my $score (@scores) {
    my $num = $smap->{$score};
    my $pc = sprintf("%.4f", ($num / ($total||0.0001)) * 100);

    if ($opt_d) {
      print qq{
        <si score='$score' pc='$pc' count='$num' /> };

    }
    else {
      printf "  scoremap %4s: %2d %6.2f%% %4d %s\n",
          $name, $score, $pc, $num, _scoremap_graph($pc);

    }
  }

  if ($opt_d) {
    print qq{ </scoremap> };

  } else {
    print "\n";
  }
}

sub _scoremap_graph {
  my ($pc) = @_;
  return '*' x ($pc * (40/100));
}


sub readlogs {
  my $spam = $ARGV[0] || "spam.log";
  my $ham = $ARGV[1] || "ham.log";

  foreach my $file ($spam, $ham) {
    open (IN, "<$file") || die "Could not open file '$file': $!";

    my $isspam = ($file eq $spam);
    my $caught;
    my $restofline;
    my $rules;
    my $score;

    # this is very speed-sensitive code.  remove all possible
    # conditionals using an eval('..').
    my $evalstr = '
      while (<IN>) {
    ';

    if ($opt_M) {
      $evalstr .= '
        next unless /$opt_M/o;
      ';
    }
    if ($opt_X) {
      $evalstr .= '
        next if /$opt_X/o;
      ';
    }

    # note: doing the match with a regexp shaves off no less than
    # 7 opcodes. nice!

    # the additional split() is for this case:
    # ".  -20 /path  time=1112116980,scantime=0,format=f,reuse=no"
    # in other words, no hits.  split(' ') cannot deal with this
    # correctly, seeing (".", "-20", "/path", "time=...etc").  Work
    # around this by using a literal / / regexp split to discard
    # the csv stuff we don't want out of the rest of the line.


    $evalstr .= '
        ($caught, $score, $restofline) = split(\' \', $_, 3);
        next unless ($caught =~ /^[Y\.]$/ && $restofline);
        (undef, $rules) = split(/ /, $restofline, 3);
    ';

    if ($opt_f) {
      $evalstr .= '
        next if (!(($caught eq "Y") xor $isspam));
      ';
    }

    if ($opt_S) {
      $evalstr .= '
        $score = int $score;
      ';
    }

    my $hmapstr = '';
    my $smapstr = '';
    if ($isspam) {
      if ($opt_o) {
        $hmapstr = '
          if (!exists $hmap_spam{$r}) {
            $hmap_spam{$r} = "";
          }
          vec ($hmap_spam{$r}, $num_spam, 1) = 1;
        ';
      }

      if ($opt_S) {
        $smapstr = ' $scoremap_spam{$r}{$score}++; ';
      }

      $evalstr .= '
        $num_spam++;
        foreach my $r (split(/,/, $rules)) {
          $freq_spam{$r}++;
          '.$hmapstr.$smapstr.'
        }
      ';
    } else {
      if ($opt_o) {
        $hmapstr = '
          if (!exists $hmap_ham{$r}) {
            $hmap_ham{$r} = "";
          }
          vec ($hmap_ham{$r}, $num_ham, 1) = 1;
        ';
      }

      if ($opt_S) {
        $smapstr = ' $scoremap_ham{$r}{$score}++; ';
      }

      $evalstr .= '
        $num_ham++;
        foreach my $r (split(/,/, $rules)) {
          $freq_ham{$r}++;
          '.$hmapstr.$smapstr.'
        }
      ';
    }
    $evalstr .= '
      }
    ';

    eval $evalstr;

    if ($@) {
      die $@;
    }

    close IN;
  }
  # paranoia: remove zero length rules
  delete $freq_spam{''};
  delete $hmap_spam{''};
  delete $freq_ham{''};
  delete $hmap_ham{''};
}

sub compute_overlaps_for_rule {
  my ($r1) = @_;

  my %overlaps_spam = ();
  foreach my $r2 (keys %hmap_spam) {
    next if $r1 eq $r2;

    my $ratio = _hmap_to_overlap_ratio ($r1, $r2,
            $hmap_spam{$r1}, $hmap_spam{$r2}, $freq_spam{$r1}, $freq_spam{$r2});
    
    if (exists $overlaps_spam{$ratio}) {
      $overlaps_spam{$ratio} .= " ".$r2;
    } else {
      $overlaps_spam{$ratio} = $r2;
    }
  }
  my %overlaps_ham = ();
  foreach my $r2 (keys %hmap_ham) {
    next if $r1 eq $r2;

    my $ratio = _hmap_to_overlap_ratio ($r1, $r2,
            $hmap_ham{$r1}, $hmap_ham{$r2}, $freq_ham{$r1}, $freq_ham{$r2});
    
    if (exists $overlaps_ham{$ratio}) {
      $overlaps_ham{$ratio} .= " ".$r2;
    } else {
      $overlaps_ham{$ratio} = $r2;
    }
  }

  _print_overlap_ratios($r1, \%overlaps_spam, "spam");
  _print_overlap_ratios($r1, \%overlaps_ham, " ham");
}

sub _print_overlap_ratios {
  my ($r1, $hash, $type) = @_;

  if ($opt_d) {
    print qq{ <overlap class='$type'> };
  }

  foreach my $full (sort { $b <=> $a } keys %$hash) {
    my $ratio = $full * 100;

    last if ($ratio < 30);     # 30% cutoff
    my $rules = _prettify_overlap_rules($r1, $hash->{$full});
    next if ($rules eq '');

    if ($opt_d) {
      print qq{
        <overlaprules ratio='$ratio'>$rules</overlaprules>
      };

    } else {
      printf "  overlap %s: %3d%% %s\n", $type, $ratio, $rules;
    }
  }

  if ($opt_d) {
    print qq{ </overlap };
  }
}

sub _prettify_overlap_rules {
  my $rule = shift;
  my $str = shift;

  my @rules = sort split(' ', $str);
  if ($rules{$rule} && $rules{$rule}->{type} eq 'meta') {
    # ignore meta-subrules that match the rule they make up.
    # TODO: this is simplistic; it doesn't look to see if those subrules
    # are in turn meta rules with further subrules that should be ignored.
    # but it works well enough...

    my $code = $rules{$rule}->{code};
    @rules = grep {
      $code !~ /\b\Q$_\E\b/;
    } @rules;
  }
  return join (' ', @rules);
}

sub _hmap_to_overlap_ratio {
  my ($r1, $r2, $hmap1, $hmap2, $freq1, $freq2) = @_;

  $hmap1 ||= '';
  $hmap2 ||= '';
  if ($hmap1 !~ /[^\000]/ && $hmap2 !~ /[^\000]/) {
    # no hits on either!  this would normally give a 100% hitrate match,
    # but that's misleading -- so hide it by giving it a 0% overlap
    return 0;
  }

  #my $i; for ($i = 0; $i < length($hmap1)*8; $i++) { print vec($hmap1,$i,1); } print "\n"; for ($i = 0; $i < length($hmap2)*8; $i++) { print vec($hmap2,$i,1); } print "\n";
  # ah, nifty.  this could have been very slow, but with perl's support
  # for bitstring ops, we get C speed.  yay!
  my $both_bits = $hmap1 | $hmap2;
  my $diff_bits = $hmap1 ^ $hmap2;

  # now, count the set bits in both bitstrings. thank you unpack()!
  my $both_count = unpack("%32b*", $both_bits);
  my $diff_count = unpack("%32b*", $diff_bits);

  if ($both_count == 0) {
    warn "oops! no hits on either but didn't get caught in RE";
    return 0;
  }

  return ($both_count - $diff_count) / $both_count;
}


sub readscores {
  my($cffile) = @_;
  if (system ("$FindBin::Bin/parse-rules-for-masses -d \"$cffile\" -s $opt_s")) {
    warn "parse-rules-for-masses failed!";
  }
  eval {
    require "./tmp/rules.pl";
  };
  if ($@) {
    warn "tmp/rules.pl is unparseable: $@";
    $rules_pl_unparseable = 1;
    # but carry on anyway (for most uses)
  } else {
    $rules_pl_unparseable = 0;
  }
}

sub soratio {
  my ($s, $n) = @_;

  $s ||= 0;
  $n ||= 0;

  if ($s + $n > 0) {
      return $s / ($s + $n);
  } else {
      return 0.5;		# no results -> not effective
  }
}

sub read_timings {
  if (!open (IN, "<timing.log")) {
    warn "hit-frequencies: cannot read 'timing.log', timings will be 0";
    return;
  }
  my $ver = <IN>;
  if ($ver !~ /^v1/) {
    warn "hit-frequencies: unknown version in 'timing.log', timings will be 0";
    close IN;
    return;
  }
  while (<IN>) {
    if (/^T\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/) {
      my ($name, $duration, $max, $runs) = ($1,$2,$3,$4);
      $rule_times{$name} = ($duration / ($runs||0.00001)) * 1000;
    }
  }
  close IN;
}

