#!/usr/bin/perl

use strict;
use warnings;

my @data;

my $c = 0;
my $hc = 0;
foreach my $f (10,20,35,40,50,55,60,70,80,95) {
  my $n = `bash -c 'cat final/{american,english}-{words,abbreviations,contractions}.$f 2> /dev/null | wc -l'` + 0;
  my $p = `bash -c 'cat final/{american,english}-{proper-names,upper}.$f 2> /dev/null | wc -l'` + 0;
  my $c0 = $c;
  $c = $c + $n + $p;
  my $h;
  for (my $i = $c0+1; $i <= $c; ++$i) {
      $h += 1/$i;
  }
  $hc += $h;
  push @data, [$f, $n, $p, $c, $hc];
}

sub commify {
    local $_  = shift;
    1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
    return $_;
}


#print "  Size   Words       Names    Running Total  %        Zipf's law\n";
print "  Size   Words       Names    Running Total  %\n";

foreach (@data) {
    my ($f, $n, $p, $c0, $h) = @$_;
    my $cp = 100*$c0/$c;
    my $hp = 100*$h/$hc;
    #printf("   %2d  %7s     %7s      %7s   %5.1f         %6.2f\n", $f, commify($n), commify($p), commify($c0), $cp, $hp);
    printf("   %2d  %7s     %7s      %7s   %5.1f\n", $f, commify($n), commify($p), commify($c0), $cp);
    #print "$f $n $p $c0   $cp $hp\n";
}
