#!/bin/sh
###########################################################################
##                                                                       ##
##                  Language Technologies Institute                      ##
##                     Carnegie Mellon University                        ##
##                         Copyright (c) 2004                            ##
##                        All Rights Reserved.                           ##
##                                                                       ##
##  Permission is hereby granted, free of charge, to use and distribute  ##
##  this software and its documentation without restriction, including   ##
##  without limitation the rights to use, copy, modify, merge, publish,  ##
##  distribute, sublicense, and/or sell copies of this work, and to      ##
##  permit persons to whom this work is furnished to do so, subject to   ##
##  the following conditions:                                            ##
##   1. The code must retain the above copyright notice, this list of    ##
##      conditions and the following disclaimer.                         ##
##   2. Any modifications must be clearly marked as such.                ##
##   3. Original authors' names are not deleted.                         ##
##   4. The authors' names are not used to endorse or promote products   ##
##      derived from this software without specific prior written        ##
##      permission.                                                      ##
##                                                                       ##
##  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         ##
##  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      ##
##  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   ##
##  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      ##
##  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    ##
##  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   ##
##  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          ##
##  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       ##
##  THIS SOFTWARE.                                                       ##
##                                                                       ##
###########################################################################
##             Author:  Alan W Black (awb@cs.cmu.edu)                    ##
##               Date:  December 2004                                    ##
###########################################################################
##                                                                       ##
##  Make a Huffman table data                                            ##
##    done by finding the top singletons, bigrams, trigrams ... and byte ##
##    coding them                                                        ##
##                                                                       ##
##  But this isn't full huffman coding yet                               ##
##                                                                       ##
##  Hmm, this should probably be written in something other than shell   ##
###########################################################################

##  There are N stages
##    Find the alphabet (usually 256 things)
##        This is done by finding the top singletons, bigrams, trigrams ...
##    Find the frequencies  (currently ignored, as codes are always 1 byte)
##    Compress the data, and output the lookup tables

## There is some octal conversion in here that probably is unnecessary

# Tidy up the tmp files if we get interrupted
trap 'rm -f huff.tmp.corpus.[0-9]* ; exit' 0 2 3 5 10 13 15

# Newer versions of Linux have a not very useful value for LANG by default
LANG=C
export LANG

## 0 is reserved, 1 is reserved too
ALPHABET_SIZE=254

# if [ $1 = "xentries" ]
# then 
#    infile=$2
#    outfile=$3
#    # Normalize the string to be compressed
#    cat $infile | awk '{print $1}' |
#    tr "A-Z" "a-z" |
#    awk '{for (i=1; i<=length($1); i++)
#             printf("%s ",substr($1,i,1));
#          printf("\n");}' >huff.tmp.corpus
# fi

if [ $1 = "entries" ]
then 
   # actually works for non-utf8 entries too
   infile=$2
   outfile=$3
   # Normalize the string to be compressed
   cat $infile | awk '{printf("\"%s\"\n",$1)}' |
   cat >huff.entries.corpus
   $ESTDIR/../festival/bin/festival -b $FLITEDIR/tools/make_lex.scm '(utf8entries "huff.entries.corpus" "huff.tmp.corpus")' 
fi

if [ $1 = "phones" ]
then 
   infile=$2
   outfile=$3
   # Normalize the text to be compressed
   cat $infile | awk '{print $NF}' |
   sed 's/\\/ /g' >huff.tmp.corpus
fi

if [ $1 = "residual" ]
then 
   # This really doesn't work: just some tests to see what's worthwhile
   infile=$2
   outfile=$3
   # Normalized the text to be compressed
   cat $infile |
   awk '{if ($5 ~ /.*_res_/)
            n=1;
         else if (n==1)
         {
            printf("%s ",$i);
            b=$i;
            for (i=2; i<=NF; i++)
            { 
                q=b-$i;
                if (q < 0)
                   q+=256;
                printf("%s ",q);
                b=$i
            }
            printf("\n");
            n=0
         }}' |
   sed 's/,//g;s/};//' >huff.tmp.corpus
fi

if [ $1 = "other" ]
then
   infile=$2
   outfile=$3
   cat $infile >huff.tmp.corpus
fi

CORPUS_SIZE=`cat huff.tmp.corpus | wc -w`
echo "Original Corpus size = $CORPUS_SIZE"

# Find the base alphabet 
cat huff.tmp.corpus |
awk '{for (i=1; i<=NF; i++)
          printf("%s\n",$i);}' |
sort -u >huff.tmp.base.alphabet

BASE_ALPHABET_SIZE=`cat huff.tmp.base.alphabet | wc -l`
echo "Base alphabet = $BASE_ALPHABET_SIZE"

# Find the additional best n grams
PASS=000
cp -p huff.tmp.corpus huff.tmp.corpus.$PASS

PASS_ALPHABET_SIZE=$BASE_ALPHABET_SIZE
while [ $PASS_ALPHABET_SIZE != $ALPHABET_SIZE ]
do
   # Loop until we get to the desired alphabet size

   cat huff.tmp.corpus.$PASS |
   awk '{for (i=1; i<NF; i++)
         {
           bb = sprintf("%s_%s",$i,$(i+1));
           freq[bb]++;
           part1[bb] = $i;
           part2[bb] = $(i+1);
         }}
       END {best="";
            bestscore=0;
            for (i in freq)
            {
               score = freq[i];
               if (score > bestscore)
               {
                  bestscore = score;
                  best = i;
               }
            }
            printf("%s %s %d %d\n",part1[best],part2[best],bestscore,freq[best])
           }' >huff.tmp.corpus.$PASS.best

   # Substitute the best bigram to give next corpus
   NPASS=`echo $PASS | awk '{printf("%03d",$1+1)}'`
   cat huff.tmp.corpus.$PASS |
   awk 'BEGIN {p1="'`cat huff.tmp.corpus.$PASS.best | awk '{print $1}'`'";
               p2="'`cat huff.tmp.corpus.$PASS.best | awk '{print $2}'`'";
              }
        {for (i=1; i<=NF; i++)
         {
            if (($i == p1) && ($(i+1) == p2))
            {
               printf("%s+%s ",$i,$(i+1));
               i++;
            }
            else
               printf("%s ",$i);
         }
         printf("\n");}' >huff.tmp.corpus.$NPASS

   # Summary 
   echo $PASS joining `cat huff.tmp.corpus.$PASS.best`
   echo "  PAS $PASS_ALPHABET_SIZE AS $ALPHABET_SIZE"
   echo "  Corpus was "`cat huff.tmp.corpus.$PASS | wc -w`
   echo "  Corpus is "`cat huff.tmp.corpus.$NPASS | wc -w`
   CORPUS_IS=`cat huff.tmp.corpus.$NPASS | wc -w`
   echo $CORPUS_SIZE $CORPUS_IS | 
      awk '{printf("  Compression %0.2f\n",100*($2*1.0)/$1)}'

   PASS=$NPASS
   PASS_ALPHABET_SIZE=`cat huff.tmp.corpus.$NPASS |
                       awk '{for (i=1; i<=NF; i++)
                       printf("%s\n",$i);}' | sort -u | wc -l`
done

# Now build the map
mv huff.tmp.corpus.$NPASS huff.tmp.corpus.best
rm -f huff.tmp.corpus.[0-9]*

cat huff.tmp.corpus.best |
awk 'BEGIN {tot=0;}
     { for (i=1; i<=NF; i++)
       {
           freq[$i]++ 
           tot++;
       }
     }
     END { for (i in freq)
             printf("%f %s\n",freq[i]/tot,i);}' |
sort -n |
awk '{printf("map[\"%s\"] = %d; unmap[%d] = \"%s\";\n",$2,NR,NR,$2)}' >huff.tmp.corpus.maptable

maptable=`cat huff.tmp.corpus.maptable`

# compress the corpus
cat huff.tmp.corpus.best |
awk 'BEGIN {'"$maptable"'}
     function enoctal(x)
     {
        units = x % 8;
        eights = ((x - units) / 8) % 8;
        sixtyfours = ((((x - units) / 8) - eights) / 8) % 8;
        return sprintf("%d%d%d", sixtyfours, eights, units);
     }
     {
        for (i=1; i<=NF; i++)
            printf("\\%s",enoctal(map[$i]));
        printf("\n");
     }' > huff.tmp.corpus.compressed

# uncompress the corpus
cat huff.tmp.corpus.compressed |
sed 's/\\/ /g' |
awk 'BEGIN {'"$maptable"'}
     function unenoctal(x)
     {
        y = ((substr(x,1,1)+0)*64) + ((substr(x,2,1)+0)*8) + (substr(x,3,1)+0);
        return y;
     }
     {
        for (i=1; i<=NF; i++)
            printf("%s ",unmap[unenoctal($i)]);
        printf("\n");
     }' > huff.tmp.corpus.uncompressed

if [ $1 = "phones" ]
then
   cat huff.tmp.corpus.best |
   sed 's/+/\\/g' |
   awk '{ for (i=1; i<=NF; i++)
          {
              freq[$i]++ 
              tot++;
          }
        }
        END { for (i in freq)
                printf("%f %s\n",freq[i]/tot,i);}' |
   sort -n |
   awk '{printf("   \"\\%s\" , /* %f */ \n",$2,$1);}' >$outfile
   mv huff.tmp.corpus.compressed huff.phones.compressed
fi

if [ $1 = "entries" ]
then
   cat huff.tmp.corpus.best |
   sed 's/+//g' |
   awk '{ for (i=1; i<=NF; i++)
          {
              freq[$i]++ 
              tot++;
          }
        }
        END { for (i in freq)
                printf("%f %s\n",freq[i]/tot,i);}' |
   sort -n |
   awk '{printf("   \"%s\" , /* %f */ \n",$2,$1);}' >$outfile
   mv huff.tmp.corpus.compressed huff.entries.compressed
fi

exit



