#--- The HuffmanTree class, which implements a Huffman tree algorithm for the compression of text. The algorithm comes from: http://q-lang.sourceforge.net/examples/huffman.q It was translated in dodo to illustrate functional style programming. Usage: def message = "Alice in Wonderland" def H = HuffmanTree.newWithWeights(weights(message)) def code = H.encode(message) console!out.Puts(""(code.count)" bits: "(H.decode(code))) Original header: /* huffman.q: Huffman encoding trees */ /* written by Albert Graef, 05-08-1993 (see also Abelson/Sussman: Structure and Interpretation of Computer Programs, MIT Press, 1985) revised 11-26-1993, 3-27-1997, 12-19-2000, 03-02-2002, 01-30-04 AG */ /* NOTE: Alphabets should consist of two symbols at least. Otherwise the single symbol will be encoded as the empty list, which leads to infinite recursion when decoded. This could be considered a bug. ;-) For instance, try the following: ==> def message = chars "Alice in Wonderland" ==> def H = huffman_tree (weights message) ==> def code = encode H message ==> strcat (decode H code) A more amusing example is perhaps the following exercise from Abelson/Sussman 1985, p. 125: "The following eight-symbol alphabet with associated relative frequencies was designed to efficiently encode the lyrics of 1950s rock songs. (Note that the "symbols" of an "alphabet" need not be individual letters.) A 2 NA 16 BOOM 1 SHA 3 GET 2 YIP 10 JOB 2 WAH 1 Generate a corresponding Huffman tree, and use it to encode the following message: Get a job Sha na na na na na na na na Get a job Sha na na na na na na na na Wah yip yip yip yip yip yip yip yip Sha boom How many bits are required for the encoding? What is the smallest number of bits that would be needed to encode this song if we used a fixed-length code for the eight-symbol alphabet?" */ +++ __Module__ huffmantree (version: "1.0") __Header__ class HuffmanTree { __DOC "The decoding function. It takes as its arguments a list of "zeros and ones, and reconstructs the original message." def decode(flag[]) -> String __DOC "The encoding function. It takes as its arguments a list of "symbols, and returns the coded message." def encode(char[]) -> flag[] __DOC "Construct a Huffman tree, starting from a list of "(symbol,weight) pairs." def HuffmanTree.newWithWeights(XW[]) -> Self } struct XW __DOC "Determine the (symbol,weight) pairs for a particular message" def weights(char[]) -> XW[] __Main__ #--- Huffman encoding trees are represented as binary trees whose leaves (represented by the tip symbol) carry individual symbols and their weights (frequencies), and whose interior nodes (represented with the bin symbol) store the sets of symbols (represented as lists) found in the corresponding subtrees, together with the corresponding weights (which are the sums of the subtree weights). The nil symbol denotes an empty tree. +++ HuffmanTree nil def leafSet(HW[] xws) = foreach(ls = [], h in xws) addTree(HTtip.newWithXW(h), ls) def addTree(HuffmanTree h1, HuffmanTree[] hs) = if (hs.null) [h1] #hs is empty else hs[1] -> h2 if (h1.weight > h2.weight) [h2] + addTree(h1, hs[2...]) else [h1] + hs; class HuffmanTree { def syms -> get(char[]) = [] def weight() = 0 def decode(flag[] bs) = decode1(self, bs) def encode(char[] s) = foreach(bs = [], c in s) encode1(c) + bs def HuffmanTree.newWithWeights(XW[] xws) = newWithLeafSet(leafSet(xws)) def decode1(HuffmanTree h, flag[] bs) = if (bs.null) "" #bs is empty else throw(IllegalArgument.newWithMessage( "Too many bits")) def encode1(char x) = throw(IllegalArgument.newWithMessage( "Cannot encode with empty tree")) def HuffmanTree.newWithLeafSet(HuffmanTree[] hs) = hs ~ match( [] => nil [$h] => h [$h1, $h2, ...$hs] => newWithLeafSet( addTree( HTbin.newWithLeftRight(h1, h2), hs)) ) } def HTtip = new HuffmanTree() { char x int w def ^syms() = [x] def ^weight() = w def ^decode(flag[] bs) = if (bs.null) x #fix the infinite recursion else throw(IllegalArgument.newWithMessage( "Too many bits")) def HTtip.newWithXW(XW xw) = new instance(x: xw.x, w: xw.w) def ^decode1(HuffmanTree h, flag[] bs) = x + h.decode(bs) def ^encode1(char x) = [] } def HTbin = new HuffmanTree() { char[] xs int w HuffmanTree h1, h2 def ^syms() = xs def ^weight() = w def HTbin.newWithLeftRight(HuffmanTree h1, h2) = new instance( xs: h1.syms + h2.syms, w: h1.weight + h2.weight, h1: h1, h2: h2) def ^decode1(HuffmanTree h, flag[] bs) = if (bs[1] = 0) h1.decode1(h, bs[2...]) else h2.decode1(h, bs[2...]) def ^encode1(char x) = if (h1.syms.contains(x)) [0] + h1.encode1(x) else if (h2.syms.contains(x)) [1] + h2.encode1(x) else throw(IllegalArgument.newWithMessage( "Unknown symbol: "(x))) } struct XW: char x; int w. def weights(char[] xs) = foreach(xws = [], x in xs.sort) addWeights(x, xws) def addWeights(char x, XW[] xws) = if (xws.null) [new XW.instance(x: x, w: 1)] else xws[1] -> xw if (x = xw.x) [new XW.instance(x: x, w: ++xw.w)] + xws[2...] else [new XW.instance(x: x, w: 1), xw] + xws[2...]