// Burrows Wheeler Transform Encoder/Decoder
#ifdef unix
#define __cdecl
#endif
#include <stdlib.h>
#include <memory.h>
#include <string.h>
// these two values are stored together
// to improve processor cache hits
typedef struct {
unsigned prefix, offset;
} KeyPrefix;
// link to suffix sort module
extern KeyPrefix *bwtsort(unsigned char *, unsigned);
// these functions link bit-level I/O
void arc_put1 (int bit);
void arc_put8 (int byte);
int arc_get1 ();
int arc_get8 ();
// define 1/2 rle zero alphabet bits
#define HUFF_bit0 256
#define HUFF_bit1 257
// the size of the Huffman alphabet
#define HUFF_size 258
// store these values together for bwt decoding
typedef struct {
unsigned code:8;
unsigned cnt:24;
} Xform;
// the HuffMan table for each alphabet character
typedef struct {
unsigned len;
unsigned code;
} HuffTable;
HuffTable HuffCode[HUFF_size];
// used to construct the HuffCode table
struct Node {
struct Node *left, *right;
unsigned freq;
};
unsigned Freq[HUFF_size], ZeroCnt; // alphabet counts
unsigned char MtfOrder[256]; // move-to-front
// enumerate coding tree depths
unsigned enumerate (unsigned *codes, struct Node *node, unsigned depth)
{
unsigned one, two;
if( !node->right ) { // leaf node?
HuffCode[(int)(node->left)].len = depth;
codes[depth]++;
return depth;
}
one = enumerate (codes, node->left, depth + 1);
two = enumerate (codes, node->right, depth + 1);
// return the max depth of the two sub-trees
return one > two ? one : two;
}
int __cdecl comp_node (const void *left, const void *right)
{
return ((struct Node *)left)->freq - ((struct Node *)right)->freq;
}
// construct Huffman coding tree
void encode_tree ()
{
struct Node tree[2 * HUFF_size], *base = tree, *left, *right;
unsigned codes[32], rank[32], weight;
int idx, max;
int size;
// the node tree is built with all the base symbols
// then constructed nodes are appended
memset (HuffCode, 0, sizeof(HuffCode));
memset (tree, 0, sizeof(tree));
// sort base symbol nodes by their frequencies
for( size = 0; size < HUFF_size; size++ ) {
tree[size].left = (void *)size;
tree[size].freq = Freq[size];
tree[size].right = NULL; // indicates a base node
}
qsort (tree, HUFF_size, sizeof(struct Node), comp_node);
// repeatedly combine & remove two lowest freq nodes
// then construct a new node w/sum of these two freq
// and insert from the end of the tree (base + size)
while( size-- > 1 ) {
left = base;
if( weight = (base++)->freq )
weight += base->freq;
else
continue; // skip over zero freq nodes
right = base++;
idx = size;
// sort new node into place
while( --idx )
if( base[idx-1].freq > weight )
base[idx] = base[idx-1];
else
break;
// construct the new internal node
base[idx].freq = weight;
base[idx].right = right;
base[idx].left = left;
}
// base points at root of tree (size == 1)
// construct the Huffman code down from here
memset (codes, 0, sizeof(codes));
memset (rank, 0, sizeof(rank));
// enumerate the left & right subtrees,
// returns the deepest path to leaves
max = enumerate (rank, base, 0);
// use cannonical Huffman coding technique
// (from Steve Pigeon)
for( idx = 0; idx <= max; idx++ )
codes[idx + 1] = (codes[idx] + rank[idx]) << 1, rank[idx] = 0;
// set the code for each non-zero freq alphabet symbol
for( idx = 0; idx < HUFF_size; idx++ )
if( HuffCode[idx].len )
HuffCode[idx].code = codes[HuffCode[idx].len] + rank[HuffCode[idx].len]++;
}
// output code bits for one alphabet symbol
unsigned huff_encode (unsigned val)
{
unsigned mask = 1 << HuffCode[val].len;
unsigned code = HuffCode[val].code;
while( mask >>= 1 )
arc_put1 (code & mask);
return code;
}
// perform run-length-encoding
// using two new Huffman codes
// for RLE count bits 0 & 1
void rle_encode (unsigned char *out, unsigned size)
{
unsigned count, idx;
unsigned mask, code;
// transmit cannonical huff coding tree by
// sending 5 bits for each symbol's length
for( idx = 0; idx < HUFF_size; idx++ ) {
count = HuffCode[idx].len;
mask = 1 << 5;
while( mask >>= 1 )
arc_put1 (count & mask);
}
// accumulate RLE counts and encode BWT
// repeated zeroes are first counted,
// this count is transmitted in binary
// using 2 special HUFF alphabet symbols
// HUFF_bit0 and HUFF_bit1, representing
// count values 1 & 2:
// transmit HUFF_bit0 = count of 1
// transmit HUFF_bit1 = count of 2
// transmit HUFF_bit0, HUFF_bit0 = count of 3
// transmit HUFF_bit0, HUFF_bit1 = count of 4
// transmit HUFF_bit1, HUFF_bit0 = count of 5
// transmit HUFF_bit1, HUFF_bit1 = count of 6 ...
// to make decoding simpler, transmit final
// zero code separately from RLE
count = 0;
while( size-- ) {
if( !(code = *out++) && size ) {
count++; // accumulate RLE count
continue; // except for trailing RLE
}
while( count ) // transmit any RLE count bits
huff_encode (HUFF_bit0 + (--count & 0x1)), count >>= 1;
huff_encode (code);
}
}
// Move-to-Front decoder
unsigned mtf_decode (unsigned nxt)
{
unsigned char code;
// Pull the char
//
code = MtfOrder[nxt];
//
// Now shuffle the order array
//
memmove (MtfOrder + 1, MtfOrder, nxt);
return MtfOrder[0] = code;
}
// expand BWT into the supplied buffer
void rle_decode (Xform *xform, unsigned size, unsigned last)
{
unsigned xlate[HUFF_size], length[HUFF_size];
unsigned codes[32], rank[32], base[32], bits;
unsigned nxt, count, lvl, idx, out = 0, zero;
unsigned char prev;
// construct decode table
memset (codes, 0, sizeof(codes));
memset (rank, 0, sizeof(rank));
// retrieve code lengths, 5 bits each
for( idx = 0; idx < HUFF_size; idx++ ) {
for( length[idx] = bits = 0; bits < 5; bits++ )
length[idx] <<= 1, length[idx] |= arc_get1();
rank[length[idx]]++;
}
// construct cannonical Huffman code groups
// one group range for each bit length
base[0] = base[1] = 0;
for( idx = 1; idx < 30; idx++ ) {
codes[idx + 1] = (codes[idx] + rank[idx]) << 1;
base[idx + 1] = base[idx] + rank[idx];
rank[idx] = 0;
}
// fill in the translated Huffman codes
// filling in ranks for each code group
for( nxt = idx = 0; idx < HUFF_size; idx++ )
if( lvl = length[idx] )
xlate[base[lvl] + rank[lvl]++] = idx;
zero = prev = count = bits = lvl = 0;
// fill supplied buffer by reading the input
// one bit at a time and assembling codes
while( out < size ) {
bits <<= 1, bits |= arc_get1 ();
if( rank[++lvl] )
if( bits < codes[lvl] + rank[lvl] )
nxt = xlate[base[lvl] + bits - codes[lvl]];
else
continue; // the code is above the range for this length
else
continue; // no symbols with this code length
// nxt = the recognized symbol
// reset code accumulator
bits = lvl = 0;
// process RLE count bits as 1 or 2
if( nxt > 255 ) {
count += ( nxt - 255 ) << zero++;
continue;
}
// expand previous RLE count
while( count ) {
if( out != last )
xform[out].cnt = Freq[prev]++;
xform[out++].code = prev;
count--;
}
zero = 0;
prev = mtf_decode (nxt); // translate mtf of the symbol
if( out != last )
xform[out].cnt = Freq[prev]++;
xform[out++].code = prev; // store next symbol
}
}
// Move-to-Front encoder, and
// accumulate frequency counts
// using RLE coding (not for flush)
unsigned char mtf_encode (unsigned char val, int flush)
{
unsigned code;
code = (unsigned char *)memchr (MtfOrder, val, 256) - MtfOrder;
memmove (MtfOrder + 1, MtfOrder, code);
MtfOrder[0] = val;
if( !flush && !code )
return ZeroCnt++, code;
// accumulate the frequency counts for the
// new code and the previous zero run
Freq[code]++;
while( ZeroCnt )
Freq[HUFF_bit0 + (--ZeroCnt & 0x1)]++, ZeroCnt >>= 1;
return code;
}
// initialize Move-to-Front symbols
void mtf_init ()
{
unsigned idx;
for( idx = 0 ; idx < 256 ; idx++ )
MtfOrder[idx] = (unsigned char)idx;
}
// unpack next bwt segment from current stream into buffer
void bwt_decode (unsigned char *outbuff, unsigned buflen)
{
unsigned last, idx = 0;
Xform *xform;
unsigned ch;
xform = malloc ((buflen + 1 ) * sizeof(Xform));
mtf_init ();
// retrieve last row number
last = arc_get8 () << 16;
last |= arc_get8 () << 8;
last |= arc_get8 ();
// To determine a character's position in the output string given
// its position in the input string, we can use the knowledge about
// the fact that the output string is sorted. Each character 'c' will
// show up in the output stream in in position i, where i is the sum
// total of all characters in the input buffer that precede c in the
// alphabet (kept in the count array), plus the count of all
// occurences of 'c' previously in the block (kept in xform.cnt)
// The first part of this code calculates the running totals for all
// the characters in the alphabet. That satisfies the first part of the
// equation needed to determine where each 'c' will go in the output
// stream. Remember that the character pointed to by 'last' is a special
// end-of-buffer character that needs to be larger than any other char
// so we just skip over it while tallying counts
memset (Freq, 0, sizeof(Freq));
rle_decode (xform, buflen + 1, last);
for( idx = 1 ; idx < 256 ; idx++ )
Freq[idx] += Freq[idx-1];
// Once the transformation vector is in place, writing the
// output is just a matter of computing the indices. Note
// that we ignore the EOB from the end of data first, and
// process the array backwards from there
last = idx = buflen;
while( idx-- ) {
ch = outbuff[idx] = xform[last].code;
last = xform[last].cnt;
if( ch-- )
last += Freq[ch];
}
free (xform);
}
// pack next bwt segment into current stream
void bwt_encode (unsigned char *buff, unsigned max)
{
unsigned idx, last, off, size = 0;
unsigned char *out;
KeyPrefix *keys;
// zero freq counts
memset (Freq, 0, sizeof(Freq));
ZeroCnt = 0;
keys = bwtsort (buff, max);
out = malloc (max + 1);
// Finally, write out column L. Column L consists of all
// the prefix characters to the sorted strings, in order.
// It's easy to get the prefix character, but offset 0
// is handled with care, since its prefix character
// is the imaginary end-of-buffer character. Save the
// positions in L of the end-of-buffer character and
// and write it out at the end to the output stream.
mtf_init ();
idx = 0;
do if( off = keys[idx].offset )
out[size++] = mtf_encode (buff[--off], 0);
else
last = idx, out[size++] = mtf_encode (MtfOrder[0], 0);
while( ++idx < max );
out[size++] = mtf_encode (buff[max - 1], 1);
// transmit where the EOB is located
arc_put8 ((unsigned char)(last >> 16));
arc_put8 ((unsigned char)(last >> 8));
arc_put8 ((unsigned char)(last));
// construct huff coding tree and transmit code-lengths
encode_tree ();
// encode and transmit output
rle_encode (out, size);
free (keys);
free (out);
}
#ifdef CODERSTANDALONE
#include <stdio.h>
unsigned char ArcBit = 0, ArcChar = 0;
FILE *In, *Out;
int main (int argc, char **argv)
{
int mode, max, size, nxt;
unsigned char *buff;
if( argc > 1 )
mode = argv[1][0];
else
return 1;
if( !(In = fopen (argv[2], "rb")) )
return 1;
if( !(Out = fopen (argv[3], "wb")) )
return 1;
// decompression
while( mode == 'd' ) {
size = getc (In);
if( size < 0 )
return 0;
for( nxt = 0; nxt < 2; nxt++ )
size <<= 8, size |= getc (In);
ArcBit = 0;
if( size ) {
buff = malloc (size);
bwt_decode (buff, size);
}
for( nxt = 0; nxt < size; nxt++ )
putc (buff[nxt], Out);
if( size )
free (buff);
}
// compression
fseek(In, 0, 2);
size = ftell(In);
fseek (In, 0, 0);
do {
if( max = size > 900000 ? 900000 : size )
buff = malloc (max);
putc ((unsigned char)(max >> 16), Out);
putc ((unsigned char)(max >> 8), Out);
putc ((unsigned char)(max), Out);
for( nxt = 0; nxt < max; nxt++ )
buff[nxt] = getc(In);
if( max )
bwt_encode (buff, max), free (buff);
while( ArcBit ) // flush last few bits
arc_put1 (0);
} while( size -= max );
return 0;
}
void arc_put1 (int bit)
{
ArcChar <<= 1;
if( bit )
ArcChar |= 1;
if( ++ArcBit < 8 )
return;
putc (ArcChar, Out);
ArcChar = ArcBit = 0;
}
void arc_put8 (int ch)
{
int idx = 8;
while( idx-- )
arc_put1 (ch & 1 << idx);
}
int arc_get1 ()
{
if( !ArcBit )
ArcChar = getc (In), ArcBit = 8;
return ArcChar >> --ArcBit & 1;
}
int arc_get8 ()
{
int idx, result = 0;
for( idx = 0; idx < 8; idx++ )
result <<= 1, result |= arc_get1();
return result;
}
#endif