From bfb62d93180d32713757c5692ef3d0efc8ef413e Mon Sep 17 00:00:00 2001
From: David Bryant <david@wavpack.com>
Date: Thu, 23 Jul 2020 23:58:49 -0700
Subject: [PATCH] new algorithm for selective dictionary string recycling
 change default setting for filter to 16-bit codes (-8)

---
 README      |  40 +++--
 lzwfilter.c |   6 +-
 lzwlib.c    | 429 +++++++++++++++++++++++++++++++++++++---------------
 3 files changed, 340 insertions(+), 135 deletions(-)

diff --git a/README b/README
index 98ca4929..e77444e6 100644
--- a/README
+++ b/README
@@ -12,16 +12,27 @@ high speed compression or decompression facilities where lots of RAM for
 large dictionaries might not be available. I have used this in several
 projects for storing compressed firmware images, and once I even coded the
 decompressor in Z-80 assembly language for speed! Depending on the maximum
-symbol size selected, the implementation can require from 1024 to 261120
+symbol size selected, the implementation can require from 2368 to 335616
 bytes of RAM for decoding (and about half again more for encoding).
 
+This is a streaming compressor in that the data is not divided into blocks
+and no context information like dictionaries or Huffman tables are sent
+ahead of the compressed data (except for one byte to signal the maximum
+bit depth). This limits the maximum possible compression ratio compared to
+algorithms that significantly preprocess the data, but with the help of
+some enhancements to the LZW algorithm (described below) it is able to
+compress better than the UNIX "compress" utility (which is also LZW) and
+is in fact closer to and sometimes beats the compression level of "gzip".
+
 The symbols are stored in "adjusted binary" which provides somewhat better
 compression (with virtually no speed penalty) compared to the fixed word
-sizes normally used. To ensure good performance on data with varying
-characteristics (like executable images) the encoder resets as soon as the
-dictionary is full. Also, worst-case performance is limited to about 8%
-inflation by catching poor performance and forcing an early reset before
-longer symbols are sent.
+sizes normally used. Once the dictionary is full, the encoder returns to
+the beginning and recycles string codes that have not been used yet for
+longer strings. In this way the dictionary constantly "churns" based on the
+the incoming stream, thereby improving and adapting to optimal compression.
+The compression performance is constantly monitored and a dictionary flush
+is forced on stretches of negative compression which limits worst-case
+performance to about 8% inflation.
 
 LZW-AB consists of three standard C files: the library, a command-line
 filter demo using pipes, and a command-line test harness. Each program
@@ -42,7 +53,7 @@ cl -O2 lzwfilter.c lzwlib.c
 cl -O2 lzwtester.c lzwlib.c
 
 There are Windows binaries (built on MinGW) for the filter and the tester on the
-GitHub release page (v2). The "help" display for the filter looks like this:
+GitHub release page (v3). The "help" display for the filter looks like this:
 
  Usage:     lzwfilter [-options] [< infile] [> outfile]
 
@@ -53,9 +64,20 @@ GitHub release page (v2). The "help" display for the filter looks like this:
            -1     = maximum symbol size = 9 bits
            -2     = maximum symbol size = 10 bits
            -3     = maximum symbol size = 11 bits
-           -4     = maximum symbol size = 12 bits (default)
+           -4     = maximum symbol size = 12 bits
            -5     = maximum symbol size = 13 bits
            -6     = maximum symbol size = 14 bits
            -7     = maximum symbol size = 15 bits
-           -8     = maximum symbol size = 16 bits
+           -8     = maximum symbol size = 16 bits (default)
            -v     = verbose (display ratio and checksum)
+
+Here's the "help" display for the tester:
+
+ Usage:     lzwtester [options] file [...]
+
+ Options:   -1 ... -8 = test using only specified max symbol size (9 - 16)
+            -0        = cycle through all maximum symbol sizes (default)
+            -e        = exhaustive test (by successive truncation)
+            -f        = fuzz test (randomly corrupt compressed data)
+            -q        = quiet mode (only reports errors and summary)
+
diff --git a/lzwfilter.c b/lzwfilter.c
index 8793139e..92ece7fc 100644
--- a/lzwfilter.c
+++ b/lzwfilter.c
@@ -32,11 +32,11 @@ static const char *usage =
 "           -1     = maximum symbol size = 9 bits\n"
 "           -2     = maximum symbol size = 10 bits\n"
 "           -3     = maximum symbol size = 11 bits\n"
-"           -4     = maximum symbol size = 12 bits (default)\n"
+"           -4     = maximum symbol size = 12 bits\n"
 "           -5     = maximum symbol size = 13 bits\n"
 "           -6     = maximum symbol size = 14 bits\n"
 "           -7     = maximum symbol size = 15 bits\n"
-"           -8     = maximum symbol size = 16 bits\n"
+"           -8     = maximum symbol size = 16 bits (default)\n"
 "           -v     = verbose (display ratio and checksum)\n\n"
 " Web:       Visit www.github.com/dbry/lzw-ab for latest version and info\n\n";
 
@@ -87,7 +87,7 @@ static void write_buff (int value, void *ctx)
 
 int main (int argc, char **argv)
 {
-    int decompress = 0, maxbits = 12, verbose = 0, error = 0;
+    int decompress = 0, maxbits = 16, verbose = 0, error = 0;
     streamer reader, writer;
 
     memset (&reader, 0, sizeof (reader));
diff --git a/lzwlib.c b/lzwlib.c
index 0dba30f1..cee135ca 100644
--- a/lzwlib.c
+++ b/lzwlib.c
@@ -23,42 +23,91 @@
  * basic technique, and this implementation is no different. The target of
  * the present implementation is embedded systems, and so emphasis was placed
  * on simplicity, fast execution, and minimal RAM usage.
+ * 
+ * This is a streaming compressor in that the data is not divided into blocks
+ * and no context information like dictionaries or Huffman tables are sent
+ * ahead of the compressed data (except for one byte to signal the maximum
+ * bit depth). This limits the maximum possible compression ratio compared to
+ * algorithms that significantly preprocess the data, but with the help of
+ * some enhancements to the LZW algorithm (described below) it is able to
+ * compress better than the UNIX "compress" utility (which is also LZW) and
+ * is in fact closer to and sometimes beats the compression level of "gzip".
  *
  * The symbols are stored in "adjusted binary" which provides somewhat better
- * compression (with virtually no speed penalty) compared to the fixed word
- * sizes normally used. To ensure good performance on data with varying
- * characteristics (like executable images) the encoder resets as soon as the
- * dictionary is full. Also, worst-case performance is limited to about 8%
- * inflation by catching poor performance and forcing an early reset before
- * longer symbols are sent.
+ * compression, with virtually no speed penalty, compared to the fixed word
+ * sizes normally used. These are sometimes called "phased-in" binary codes
+ * and their use in LZW is described here:
+ *
+ *   R. N. Horspool, "Improving LZW (data compression algorithm)", Data
+ *   Compression Conference, pp. 332-341, 1991.
+ *
+ * Earlier versions of this compressor would reset as soon as the dictionary
+ * became full to ensure good performance on heterogenous data (such as tar
+ * files or executable images). While trivial to implement, this is not
+ * particularly efficient with homogeneous data (or in general) because we
+ * spend a lot of time sending short symbols where the compression is poor.
+ *
+ * This newer version utilizes a technique such that once the dictionary is
+ * full, we restart at the beginning and recycle only those codes that were
+ * seen only once. We know this because they are not referenced by longer
+ * strings, and are easy to replace in the dictionary for the same reason.
+ * Since they have only been seen once it's also more likely that we will
+ * be replacing them with a more common string, and this is especially
+ * true if the data characteristics are changing.
+ *
+ * Replacing string codes in this manner has the interesting side effect that
+ * some older shorter strings that the removed strings were based on will
+ * possibly become unreferenced themselves and be recycled on the next pass.
+ * In this way, the entire dictionary constantly "churns" based on the
+ * incoming stream, thereby improving and adapting to optimal compression.
+ *
+ * Even with this technique there is still a possibility that a sudden change
+ * in the data characteristics will appear, resulting in significant negative
+ * compression (up to 100% for 16-bit codes). To detect this case we generate
+ * an exponentially decaying average of the current compression ratio and reset
+ * when this hits about 1.06, which limits worst case inflation to about 8%.
  *
- * The maximum symbol size is configurable on the encode side (from 9 bits
- * to 16 bits) and determines the RAM footprint required by both sides and,
- * to a large extent, the compression performance. This information is
- * communicated to the decoder in the first stream byte so that it can
- * allocate accordingly. The RAM requirements are as follows:
+ * The maximum symbol size is configurable on the encode side (from 9 bits to
+ * 16 bits) and determines the RAM footprint required by both sides and, to a
+ * large extent, the compression performance. This information is communicated
+ * to the decoder in the first stream byte so that it can allocate accordingly.
+ * The RAM requirements are as follows:
  *
  *    maximum    encoder RAM   decoder RAM
  *  symbol size  requirement   requirement
  * -----------------------------------------
- *     9-bit     1792 bytes    1024 bytes
- *    10-bit     4352 bytes    3072 bytes
- *    11-bit     9472 bytes    7168 bytes
- *    12-bit     19712 bytes   15360 bytes
- *    13-bit     40192 bytes   31744 bytes
- *    14-bit     81152 bytes   64512 bytes
- *    15-bit     163072 bytes  130048 bytes
- *    16-bit     326912 bytes  261120 bytes
- * 
+ *     9-bit      4096 bytes    2368 bytes
+ *    10-bit      8192 bytes    4992 bytes
+ *    11-bit     16384 bytes   10240 bytes
+ *    12-bit     32768 bytes   20736 bytes
+ *    13-bit     65536 bytes   41728 bytes
+ *    14-bit    131072 bytes   83712 bytes
+ *    15-bit    262144 bytes  167680 bytes
+ *    16-bit    524288 bytes  335616 bytes
+ *
  * This implementation uses malloc(), but obviously an embedded version could
  * use static arrays instead if desired (assuming that the maxbits was
  * controlled outside).
  */
 
-#define NULL_CODE       -1      // indicates a NULL prefix
+#define NULL_CODE       65535   // indicates a NULL prefix (must be unsigned short)
 #define CLEAR_CODE      256     // code to flush dictionary and restart decoder
 #define FIRST_STRING    257     // code of first dictionary string
 
+/* This macro determines the number of bits required to represent the given value,
+ * not counting the implied MSB. For GNU C it will use the provided built-in,
+ * otherwise a comparison tree is employed. Note that in the non-GNU case, only
+ * values up to 65535 (15 bits) are supported.
+ */
+
+#ifdef __GNUC__
+#define CODE_BITS(n) (31 - __builtin_clz(n))
+#else
+#define CODE_BITS(n) ((n) < 4096 ?                                      \
+            ((n) < 1024  ? 8  + ((n) >= 512)  : 10 + ((n) >= 2048)) :   \
+            ((n) < 16384 ? 12 + ((n) >= 8192) : 14 + ((n) >= 32768)))
+#endif
+
 /* This macro writes the adjusted-binary symbol "code" given the maximum
  * symbol "maxcode". A macro is used here just to avoid the duplication in
  * the lzw_compress() function. The idea is that if "maxcode" is not one
@@ -71,24 +120,21 @@
  * the 4 codes from 254 to 257 take 9 bits.
  */
 
-#define WRITE_CODE(code,maxcode) do {                           \
-    int code_bits = (maxcode) < 4096 ?                          \
-        ((maxcode) < 1024  ? 8  + ((maxcode) >= 512)   :        \
-                             10 + ((maxcode) >= 2048)) :        \
-        ((maxcode) < 16384 ? 12 + ((maxcode) >= 8192)  :        \
-                             14 + ((maxcode) >= 32768));        \
-    int extras = (1 << (code_bits + 1)) - (maxcode) - 1;        \
-    if ((code) < extras) {                                      \
-        shifter |= ((long)(code) << bits);                      \
-        bits += code_bits;                                      \
-    }                                                           \
-    else {                                                      \
-        shifter |= ((long)(((code) + extras) >> 1) << bits);    \
-        bits += code_bits;                                      \
-        shifter |= ((long)(((code) + extras) & 1) << bits++);   \
-    }                                                           \
-    do { (*dst)(shifter,dstctx); shifter >>= 8; output_bytes++; \
-    } while ((bits -= 8) >= 8);                                 \
+#define WRITE_CODE(code,maxcode) do {                               \
+    unsigned int code_bits = CODE_BITS (maxcode);                   \
+    unsigned int extras = (2 << code_bits) - (maxcode) - 1;         \
+    if ((code) < extras) {                                          \
+        shifter |= ((code) << bits);                                \
+        bits += code_bits;                                          \
+    }                                                               \
+    else {                                                          \
+        shifter |= ((((code) + extras) >> 1) << bits);              \
+        bits += code_bits;                                          \
+        shifter |= ((((code) + extras) & 1) << bits++);             \
+    }                                                               \
+    do { (*dst)(shifter,dstctx); shifter >>= 8;                     \
+        output_bytes += 256;                                        \
+    } while ((bits -= 8) >= 8);                                     \
 } while (0)
 
 /* LZW compression function. Bytes (8-bit) are read and written through callbacks and the
@@ -100,13 +146,19 @@
  * multiple instances of the compression operation (but simple applications can ignore these).
  */
 
+typedef struct {
+    unsigned short first_reference, next_reference, back_reference;
+    unsigned char terminator;
+} encoder_entry_t;
+
 int lzw_compress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void *srcctx, int maxbits)
 {
-    int next = FIRST_STRING, prefix = NULL_CODE, bits = 0, total_codes, c;
-    unsigned long input_bytes = 0, output_bytes = 0;
-    unsigned short *first_references, *next_references;
-    unsigned char *terminators;
-    unsigned long shifter = 0;
+    unsigned int maxcode = FIRST_STRING, next_string = FIRST_STRING, prefix = NULL_CODE, total_codes;
+    unsigned int dictionary_full = 0, available_entries, max_available_entries, max_available_code;
+    unsigned int input_bytes = 65536, output_bytes = 65536;
+    unsigned int shifter = 0, bits = 0;
+    encoder_entry_t *dictionary;
+    int c;
 
     if (maxbits < 9 || maxbits > 16)    // check for valid "maxbits" setting
         return 1;
@@ -114,18 +166,17 @@ int lzw_compress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void
     // based on the "maxbits" parameter, compute total codes and allocate dictionary storage
 
     total_codes = 1 << maxbits;
-    first_references = malloc (total_codes * sizeof (first_references [0]));
-    next_references = malloc ((total_codes - 256) * sizeof (next_references [0]));
-    terminators = malloc ((total_codes - 256) * sizeof (terminators [0]));
+    dictionary = malloc (total_codes * sizeof (encoder_entry_t));
+    max_available_entries = total_codes - FIRST_STRING - 1;
+    max_available_code = total_codes - 2;
 
-    if (!first_references || !next_references || !terminators)
+    if (!dictionary)
         return 1;                       // failed malloc()
 
     // clear the dictionary
 
-    memset (first_references, 0, total_codes * sizeof (first_references [0]));
-    memset (next_references, 0, (total_codes - 256) * sizeof (next_references [0]));
-    memset (terminators, 0, (total_codes - 256) * sizeof (terminators [0]));
+    available_entries = max_available_entries;
+    memset (dictionary, 0, 256 * sizeof (encoder_entry_t));
 
     (*dst)(maxbits - 9, dstctx);    // first byte in output stream indicates the maximum symbol bits
 
@@ -135,58 +186,123 @@ int lzw_compress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void
     // variables and are sent to the output every time 8 bits are available (done in the macro).
 
     while ((c = (*src)(srcctx)) != EOF) {
-        int cti;                            // coding table index
+        unsigned int cti;                   // coding table index
 
-        input_bytes++;
+        input_bytes += 256;
 
         if (prefix == NULL_CODE) {          // this only happens the very first byte when we don't yet have a prefix
             prefix = c;
             continue;
         }
 
-        if ((cti = first_references [prefix])) {    // if any longer strings are built on the current prefix...
+        memset (dictionary + next_string, 0, sizeof (encoder_entry_t));
+
+        if ((cti = dictionary [prefix].first_reference)) {          // if any longer strings are built on the current prefix...
             while (1)
-                if (terminators [cti - 256] == c) { // we found a matching string, so we just update the prefix
-                    prefix = cti;                   // to that string and continue without sending anything
+                if (dictionary [cti].terminator == c) {             // we found a matching string, so we just update the prefix
+                    prefix = cti;                                   // to that string and continue without sending anything
                     break;
                 }
-                else if (!next_references [cti - 256]) {    // this string did not match the new character and
-                    next_references [cti - 256] = next;     // there aren't any more, so we'll add a new string
-                    cti = 0;                                // and point to it with "next_reference"
+                else if (!dictionary [cti].next_reference) {        // this string did not match the new character and
+                    dictionary [cti].next_reference = next_string;  // there aren't any more, so we'll add a new string,
+                                                                    // point to it with "next_reference", and also make the
+                    dictionary [next_string].back_reference = cti;  // "back_reference" which is used for recycling entries
+                    cti = 0;
                     break;
                 }
                 else
-                    cti = next_references [cti - 256];      // there are more possible matches to check, so loop back
+                    cti = dictionary [cti].next_reference;          // there are more possible matches to check, so loop back
+        }
+        else {                                                      // no longer strings are based on the current prefix, so now
+            dictionary [prefix].first_reference = next_string;      // the current prefix plus the new byte will be the next string
+            dictionary [next_string].back_reference = prefix;       // also make the back_reference used for recycling
+            if (prefix >= FIRST_STRING) available_entries--;        // the codes 0-255 are never available for recycling
         }
-        else                                        // no longer strings are based on the current prefix, so now
-            first_references [prefix] = next;       // the current prefix plus the new byte will be the next string
 
         // If "cti" is zero, we could not simply extend our "prefix" to a longer string because we did not find a
         // dictionary match, so we send the symbol representing the current "prefix" and add the new string to the
         // dictionary. Since the current byte "c" was not included in the prefix, that now becomes our new prefix.
 
         if (!cti) {
-            WRITE_CODE (prefix, next);              // send symbol for current prefix (0 to next-1)
-            terminators [next - 256] = c;           // newly created string has current byte as the terminator
-            prefix = c;                             // current byte also becomes new prefix for next string
-
-            // This is where we bump the next string index and decide whether to clear the dictionary and start over.
-            // The triggers for that are either the dictionary is full or we've been outputting too many bytes and
-            // decide to cut our losses before the symbols get any larger. Note that for the dictionary full case we
-            // do NOT send the CLEAR_CODE because the decoder knows about this and we don't want to be redundant.
-
-            if (++next == total_codes || output_bytes > 8 + input_bytes + (input_bytes >> 4)) {
-                if (next < total_codes)
-                    WRITE_CODE (CLEAR_CODE, next);
-
-                // clear the dictionary and reset the byte counters -- basically everything starts over
-                // except that we keep the last pending "prefix" (which, of course, was never sent)
-
-                memset (first_references, 0, total_codes * sizeof (first_references [0]));
-                memset (next_references, 0, (total_codes - 256) * sizeof (next_references [0]));
-                memset (terminators, 0, (total_codes - 256) * sizeof (terminators [0]));
-                input_bytes = output_bytes = 0;
-                next = FIRST_STRING;
+            WRITE_CODE (prefix, maxcode);               // send symbol for current prefix (0 to maxcode-1)
+            dictionary [next_string].terminator = c;    // newly created string has current byte as the terminator
+            prefix = c;                                 // current byte also becomes new prefix for next string
+
+            // If the dictionary is not full yet, we bump the maxcode and next_string and check to see if the
+            // dictionary is now full. If it is we set the dictionary_full flag and leave maxcode set to two
+            // less than total_codes because every string entry is now available for matching, but the actual
+            // maximum code is reserved for EOF.
+
+            if (!dictionary_full) {
+                dictionary_full = (++next_string > max_available_code);
+                maxcode++;
+            }
+
+            // If the dictionary is full we look for an entry to recycle starting at next_string (the one we
+            // just created or recycled) plus one (with check for wrap check). We know there is one because at
+            // a minimum the string we just added. This also takes care of removing the entry to be recycled
+            // (which is possible/easy because no longer strings have been based on it).
+
+            if (dictionary_full) {
+                for (next_string++; next_string <= max_available_code || (next_string = FIRST_STRING); next_string++)
+                    if (!dictionary [next_string].first_reference)
+                        break;
+
+                cti = dictionary [next_string].back_reference;  // dictionary [cti] references the entry we're
+                                                                // trying to recycle (either as a first or a next)
+
+                if (dictionary [cti].first_reference == next_string) {
+                    dictionary [cti].first_reference = dictionary [next_string].next_reference;
+
+                    // if we just cleared a first reference, and that string is not 0-255,
+                    // then that's a newly available entry
+                    if (!dictionary [cti].first_reference && cti >= FIRST_STRING)
+                        available_entries++;
+                }
+                else if (dictionary [cti].next_reference == next_string)    // fixup a "next_reference"
+                    dictionary [cti].next_reference = dictionary [next_string].next_reference;
+
+                // If the entry we're recycling had a next reference, then update the back reference
+                // so it's completely out of the chain. Of course we know it didn't have a first
+                // reference because then we wouldn't be recycling it.
+
+                if (dictionary [next_string].next_reference)
+                    dictionary [dictionary [next_string].next_reference].back_reference = cti;
+
+                // This check is technically not needed because there will always be an available entry
+                // (the last string we added at a minimum) but we don't want to get in a situation where
+                // we only have a few entries that we're cycling though. I pulled the limits (16 entries
+                // or 1% of total) out of a hat.
+
+                if (available_entries < 16 || available_entries * 100 < max_available_entries) {
+                    // clear the dictionary and reset the byte counters -- basically everything starts over
+                    // except that we keep the last pending "prefix" (which, of course, was never sent)
+
+                    WRITE_CODE (CLEAR_CODE, maxcode);
+                    memset (dictionary, 0, 256 * sizeof (encoder_entry_t));
+                    available_entries = max_available_entries;
+                    next_string = maxcode = FIRST_STRING;
+                    input_bytes = output_bytes = 65536;
+                    dictionary_full = 0;
+                }
+            }
+
+            // This is similar to the above check, except that it's used whether the dictionary is full or not.
+            // It uses an exponentially decaying average of the current compression ratio, so it can terminate
+            // very early if the incoming data is uncompressible or it can terminate any later time that the
+            // dictionary no longer compresses the incoming stream.
+
+            if (output_bytes > input_bytes + (input_bytes >> 4)) {
+                WRITE_CODE (CLEAR_CODE, maxcode);
+                memset (dictionary, 0, 256 * sizeof (encoder_entry_t));
+                available_entries = max_available_entries;
+                next_string = maxcode = FIRST_STRING;
+                input_bytes = output_bytes = 65536;
+                dictionary_full = 0;
+            }
+            else {
+                output_bytes -= output_bytes >> 8;
+                input_bytes -= input_bytes >> 8;
             }
         }
     }
@@ -194,18 +310,18 @@ int lzw_compress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void
     // we're done with input, so if we've received anything we still need to send that pesky pending prefix...
 
     if (prefix != NULL_CODE) {
-        WRITE_CODE (prefix, next);
+        WRITE_CODE (prefix, maxcode);
 
-        if (++next == total_codes)  // watch for clearing to the first string to stay in step with the decoder!
-            next = FIRST_STRING;    // (this was actually a corner-case bug that did not trigger often)
+        if (!dictionary_full)
+            maxcode++;
     }
 
-    WRITE_CODE (next, next);        // the maximum possible code is always reserved for our END_CODE
+    WRITE_CODE (maxcode, maxcode);  // the maximum possible code is always reserved for our END_CODE
 
     if (bits)                       // finally, flush any pending bits from the shifter
         (*dst)(shifter, dstctx);
 
-    free (terminators); free (next_references); free (first_references);
+    free (dictionary);
     return 0;
 }
 
@@ -220,12 +336,18 @@ int lzw_compress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void
  * (but simple applications can ignore these).
  */
 
+typedef struct {
+    unsigned char terminator, extra_references;
+    unsigned short prefix;
+} decoder_entry_t;
+
 int lzw_decompress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void *srcctx)
 {
-    int read_byte, next = FIRST_STRING, prefix = CLEAR_CODE, bits = 0, total_codes;
-    unsigned char *terminators, *reverse_buffer;
-    unsigned long shifter = 0;
-    unsigned short *prefixes;
+    unsigned int maxcode = FIRST_STRING, next_string = FIRST_STRING - 1, prefix = CLEAR_CODE;
+    unsigned int dictionary_full = 0, max_available_code, total_codes;
+    unsigned int shifter = 0, bits = 0, read_byte, i;
+    unsigned char *reverse_buffer, *referenced;
+    decoder_entry_t *dictionary;
 
     if ((read_byte = ((*src)(srcctx))) == EOF || (read_byte & 0xf8))  //sanitize first byte
         return 1;
@@ -233,36 +355,47 @@ int lzw_decompress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), voi
     // based on the "maxbits" parameter, compute total codes and allocate dictionary storage
 
     total_codes = 512 << (read_byte & 0x7);
-    reverse_buffer = malloc ((total_codes - 256) * sizeof (reverse_buffer [0]));
-    prefixes = malloc ((total_codes - 256) * sizeof (prefixes [0]));
-    terminators = malloc ((total_codes - 256) * sizeof (terminators [0]));
-
-    if (!reverse_buffer || !prefixes || !terminators)       // check for malloc() failure
+    max_available_code = total_codes - 2;
+    dictionary = malloc (total_codes * sizeof (decoder_entry_t));
+    reverse_buffer = malloc (total_codes - 256);
+    referenced = malloc (total_codes / 8);  // bitfield indicating code is referenced at least once
+
+    // Note that to implement the dictionary entry recycling we have to keep track of how many
+    // longer strings are based on each string in the dictionary. This can be between 0 (no
+    // references) to 256 (every possible next byte), but unfortunately that's one more value
+    // than what can be stored in a byte. The solution is to have a single bit for each entry
+    // indicating any references (i.e., the code cannot be recycled) and an additional byte
+    // in the dictionary entry struct counting the "extra" references (beyond one).
+
+    if (!reverse_buffer || !dictionary)         // check for malloc() failure
         return 1;
 
+    for (i = 0; i < 256; ++i) {                 // these never change
+        dictionary [i].prefix = NULL_CODE;
+        dictionary [i].terminator = i;
+    }
+
     // This is the main loop where we read input symbols. The values range from 0 to the code value
     // of the "next" string in the dictionary (although the actual "next" code cannot be used yet,
     // and so we reserve that code for the END_CODE). Note that receiving an EOF from the input
     // stream is actually an error because we should have gotten the END_CODE first.
 
     while (1) {
-        int code_bits = next < 4096 ?
-            (next < 1024  ? 8  + (next >= 512)  : 10 + (next >= 2048)) :
-            (next < 16384 ? 12 + (next >= 8192) : 14 + (next >= 32768));
-        int extras = (1 << (code_bits + 1)) - next - 1, code;
+        unsigned int code_bits = CODE_BITS (maxcode), code;
+        unsigned int extras = (2 << code_bits) - maxcode - 1;
 
         do {
             if ((read_byte = ((*src)(srcctx))) == EOF) {
-                free (terminators); free (prefixes); free (reverse_buffer);
+                free (dictionary); free (reverse_buffer); free (referenced);
                 return 1;
             }
 
-            shifter |= (long) read_byte << bits;
+            shifter |= read_byte << bits;
         } while ((bits += 8) < code_bits);
 
         // first we assume the code will fit in the minimum number of required bits
 
-        code = (int) shifter & ((1 << code_bits) - 1);
+        code = shifter & ((1 << code_bits) - 1);
         shifter >>= code_bits;
         bits -= code_bits;
 
@@ -272,11 +405,11 @@ int lzw_decompress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), voi
         if (code >= extras) {
             if (!bits) {
                 if ((read_byte = ((*src)(srcctx))) == EOF) {
-                    free (terminators); free (prefixes); free (reverse_buffer);
+                    free (dictionary); free (reverse_buffer); free (referenced);
                     return 1;
                 }
 
-                shifter = (long) read_byte;
+                shifter = read_byte;
                 bits = 8;
             }
 
@@ -285,46 +418,96 @@ int lzw_decompress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), voi
             bits--;
         }
 
-        if (code == next)                   // sending the maximum code is reserved for the end of the file
+        if (code == maxcode)                // sending the maximum code is reserved for the end of the file
             break;
-        else if (code == CLEAR_CODE)        // otherwise check for a CLEAR_CODE to start over early
-            next = FIRST_STRING;
+        else if (code == CLEAR_CODE) {      // otherwise check for a CLEAR_CODE to start over early
+            next_string = FIRST_STRING - 1;
+            maxcode = FIRST_STRING;
+            dictionary_full = 0;
+        }
         else if (prefix == CLEAR_CODE) {    // this only happens at the first symbol which is always sent
             (*dst)(code, dstctx);           // literally and becomes our initial prefix
-            next++;
+            next_string++;
+            maxcode++;
         }
         // Otherwise we have a valid prefix so we step through the string from end to beginning storing the
         // bytes in the "reverse_buffer", and then we send them out in the proper order. One corner-case
         // we have to handle here is that the string might be the same one that is actually being defined
-        // now (code == next-1). Also, the first 256 entries of "terminators" and "prefixes" are fixed and
-        // not allocated, so that messes things up a bit.
+        // now (code == next_string).
         else {
-            int cti = (code == next-1) ? prefix : code;
+            unsigned int cti = (code == next_string) ? prefix : code;
             unsigned char *rbp = reverse_buffer, c;
 
-            do *rbp++ = cti < 256 ? cti : terminators [cti - 256];      // step backward through string...
-            while ((cti = (cti < 256) ? NULL_CODE : prefixes [cti - 256]) != NULL_CODE);
+            do {
+                *rbp++ = dictionary [cti].terminator;
+                if (rbp == reverse_buffer + total_codes - 256) {
+                    free (dictionary); free (reverse_buffer); free (referenced);
+                    return 1;
+                }
+            } while ((cti = dictionary [cti].prefix) != NULL_CODE);
 
             c = *--rbp;     // the first byte in this string is the terminator for the last string, which is
                             // the one that we'll create a new dictionary entry for this time
 
-            do (*dst)(*rbp, dstctx);                // send string in corrected order (except for the terminator
-            while (rbp-- != reverse_buffer);        // which we don't know yet)
+            do      // send string in corrected order (except for the terminator which we don't know yet)
+                (*dst)(*rbp, dstctx);
+            while (rbp-- != reverse_buffer);
 
-            if (code == next-1)
+            if (code == next_string) {
                 (*dst)(c,dstctx);
+            }
+
+            // This should always execute (the conditional is to catch corruptions) and is where we add a new string to
+            // the dictionary, either at the end or elsewhere when we are "recycling" entries that were never referenced
 
-            prefixes [next - 1 - 256] = prefix;     // now update the next dictionary entry with the new string
-            terminators [next - 1 - 256] = c;       // (but we're always one behind, so it's not the string just sent)
+            if (next_string >= FIRST_STRING && next_string < total_codes) {
+                if (referenced [prefix >> 3] & (1 << (prefix & 7)))     // increment reference count on prefix
+                    dictionary [prefix].extra_references++;
+                else
+                    referenced [prefix >> 3] |= 1 << (prefix & 7);
 
-            if (++next == total_codes)              // check for full dictionary, which forces a reset (and, BTW,
-                next = FIRST_STRING;                // means we'll never use the dictionary entry we just wrote)
+                dictionary [next_string].prefix = prefix;       // now update the next dictionary entry with the new string
+                dictionary [next_string].terminator = c;        // (but we're always one behind, so it's not the string just sent)
+                dictionary [next_string].extra_references = 0;  // newly created string has not been referenced
+                referenced [next_string >> 3] &= ~(1 << (next_string & 7));
+            }
+
+            // If the dictionary is not full yet, we bump the maxcode and next_string and check to see if the
+            // dictionary is now full. If it is we set the dictionary_full flag and set next_string back to the
+            // beginning of the dictionary strings to start recycling them. Note that then maxcode will remain
+            // two less than total_codes because every string entry is available for matching, and the actual
+            // maximum code is reserved for EOF.
+
+            if (!dictionary_full) {
+                maxcode++;
+
+                if (++next_string > max_available_code) {
+                    dictionary_full = 1;
+                    maxcode--;
+                }
+            }
+
+            // If the dictionary is full we look for an entry to recycle starting at next_string (the one we
+            // created or recycled) plus one. We know there is one because at a minimum the string we just added
+            // has not been referenced). This also takes care of removing the entry to be recycled (which is
+            // possible/easy because no longer strings have been based on it).
+
+            if (dictionary_full) {
+                for (next_string++; next_string <= max_available_code || (next_string = FIRST_STRING); next_string++)
+                    if (!(referenced [next_string >> 3] & (1 << (next_string & 7))))
+                        break;
+
+                if (dictionary [dictionary [next_string].prefix].extra_references)
+                    dictionary [dictionary [next_string].prefix].extra_references--;
+                else
+                    referenced [dictionary [next_string].prefix >> 3] &= ~(1 << (dictionary [next_string].prefix & 7));
+            }
         }
 
         prefix = code;      // the code we just received becomes the prefix for the next dictionary string entry
                             // (which we'll create once we find out the terminator)
     }
 
-    free (terminators); free (prefixes); free (reverse_buffer);
+    free (dictionary); free (reverse_buffer); free (referenced);
     return 0;
 }