220 lines
7.5 KiB
C++
220 lines
7.5 KiB
C++
/*
|
|
Copyright (c) 2009-2010 Christopher A. Taylor. All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
* Redistributions of source code must retain the above copyright notice,
|
|
this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright notice,
|
|
this list of conditions and the following disclaimer in the documentation
|
|
and/or other materials provided with the distribution.
|
|
* Neither the name of LibCat nor the names of its contributors may be used
|
|
to endorse or promote products derived from this software without
|
|
specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#ifndef CAT_RANGE_CODER_HPP
|
|
#define CAT_RANGE_CODER_HPP
|
|
|
|
#include <cat/Platform.hpp>
|
|
#include <ostream>
|
|
#include <map>
|
|
#include <string>
|
|
|
|
namespace cat {
|
|
|
|
|
|
/*
|
|
TextStatsCollector
|
|
|
|
Collects order-1 statistics of the text given one character at a time.
|
|
Order-1 statistics include the likelihood of a character given the previous one.
|
|
|
|
This is intended to be used on a large sample of text (of unlimited length)
|
|
to come up with statistics that most text should follow. When the resulting
|
|
table is used with a range coder, compression should be reliably achieved,
|
|
though a bit should be allocated for the case where the result of the coder
|
|
would be longer than encoding with uniform ranges, like if someone enters "ZZZZ".
|
|
|
|
The RangeEncoder class has a text compressor based on the output of this class.
|
|
|
|
I opted for a static table because this is to be run on a network server
|
|
where many clients are connected. The memory needed for this type of
|
|
compression does not increase with the number of clients.
|
|
*/
|
|
class TextStatsCollector
|
|
{
|
|
u32 last, total, frequencies[256][256];
|
|
u8 seen[256];
|
|
|
|
public:
|
|
TextStatsCollector();
|
|
|
|
public:
|
|
// Record a character occurance
|
|
// 0 = end of line, so next character counts towards initial character frequency
|
|
void Tally(u8 x);
|
|
|
|
#if defined(CAT_PRAGMA_PACK)
|
|
#pragma pack(push)
|
|
#pragma pack(1)
|
|
#endif
|
|
|
|
struct TableFormat
|
|
{
|
|
// MurmurHash2 of remainder, with seed = 0
|
|
u32 hash;
|
|
|
|
// Total symbols in the table <= 256
|
|
u16 total;
|
|
|
|
// Fraction of a byte represented by total, scaled from [0, 2^15]
|
|
u16 log2total;
|
|
|
|
// ASCII character code -> Table index map
|
|
u8 char2index[256];
|
|
|
|
// Table index -> ASCII character code map
|
|
u8 index2char[256];
|
|
|
|
/*
|
|
Start of frequency table
|
|
|
|
The first 32 entries are used for reverse lookup (freq->symbol):
|
|
GET_SYMBOL_LUT() will get this address:
|
|
frequencies[0..15] = array of 16 bytes creating a lookup table (LUT) given
|
|
the high 4 bits of the frequency, for the low range
|
|
frequencies[16..31] = array of 16 bytes creating a lookup table (LUT) given
|
|
the high 4 bits of the frequency, for the high range
|
|
|
|
GET_SYMBOL_BASE() will get this address:
|
|
frequencies[32] = cumulative frequency for (last=0, this=1) out of 2^16 trials
|
|
frequencies[33] = cumulative frequency for (last=0, this=2) out of 2^16 trials
|
|
frequencies[34] = ...
|
|
|
|
Note: (0, 0) is implicitly zero, and (0, TOTAL) is implicitly 2^16
|
|
So these tables don't include those implicit entries.
|
|
*/
|
|
u16 frequencies[1];
|
|
} CAT_PACKED;
|
|
|
|
#if defined(CAT_PRAGMA_PACK)
|
|
#pragma pack(pop)
|
|
#endif
|
|
|
|
// Returns code that creates a table in the above format
|
|
bool GenerateMinimalStaticTable(const char *TableName, std::ostream &osout);
|
|
|
|
// Check for errors in the in-memory version of the table that was generated
|
|
static bool VerifyTableIntegrity(const TableFormat *table);
|
|
};
|
|
|
|
|
|
/*
|
|
Range Encoder
|
|
|
|
Encodes a single message one field at a time using the minimum
|
|
number of bits, rounded up to the next highest byte.
|
|
|
|
To insure that the message does not grow in size, provide limited
|
|
space for the output buffer and check .Fail() at the end. If it
|
|
failed, this means it ran out of space during encoding.
|
|
|
|
If encoding succeeded, check .Used() to determine the number of
|
|
bytes used by the output buffer.
|
|
*/
|
|
class RangeEncoder
|
|
{
|
|
u8 *output;
|
|
int limit, remaining;
|
|
|
|
u64 low, range;
|
|
|
|
// Write out bytes as needed
|
|
CAT_INLINE void Normalize();
|
|
|
|
CAT_INLINE void GetTableSymbol(const TextStatsCollector::TableFormat *stats, u32 &last, u8 ch, u16 &symbol_low, u16 &symbol_range);
|
|
|
|
public:
|
|
// Ctors
|
|
RangeEncoder(void *output_i, int limit_i);
|
|
RangeEncoder(RangeEncoder &cp);
|
|
|
|
// Overwrite one context with another. Using context references instead
|
|
// of working on the contexts directly is probably more efficient.
|
|
RangeEncoder &operator=(RangeEncoder &cp);
|
|
|
|
// State accessors
|
|
bool Fail() { return output == 0; }
|
|
int Used() { return limit - remaining; }
|
|
|
|
public:
|
|
// Encode the given text with the given statistics
|
|
// NOTE: May be up to one byte longer than the original message
|
|
void Text(const char *msg, const TextStatsCollector::TableFormat *stats);
|
|
|
|
// Encode a biased bit given the frequency it is 0
|
|
// frequency = average times out of 2^32 trials the bit will be 0
|
|
void BiasedBit(u32 b, u32 frequency);
|
|
|
|
// Encode a field that takes on [0, total) values with equal likelihood
|
|
void Range(u32 symbol, u32 total);
|
|
|
|
// Emit the final byte(s) needed to encode the symbols
|
|
void Finish();
|
|
};
|
|
|
|
|
|
/*
|
|
Range Decoder
|
|
|
|
Interprets buffers produced by RangeEncoder
|
|
*/
|
|
class RangeDecoder
|
|
{
|
|
const u8 *input;
|
|
int remaining;
|
|
u64 code, low, range;
|
|
|
|
// Read in bytes as needed
|
|
CAT_INLINE void Normalize();
|
|
|
|
// Grab symbol low frequency and range from the table
|
|
CAT_INLINE u8 GetTableSymbol(const TextStatsCollector::TableFormat *stats, u32 &last, u16 freq, u16 &symbol_low, u16 &symbol_range);
|
|
|
|
public:
|
|
// Initializing constructor
|
|
RangeDecoder(const void *message, int bytes);
|
|
|
|
int Remaining() { return remaining; }
|
|
|
|
public:
|
|
// Decode the given text with the given statistics
|
|
int Text(char *msg, int buffer_size, const TextStatsCollector::TableFormat *stats);
|
|
|
|
// Decode a biased bit given the frequency it is 0
|
|
// frequency = average times out of 2^32 trials the bit will be 0
|
|
u32 BiasedBit(u32 frequency);
|
|
|
|
// Decode a field that takes on [0, total) values with equal likelihood
|
|
u32 Range(u32 total);
|
|
};
|
|
|
|
|
|
} // namespace cat
|
|
|
|
#endif // CAT_RANGE_CODER_HPP
|