mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-06-08 05:27:14 +09:00
LibCompress: Refactor deflate de/compressor using zlib
Also remove two crash tests that are not relevant anymore because the implementation changed substantially.
This commit is contained in:
parent
2baa7977a4
commit
1c2b373e9c
Notes:
github-actions[bot]
2025-03-19 12:48:41 +00:00
Author: https://github.com/devgianlu
Commit: 1c2b373e9c
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/3755
Reviewed-by: https://github.com/gmta ✅
Reviewed-by: https://github.com/trflynn89
8 changed files with 56 additions and 1018 deletions
|
@ -1,25 +1,19 @@
|
|||
/*
|
||||
* Copyright (c) 2020, the SerenityOS developers.
|
||||
* Copyright (c) 2021, Idan Horowitz <idan.horowitz@serenityos.org>
|
||||
* Copyright (c) 2025, Altomani Gianluca <altomanigianluca@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/Array.h>
|
||||
#include <AK/Assertions.h>
|
||||
#include <AK/BinarySearch.h>
|
||||
#include <AK/MemoryStream.h>
|
||||
#include <LibCompress/Deflate.h>
|
||||
#include <LibCompress/Huffman.h>
|
||||
#include <LibCompress/DeflateTables.h>
|
||||
|
||||
#include <zlib.h>
|
||||
|
||||
namespace Compress {
|
||||
|
||||
static constexpr u8 deflate_special_code_length_copy = 16;
|
||||
static constexpr u8 deflate_special_code_length_zeros = 17;
|
||||
static constexpr u8 deflate_special_code_length_long_zeros = 18;
|
||||
|
||||
static constexpr int EndOfBlock = 256;
|
||||
|
||||
CanonicalCode const& CanonicalCode::fixed_literal_codes()
|
||||
{
|
||||
static CanonicalCode code;
|
||||
|
@ -165,807 +159,28 @@ ErrorOr<u32> CanonicalCode::read_symbol(LittleEndianInputBitStream& stream) cons
|
|||
return Error::from_string_literal("Symbol exceeds maximum symbol number");
|
||||
}
|
||||
|
||||
DeflateDecompressor::CompressedBlock::CompressedBlock(DeflateDecompressor& decompressor, CanonicalCode literal_codes, Optional<CanonicalCode> distance_codes)
|
||||
: m_decompressor(decompressor)
|
||||
, m_literal_codes(literal_codes)
|
||||
, m_distance_codes(distance_codes)
|
||||
{
|
||||
}
|
||||
|
||||
ErrorOr<bool> DeflateDecompressor::CompressedBlock::try_read_more()
|
||||
{
|
||||
if (m_eof == true)
|
||||
return false;
|
||||
|
||||
auto const symbol = TRY(m_literal_codes.read_symbol(*m_decompressor.m_input_stream));
|
||||
|
||||
if (symbol >= 286)
|
||||
return Error::from_string_literal("Invalid deflate literal/length symbol");
|
||||
|
||||
if (symbol < EndOfBlock) {
|
||||
u8 byte_symbol = symbol;
|
||||
m_decompressor.m_output_buffer.write({ &byte_symbol, sizeof(byte_symbol) });
|
||||
return true;
|
||||
}
|
||||
|
||||
if (symbol == EndOfBlock) {
|
||||
m_eof = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!m_distance_codes.has_value())
|
||||
return Error::from_string_literal("Distance codes have not been initialized");
|
||||
|
||||
auto const length = TRY(m_decompressor.decode_length(symbol));
|
||||
auto const distance_symbol = TRY(m_distance_codes.value().read_symbol(*m_decompressor.m_input_stream));
|
||||
if (distance_symbol >= 30)
|
||||
return Error::from_string_literal("Invalid deflate distance symbol");
|
||||
|
||||
auto const distance = TRY(m_decompressor.decode_distance(distance_symbol));
|
||||
|
||||
auto copied_length = TRY(m_decompressor.m_output_buffer.copy_from_seekback(distance, length));
|
||||
|
||||
// TODO: What should we do if the output buffer is full?
|
||||
VERIFY(copied_length == length);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
DeflateDecompressor::UncompressedBlock::UncompressedBlock(DeflateDecompressor& decompressor, size_t length)
|
||||
: m_decompressor(decompressor)
|
||||
, m_bytes_remaining(length)
|
||||
{
|
||||
}
|
||||
|
||||
ErrorOr<bool> DeflateDecompressor::UncompressedBlock::try_read_more()
|
||||
{
|
||||
if (m_bytes_remaining == 0)
|
||||
return false;
|
||||
|
||||
if (m_decompressor.m_input_stream->is_eof())
|
||||
return Error::from_string_literal("Input data ends in the middle of an uncompressed DEFLATE block");
|
||||
|
||||
Array<u8, 4096> temporary_buffer;
|
||||
auto readable_bytes = temporary_buffer.span().trim(min(m_bytes_remaining, m_decompressor.m_output_buffer.empty_space()));
|
||||
auto read_bytes = TRY(m_decompressor.m_input_stream->read_some(readable_bytes));
|
||||
auto written_bytes = m_decompressor.m_output_buffer.write(read_bytes);
|
||||
VERIFY(read_bytes.size() == written_bytes);
|
||||
|
||||
m_bytes_remaining -= written_bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
ErrorOr<NonnullOwnPtr<DeflateDecompressor>> DeflateDecompressor::construct(MaybeOwned<LittleEndianInputBitStream> stream)
|
||||
{
|
||||
auto output_buffer = TRY(CircularBuffer::create_empty(32 * KiB));
|
||||
return TRY(adopt_nonnull_own_or_enomem(new (nothrow) DeflateDecompressor(move(stream), move(output_buffer))));
|
||||
}
|
||||
|
||||
DeflateDecompressor::DeflateDecompressor(MaybeOwned<LittleEndianInputBitStream> stream, CircularBuffer output_buffer)
|
||||
: m_input_stream(move(stream))
|
||||
, m_output_buffer(move(output_buffer))
|
||||
{
|
||||
}
|
||||
|
||||
DeflateDecompressor::~DeflateDecompressor()
|
||||
{
|
||||
if (m_state == State::ReadingCompressedBlock)
|
||||
m_compressed_block.~CompressedBlock();
|
||||
if (m_state == State::ReadingUncompressedBlock)
|
||||
m_uncompressed_block.~UncompressedBlock();
|
||||
}
|
||||
|
||||
ErrorOr<Bytes> DeflateDecompressor::read_some(Bytes bytes)
|
||||
{
|
||||
size_t total_read = 0;
|
||||
while (total_read < bytes.size()) {
|
||||
auto slice = bytes.slice(total_read);
|
||||
|
||||
if (m_state == State::Idle) {
|
||||
if (m_read_final_block)
|
||||
break;
|
||||
|
||||
m_read_final_block = TRY(m_input_stream->read_bit());
|
||||
auto const block_type = TRY(m_input_stream->read_bits(2));
|
||||
|
||||
if (block_type == 0b00) {
|
||||
m_input_stream->align_to_byte_boundary();
|
||||
|
||||
u16 length = TRY(m_input_stream->read_value<LittleEndian<u16>>());
|
||||
u16 negated_length = TRY(m_input_stream->read_value<LittleEndian<u16>>());
|
||||
|
||||
if ((length ^ 0xffff) != negated_length)
|
||||
return Error::from_string_literal("Calculated negated length does not equal stored negated length");
|
||||
|
||||
m_state = State::ReadingUncompressedBlock;
|
||||
new (&m_uncompressed_block) UncompressedBlock(*this, length);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (block_type == 0b01) {
|
||||
m_state = State::ReadingCompressedBlock;
|
||||
new (&m_compressed_block) CompressedBlock(*this, CanonicalCode::fixed_literal_codes(), CanonicalCode::fixed_distance_codes());
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (block_type == 0b10) {
|
||||
CanonicalCode literal_codes;
|
||||
Optional<CanonicalCode> distance_codes;
|
||||
TRY(decode_codes(literal_codes, distance_codes));
|
||||
|
||||
m_state = State::ReadingCompressedBlock;
|
||||
new (&m_compressed_block) CompressedBlock(*this, literal_codes, distance_codes);
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
return Error::from_string_literal("Unhandled block type for Idle state");
|
||||
}
|
||||
|
||||
if (m_state == State::ReadingCompressedBlock) {
|
||||
auto nread = m_output_buffer.read(slice).size();
|
||||
|
||||
while (nread < slice.size() && TRY(m_compressed_block.try_read_more())) {
|
||||
nread += m_output_buffer.read(slice.slice(nread)).size();
|
||||
}
|
||||
|
||||
total_read += nread;
|
||||
if (nread == slice.size())
|
||||
break;
|
||||
|
||||
m_compressed_block.~CompressedBlock();
|
||||
m_state = State::Idle;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (m_state == State::ReadingUncompressedBlock) {
|
||||
auto nread = m_output_buffer.read(slice).size();
|
||||
|
||||
while (nread < slice.size() && TRY(m_uncompressed_block.try_read_more())) {
|
||||
nread += m_output_buffer.read(slice.slice(nread)).size();
|
||||
}
|
||||
|
||||
total_read += nread;
|
||||
if (nread == slice.size())
|
||||
break;
|
||||
|
||||
m_uncompressed_block.~UncompressedBlock();
|
||||
m_state = State::Idle;
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
return bytes.slice(0, total_read);
|
||||
}
|
||||
|
||||
bool DeflateDecompressor::is_eof() const { return m_state == State::Idle && m_read_final_block; }
|
||||
|
||||
ErrorOr<size_t> DeflateDecompressor::write_some(ReadonlyBytes)
|
||||
{
|
||||
return Error::from_errno(EBADF);
|
||||
}
|
||||
|
||||
bool DeflateDecompressor::is_open() const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
void DeflateDecompressor::close()
|
||||
ErrorOr<NonnullOwnPtr<DeflateDecompressor>> DeflateDecompressor::create(MaybeOwned<Stream> stream)
|
||||
{
|
||||
auto buffer = TRY(AK::FixedArray<u8>::create(16 * 1024));
|
||||
auto zstream = TRY(GenericZlibDecompressor::new_z_stream(-MAX_WBITS));
|
||||
return adopt_nonnull_own_or_enomem(new (nothrow) DeflateDecompressor(move(buffer), move(stream), zstream));
|
||||
}
|
||||
|
||||
ErrorOr<ByteBuffer> DeflateDecompressor::decompress_all(ReadonlyBytes bytes)
|
||||
{
|
||||
FixedMemoryStream memory_stream { bytes };
|
||||
LittleEndianInputBitStream bit_stream { MaybeOwned<Stream>(memory_stream) };
|
||||
auto deflate_stream = TRY(DeflateDecompressor::construct(MaybeOwned<LittleEndianInputBitStream>(bit_stream)));
|
||||
return deflate_stream->read_until_eof(4096);
|
||||
return ::Compress::decompress_all<DeflateDecompressor>(bytes);
|
||||
}
|
||||
|
||||
ErrorOr<u32> DeflateDecompressor::decode_length(u32 symbol)
|
||||
ErrorOr<NonnullOwnPtr<DeflateCompressor>> DeflateCompressor::create(MaybeOwned<Stream> stream, GenericZlibCompressionLevel compression_level)
|
||||
{
|
||||
if (symbol <= 264)
|
||||
return symbol - 254;
|
||||
|
||||
if (symbol <= 284) {
|
||||
auto extra_bits = (symbol - 261) / 4;
|
||||
return (((symbol - 265) % 4 + 4) << extra_bits) + 3 + TRY(m_input_stream->read_bits(extra_bits));
|
||||
}
|
||||
|
||||
if (symbol == 285)
|
||||
return DeflateDecompressor::max_back_reference_length;
|
||||
|
||||
VERIFY_NOT_REACHED();
|
||||
auto buffer = TRY(AK::FixedArray<u8>::create(16 * 1024));
|
||||
auto zstream = TRY(GenericZlibCompressor::new_z_stream(-MAX_WBITS, compression_level));
|
||||
return adopt_nonnull_own_or_enomem(new (nothrow) DeflateCompressor(move(buffer), move(stream), zstream));
|
||||
}
|
||||
|
||||
ErrorOr<u32> DeflateDecompressor::decode_distance(u32 symbol)
|
||||
ErrorOr<ByteBuffer> DeflateCompressor::compress_all(ReadonlyBytes bytes, GenericZlibCompressionLevel compression_level)
|
||||
{
|
||||
if (symbol <= 3)
|
||||
return symbol + 1;
|
||||
|
||||
if (symbol <= 29) {
|
||||
auto extra_bits = (symbol / 2) - 1;
|
||||
return ((symbol % 2 + 2) << extra_bits) + 1 + TRY(m_input_stream->read_bits(extra_bits));
|
||||
}
|
||||
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
ErrorOr<void> DeflateDecompressor::decode_codes(CanonicalCode& literal_code, Optional<CanonicalCode>& distance_code)
|
||||
{
|
||||
auto literal_code_count = TRY(m_input_stream->read_bits(5)) + 257;
|
||||
auto distance_code_count = TRY(m_input_stream->read_bits(5)) + 1;
|
||||
auto code_length_count = TRY(m_input_stream->read_bits(4)) + 4;
|
||||
|
||||
// First we have to extract the code lengths of the code that was used to encode the code lengths of
|
||||
// the code that was used to encode the block.
|
||||
|
||||
u8 code_lengths_code_lengths[19] = { 0 };
|
||||
for (size_t i = 0; i < code_length_count; ++i) {
|
||||
code_lengths_code_lengths[code_lengths_code_lengths_order[i]] = TRY(m_input_stream->read_bits(3));
|
||||
}
|
||||
|
||||
// Now we can extract the code that was used to encode the code lengths of the code that was used to
|
||||
// encode the block.
|
||||
auto const code_length_code = TRY(CanonicalCode::from_bytes({ code_lengths_code_lengths, sizeof(code_lengths_code_lengths) }));
|
||||
|
||||
// Next we extract the code lengths of the code that was used to encode the block.
|
||||
Vector<u8, 286> code_lengths;
|
||||
while (code_lengths.size() < literal_code_count + distance_code_count) {
|
||||
auto symbol = TRY(code_length_code.read_symbol(*m_input_stream));
|
||||
|
||||
if (symbol < deflate_special_code_length_copy) {
|
||||
code_lengths.append(static_cast<u8>(symbol));
|
||||
} else if (symbol == deflate_special_code_length_copy) {
|
||||
if (code_lengths.is_empty())
|
||||
return Error::from_string_literal("Found no codes to copy before a copy block");
|
||||
auto nrepeat = 3 + TRY(m_input_stream->read_bits(2));
|
||||
for (size_t j = 0; j < nrepeat; ++j)
|
||||
code_lengths.append(code_lengths.last());
|
||||
} else if (symbol == deflate_special_code_length_zeros) {
|
||||
auto nrepeat = 3 + TRY(m_input_stream->read_bits(3));
|
||||
for (size_t j = 0; j < nrepeat; ++j)
|
||||
code_lengths.append(0);
|
||||
} else {
|
||||
VERIFY(symbol == deflate_special_code_length_long_zeros);
|
||||
auto nrepeat = 11 + TRY(m_input_stream->read_bits(7));
|
||||
for (size_t j = 0; j < nrepeat; ++j)
|
||||
code_lengths.append(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (code_lengths.size() != literal_code_count + distance_code_count)
|
||||
return Error::from_string_literal("Number of code lengths does not match the sum of codes");
|
||||
|
||||
// Now we extract the code that was used to encode literals and lengths in the block.
|
||||
literal_code = TRY(CanonicalCode::from_bytes(code_lengths.span().trim(literal_code_count)));
|
||||
|
||||
// Now we extract the code that was used to encode distances in the block.
|
||||
|
||||
if (distance_code_count == 1) {
|
||||
auto length = code_lengths[literal_code_count];
|
||||
|
||||
if (length == 0)
|
||||
return {};
|
||||
else if (length != 1)
|
||||
return Error::from_string_literal("Length for a single distance code is longer than 1");
|
||||
}
|
||||
|
||||
distance_code = TRY(CanonicalCode::from_bytes(code_lengths.span().slice(literal_code_count)));
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
ErrorOr<NonnullOwnPtr<DeflateCompressor>> DeflateCompressor::construct(MaybeOwned<Stream> stream, CompressionLevel compression_level)
|
||||
{
|
||||
auto bit_stream = TRY(try_make<LittleEndianOutputBitStream>(move(stream)));
|
||||
auto deflate_compressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) DeflateCompressor(move(bit_stream), compression_level)));
|
||||
return deflate_compressor;
|
||||
}
|
||||
|
||||
DeflateCompressor::DeflateCompressor(NonnullOwnPtr<LittleEndianOutputBitStream> stream, CompressionLevel compression_level)
|
||||
: m_compression_level(compression_level)
|
||||
, m_compression_constants(compression_constants[static_cast<int>(m_compression_level)])
|
||||
, m_output_stream(move(stream))
|
||||
{
|
||||
m_symbol_frequencies.fill(0);
|
||||
m_distance_frequencies.fill(0);
|
||||
}
|
||||
|
||||
DeflateCompressor::~DeflateCompressor() = default;
|
||||
|
||||
ErrorOr<Bytes> DeflateCompressor::read_some(Bytes)
|
||||
{
|
||||
return Error::from_errno(EBADF);
|
||||
}
|
||||
|
||||
ErrorOr<size_t> DeflateCompressor::write_some(ReadonlyBytes bytes)
|
||||
{
|
||||
VERIFY(!m_finished);
|
||||
|
||||
size_t total_written = 0;
|
||||
while (!bytes.is_empty()) {
|
||||
auto n_written = bytes.copy_trimmed_to(pending_block().slice(m_pending_block_size));
|
||||
m_pending_block_size += n_written;
|
||||
|
||||
if (m_pending_block_size == block_size)
|
||||
TRY(flush());
|
||||
|
||||
bytes = bytes.slice(n_written);
|
||||
total_written += n_written;
|
||||
}
|
||||
return total_written;
|
||||
}
|
||||
|
||||
bool DeflateCompressor::is_eof() const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
bool DeflateCompressor::is_open() const
|
||||
{
|
||||
return m_output_stream->is_open();
|
||||
}
|
||||
|
||||
void DeflateCompressor::close()
|
||||
{
|
||||
}
|
||||
|
||||
// Knuth's multiplicative hash on 4 bytes
|
||||
u16 DeflateCompressor::hash_sequence(u8 const* bytes)
|
||||
{
|
||||
constexpr u32 const knuth_constant = 2654435761; // shares no common factors with 2^32
|
||||
return ((bytes[0] | bytes[1] << 8 | bytes[2] << 16 | bytes[3] << 24) * knuth_constant) >> (32 - hash_bits);
|
||||
}
|
||||
|
||||
size_t DeflateCompressor::compare_match_candidate(size_t start, size_t candidate, size_t previous_match_length, size_t maximum_match_length)
|
||||
{
|
||||
VERIFY(previous_match_length < maximum_match_length);
|
||||
|
||||
// We firstly check that the match is at least (prev_match_length + 1) long, we check backwards as there's a higher chance the end mismatches
|
||||
for (ssize_t i = previous_match_length; i >= 0; i--) {
|
||||
if (m_rolling_window[start + i] != m_rolling_window[candidate + i])
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Find the actual length
|
||||
auto match_length = previous_match_length + 1;
|
||||
while (match_length < maximum_match_length && m_rolling_window[start + match_length] == m_rolling_window[candidate + match_length]) {
|
||||
match_length++;
|
||||
}
|
||||
|
||||
VERIFY(match_length > previous_match_length);
|
||||
VERIFY(match_length <= maximum_match_length);
|
||||
return match_length;
|
||||
}
|
||||
|
||||
size_t DeflateCompressor::find_back_match(size_t start, u16 hash, size_t previous_match_length, size_t maximum_match_length, size_t& match_position)
|
||||
{
|
||||
auto max_chain_length = m_compression_constants.max_chain;
|
||||
if (previous_match_length == 0)
|
||||
previous_match_length = min_match_length - 1; // we only care about matches that are at least min_match_length long
|
||||
if (previous_match_length >= maximum_match_length)
|
||||
return 0; // we can't improve a maximum length match
|
||||
if (previous_match_length >= m_compression_constants.max_lazy_length)
|
||||
return 0; // the previous match is already pretty, we shouldn't waste another full search
|
||||
if (previous_match_length >= m_compression_constants.good_match_length)
|
||||
max_chain_length /= 4; // we already have a pretty good much, so do a shorter search
|
||||
|
||||
auto candidate = m_hash_head[hash];
|
||||
auto match_found = false;
|
||||
while (max_chain_length--) {
|
||||
if (candidate == empty_slot)
|
||||
break; // no remaining candidates
|
||||
|
||||
VERIFY(candidate < start);
|
||||
if (start - candidate > window_size)
|
||||
break; // outside the window
|
||||
|
||||
auto match_length = compare_match_candidate(start, candidate, previous_match_length, maximum_match_length);
|
||||
|
||||
if (match_length != 0) {
|
||||
match_found = true;
|
||||
match_position = candidate;
|
||||
previous_match_length = match_length;
|
||||
|
||||
if (match_length == maximum_match_length)
|
||||
return match_length; // bail if we got the maximum possible length
|
||||
}
|
||||
|
||||
candidate = m_hash_prev[candidate % window_size];
|
||||
}
|
||||
if (!match_found)
|
||||
return 0; // we didn't find any matches
|
||||
return previous_match_length; // we found matches, but they were at most previous_match_length long
|
||||
}
|
||||
|
||||
ALWAYS_INLINE u8 DeflateCompressor::distance_to_base(u16 distance)
|
||||
{
|
||||
return (distance <= 256) ? distance_to_base_lo[distance - 1] : distance_to_base_hi[(distance - 1) >> 7];
|
||||
}
|
||||
|
||||
void DeflateCompressor::lz77_compress_block()
|
||||
{
|
||||
for (auto& slot : m_hash_head) { // initialize chained hash table
|
||||
slot = empty_slot;
|
||||
}
|
||||
|
||||
auto insert_hash = [&](auto pos, auto hash) {
|
||||
auto window_pos = pos % window_size;
|
||||
m_hash_prev[window_pos] = m_hash_head[hash];
|
||||
m_hash_head[hash] = window_pos;
|
||||
};
|
||||
|
||||
auto emit_literal = [&](auto literal) {
|
||||
VERIFY(m_pending_symbol_size <= block_size + 1);
|
||||
auto index = m_pending_symbol_size++;
|
||||
m_symbol_buffer[index].distance = 0;
|
||||
m_symbol_buffer[index].literal = literal;
|
||||
m_symbol_frequencies[literal]++;
|
||||
};
|
||||
|
||||
auto emit_back_reference = [&](auto distance, auto length) {
|
||||
VERIFY(m_pending_symbol_size <= block_size + 1);
|
||||
auto index = m_pending_symbol_size++;
|
||||
m_symbol_buffer[index].distance = distance;
|
||||
m_symbol_buffer[index].length = length;
|
||||
m_symbol_frequencies[length_to_symbol[length]]++;
|
||||
m_distance_frequencies[distance_to_base(distance)]++;
|
||||
};
|
||||
|
||||
size_t previous_match_length = 0;
|
||||
size_t previous_match_position = 0;
|
||||
|
||||
VERIFY(m_compression_constants.great_match_length <= max_match_length);
|
||||
|
||||
// our block starts at block_size and is m_pending_block_size in length
|
||||
auto block_end = block_size + m_pending_block_size;
|
||||
size_t current_position;
|
||||
for (current_position = block_size; current_position < block_end - min_match_length + 1; current_position++) {
|
||||
auto hash = hash_sequence(&m_rolling_window[current_position]);
|
||||
size_t match_position;
|
||||
auto match_length = find_back_match(current_position, hash, previous_match_length,
|
||||
min(m_compression_constants.great_match_length, block_end - current_position), match_position);
|
||||
|
||||
insert_hash(current_position, hash);
|
||||
|
||||
// if the previous match is as good as the new match, just use it
|
||||
if (previous_match_length != 0 && previous_match_length >= match_length) {
|
||||
emit_back_reference((current_position - 1) - previous_match_position, previous_match_length);
|
||||
|
||||
// skip all the bytes that are included in this match
|
||||
for (size_t j = current_position + 1; j < min(current_position - 1 + previous_match_length, block_end - min_match_length + 1); j++) {
|
||||
insert_hash(j, hash_sequence(&m_rolling_window[j]));
|
||||
}
|
||||
current_position = (current_position - 1) + previous_match_length - 1;
|
||||
previous_match_length = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (match_length == 0) {
|
||||
VERIFY(previous_match_length == 0);
|
||||
emit_literal(m_rolling_window[current_position]);
|
||||
continue;
|
||||
}
|
||||
|
||||
// if this is a lazy match, and the new match is better than the old one, output previous as literal
|
||||
if (previous_match_length != 0) {
|
||||
emit_literal(m_rolling_window[current_position - 1]);
|
||||
}
|
||||
|
||||
previous_match_length = match_length;
|
||||
previous_match_position = match_position;
|
||||
}
|
||||
|
||||
// clean up leftover lazy match
|
||||
if (previous_match_length != 0) {
|
||||
emit_back_reference((current_position - 1) - previous_match_position, previous_match_length);
|
||||
current_position = (current_position - 1) + previous_match_length;
|
||||
}
|
||||
|
||||
// output remaining literals
|
||||
while (current_position < block_end) {
|
||||
emit_literal(m_rolling_window[current_position++]);
|
||||
}
|
||||
}
|
||||
|
||||
size_t DeflateCompressor::huffman_block_length(Array<u8, max_huffman_literals> const& literal_bit_lengths, Array<u8, max_huffman_distances> const& distance_bit_lengths)
|
||||
{
|
||||
size_t length = 0;
|
||||
|
||||
for (size_t i = 0; i < 286; i++) {
|
||||
auto frequency = m_symbol_frequencies[i];
|
||||
length += literal_bit_lengths[i] * frequency;
|
||||
|
||||
if (i >= 257) // back reference length symbols
|
||||
length += packed_length_symbols[i - 257].extra_bits * frequency;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < 30; i++) {
|
||||
auto frequency = m_distance_frequencies[i];
|
||||
length += distance_bit_lengths[i] * frequency;
|
||||
length += packed_distances[i].extra_bits * frequency;
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
size_t DeflateCompressor::uncompressed_block_length()
|
||||
{
|
||||
auto padding = 8 - ((m_output_stream->bit_offset() + 3) % 8);
|
||||
// 3 bit block header + align to byte + 2 * 16 bit length fields + block contents
|
||||
return 3 + padding + (2 * 16) + m_pending_block_size * 8;
|
||||
}
|
||||
|
||||
size_t DeflateCompressor::fixed_block_length()
|
||||
{
|
||||
// block header + fixed huffman encoded block contents
|
||||
return 3 + huffman_block_length(fixed_literal_bit_lengths, fixed_distance_bit_lengths);
|
||||
}
|
||||
|
||||
size_t DeflateCompressor::dynamic_block_length(Array<u8, max_huffman_literals> const& literal_bit_lengths, Array<u8, max_huffman_distances> const& distance_bit_lengths, Array<u8, 19> const& code_lengths_bit_lengths, Array<u16, 19> const& code_lengths_frequencies, size_t code_lengths_count)
|
||||
{
|
||||
// block header + literal code count + distance code count + code length count
|
||||
auto length = 3 + 5 + 5 + 4;
|
||||
|
||||
// 3 bits per code_length
|
||||
length += 3 * code_lengths_count;
|
||||
|
||||
for (size_t i = 0; i < code_lengths_frequencies.size(); i++) {
|
||||
auto frequency = code_lengths_frequencies[i];
|
||||
length += code_lengths_bit_lengths[i] * frequency;
|
||||
|
||||
if (i == deflate_special_code_length_copy) {
|
||||
length += 2 * frequency;
|
||||
} else if (i == deflate_special_code_length_zeros) {
|
||||
length += 3 * frequency;
|
||||
} else if (i == deflate_special_code_length_long_zeros) {
|
||||
length += 7 * frequency;
|
||||
}
|
||||
}
|
||||
|
||||
return length + huffman_block_length(literal_bit_lengths, distance_bit_lengths);
|
||||
}
|
||||
|
||||
ErrorOr<void> DeflateCompressor::write_huffman(CanonicalCode const& literal_code, Optional<CanonicalCode> const& distance_code)
|
||||
{
|
||||
auto has_distances = distance_code.has_value();
|
||||
for (size_t i = 0; i < m_pending_symbol_size; i++) {
|
||||
if (m_symbol_buffer[i].distance == 0) {
|
||||
TRY(literal_code.write_symbol(*m_output_stream, m_symbol_buffer[i].literal));
|
||||
continue;
|
||||
}
|
||||
VERIFY(has_distances);
|
||||
auto symbol = length_to_symbol[m_symbol_buffer[i].length];
|
||||
TRY(literal_code.write_symbol(*m_output_stream, symbol));
|
||||
// Emit extra bits if needed
|
||||
TRY(m_output_stream->write_bits<u16>(m_symbol_buffer[i].length - packed_length_symbols[symbol - 257].base_length, packed_length_symbols[symbol - 257].extra_bits));
|
||||
|
||||
auto base_distance = distance_to_base(m_symbol_buffer[i].distance);
|
||||
TRY(distance_code.value().write_symbol(*m_output_stream, base_distance));
|
||||
// Emit extra bits if needed
|
||||
TRY(m_output_stream->write_bits<u16>(m_symbol_buffer[i].distance - packed_distances[base_distance].base_distance, packed_distances[base_distance].extra_bits));
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
size_t DeflateCompressor::encode_huffman_lengths(ReadonlyBytes lengths, Array<code_length_symbol, max_huffman_literals + max_huffman_distances>& encoded_lengths)
|
||||
{
|
||||
size_t encoded_count = 0;
|
||||
size_t i = 0;
|
||||
while (i < lengths.size()) {
|
||||
if (lengths[i] == 0) {
|
||||
auto zero_count = 0;
|
||||
for (size_t j = i; j < min(lengths.size(), i + 138) && lengths[j] == 0; j++)
|
||||
zero_count++;
|
||||
|
||||
if (zero_count < 3) { // below minimum repeated zero count
|
||||
encoded_lengths[encoded_count++].symbol = 0;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (zero_count <= 10) {
|
||||
encoded_lengths[encoded_count].symbol = deflate_special_code_length_zeros;
|
||||
encoded_lengths[encoded_count++].count = zero_count;
|
||||
} else {
|
||||
encoded_lengths[encoded_count].symbol = deflate_special_code_length_long_zeros;
|
||||
encoded_lengths[encoded_count++].count = zero_count;
|
||||
}
|
||||
i += zero_count;
|
||||
continue;
|
||||
}
|
||||
|
||||
encoded_lengths[encoded_count++].symbol = lengths[i++];
|
||||
|
||||
auto copy_count = 0;
|
||||
for (size_t j = i; j < min(lengths.size(), i + 6) && lengths[j] == lengths[i - 1]; j++)
|
||||
copy_count++;
|
||||
|
||||
if (copy_count >= 3) {
|
||||
encoded_lengths[encoded_count].symbol = deflate_special_code_length_copy;
|
||||
encoded_lengths[encoded_count++].count = copy_count;
|
||||
i += copy_count;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return encoded_count;
|
||||
}
|
||||
|
||||
size_t DeflateCompressor::encode_block_lengths(Array<u8, max_huffman_literals> const& literal_bit_lengths, Array<u8, max_huffman_distances> const& distance_bit_lengths, Array<code_length_symbol, max_huffman_literals + max_huffman_distances>& encoded_lengths, size_t& literal_code_count, size_t& distance_code_count)
|
||||
{
|
||||
literal_code_count = max_huffman_literals;
|
||||
distance_code_count = max_huffman_distances;
|
||||
|
||||
VERIFY(literal_bit_lengths[EndOfBlock] != 0); // Make sure at least the EndOfBlock marker is present
|
||||
while (literal_bit_lengths[literal_code_count - 1] == 0)
|
||||
literal_code_count--;
|
||||
|
||||
// Drop trailing zero lengths, keeping at least one
|
||||
while (distance_bit_lengths[distance_code_count - 1] == 0 && distance_code_count > 1)
|
||||
distance_code_count--;
|
||||
|
||||
Array<u8, max_huffman_literals + max_huffman_distances> all_lengths {};
|
||||
for (size_t i = 0; i < literal_code_count; i++)
|
||||
all_lengths[i] = literal_bit_lengths[i];
|
||||
for (size_t i = 0; i < distance_code_count; i++)
|
||||
all_lengths[literal_code_count + i] = distance_bit_lengths[i];
|
||||
|
||||
return encode_huffman_lengths(all_lengths.span().trim(literal_code_count + distance_code_count), encoded_lengths);
|
||||
}
|
||||
|
||||
ErrorOr<void> DeflateCompressor::write_dynamic_huffman(CanonicalCode const& literal_code, size_t literal_code_count, Optional<CanonicalCode> const& distance_code, size_t distance_code_count, Array<u8, 19> const& code_lengths_bit_lengths, size_t code_length_count, Array<code_length_symbol, max_huffman_literals + max_huffman_distances> const& encoded_lengths, size_t encoded_lengths_count)
|
||||
{
|
||||
TRY(m_output_stream->write_bits(literal_code_count - 257, 5));
|
||||
TRY(m_output_stream->write_bits(distance_code_count - 1, 5));
|
||||
TRY(m_output_stream->write_bits(code_length_count - 4, 4));
|
||||
|
||||
for (size_t i = 0; i < code_length_count; i++) {
|
||||
TRY(m_output_stream->write_bits(code_lengths_bit_lengths[code_lengths_code_lengths_order[i]], 3));
|
||||
}
|
||||
|
||||
auto code_lengths_code = MUST(CanonicalCode::from_bytes(code_lengths_bit_lengths));
|
||||
for (size_t i = 0; i < encoded_lengths_count; i++) {
|
||||
auto encoded_length = encoded_lengths[i];
|
||||
TRY(code_lengths_code.write_symbol(*m_output_stream, encoded_length.symbol));
|
||||
if (encoded_length.symbol == deflate_special_code_length_copy) {
|
||||
TRY(m_output_stream->write_bits<u8>(encoded_length.count - 3, 2));
|
||||
} else if (encoded_length.symbol == deflate_special_code_length_zeros) {
|
||||
TRY(m_output_stream->write_bits<u8>(encoded_length.count - 3, 3));
|
||||
} else if (encoded_length.symbol == deflate_special_code_length_long_zeros) {
|
||||
TRY(m_output_stream->write_bits<u8>(encoded_length.count - 11, 7));
|
||||
}
|
||||
}
|
||||
|
||||
TRY(write_huffman(literal_code, distance_code));
|
||||
return {};
|
||||
}
|
||||
|
||||
ErrorOr<void> DeflateCompressor::flush()
|
||||
{
|
||||
TRY(m_output_stream->write_bits(m_finished, 1));
|
||||
|
||||
// if this is just an empty block to signify the end of the deflate stream use the smallest block possible (10 bits total)
|
||||
if (m_pending_block_size == 0) {
|
||||
VERIFY(m_finished); // we shouldn't be writing empty blocks unless this is the final one
|
||||
TRY(m_output_stream->write_bits(0b01u, 2)); // fixed huffman codes
|
||||
TRY(m_output_stream->write_bits(0b0000000u, 7)); // end of block symbol
|
||||
TRY(m_output_stream->align_to_byte_boundary());
|
||||
return {};
|
||||
}
|
||||
|
||||
auto write_uncompressed = [&]() -> ErrorOr<void> {
|
||||
TRY(m_output_stream->write_bits(0b00u, 2)); // no compression
|
||||
TRY(m_output_stream->align_to_byte_boundary());
|
||||
TRY(m_output_stream->write_value<LittleEndian<u16>>(m_pending_block_size));
|
||||
TRY(m_output_stream->write_value<LittleEndian<u16>>(~m_pending_block_size));
|
||||
TRY(m_output_stream->write_until_depleted(pending_block().slice(0, m_pending_block_size)));
|
||||
return {};
|
||||
};
|
||||
|
||||
if (m_compression_level == CompressionLevel::STORE) { // disabled compression fast path
|
||||
TRY(write_uncompressed());
|
||||
m_pending_block_size = 0;
|
||||
return {};
|
||||
}
|
||||
|
||||
// The following implementation of lz77 compression and huffman encoding is based on the reference implementation by Hans Wennborg https://www.hanshq.net/zip.html
|
||||
|
||||
// this reads from the pending block and writes to m_symbol_buffer
|
||||
lz77_compress_block();
|
||||
|
||||
// insert EndOfBlock marker to the symbol buffer
|
||||
m_symbol_buffer[m_pending_symbol_size].distance = 0;
|
||||
m_symbol_buffer[m_pending_symbol_size++].literal = EndOfBlock;
|
||||
m_symbol_frequencies[EndOfBlock]++;
|
||||
|
||||
// generate optimal dynamic huffman code lengths
|
||||
Array<u8, max_huffman_literals> dynamic_literal_bit_lengths {};
|
||||
Array<u8, max_huffman_distances> dynamic_distance_bit_lengths {};
|
||||
generate_huffman_lengths(dynamic_literal_bit_lengths, m_symbol_frequencies, 15); // deflate data huffman can use up to 15 bits per symbol
|
||||
generate_huffman_lengths(dynamic_distance_bit_lengths, m_distance_frequencies, 15);
|
||||
|
||||
// encode literal and distance lengths together in deflate format
|
||||
Array<code_length_symbol, max_huffman_literals + max_huffman_distances> encoded_lengths {};
|
||||
size_t literal_code_count;
|
||||
size_t distance_code_count;
|
||||
auto encoded_lengths_count = encode_block_lengths(dynamic_literal_bit_lengths, dynamic_distance_bit_lengths, encoded_lengths, literal_code_count, distance_code_count);
|
||||
|
||||
// count code length frequencies
|
||||
Array<u16, 19> code_lengths_frequencies { 0 };
|
||||
for (size_t i = 0; i < encoded_lengths_count; i++) {
|
||||
code_lengths_frequencies[encoded_lengths[i].symbol]++;
|
||||
}
|
||||
// generate optimal huffman code lengths code lengths
|
||||
Array<u8, 19> code_lengths_bit_lengths {};
|
||||
generate_huffman_lengths(code_lengths_bit_lengths, code_lengths_frequencies, 7); // deflate code length huffman can use up to 7 bits per symbol
|
||||
// calculate actual code length code lengths count (without trailing zeros)
|
||||
auto code_lengths_count = code_lengths_bit_lengths.size();
|
||||
while (code_lengths_bit_lengths[code_lengths_code_lengths_order[code_lengths_count - 1]] == 0)
|
||||
code_lengths_count--;
|
||||
|
||||
auto uncompressed_size = uncompressed_block_length();
|
||||
auto fixed_huffman_size = fixed_block_length();
|
||||
auto dynamic_huffman_size = dynamic_block_length(dynamic_literal_bit_lengths, dynamic_distance_bit_lengths, code_lengths_bit_lengths, code_lengths_frequencies, code_lengths_count);
|
||||
|
||||
// If the compression somehow didn't reduce the size enough, just write out the block uncompressed as it allows for much faster decompression
|
||||
if (uncompressed_size <= min(fixed_huffman_size, dynamic_huffman_size)) {
|
||||
TRY(write_uncompressed());
|
||||
} else if (fixed_huffman_size <= dynamic_huffman_size) {
|
||||
// If the fixed and dynamic huffman codes come out the same size, prefer the fixed version, as it takes less time to decode fixed huffman codes.
|
||||
TRY(m_output_stream->write_bits(0b01u, 2));
|
||||
TRY(write_huffman(CanonicalCode::fixed_literal_codes(), CanonicalCode::fixed_distance_codes()));
|
||||
} else {
|
||||
// dynamic huffman codes
|
||||
TRY(m_output_stream->write_bits(0b10u, 2));
|
||||
auto literal_code = MUST(CanonicalCode::from_bytes(dynamic_literal_bit_lengths));
|
||||
auto distance_code_or_error = CanonicalCode::from_bytes(dynamic_distance_bit_lengths);
|
||||
Optional<CanonicalCode> distance_code;
|
||||
if (!distance_code_or_error.is_error())
|
||||
distance_code = distance_code_or_error.release_value();
|
||||
TRY(write_dynamic_huffman(literal_code, literal_code_count, distance_code, distance_code_count, code_lengths_bit_lengths, code_lengths_count, encoded_lengths, encoded_lengths_count));
|
||||
}
|
||||
if (m_finished)
|
||||
TRY(m_output_stream->align_to_byte_boundary());
|
||||
|
||||
// reset all block specific members
|
||||
m_pending_block_size = 0;
|
||||
m_pending_symbol_size = 0;
|
||||
m_symbol_frequencies.fill(0);
|
||||
m_distance_frequencies.fill(0);
|
||||
// On the final block this copy will potentially produce an invalid search window, but since its the final block we dont care
|
||||
pending_block().copy_trimmed_to({ m_rolling_window, block_size });
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
ErrorOr<void> DeflateCompressor::final_flush()
|
||||
{
|
||||
VERIFY(!m_finished);
|
||||
m_finished = true;
|
||||
TRY(flush());
|
||||
TRY(m_output_stream->flush_buffer_to_stream());
|
||||
return {};
|
||||
}
|
||||
|
||||
ErrorOr<ByteBuffer> DeflateCompressor::compress_all(ReadonlyBytes bytes, CompressionLevel compression_level)
|
||||
{
|
||||
auto output_stream = TRY(try_make<AllocatingMemoryStream>());
|
||||
auto deflate_stream = TRY(DeflateCompressor::construct(MaybeOwned<Stream>(*output_stream), compression_level));
|
||||
|
||||
TRY(deflate_stream->write_until_depleted(bytes));
|
||||
TRY(deflate_stream->final_flush());
|
||||
|
||||
auto buffer = TRY(ByteBuffer::create_uninitialized(output_stream->used_buffer_size()));
|
||||
TRY(output_stream->read_until_filled(buffer));
|
||||
|
||||
return buffer;
|
||||
return ::Compress::compress_all<DeflateCompressor>(bytes, compression_level);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
/*
|
||||
* Copyright (c) 2020, the SerenityOS developers.
|
||||
* Copyright (c) 2021, Idan Horowitz <idan.horowitz@serenityos.org>
|
||||
* Copyright (c) 2025, Altomani Gianluca <altomanigianluca@gmail.com>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
@ -8,14 +9,8 @@
|
|||
#pragma once
|
||||
|
||||
#include <AK/BitStream.h>
|
||||
#include <AK/ByteBuffer.h>
|
||||
#include <AK/CircularBuffer.h>
|
||||
#include <AK/Endian.h>
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/MaybeOwned.h>
|
||||
#include <AK/Stream.h>
|
||||
#include <AK/Vector.h>
|
||||
#include <LibCompress/DeflateTables.h>
|
||||
#include <LibCompress/Zlib.h>
|
||||
|
||||
namespace Compress {
|
||||
|
||||
|
@ -60,172 +55,28 @@ ALWAYS_INLINE ErrorOr<void> CanonicalCode::write_symbol(LittleEndianOutputBitStr
|
|||
return {};
|
||||
}
|
||||
|
||||
class DeflateDecompressor final : public Stream {
|
||||
private:
|
||||
class CompressedBlock {
|
||||
public:
|
||||
CompressedBlock(DeflateDecompressor&, CanonicalCode literal_codes, Optional<CanonicalCode> distance_codes);
|
||||
|
||||
ErrorOr<bool> try_read_more();
|
||||
|
||||
private:
|
||||
bool m_eof { false };
|
||||
|
||||
DeflateDecompressor& m_decompressor;
|
||||
CanonicalCode m_literal_codes;
|
||||
Optional<CanonicalCode> m_distance_codes;
|
||||
};
|
||||
|
||||
class UncompressedBlock {
|
||||
public:
|
||||
UncompressedBlock(DeflateDecompressor&, size_t);
|
||||
|
||||
ErrorOr<bool> try_read_more();
|
||||
|
||||
private:
|
||||
DeflateDecompressor& m_decompressor;
|
||||
size_t m_bytes_remaining;
|
||||
};
|
||||
|
||||
enum class State {
|
||||
Idle,
|
||||
ReadingCompressedBlock,
|
||||
ReadingUncompressedBlock
|
||||
};
|
||||
|
||||
class DeflateDecompressor final : public GenericZlibDecompressor {
|
||||
public:
|
||||
friend CompressedBlock;
|
||||
friend UncompressedBlock;
|
||||
|
||||
static ErrorOr<NonnullOwnPtr<DeflateDecompressor>> construct(MaybeOwned<LittleEndianInputBitStream> stream);
|
||||
~DeflateDecompressor();
|
||||
|
||||
virtual ErrorOr<Bytes> read_some(Bytes) override;
|
||||
virtual ErrorOr<size_t> write_some(ReadonlyBytes) override;
|
||||
virtual bool is_eof() const override;
|
||||
virtual bool is_open() const override;
|
||||
virtual void close() override;
|
||||
|
||||
static ErrorOr<NonnullOwnPtr<DeflateDecompressor>> create(MaybeOwned<Stream>);
|
||||
static ErrorOr<ByteBuffer> decompress_all(ReadonlyBytes);
|
||||
|
||||
private:
|
||||
DeflateDecompressor(MaybeOwned<LittleEndianInputBitStream> stream, CircularBuffer buffer);
|
||||
|
||||
ErrorOr<u32> decode_length(u32);
|
||||
ErrorOr<u32> decode_distance(u32);
|
||||
ErrorOr<void> decode_codes(CanonicalCode& literal_code, Optional<CanonicalCode>& distance_code);
|
||||
|
||||
static constexpr u16 max_back_reference_length = 258;
|
||||
|
||||
bool m_read_final_block { false };
|
||||
|
||||
State m_state { State::Idle };
|
||||
union {
|
||||
CompressedBlock m_compressed_block;
|
||||
UncompressedBlock m_uncompressed_block;
|
||||
};
|
||||
|
||||
MaybeOwned<LittleEndianInputBitStream> m_input_stream;
|
||||
CircularBuffer m_output_buffer;
|
||||
DeflateDecompressor(AK::FixedArray<u8> buffer, MaybeOwned<Stream> stream, z_stream* zstream)
|
||||
: GenericZlibDecompressor(move(buffer), move(stream), zstream)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
class DeflateCompressor final : public Stream {
|
||||
class DeflateCompressor final : public GenericZlibCompressor {
|
||||
public:
|
||||
static constexpr size_t block_size = 32 * KiB - 1; // TODO: this can theoretically be increased to 64 KiB - 2
|
||||
static constexpr size_t window_size = block_size * 2;
|
||||
static constexpr size_t hash_bits = 15;
|
||||
static constexpr size_t max_huffman_literals = 288;
|
||||
static constexpr size_t max_huffman_distances = 32;
|
||||
static constexpr size_t min_match_length = 4; // matches smaller than these are not worth the size of the back reference
|
||||
static constexpr size_t max_match_length = 258; // matches longer than these cannot be encoded using huffman codes
|
||||
static constexpr u16 empty_slot = UINT16_MAX;
|
||||
|
||||
struct CompressionConstants {
|
||||
size_t good_match_length; // Once we find a match of at least this length (a good enough match) we reduce max_chain to lower processing time
|
||||
size_t max_lazy_length; // If the match is at least this long we dont defer matching to the next byte (which takes time) as its good enough
|
||||
size_t great_match_length; // Once we find a match of at least this length (a great match) we can just stop searching for longer ones
|
||||
size_t max_chain; // We only check the actual length of the max_chain closest matches
|
||||
};
|
||||
|
||||
// These constants were shamelessly "borrowed" from zlib
|
||||
static constexpr CompressionConstants compression_constants[] = {
|
||||
{ 0, 0, 0, 0 },
|
||||
{ 4, 4, 8, 4 },
|
||||
{ 8, 16, 128, 128 },
|
||||
{ 32, 258, 258, 4096 },
|
||||
{ max_match_length, max_match_length, max_match_length, 1 << hash_bits } // disable all limits
|
||||
};
|
||||
|
||||
enum class CompressionLevel : int {
|
||||
STORE = 0,
|
||||
FAST,
|
||||
GOOD,
|
||||
GREAT,
|
||||
BEST // WARNING: this one can take an unreasonable amount of time!
|
||||
};
|
||||
|
||||
static ErrorOr<NonnullOwnPtr<DeflateCompressor>> construct(MaybeOwned<Stream>, CompressionLevel = CompressionLevel::GOOD);
|
||||
~DeflateCompressor();
|
||||
|
||||
virtual ErrorOr<Bytes> read_some(Bytes) override;
|
||||
virtual ErrorOr<size_t> write_some(ReadonlyBytes) override;
|
||||
virtual bool is_eof() const override;
|
||||
virtual bool is_open() const override;
|
||||
virtual void close() override;
|
||||
ErrorOr<void> final_flush();
|
||||
|
||||
static ErrorOr<ByteBuffer> compress_all(ReadonlyBytes bytes, CompressionLevel = CompressionLevel::GOOD);
|
||||
static ErrorOr<NonnullOwnPtr<DeflateCompressor>> create(MaybeOwned<Stream>, GenericZlibCompressionLevel = GenericZlibCompressionLevel::Default);
|
||||
static ErrorOr<ByteBuffer> compress_all(ReadonlyBytes, GenericZlibCompressionLevel = GenericZlibCompressionLevel::Default);
|
||||
|
||||
private:
|
||||
DeflateCompressor(NonnullOwnPtr<LittleEndianOutputBitStream>, CompressionLevel = CompressionLevel::GOOD);
|
||||
|
||||
Bytes pending_block() { return { m_rolling_window + block_size, block_size }; }
|
||||
|
||||
// LZ77 Compression
|
||||
static u16 hash_sequence(u8 const* bytes);
|
||||
size_t compare_match_candidate(size_t start, size_t candidate, size_t prev_match_length, size_t max_match_length);
|
||||
size_t find_back_match(size_t start, u16 hash, size_t previous_match_length, size_t max_match_length, size_t& match_position);
|
||||
void lz77_compress_block();
|
||||
|
||||
// Huffman Coding
|
||||
struct code_length_symbol {
|
||||
u8 symbol;
|
||||
u8 count; // used for special symbols 16-18
|
||||
};
|
||||
static u8 distance_to_base(u16 distance);
|
||||
size_t huffman_block_length(Array<u8, max_huffman_literals> const& literal_bit_lengths, Array<u8, max_huffman_distances> const& distance_bit_lengths);
|
||||
ErrorOr<void> write_huffman(CanonicalCode const& literal_code, Optional<CanonicalCode> const& distance_code);
|
||||
static size_t encode_huffman_lengths(ReadonlyBytes lengths, Array<code_length_symbol, max_huffman_literals + max_huffman_distances>& encoded_lengths);
|
||||
size_t encode_block_lengths(Array<u8, max_huffman_literals> const& literal_bit_lengths, Array<u8, max_huffman_distances> const& distance_bit_lengths, Array<code_length_symbol, max_huffman_literals + max_huffman_distances>& encoded_lengths, size_t& literal_code_count, size_t& distance_code_count);
|
||||
ErrorOr<void> write_dynamic_huffman(CanonicalCode const& literal_code, size_t literal_code_count, Optional<CanonicalCode> const& distance_code, size_t distance_code_count, Array<u8, 19> const& code_lengths_bit_lengths, size_t code_length_count, Array<code_length_symbol, max_huffman_literals + max_huffman_distances> const& encoded_lengths, size_t encoded_lengths_count);
|
||||
|
||||
size_t uncompressed_block_length();
|
||||
size_t fixed_block_length();
|
||||
size_t dynamic_block_length(Array<u8, max_huffman_literals> const& literal_bit_lengths, Array<u8, max_huffman_distances> const& distance_bit_lengths, Array<u8, 19> const& code_lengths_bit_lengths, Array<u16, 19> const& code_lengths_frequencies, size_t code_lengths_count);
|
||||
ErrorOr<void> flush();
|
||||
|
||||
bool m_finished { false };
|
||||
CompressionLevel m_compression_level;
|
||||
CompressionConstants m_compression_constants;
|
||||
NonnullOwnPtr<LittleEndianOutputBitStream> m_output_stream;
|
||||
|
||||
u8 m_rolling_window[window_size];
|
||||
size_t m_pending_block_size { 0 };
|
||||
|
||||
struct [[gnu::packed]] {
|
||||
u16 distance; // back reference length
|
||||
union {
|
||||
u16 literal; // literal byte or on of block symbol
|
||||
u16 length; // back reference length (if distance != 0)
|
||||
};
|
||||
} m_symbol_buffer[block_size + 1];
|
||||
size_t m_pending_symbol_size { 0 };
|
||||
Array<u16, max_huffman_literals> m_symbol_frequencies; // there are 286 valid symbol values (symbols 286-287 never occur)
|
||||
Array<u16, max_huffman_distances> m_distance_frequencies; // there are 30 valid distance values (distances 30-31 never occur)
|
||||
|
||||
// LZ77 Chained hash table
|
||||
u16 m_hash_head[1 << hash_bits];
|
||||
u16 m_hash_prev[window_size];
|
||||
DeflateCompressor(AK::FixedArray<u8> buffer, MaybeOwned<Stream> stream, z_stream* zstream)
|
||||
: GenericZlibCompressor(move(buffer), move(stream), zstream)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
#include <AK/BitStream.h>
|
||||
#include <AK/Debug.h>
|
||||
#include <AK/HashMap.h>
|
||||
#include <AK/HashTable.h>
|
||||
#include <AK/MemoryStream.h>
|
||||
#include <AK/QuickSort.h>
|
||||
|
|
|
@ -33,7 +33,7 @@ WebIDL::ExceptionOr<GC::Ref<CompressionStream>> CompressionStream::construct_imp
|
|||
case Bindings::CompressionFormat::Deflate:
|
||||
return TRY(Compress::ZlibCompressor::create(move(input_stream)));
|
||||
case Bindings::CompressionFormat::DeflateRaw:
|
||||
return TRY(Compress::DeflateCompressor::construct(make<LittleEndianInputBitStream>(move(input_stream))));
|
||||
return TRY(Compress::DeflateCompressor::create(move(input_stream)));
|
||||
case Bindings::CompressionFormat::Gzip:
|
||||
return TRY(Compress::GzipCompressor::create(move(input_stream)));
|
||||
}
|
||||
|
@ -166,16 +166,9 @@ ErrorOr<ByteBuffer> CompressionStream::compress(ReadonlyBytes bytes, Finish fini
|
|||
}));
|
||||
|
||||
if (finish == Finish::Yes) {
|
||||
TRY(m_compressor.visit(
|
||||
[&](NonnullOwnPtr<Compress::ZlibCompressor> const& compressor) {
|
||||
return compressor->finish();
|
||||
},
|
||||
[&](NonnullOwnPtr<Compress::DeflateCompressor> const& compressor) {
|
||||
return compressor->final_flush();
|
||||
},
|
||||
[&](NonnullOwnPtr<Compress::GzipCompressor> const& compressor) {
|
||||
return compressor->finish();
|
||||
}));
|
||||
TRY(m_compressor.visit([](auto const& compressor) {
|
||||
return compressor->finish();
|
||||
}));
|
||||
}
|
||||
|
||||
auto buffer = TRY(ByteBuffer::create_uninitialized(m_output_stream->used_buffer_size()));
|
||||
|
|
|
@ -34,7 +34,7 @@ WebIDL::ExceptionOr<GC::Ref<DecompressionStream>> DecompressionStream::construct
|
|||
case Bindings::CompressionFormat::Deflate:
|
||||
return TRY(Compress::ZlibDecompressor::create(move(input_stream)));
|
||||
case Bindings::CompressionFormat::DeflateRaw:
|
||||
return TRY(Compress::DeflateDecompressor::construct(make<LittleEndianInputBitStream>(move(input_stream))));
|
||||
return TRY(Compress::DeflateDecompressor::create(move(input_stream)));
|
||||
case Bindings::CompressionFormat::Gzip:
|
||||
return TRY(Compress::GzipDecompressor::create((move(input_stream))));
|
||||
}
|
||||
|
|
|
@ -4,17 +4,12 @@
|
|||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <LibTest/TestCase.h>
|
||||
|
||||
#include <AK/Array.h>
|
||||
#include <AK/BitStream.h>
|
||||
#include <AK/MemoryStream.h>
|
||||
#include <AK/Random.h>
|
||||
#include <LibCompress/Deflate.h>
|
||||
#include <LibCore/File.h>
|
||||
#include <cstring>
|
||||
|
||||
#define TEST_INPUT(x) ("deflate-test-files/" x)
|
||||
#include <LibTest/TestCase.h>
|
||||
|
||||
TEST_CASE(canonical_code_simple)
|
||||
{
|
||||
|
@ -30,7 +25,7 @@ TEST_CASE(canonical_code_simple)
|
|||
0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d, 0x15
|
||||
};
|
||||
|
||||
auto const huffman = Compress::CanonicalCode::from_bytes(code).value();
|
||||
auto const huffman = TRY_OR_FAIL(Compress::CanonicalCode::from_bytes(code));
|
||||
auto memory_stream = MUST(try_make<FixedMemoryStream>(input));
|
||||
LittleEndianInputBitStream bit_stream { move(memory_stream) };
|
||||
|
||||
|
@ -50,7 +45,7 @@ TEST_CASE(canonical_code_complex)
|
|||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05
|
||||
};
|
||||
|
||||
auto const huffman = Compress::CanonicalCode::from_bytes(code).value();
|
||||
auto const huffman = TRY_OR_FAIL(Compress::CanonicalCode::from_bytes(code));
|
||||
auto memory_stream = MUST(try_make<FixedMemoryStream>(input));
|
||||
LittleEndianInputBitStream bit_stream { move(memory_stream) };
|
||||
|
||||
|
@ -75,8 +70,8 @@ TEST_CASE(deflate_decompress_compressed_block)
|
|||
|
||||
u8 const uncompressed[] = "This is a simple text file :)";
|
||||
|
||||
auto const decompressed = Compress::DeflateDecompressor::decompress_all(compressed);
|
||||
EXPECT(decompressed.value().bytes() == ReadonlyBytes({ uncompressed, sizeof(uncompressed) - 1 }));
|
||||
auto const decompressed = TRY_OR_FAIL(Compress::DeflateDecompressor::decompress_all(compressed));
|
||||
EXPECT(decompressed.bytes() == ReadonlyBytes({ uncompressed, sizeof(uncompressed) - 1 }));
|
||||
}
|
||||
|
||||
TEST_CASE(deflate_decompress_uncompressed_block)
|
||||
|
@ -88,13 +83,13 @@ TEST_CASE(deflate_decompress_uncompressed_block)
|
|||
|
||||
u8 const uncompressed[] = "Hello, World!";
|
||||
|
||||
auto const decompressed = Compress::DeflateDecompressor::decompress_all(compressed);
|
||||
EXPECT(decompressed.value().bytes() == (ReadonlyBytes { uncompressed, sizeof(uncompressed) - 1 }));
|
||||
auto const decompressed = TRY_OR_FAIL(Compress::DeflateDecompressor::decompress_all(compressed));
|
||||
EXPECT(decompressed.bytes() == (ReadonlyBytes { uncompressed, sizeof(uncompressed) - 1 }));
|
||||
}
|
||||
|
||||
TEST_CASE(deflate_decompress_multiple_blocks)
|
||||
{
|
||||
Array<u8, 84> const compressed {
|
||||
Array<u8, 72> const compressed {
|
||||
0x00, 0x1f, 0x00, 0xe0, 0xff, 0x54, 0x68, 0x65, 0x20, 0x66, 0x69, 0x72,
|
||||
0x73, 0x74, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x20, 0x69, 0x73, 0x20,
|
||||
0x75, 0x6e, 0x63, 0x6f, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73, 0x65, 0x64,
|
||||
|
@ -105,8 +100,8 @@ TEST_CASE(deflate_decompress_multiple_blocks)
|
|||
|
||||
u8 const uncompressed[] = "The first block is uncompressed and the second block is compressed.";
|
||||
|
||||
auto const decompressed = Compress::DeflateDecompressor::decompress_all(compressed);
|
||||
EXPECT(decompressed.value().bytes() == (ReadonlyBytes { uncompressed, sizeof(uncompressed) - 1 }));
|
||||
auto const decompressed = TRY_OR_FAIL(Compress::DeflateDecompressor::decompress_all(compressed));
|
||||
EXPECT(decompressed.bytes() == (ReadonlyBytes { uncompressed, sizeof(uncompressed) - 1 }));
|
||||
}
|
||||
|
||||
TEST_CASE(deflate_decompress_zeroes)
|
||||
|
@ -118,15 +113,15 @@ TEST_CASE(deflate_decompress_zeroes)
|
|||
|
||||
Array<u8, 4096> const uncompressed { 0 };
|
||||
|
||||
auto const decompressed = Compress::DeflateDecompressor::decompress_all(compressed);
|
||||
EXPECT(uncompressed == decompressed.value().bytes());
|
||||
auto const decompressed = TRY_OR_FAIL(Compress::DeflateDecompressor::decompress_all(compressed));
|
||||
EXPECT(uncompressed == decompressed.bytes());
|
||||
}
|
||||
|
||||
TEST_CASE(deflate_round_trip_store)
|
||||
{
|
||||
auto original = ByteBuffer::create_uninitialized(1024).release_value();
|
||||
fill_with_random(original);
|
||||
auto compressed = TRY_OR_FAIL(Compress::DeflateCompressor::compress_all(original, Compress::DeflateCompressor::CompressionLevel::STORE));
|
||||
auto compressed = TRY_OR_FAIL(Compress::DeflateCompressor::compress_all(original));
|
||||
auto uncompressed = TRY_OR_FAIL(Compress::DeflateDecompressor::decompress_all(compressed));
|
||||
EXPECT(uncompressed == original);
|
||||
}
|
||||
|
@ -136,19 +131,21 @@ TEST_CASE(deflate_round_trip_compress)
|
|||
auto original = ByteBuffer::create_zeroed(2048).release_value();
|
||||
fill_with_random(original.bytes().trim(1024)); // we pre-filled the second half with 0s to make sure we test back references as well
|
||||
// Since the different levels just change how much time is spent looking for better matches, just use fast here to reduce test time
|
||||
auto compressed = TRY_OR_FAIL(Compress::DeflateCompressor::compress_all(original, Compress::DeflateCompressor::CompressionLevel::FAST));
|
||||
auto compressed = TRY_OR_FAIL(Compress::DeflateCompressor::compress_all(original));
|
||||
auto uncompressed = TRY_OR_FAIL(Compress::DeflateDecompressor::decompress_all(compressed));
|
||||
EXPECT(uncompressed == original);
|
||||
}
|
||||
|
||||
TEST_CASE(deflate_round_trip_compress_large)
|
||||
{
|
||||
auto size = Compress::DeflateCompressor::block_size * 2;
|
||||
auto original = ByteBuffer::create_uninitialized(size).release_value(); // Compress a buffer larger than the maximum block size to test the sliding window mechanism
|
||||
// Compress a buffer larger than the maximum block size to test the sliding window mechanism
|
||||
auto original = TRY_OR_FAIL(ByteBuffer::create_uninitialized((32 * KiB - 1) * 2));
|
||||
fill_with_random(original);
|
||||
|
||||
// Since the different levels just change how much time is spent looking for better matches, just use fast here to reduce test time
|
||||
auto compressed = TRY_OR_FAIL(Compress::DeflateCompressor::compress_all(original, Compress::DeflateCompressor::CompressionLevel::FAST));
|
||||
auto compressed = TRY_OR_FAIL(Compress::DeflateCompressor::compress_all(original));
|
||||
auto uncompressed = TRY_OR_FAIL(Compress::DeflateDecompressor::decompress_all(compressed));
|
||||
EXPECT(uncompressed.span() == original.span().slice(0, uncompressed.size()));
|
||||
EXPECT(uncompressed == original);
|
||||
}
|
||||
|
||||
|
@ -156,23 +153,5 @@ TEST_CASE(deflate_compress_literals)
|
|||
{
|
||||
// This byte array is known to not produce any back references with our lz77 implementation even at the highest compression settings
|
||||
Array<u8, 0x13> test { 0, 0, 0, 0, 0x72, 0, 0, 0xee, 0, 0, 0, 0x26, 0, 0, 0, 0x28, 0, 0, 0x72 };
|
||||
auto compressed = TRY_OR_FAIL(Compress::DeflateCompressor::compress_all(test, Compress::DeflateCompressor::CompressionLevel::GOOD));
|
||||
}
|
||||
|
||||
TEST_CASE(ossfuzz_63183)
|
||||
{
|
||||
auto path = TEST_INPUT("clusterfuzz-testcase-minimized-FuzzDeflateCompression-6163230961303552.fuzz"sv);
|
||||
auto test_file = MUST(Core::File::open(path, Core::File::OpenMode::Read));
|
||||
auto test_data = MUST(test_file->read_until_eof());
|
||||
auto compressed = TRY_OR_FAIL(Compress::DeflateCompressor::compress_all(test_data, Compress::DeflateCompressor::CompressionLevel::GOOD));
|
||||
auto decompressed = TRY_OR_FAIL(Compress::DeflateDecompressor::decompress_all(compressed));
|
||||
EXPECT(test_data == decompressed);
|
||||
}
|
||||
|
||||
TEST_CASE(ossfuzz_58046)
|
||||
{
|
||||
auto path = TEST_INPUT("clusterfuzz-testcase-minimized-FuzzDeflateDecompression-5523852259360768.fuzz"sv);
|
||||
auto test_file = TRY_OR_FAIL(Core::File::open(path, Core::File::OpenMode::Read));
|
||||
auto test_data = TRY_OR_FAIL(test_file->read_until_eof());
|
||||
EXPECT(Compress::DeflateDecompressor::decompress_all(test_data).is_error());
|
||||
auto compressed = TRY_OR_FAIL(Compress::DeflateCompressor::compress_all(test));
|
||||
}
|
||||
|
|
Binary file not shown.
|
@ -1 +0,0 @@
|
|||
I˙żž I˙żIŔž I˙żž żžĽž
|
Loading…
Add table
Add a link
Reference in a new issue