Odin/core/compress/zlib/zlib.odin

package zlib

/*
	Copyright 2021 Jeroen van Rijn <nom@duclavier.com>.
	Made available under Odin's BSD-2 license.

	List of contributors:
		Jeroen van Rijn: Initial implementation, optimization.
		Ginger Bill:     Cosmetic changes.
*/

import "core:compress"

import "core:mem"
import "core:io"
import "core:hash"
import "core:bytes"

/*
	zlib.inflate decompresses a ZLIB stream passed in as a []u8 or io.Stream.
	Returns: Error.
*/

/*
	Do we do Adler32 as we write bytes to output?
	It used to be faster to do it inline, now it's faster to do it at the end of `inflate`.

	We'll see what's faster after more optimization, and might end up removing
	`Context.rolling_hash` if not inlining it is still faster.

*/

Compression_Method :: enum u8 {
	DEFLATE  = 8,
	Reserved = 15,
}

Compression_Level :: enum u8 {
	Fastest = 0,
	Fast    = 1,
	Default = 2,
	Maximum = 3,
}

Options :: struct {
	window_size: u16,
	level: u8,
}

Error     :: compress.Error;
E_General :: compress.General_Error;
E_ZLIB    :: compress.ZLIB_Error;
E_Deflate :: compress.Deflate_Error;

DEFLATE_MAX_CHUNK_SIZE   :: 65535;
DEFLATE_MAX_LITERAL_SIZE :: 65535;
DEFLATE_MAX_DISTANCE     :: 32768;
DEFLATE_MAX_LENGTH       :: 258;

HUFFMAN_MAX_BITS  :: 16;
HUFFMAN_FAST_BITS :: 9;
HUFFMAN_FAST_MASK :: ((1 << HUFFMAN_FAST_BITS) - 1);

Z_LENGTH_BASE := [31]u16{
	3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,
	67,83,99,115,131,163,195,227,258,0,0,
};

Z_LENGTH_EXTRA := [31]u8{
	0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0,
};

Z_DIST_BASE := [32]u16{
	1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
	257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0,
};

Z_DIST_EXTRA := [32]u8{
	0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,0,0,
};

Z_LENGTH_DEZIGZAG := []u8{
	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15,
};

Z_FIXED_LENGTH := [288]u8{
	8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
	8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
	8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
	8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
	8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,
};

Z_FIXED_DIST := [32]u8{
	5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
};

/*
	Accelerate all cases in default tables.
*/
ZFAST_BITS :: 9;
ZFAST_MASK :: ((1 << ZFAST_BITS) - 1);

/*
	ZLIB-style Huffman encoding.
	JPEG packs from left, ZLIB from right. We can't share code.
*/
Huffman_Table :: struct {
	fast:        [1 << ZFAST_BITS]u16,
	firstcode:   [16]u16,
	maxcode:     [17]int,
	firstsymbol: [16]u16,
	size:        [288]u8,
	value:       [288]u16,
};

// Implementation starts here
@(optimization_mode="speed")
z_bit_reverse :: #force_inline proc(n: u16, bits: u8) -> (r: u16) {
	assert(bits <= 16);
	// NOTE: Can optimize with llvm.bitreverse.i64 or some bit twiddling
	// by reversing all of the bits and masking out the unneeded ones.
	r = n;
	r = ((r & 0xAAAA) >>  1) | ((r & 0x5555) << 1);
	r = ((r & 0xCCCC) >>  2) | ((r & 0x3333) << 2);
	r = ((r & 0xF0F0) >>  4) | ((r & 0x0F0F) << 4);
	r = ((r & 0xFF00) >>  8) | ((r & 0x00FF) << 8);

	r >>= (16 - bits);
	return;
}


@(optimization_mode="speed")
grow_buffer :: proc(buf: ^[dynamic]u8) -> (err: compress.Error) {
	/*
		That we get here at all means that we didn't pass an expected output size,
		or that it was too little.
	*/

	/*
		Double until we reach the maximum allowed.
	*/
	new_size := min(len(buf) << 1, compress.COMPRESS_OUTPUT_ALLOCATE_MAX);
	resize(buf, new_size);
	if len(buf) != new_size {
		/*
			Resize failed.
		*/
		return .Resize_Failed;
	}

	return nil;
}

/*
	TODO: Make these return compress.Error.
*/

@(optimization_mode="speed")
write_byte :: #force_inline proc(z: ^$C, c: u8) -> (err: io.Error) #no_bounds_check {
	/*
		Resize if needed.
	*/
	if int(z.bytes_written) + 1 >= len(z.output.buf) {
		e := grow_buffer(&z.output.buf);
		if e != nil {
			return .Short_Write;
		}
	}

	#no_bounds_check {
		z.output.buf[z.bytes_written] = c;
	}
	z.bytes_written += 1;
	return .None;
}

@(optimization_mode="speed")
repl_byte :: proc(z: ^$C, count: u16, c: u8) -> (err: io.Error) 	#no_bounds_check {
	/*
		TODO(Jeroen): Once we have a magic ring buffer, we can just peek/write into it
		without having to worry about wrapping, so no need for a temp allocation to give to
		the output stream, just give it _that_ slice.
	*/

	/*
	Resize if needed.
	*/
	if int(z.bytes_written) + int(count) >= len(z.output.buf) {
		e := grow_buffer(&z.output.buf);
		if e != nil {
			return .Short_Write;
		}
	}

	#no_bounds_check {
		for _ in 0..<count {
			z.output.buf[z.bytes_written] = c;
			z.bytes_written += 1;
		}
	}

	return .None;
}

@(optimization_mode="speed")
repl_bytes :: proc(z: ^$C, count: u16, distance: u16) -> (err: io.Error) {
	/*
		TODO(Jeroen): Once we have a magic ring buffer, we can just peek/write into it
		without having to worry about wrapping, so no need for a temp allocation to give to
		the output stream, just give it _that_ slice.
	*/

	offset := i64(distance);

	if int(z.bytes_written) + int(count) >= len(z.output.buf) {
		e := grow_buffer(&z.output.buf);
		if e != nil {
			return .Short_Write;
		}
	}

	#no_bounds_check {
		for _ in 0..<count {
			c := z.output.buf[z.bytes_written - offset];
			z.output.buf[z.bytes_written] = c;
			z.bytes_written += 1;
		}
	}

	return .None;
}


allocate_huffman_table :: proc(allocator := context.allocator) -> (z: ^Huffman_Table, err: Error) {
	return new(Huffman_Table, allocator), nil;
}

@(optimization_mode="speed")
build_huffman :: proc(z: ^Huffman_Table, code_lengths: []u8) -> (err: Error) {
	sizes:     [HUFFMAN_MAX_BITS+1]int;
	next_code: [HUFFMAN_MAX_BITS]int;

	k := int(0);

	mem.zero_slice(sizes[:]);
	mem.zero_slice(z.fast[:]);

	for v in code_lengths {
		sizes[v] += 1;
	}
	sizes[0] = 0;

	for i in 1..<(HUFFMAN_MAX_BITS+1) {
		if sizes[i] > (1 << uint(i)) {
			return E_Deflate.Huffman_Bad_Sizes;
		}
	}
	code := int(0);

	for i in 1..<HUFFMAN_MAX_BITS {
		next_code[i]     = code;
		z.firstcode[i]   = u16(code);
		z.firstsymbol[i] = u16(k);
		code = code + sizes[i];
		if sizes[i] != 0 {
			if code - 1 >= (1 << u16(i)) {
				return E_Deflate.Huffman_Bad_Code_Lengths;
			}
		}
		z.maxcode[i] = code << (HUFFMAN_MAX_BITS - uint(i));
		code <<= 1;
		k += int(sizes[i]);
	}

	z.maxcode[HUFFMAN_MAX_BITS] = 0x10000; // Sentinel
	c: int;

	for v, ci in code_lengths {
		if v != 0 {
			c = next_code[v] - int(z.firstcode[v]) + int(z.firstsymbol[v]);
			fastv := u16((u16(v) << 9) | u16(ci));
			z.size[c]  = u8(v);
			z.value[c] = u16(ci);
			if v <= ZFAST_BITS {
				j := z_bit_reverse(u16(next_code[v]), v);
				for j < (1 << ZFAST_BITS) {
					z.fast[j] = fastv;
					j += (1 << v);
				}
			}
			next_code[v] += 1;
		}
	}
	return nil;
}

@(optimization_mode="speed")
decode_huffman_slowpath :: proc(z: ^$C, t: ^Huffman_Table) -> (r: u16, err: Error) #no_bounds_check {
	code := u16(compress.peek_bits_lsb(z,16));

	k := int(z_bit_reverse(code, 16));
	s: u8;

	#no_bounds_check for s = HUFFMAN_FAST_BITS+1; ; {
		if k < t.maxcode[s] {
			break;
		}
		s += 1;
	}
	if s >= 16 {
		return 0, E_Deflate.Bad_Huffman_Code;
	}
	// code size is s, so:
	b := (k >> (16-s)) - int(t.firstcode[s]) + int(t.firstsymbol[s]);
	if b >= size_of(t.size) {
		return 0, E_Deflate.Bad_Huffman_Code;
	}
	if t.size[b] != s {
		return 0, E_Deflate.Bad_Huffman_Code;
	}

	compress.consume_bits_lsb(z, s);

	r = t.value[b];
	return r, nil;
}

@(optimization_mode="speed")
decode_huffman :: proc(z: ^$C, t: ^Huffman_Table) -> (r: u16, err: Error) #no_bounds_check {
	if z.num_bits < 16 {
		if z.num_bits > 63 {
			return 0, E_ZLIB.Code_Buffer_Malformed;
		}
		compress.refill_lsb(z);
		if z.num_bits > 63 {
			return 0, E_General.Stream_Too_Short;
		}
	}
	#no_bounds_check b := t.fast[z.code_buffer & ZFAST_MASK];
	if b != 0 {
		s := u8(b >> ZFAST_BITS);
		compress.consume_bits_lsb(z, s);
		return b & 511, nil;
	}
	return decode_huffman_slowpath(z, t);
}

@(optimization_mode="speed")
parse_huffman_block :: proc(z: ^$C, z_repeat, z_offset: ^Huffman_Table) -> (err: Error) #no_bounds_check {
	#no_bounds_check for {
		value, e := decode_huffman(z, z_repeat);
		if e != nil {
			return err;
		}
		if value < 256 {
			e := write_byte(z, u8(value));
			if e != .None {
				return E_General.Output_Too_Short;
			}
		} else {
			if value == 256 {
					// End of block
					return nil;
			}

			value -= 257;
			length := Z_LENGTH_BASE[value];
			if Z_LENGTH_EXTRA[value] > 0 {
				length += u16(compress.read_bits_lsb(z, Z_LENGTH_EXTRA[value]));
			}

			value, e = decode_huffman(z, z_offset);
			if e != nil {
				return E_Deflate.Bad_Huffman_Code;
			}

			distance := Z_DIST_BASE[value];
			if Z_DIST_EXTRA[value] > 0 {
				distance += u16(compress.read_bits_lsb(z, Z_DIST_EXTRA[value]));
			}

			if z.bytes_written < i64(distance) {
				// Distance is longer than we've decoded so far.
				return E_Deflate.Bad_Distance;
			}

			/*
				These might be sped up with a repl_byte call that copies
				from the already written output more directly, and that
				update the Adler checksum once after.

				That way we'd suffer less Stream vtable overhead.
			*/
			if distance == 1 {
				/*
					Replicate the last outputted byte, length times.
				*/
				if length > 0 {
					c := z.output.buf[z.bytes_written - i64(distance)];
					e := repl_byte(z, length, c);
					if e != .None {
						return E_General.Output_Too_Short;
					}
				}
			} else {
				if length > 0 {
					e := repl_bytes(z, length, distance);
					if e != .None {
						return E_General.Output_Too_Short;
					}
				}
			}
		}
	}
}

@(optimization_mode="speed")
inflate_from_context :: proc(using ctx: ^compress.Context_Memory_Input, raw := false, expected_output_size := -1, allocator := context.allocator) -> (err: Error) #no_bounds_check {
	/*
		ctx.output must be a bytes.Buffer for now. We'll add a separate implementation that writes to a stream.

		raw determines whether the ZLIB header is processed, or we're inflating a raw
		DEFLATE stream.
	*/

	if !raw {
		size, size_err := compress.input_size(ctx);
		if size < 6 || size_err != nil {
			return E_General.Stream_Too_Short;
		}

		cmf, _ := compress.read_u8(ctx);

		method := Compression_Method(cmf & 0xf);
		if method != .DEFLATE {
			return E_General.Unknown_Compression_Method;
		}

		if cinfo := (cmf >> 4) & 0xf; cinfo > 7 {
			return E_ZLIB.Unsupported_Window_Size;
		}
		flg, _ := compress.read_u8(ctx);

		fcheck := flg & 0x1f;
		fcheck_computed := (cmf << 8 | flg) & 0x1f;
		if fcheck != fcheck_computed {
			return E_General.Checksum_Failed;
		}

		/*
			We don't handle built-in dictionaries for now.
			They're application specific and PNG doesn't use them.
		*/
		if fdict := (flg >> 5) & 1; fdict != 0 {
			return E_ZLIB.FDICT_Unsupported;
		}

		// flevel  := Compression_Level((flg >> 6) & 3);
		/*
			Inflate can consume bits belonging to the Adler checksum.
			We pass the entire stream to Inflate and will unget bytes if we need to
			at the end to compare checksums.
		*/

	}

	// Parse ZLIB stream without header.
	inflate_raw(z=ctx, expected_output_size=expected_output_size) or_return;

	if !raw {
		compress.discard_to_next_byte_lsb(ctx);

		adler_b: [4]u8;
		for _, i in adler_b {
			adler_b[i], _ = compress.read_u8_prefer_code_buffer_lsb(ctx);
		}
		adler := transmute(u32be)adler_b;

		output_hash := hash.adler32(ctx.output.buf[:]);

		if output_hash != u32(adler) {
			return E_General.Checksum_Failed;
		}
	}
	return nil;
}

// TODO: Check alignment of reserve/resize.

@(optimization_mode="speed")
inflate_raw :: proc(z: ^$C, expected_output_size := -1, allocator := context.allocator) -> (err: Error) #no_bounds_check {
	expected_output_size := expected_output_size;

	/*
		Always set up a minimum allocation size.
	*/
	expected_output_size = max(max(expected_output_size, compress.COMPRESS_OUTPUT_ALLOCATE_MIN), 512);

	// fmt.printf("\nZLIB: Expected Payload Size: %v\n\n", expected_output_size);

	if expected_output_size > 0 && expected_output_size <= compress.COMPRESS_OUTPUT_ALLOCATE_MAX {
		/*
			Try to pre-allocate the output buffer.
		*/
		reserve(&z.output.buf, expected_output_size);
		resize (&z.output.buf, expected_output_size);
	};

	if len(z.output.buf) != expected_output_size {
		return .Resize_Failed;
	}

	z.num_bits    = 0;
	z.code_buffer = 0;

	z_repeat:      ^Huffman_Table;
	z_offset:      ^Huffman_Table;
	codelength_ht: ^Huffman_Table;
	defer free(z_repeat);
	defer free(z_offset);
	defer free(codelength_ht);

	z_repeat      = allocate_huffman_table(allocator=context.allocator) or_return;
	z_offset      = allocate_huffman_table(allocator=context.allocator) or_return;
	codelength_ht = allocate_huffman_table(allocator=context.allocator) or_return;

	final := u32(0);
	type  := u32(0);

	for {
		final = compress.read_bits_lsb(z, 1);
		type  = compress.read_bits_lsb(z, 2);

		// fmt.printf("Final: %v | Type: %v\n", final, type);

		switch type {
		case 0:
			// Uncompressed block

			// Discard bits until next byte boundary
			compress.discard_to_next_byte_lsb(z);

			uncompressed_len := i16(compress.read_bits_lsb(z, 16));
			length_check     := i16(compress.read_bits_lsb(z, 16));

			// fmt.printf("LEN: %v, ~LEN: %v, NLEN: %v, ~NLEN: %v\n", uncompressed_len, ~uncompressed_len, length_check, ~length_check);


			if ~uncompressed_len != length_check {
				return E_Deflate.Len_Nlen_Mismatch;
			}

			/*
				TODO: Maybe speed this up with a stream-to-stream copy (read_from)
				and a single Adler32 update after.
			*/
			#no_bounds_check for uncompressed_len > 0 {
				compress.refill_lsb(z);
				lit := compress.read_bits_lsb(z, 8);
				write_byte(z, u8(lit));
				uncompressed_len -= 1;
			}
		case 3:
			return E_Deflate.BType_3;
		case:
			// log.debugf("Err: %v | Final: %v | Type: %v\n", err, final, type);
			if type == 1 {
				// Use fixed code lengths.
				build_huffman(z_repeat, Z_FIXED_LENGTH[:]) or_return;
				build_huffman(z_offset, Z_FIXED_DIST[:])  or_return;
			} else {
				lencodes: [286+32+137]u8;
				codelength_sizes: [19]u8;

				//i: u32;
				n: u32;

				compress.refill_lsb(z, 14);
				hlit  := compress.read_bits_no_refill_lsb(z, 5) + 257;
				hdist := compress.read_bits_no_refill_lsb(z, 5) + 1;
				hclen := compress.read_bits_no_refill_lsb(z, 4) + 4;
				ntot  := hlit + hdist;

				#no_bounds_check for i in 0..<hclen {
					s := compress.read_bits_lsb(z, 3);
					codelength_sizes[Z_LENGTH_DEZIGZAG[i]] = u8(s);
				}
				build_huffman(codelength_ht, codelength_sizes[:]) or_return;

				n = 0;
				c: u16;

				for n < ntot {
					c = decode_huffman(z, codelength_ht) or_return;

					if c < 0 || c >= 19 {
						return E_Deflate.Huffman_Bad_Code_Lengths;
					}
					if c < 16 {
						lencodes[n] = u8(c);
						n += 1;
					} else {
						fill := u8(0);
						compress.refill_lsb(z, 7);
						switch c {
						case 16:
							c = u16(compress.read_bits_no_refill_lsb(z, 2) + 3);
							if n == 0 {
								return E_Deflate.Huffman_Bad_Code_Lengths;
							}
							fill = lencodes[n - 1];
						case 17:
							c = u16(compress.read_bits_no_refill_lsb(z, 3) + 3);
						case 18:
							c = u16(compress.read_bits_no_refill_lsb(z, 7) + 11);
						case:
								return E_Deflate.Huffman_Bad_Code_Lengths;
						}

						if ntot - n < u32(c) {
							return E_Deflate.Huffman_Bad_Code_Lengths;
						}

						nc := n + u32(c);
						#no_bounds_check for ; n < nc; n += 1 {
							lencodes[n] = fill;
						}
					}
				}

				if n != ntot {
					return E_Deflate.Huffman_Bad_Code_Lengths;
				}

				build_huffman(z_repeat, lencodes[:hlit])     or_return;
				build_huffman(z_offset, lencodes[hlit:ntot]) or_return;
			}
			parse_huffman_block(z, z_repeat, z_offset) or_return;
		}
		if final == 1 {
			break;
		}
	}

	if int(z.bytes_written) != len(z.output.buf) {
		resize(&z.output.buf, int(z.bytes_written));
	}

	return nil;
}

inflate_from_byte_array :: proc(input: []u8, buf: ^bytes.Buffer, raw := false, expected_output_size := -1) -> (err: Error) {
	ctx := compress.Context_Memory_Input{};

	ctx.input_data = input;
	ctx.output = buf;

	return inflate_from_context(ctx=&ctx, raw=raw, expected_output_size=expected_output_size);
}

inflate_from_byte_array_raw :: proc(input: []u8, buf: ^bytes.Buffer, raw := false, expected_output_size := -1) -> (err: Error) {
	ctx := compress.Context_Memory_Input{};

	ctx.input_data = input;
	ctx.output = buf;

	return inflate_raw(z=&ctx, expected_output_size=expected_output_size);
}

inflate :: proc{inflate_from_context, inflate_from_byte_array};