(lz) example project

2023-11-14 18:54:31 -08:00 · 2023-11-14 18:54:31 -08:00 · 6080a8f68d
parent 7528540caf
commit 6080a8f68d
5 changed files with 431 additions and 0 deletions
--- a/cli/lz/.clang-format
+++ b/cli/lz/.clang-format
@ -0,0 +1,22 @@
+AlignAfterOpenBracket: DontAlign
+AlignEscapedNewlines: DontAlign
+AlignOperands: DontAlign
+AllowShortBlocksOnASingleLine: Always
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: TopLevel
+BreakBeforeTernaryOperators: false
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBraces: WebKit
+IndentCaseLabels: false
+TabWidth: 4
+IndentWidth: 4
+ContinuationIndentWidth: 4
+UseTab: ForContinuationAndIndentation
+ColumnLimit: 0
+ReflowComments: false
+SortIncludes: false
+SpaceBeforeParens: false
--- a/cli/lz/example.txt
+++ b/cli/lz/example.txt
@ -0,0 +1,86 @@
+Yo, listen up here's a story
+About a little guy
+That lives in a blue world
+And all day and all night
+And everything he sees is just blue
+Like him inside and outside
+Blue his house
+With a blue little window
+And a blue corvette
+And everything is blue for him
+And himself and everybody around
+Cause he ain't got nobody to listen to
+I'm blue
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+I'm blue
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+I have a blue house
+With a blue window
+Blue is the colour of all that I wear
+Blue are the streets
+And all the trees are too
+I have a girlfriend and she is so blue
+Blue are the people here
+That walk around
+Blue like my corvette its in and outside
+Blue are the words I say
+And what I think
+Blue are the feelings
+That live inside me
+I'm blue
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+I'm blue
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+I have a blue house
+With a blue window
+Blue is the colour of all that I wear
+Blue are the streets
+And all the trees are too
+I have a girlfriend and she is so blue
+Blue are the people here
+That walk around
+Blue like my corvette, its in and outside
+Blue are the words I say
+And what I think
+Blue are the feelings
+That live inside me
+I'm blue
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+I'm blue
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
+Da ba dee da ba di
--- a/cli/lz/explanation.md
+++ b/cli/lz/explanation.md
@ -0,0 +1,56 @@
+Simple Uxn LZ Format
+====================
+
+Goals:
+
+* Anyone can implement it
+* Small source code size
+* Easy to implement from Uxn
+* Mildly better than RLE
+
+Non-goals:
+
+* High compression ratio
+* High compression speed
+
+Format
+------
+
+It's a stream of commands. The first byte encodes the first command. Read the commands from the input until there's no more input.
+
+There are two commands. Literal and dictionary.
+
+
+```
+                      Byte 1             Byte 2+n
+                 ┌─────────────────┐   ┌─────
+Literal          │ 0 x x x x x x x │   │ ....
+(Always 1 byte)  └─────────────────┘   └─────
+                  Length of literal    Bytes to copy to output
+                 (Adjust by adding 1)
+
+
+                      Byte 1               Byte 2
+Dictionary       ┌─────────────────┐  ┌─────────────────┐
+(2 bytes version)│ 1 0 x x x x x x │  │ x x x x x x x x │
+                 └─────────────────┘  └─────────────────┘
+                      Length of           Offset into
+                   dictionary match       dictionary
+                 (Adjust by adding 4) (Adjust by adding 1)
+
+
+                      Byte 1            Byte 2              Byte 3
+Dictionary       ┌─────────────────┬─────────────────┐ ┌─────────────────┐
+(3 bytes version)│ 1 1 x x x x x x │ x x x x x x x x │ │ x x x x x x x x │
+                 └─────────────────┴─────────────────┘ └─────────────────┘
+                       Length of dictionary match          Offset into
+                          (Adjust by adding 4)             dictionary
+                                                       (Adjust by adding 1)
+```
+
+* The maximum dictionary history size is 256 bytes.
+* Dictionary offsets should be treated as the distance from the end of last byte that was output.
+	* Example: an offset of 0 means go back by 1 bytes into the history.
+		* `a b c d e f|g`
+	* Example: an offset of 5 means go back by 6 bytes into the history.
+		* `a|b c d e f g`
--- a/cli/lz/in.txt
+++ b/cli/lz/in.txt
@ -0,0 +1 @@
+abracadabra
--- a/cli/lz/lz_main.c
+++ b/cli/lz/lz_main.c
@ -0,0 +1,266 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lz.h"
+
+enum { MinMatchLength = 4 };
+
+int
+uxn_lz_compress(void *output, int output_size, const void *input, int input_size)
+{
+	int dict_len, match_len, i, string_len, match_ctl;
+	unsigned char *out = output, *combine = 0;
+	const unsigned char *dict, *dict_best = 0, *in = input, *start = in,
+							   *end = in + input_size;
+	while(in != end) {
+		dict_len = (int)(in - start); /* Get available dictionary size (history of
+                                     original output) */
+		if(dict_len > 256)
+			dict_len = 256;           /* Limit history lookback to 256 bytes */
+		dict = in - dict_len;         /* Start of dictionary */
+		string_len = (int)(end - in); /* Size of the string to search for */
+		if(string_len > 0x3FFF + MinMatchLength)
+			string_len = 0x3FFF + MinMatchLength;
+		/* ^ Limit string length to what we can fit in 14 bits, plus the minimum
+     * match length */
+		match_len = 0; /* This will hold the length of our best match */
+		for(; dict_len;
+			dict += 1, dict_len -= 1) /* Iterate through the dictionary */
+		{
+			for(i = 0;; i++) /* Find common prefix length with the string */
+			{
+				if(i == string_len) {
+					match_len = i;
+					dict_best = dict;
+					goto done_search;
+				}
+				/* ^ If we reach the end of the string, this is the best possible match.
+         * End. */
+				if(in[i] != dict[i % dict_len])
+					break; /* Dictionary repeats if we hit the end */
+			}
+			if(i > match_len) {
+				match_len = i;
+				dict_best = dict;
+			}
+		}
+	done_search:
+		if(match_len >= MinMatchLength) /* Long enough? Use dictionary match */
+		{
+			if((output_size -= 2) < 0)
+				goto overflow;
+			match_ctl =
+				match_len -
+				MinMatchLength; /* More numeric range: treat 0 as 4, 1 as 5, etc. */
+			if(match_ctl >
+				0x3F) /* Match is long enough to use 2 bytes for the size */
+			{
+				if((output_size -= 1) < 0)
+					goto overflow;
+				*out++ = match_ctl >> 8 | 0x40 |
+					0x80;           /* High byte of the size, with both flags set */
+				*out++ = match_ctl; /* Low byte of the size */
+			} else                  /* Use 1 byte for the size */
+			{
+				*out++ = match_ctl | 0x80; /* Set the "dictionary" flag */
+			}
+			*out++ = in - dict_best -
+				1;           /* Write offset into history. (0 is -1, 1 is -2, ...) */
+			in += match_len; /* Advance input by size of the match */
+			combine = 0;     /* Disable combining previous literal, if any */
+			continue;
+		}
+		if(combine) /* Combine with previous literal */
+		{
+			if((output_size -= 1) < 0)
+				goto overflow;
+			if(++*combine == 127)
+				combine = 0; /* If the literal hits its size limit, terminate it. */
+		} else               /* Start a new literal */
+		{
+			if((output_size -= 2) < 0)
+				goto overflow;
+			combine = out++; /* Store this address, and later use it to increment the
+                          literal size. */
+			*combine = 0;    /* The 0 here means literal of length 1. */
+		}
+		*out++ = *in++; /* Write 1 literal byte from the input to the output. */
+	}
+	return (int)(out - (unsigned char *)output);
+overflow:
+	return -1;
+}
+
+int
+uxn_lz_expand(void *output, int output_size, const void *input, int input_size)
+{
+	int num, offset, written = 0;
+	unsigned char *out = output;
+	const unsigned char *from, *in = input;
+	while(input_size) {
+		num = *in++;
+		if(num > 127) /* Dictionary */
+		{
+			if((input_size -= 1) < 0)
+				goto malformed;
+			num &= 0x7F;
+			if(num & 0x40) {
+				if((input_size -= 1) < 0)
+					goto malformed;
+				num = *in++ | num << 8 & 0x3FFF;
+			}
+			num += MinMatchLength;
+			offset = *in++ + 1;
+			if(offset > written)
+				goto malformed;
+			from = out + written - offset;
+		} else /* Literal */
+		{
+			input_size -= ++num;
+			if(input_size < 0)
+				goto malformed;
+			from = in, in += num;
+		}
+		if(written + num > output_size)
+			goto overflow;
+		while(num--)
+			out[written++] = *from++;
+	}
+	return written;
+overflow:
+malformed:
+	return -1;
+}
+
+int
+uxn_lz_expand_stream(struct uxn_lz_expand_t *a)
+{
+	/* Copy struct to stack variables for compiler optimizations */
+	unsigned char *next_in = a->next_in, *next_out = a->next_out;
+	int avail_in = a->avail_in, avail_out = a->avail_out;
+	int dict_len = a->dict_len, copy_num = a->copy_num;
+	unsigned char dict_read_pos = a->dict_read_pos,
+				  dict_write_pos = a->dict_write_pos, *dict = a->dict;
+	int result = 0;
+	switch(a->state) {
+	case 0:
+		for(; avail_in;) {
+			copy_num = *next_in++;
+			avail_in--;
+			if(copy_num > 127) /* Dictionary */
+			{
+				copy_num &= 0x7F;
+				if(copy_num & 0x40) {
+				case 1:
+					if(!avail_in) {
+						a->state = 1;
+						goto need_more;
+					}
+					avail_in--;
+					copy_num = *next_in++ | copy_num << 8 & 0x3FFF;
+				}
+				copy_num += MinMatchLength;
+			case 2:
+				if(!avail_in) {
+					a->state = 2;
+					goto need_more;
+				}
+				avail_in--;
+				dict_read_pos = *next_in++ + 1;
+				if(dict_read_pos > dict_len) {
+					a->state = 5;
+					result = -1;
+					goto flush;
+				} /* Malformed */
+				dict_read_pos = dict_write_pos - dict_read_pos;
+				if((dict_len += copy_num) > 256)
+					dict_len = 256;
+			case 3:
+				do {
+					if(!avail_out) {
+						a->state = 3;
+						goto need_more;
+					}
+					*next_out++ = dict[dict_write_pos++] = dict[dict_read_pos++];
+					avail_out--;
+				} while(--copy_num);
+			} else /* Literal */
+			{
+				copy_num++;
+				if((dict_len += copy_num) > 256)
+					dict_len = 256;
+			case 4:
+				do {
+					if(!avail_in || !avail_out) {
+						a->state = 4;
+						goto need_more;
+					}
+					*next_out++ = dict[dict_write_pos++] = *next_in++;
+					avail_in--, avail_out--;
+				} while(--copy_num);
+			}
+		}
+		a->state = 0;
+	case 5:;
+	}
+need_more:
+flush:
+	/* Flush stack variables back to struct */
+	a->next_in = next_in, a->next_out = next_out;
+	a->avail_in = avail_in, a->avail_out = avail_out;
+	a->dict_len = dict_len, a->copy_num = copy_num;
+	a->dict_read_pos = dict_read_pos, a->dict_write_pos = dict_write_pos;
+	return result;
+}
+
+unsigned int
+uxn_checksum(unsigned int seed, void *bytes, unsigned int bytes_size)
+{
+	unsigned int x = seed >> 16, y = seed, c;
+	unsigned char *in = bytes, *end = in + bytes_size;
+	for(; in != end; in++) {
+		c = *in << 8 | *in;
+		x = x * 0x2443 + c;
+		y = y * 0x118d + c;
+	}
+	return x << 16 | (y & 0xFFFF);
+}
+
+int
+main(int argc, char *argv[])
+{
+	int enc;
+	char *s;
+
+	char *my_byte_buffer = malloc(1000000);
+	FILE *fp = fopen("example.txt", "rb");
+	char buffer[1000000];
+	size_t i;
+
+	for(i = 0; i < 1000000; ++i) {
+		int c = getc(fp);
+		if(c == EOF) {
+			buffer[i] = 0x00;
+			break;
+		}
+		buffer[i] = c;
+		printf("%02x\n", c);
+	}
+
+	int res = uxn_lz_compress(my_byte_buffer, 1000000, &buffer, i);
+	if(res < 0)
+		printf("ERROR\n");
+	printf("!!!%d -> %d\n", i, res);
+
+	// Other way
+
+	char *output2 = malloc(1000000);
+	int res2 = uxn_lz_expand(output2, 1000000, &buffer, i);
+
+	printf("!!!%d -> %d\n", res, res2);
+
+	printf("%s\n", output2);
+
+	return 0;
+}