liblzma: Creates Non-resumable and Resumable modes for lzma_decoder.

The new decoder resumes the first decoder loop in the Resumable mode. Then, the code executes in Non-resumable mode until it detects that it cannot guarantee to have enough input/output to decode another symbol. The Resumable mode is how the decoder has always worked. Before decoding every input bit, it checks if there is enough space and will save its location to be resumed later. When the decoder has more input/output, it jumps back to the correct sequence in the Resumable mode code. When the input/output buffers are large, the Resumable mode is much slower than the Non-resumable because it has more branches and is harder for the compiler to optimize since it is in a large switch block. Early benchmarking shows significant time improvement (8-10% on gcc and clang x86) by using the Non-resumable code as much as possible.
2024-02-12 17:09:10 +02:00 · 2024-02-12 17:09:10 +02:00 · de5c5e4176
parent e446ab7a18
commit de5c5e4176
2 changed files with 532 additions and 224 deletions
--- a/src/liblzma/lz/lz_decoder.h
+++ b/src/liblzma/lz/lz_decoder.h
@ -180,12 +180,22 @@ dict_repeat(lzma_dict *dict, uint32_t distance, uint32_t *len)
 }


+static inline void
+dict_put(lzma_dict *dict, uint8_t byte)
+{
+	dict->buf[dict->pos++] = byte;
+
+	if (dict->pos > dict->full)
+		dict->full = dict->pos;
+}
+
+
 /// Puts one byte into the dictionary. Returns true if the dictionary was
 /// already full and the byte couldn't be added.
 static inline bool
-dict_put(lzma_dict *dict, uint8_t byte)
+dict_put_safe(lzma_dict *dict, uint8_t byte)
 {
-	if (unlikely(dict->pos == dict->limit))
+	if (dict->pos == dict->limit)
 		return true;

 	dict->buf[dict->pos++] = byte;
--- a/src/liblzma/lzma/lzma_decoder.c
+++ b/src/liblzma/lzma/lzma_decoder.c
@ -7,6 +7,7 @@
 ///
 //  Authors:    Igor Pavlov
 //              Lasse Collin
+//              Jia Tan
 //
 ///////////////////////////////////////////////////////////////////////////////

@ -21,8 +22,12 @@
 #	pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
 #endif

+// Minimum number of input bytes to safely decode one LZMA symbol.
+// The worst case is that we decode 22 bits using probabilities and 26
+// direct bits. This may decode at maximum 20 bytes of input plus one
+// extra byte after the final EOPM normalization.
+#define LZMA_IN_REQUIRED 21

-#ifdef HAVE_SMALL

 // Macros for (somewhat) size-optimized code.
 // This is used to decode the match length (how many bytes must be repeated
@ -193,22 +198,26 @@ typedef struct {
 	enum {
 		SEQ_NORMALIZE,
 		SEQ_IS_MATCH,
-		seq_8(SEQ_LITERAL),
-		seq_8(SEQ_LITERAL_MATCHED),
+		SEQ_LITERAL,
+		SEQ_LITERAL_MATCHED,
 		SEQ_LITERAL_WRITE,
 		SEQ_IS_REP,
-		seq_len(SEQ_MATCH_LEN),
-		seq_6(SEQ_DIST_SLOT),
+		SEQ_MATCH_LEN_CHOICE,
+		SEQ_MATCH_LEN_CHOICE2,
+		SEQ_MATCH_LEN_BITTREE,
+		SEQ_DIST_SLOT,
 		SEQ_DIST_MODEL,
 		SEQ_DIRECT,
-		seq_4(SEQ_ALIGN),
+		SEQ_ALIGN,
 		SEQ_EOPM,
 		SEQ_IS_REP0,
 		SEQ_SHORTREP,
 		SEQ_IS_REP0_LONG,
 		SEQ_IS_REP1,
 		SEQ_IS_REP2,
-		seq_len(SEQ_REP_LEN),
+		SEQ_REP_LEN_CHOICE,
+		SEQ_REP_LEN_CHOICE2,
+		SEQ_REP_LEN_BITTREE,
 		SEQ_COPY,
 	} sequence;

@ -309,8 +318,42 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		might_finish_without_eopm = true;
 	}

-	// The main decoder loop. The "switch" is used to restart the decoder at
-	// correct location. Once restarted, the "switch" is no longer used.
+	// Lookup table used to update the literal state.
+	// Compared to other state updates, this would need two branches.
+	// The lookup table is used by both Resumable and Non-resumable modes.
+	static const lzma_lzma_state next_state[] = {
+		STATE_LIT_LIT,
+		STATE_LIT_LIT,
+		STATE_LIT_LIT,
+		STATE_LIT_LIT,
+		STATE_MATCH_LIT_LIT,
+		STATE_REP_LIT_LIT,
+		STATE_SHORTREP_LIT_LIT,
+		STATE_MATCH_LIT,
+		STATE_REP_LIT,
+		STATE_SHORTREP_LIT,
+		STATE_MATCH_LIT,
+		STATE_REP_LIT
+	};
+
+	// The main decoder loop. The "switch" is used to resume the decoder at
+	// correct location. Once resumed, the "switch" is no longer used.
+	// The decoder loops is split into two modes:
+	//
+	// 1 - Non-resumable mode (fast). This is used when it is guaranteed
+	//     there is enough input to decode the next symbol. If the output
+	//     limit is reached, then the decoder loop will save the place
+	//     for the resumable mode to continue. This mode is not used if
+	//     HAVE_SMALL is defined. This is faster than Resumable mode
+	//     because it reduces the number of branches needed and allows
+	//     for more compiler optimizations.
+	//
+	// 2 - Resumable mode (slow). This is used when a previous decoder
+	//     loop did not have enough space in the input or output buffers
+	//     to complete. It uses sequence enum values to set remind
+	//     coder->sequence where to resume in the decoder loop. This
+	//     is the only mode used when HAVE_SMALL is defined.
+
 	switch (coder->sequence)
 	while (true) {
 		// Calculate new pos_state. This is skipped on the first loop
@ -318,13 +361,440 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		// variables.
 		pos_state = dict.pos & pos_mask;

+#ifndef HAVE_SMALL
+
+		///////////////////////////////
+		// Non-resumable Mode (fast) //
+		///////////////////////////////
+
+		// If there is not enough room for another LZMA symbol
+		// go to Resumable mode.
+		if (rc_in_pos + LZMA_IN_REQUIRED > in_size
+			|| dict.pos == dict.limit)
+			goto slow;
+
+		// Decode the first bit from the next LZMA symbol.
+		// If the bit is a 0, then we handle it as a literal.
+		// If the bit is a 1, then it is a match of previously
+		// decoded data.
+		rc_if_0(coder->is_match[state][pos_state]) {
+			/////////////////////
+			// Decode literal. //
+			/////////////////////
+
+			// Update the RC that we have decoded a 0.
+			rc_update_0(coder->is_match[state][pos_state]);
+
+			// Get the correct probability array from lp and
+			// lc params.
+			probs = literal_subcoder(coder->literal,
+					literal_context_bits, literal_pos_mask,
+					dict.pos, dict_get(&dict, 0));
+			symbol = 1;
+
+			if (is_literal_state(state)) {
+				// Decode literal without match byte.
+				// We need to decode 8 bits, so instead
+				// of looping from 1 - 8, we unroll the
+				// loop for a speed optimization.
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+			} else {
+				// Decode literal with match byte.
+				//
+				// We store the byte we compare against
+				// ("match byte") to "len" to minimize the
+				// number of variables we need to store
+				// between decoder calls.
+
+				len = (uint32_t)(dict_get(&dict, rep0)) << 1;
+
+				// The usage of "offset" allows omitting some
+				// branches, which should give tiny speed
+				// improvement on some CPUs. "offset" gets
+				// set to zero if match_bit didn't match.
+				offset = 0x100;
+
+				// Unroll the loop.
+				uint32_t match_bit;
+				uint32_t subcoder_index;
+
+#	define decode_with_match_bit \
+			match_bit = len & offset; \
+			subcoder_index = offset + match_bit + symbol; \
+			rc_bit(probs[subcoder_index], \
+					offset &= ~match_bit, \
+					offset &= match_bit)
+
+				decode_with_match_bit;
+				len <<= 1;
+				decode_with_match_bit;
+				len <<= 1;
+				decode_with_match_bit;
+				len <<= 1;
+				decode_with_match_bit;
+				len <<= 1;
+				decode_with_match_bit;
+				len <<= 1;
+				decode_with_match_bit;
+				len <<= 1;
+				decode_with_match_bit;
+				len <<= 1;
+				decode_with_match_bit;
+#	undef decode_match_bit
+			}
+
+			state = next_state[state];
+
+			// Write decoded literal to dictionary
+			dict_put(&dict, symbol);
+			continue;
+		}
+
+		///////////////////
+		// Decode match. //
+		///////////////////
+
+		// Instead of a new byte we are going to decode a
+		// distance-length pair. The distance represents how far
+		// back in the dictionary to begin copying. The length
+		// represents how many bytes to copy.
+
+		rc_update_1(coder->is_match[state][pos_state]);
+
+		rc_if_0(coder->is_rep[state]) {
+			///////////////////
+			// Simple match. //
+			///////////////////
+
+			// Not a repeated match. In this case,
+			// the length (how many bytes to copy) must be
+			// decoded first. Then, the distance (where to
+			// start copying) is decoded.
+			//
+			// This is also how we know when we are done
+			// decoding. If the distance decodes to UINT32_MAX,
+			// then we know to stop decoding (end of payload
+			// marker).
+
+			rc_update_0(coder->is_rep[state]);
+			update_match(state);
+
+			// The latest three match distances are kept in
+			// memory in case there are repeated matches.
+			rep3 = rep2;
+			rep2 = rep1;
+			rep1 = rep0;
+
+			// Decode the length of the match.
+			len_decode_fast(len, coder->match_len_decoder,
+					pos_state);
+
+			// Next, decode the distance into rep0.
+
+			// The next 6 bits determine how to decode the
+			// rest of the distance.
+			probs = coder->dist_slot[get_dist_state(len)];
+			symbol = 1;
+
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+
+			// Get rid of the highest bit that was needed for
+			// indexing of the probability array.
+			symbol -= DIST_SLOTS;
+			assert(symbol <= 63);
+
+			if (symbol < DIST_MODEL_START) {
+				// If the decoded symbol is < DIST_MODEL_START
+				// then we use its value directly as the
+				// match distance. No other bits are needed.
+				// The only possible distance values
+				// are [0, 3].
+				rep0 = symbol;
+			} else {
+				// Use the first two bits of symbol as the
+				// highest bits of the match distance.
+
+				// "limit" represents the number of low bits
+				// to decode.
+				limit = (symbol >> 1) - 1;
+				assert(limit >= 1 && limit <= 30);
+				rep0 = 2 + (symbol & 1);
+
+				if (symbol < DIST_MODEL_END) {
+					// When symbol is > DIST_MODEL_START,
+					// but symbol < DIST_MODEL_END, then
+					// it can decode distances between
+					// [4, 127].
+					assert(limit <= 5);
+					rep0 <<= limit;
+					assert(rep0 <= 96);
+					// -1 is fine, because we start
+					// decoding at probs[1], not probs[0].
+					// NOTE: This violates the C standard,
+					// since we are doing pointer
+					// arithmetic past the beginning of
+					// the array.
+					assert((int32_t)(rep0 - symbol - 1)
+							>= -1);
+					assert((int32_t)(rep0 - symbol - 1)
+							<= 82);
+					probs = coder->pos_special + rep0
+							- symbol - 1;
+					symbol = 1;
+					offset = 0;
+
+					switch (limit) {
+					case 5:
+						assert(offset == 0);
+						rc_bit(probs[symbol], ,
+							rep0 += 1U);
+						++offset;
+						--limit;
+					case 4:
+						rc_bit(probs[symbol], ,
+							rep0 += 1U << offset);
+						++offset;
+						--limit;
+					case 3:
+						rc_bit(probs[symbol], ,
+							rep0 += 1U << offset);
+						++offset;
+						--limit;
+					case 2:
+						rc_bit(probs[symbol], ,
+							rep0 += 1U << offset);
+						++offset;
+						--limit;
+					case 1:
+						// We need "symbol" only for
+						// indexing the probability
+						// array, thus we can use
+						// rc_bit_last() here to
+						// omit the unneeded updating
+						// of "symbol".
+						rc_bit_last(probs[symbol], ,
+							rep0 += 1U << offset);
+					}
+				} else {
+					// The distance is >= 128. Decode the
+					// lower bits without probabilities
+					// except the lowest four bits.
+					assert(symbol >= 14);
+					assert(limit >= 6);
+					limit -= ALIGN_BITS;
+					assert(limit >= 2);
+
+					// Not worth manual unrolling
+					do {
+						rc_direct(rep0);
+					} while (--limit > 0);
+
+					// Decode the lowest four bits using
+					// probabilities.
+					rep0 <<= ALIGN_BITS;
+					symbol = 1;
+
+					rc_bit(coder->pos_align[symbol], ,
+							rep0 += 1);
+
+					rc_bit(coder->pos_align[symbol], ,
+							rep0 += 2);
+
+					rc_bit(coder->pos_align[symbol], ,
+							rep0 += 4);
+
+					// Like when distance [4, 127], we
+					// don't need "symbol" for anything
+					// other than indexing the probability
+					// array.
+					rc_bit_last(
+						coder->pos_align[symbol], ,
+						rep0 += 8);
+
+					if (rep0 == UINT32_MAX) {
+						///////////////////////////
+						// End of payload marker //
+						///////////////////////////
+
+						// End of payload marker was
+						// found. It may only be
+						// present if
+						//   - uncompressed size is
+						//     unknown or
+						//   - after known uncompressed
+						//     size amount of bytes has
+						//     been decompressed and
+						//     caller has indicated
+						//     that EOPM might be used
+						//     (it's not allowed in
+						//     LZMA2).
+						if (!eopm_is_valid) {
+							ret = LZMA_DATA_ERROR;
+							goto out;
+						}
+
+						// LZMA1 stream with
+						// end-of-payload marker.
+						rc_normalize();
+						ret = rc_is_finished(rc)
+							? LZMA_STREAM_END
+							: LZMA_DATA_ERROR;
+						goto out;
+					}
+				}
+			}
+
+			// Validate the distance we just decoded.
+			if (unlikely(!dict_is_distance_valid(&dict, rep0))) {
+				ret = LZMA_DATA_ERROR;
+				goto out;
+			}
+
+		} else {
+			rc_update_1(coder->is_rep[state]);
+
+			/////////////////////
+			// Repeated match. //
+			/////////////////////
+
+			// The match distance is a value that we have decoded
+			// recently. The latest four match distances are
+			// available as rep0, rep1, rep2 and rep3. We will
+			// now decode which of them is the new distance.
+			//
+			// There cannot be a match if we haven't produced
+			// any output, so check that first.
+			if (unlikely(!dict_is_distance_valid(&dict, 0))) {
+				ret = LZMA_DATA_ERROR;
+				goto out;
+			}
+
+			rc_if_0(coder->is_rep0[state]) {
+				rc_update_0(coder->is_rep0[state]);
+				// The distance is rep0.
+
+				// Decode the next bit to determine if 1 byte
+				// should be copied from rep0 distance or
+				// if the number of bytes needs to be decoded.
+
+				// If the next bit is 0, then it is a
+				// "Short Rep Match" and only 1 bit is copied.
+				// Otherwise, the length of the match is
+				// decoded after the "else" statement.
+				rc_if_0(coder->is_rep0_long[state][pos_state]) {
+					rc_update_0(coder->is_rep0_long[
+							state][pos_state]);
+
+					update_short_rep(state);
+					dict_put(&dict, dict_get(&dict, rep0));
+					continue;
+				}
+
+				// Repeating more than one byte at
+				// distance of rep0.
+				rc_update_1(coder->is_rep0_long[
+						state][pos_state]);
+
+			} else {
+				rc_update_1(coder->is_rep0[state]);
+
+				// The distance is rep1, rep2 or rep3. Once
+				// we find out which one of these three, it
+				// is stored to rep0 and rep1, rep2 and rep3
+				// are updated accordingly. There is no
+				// "Short Rep Match" option, so the length
+				// of the match must always be decoded next.
+				rc_if_0(coder->is_rep1[state]) {
+					// The distance is rep1.
+					rc_update_0(coder->is_rep1[state]);
+
+					const uint32_t distance = rep1;
+					rep1 = rep0;
+					rep0 = distance;
+
+				} else {
+					rc_update_1(coder->is_rep1[state]);
+
+					rc_if_0(coder->is_rep2[state]) {
+						// The distance is rep2.
+						rc_update_0(coder->is_rep2[
+								state]);
+
+						const uint32_t distance = rep2;
+						rep2 = rep1;
+						rep1 = rep0;
+						rep0 = distance;
+
+					} else {
+						// The distance is rep3.
+						rc_update_1(coder->is_rep2[
+								state]);
+
+						const uint32_t distance = rep3;
+						rep3 = rep2;
+						rep2 = rep1;
+						rep1 = rep0;
+						rep0 = distance;
+					}
+				}
+			}
+
+			update_long_rep(state);
+
+			// Decode the length of the repeated match.
+			len_decode_fast(len, coder->rep_len_decoder,
+					pos_state);
+		}
+
+		/////////////////////////////////
+		// Repeat from history buffer. //
+		/////////////////////////////////
+
+		// The length is always between these limits. There is no way
+		// to trigger the algorithm to set len outside this range.
+		assert(len >= MATCH_LEN_MIN);
+		assert(len <= MATCH_LEN_MAX);
+
+		// Repeat len bytes from distance of rep0.
+		if (unlikely(dict_repeat(&dict, rep0, &len))) {
+			coder->sequence = SEQ_COPY;
+			goto out;
+		}
+
+		continue;
+
+slow:
+#endif
+	///////////////////////////
+	// Resumable Mode (slow) //
+	///////////////////////////
+
+	// This is very similar to Non-resumable Mode, so most of the
+	// comments are not repeated. The main differences are:
+	// - case labels are used to resume at the correct location.
+	// - Loops are not unrolled.
+	// - Range coder macros take an extra sequence argument
+	//   so they can save to coder->sequence the location to
+	//   resume in case there is not enough input.
 	case SEQ_NORMALIZE:
 	case SEQ_IS_MATCH:
 		if (unlikely(might_finish_without_eopm
 				&& dict.pos == dict.limit)) {
 			// In rare cases there is a useless byte that needs
 			// to be read anyway.
-			rc_normalize(SEQ_NORMALIZE);
+			rc_normalize_safe(SEQ_NORMALIZE);

 			// If the range decoder state is such that we can
 			// be at the end of the LZMA stream, then the
@ -347,10 +817,12 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 			eopm_is_valid = true;
 		}

-		rc_if_0(coder->is_match[state][pos_state], SEQ_IS_MATCH) {
-			rc_update_0(coder->is_match[state][pos_state]);
+		rc_if_0_safe(coder->is_match[state][pos_state], SEQ_IS_MATCH) {
+			/////////////////////
+			// Decode literal. //
+			/////////////////////

-			// It's a literal i.e. a single 8-bit byte.
+			rc_update_0(coder->is_match[state][pos_state]);

 			probs = literal_subcoder(coder->literal,
 					literal_context_bits, literal_pos_mask,
@ -359,37 +831,19 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,

 			if (is_literal_state(state)) {
 				// Decode literal without match byte.
-#ifdef HAVE_SMALL
+				// The "slow" version does not unroll
+				// the loop.
 	case SEQ_LITERAL:
 				do {
-					rc_bit(probs[symbol], , , SEQ_LITERAL);
+					rc_bit_safe(probs[symbol], , ,
+							SEQ_LITERAL);
 				} while (symbol < (1 << 8));
-#else
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL0);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL1);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL2);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL3);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL4);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL5);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL6);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL7);
-#endif
 			} else {
 				// Decode literal with match byte.
-				//
-				// We store the byte we compare against
-				// ("match byte") to "len" to minimize the
-				// number of variables we need to store
-				// between decoder calls.
 				len = (uint32_t)(dict_get(&dict, rep0)) << 1;

-				// The usage of "offset" allows omitting some
-				// branches, which should give tiny speed
-				// improvement on some CPUs. "offset" gets
-				// set to zero if match_bit didn't match.
 				offset = 0x100;

-#ifdef HAVE_SMALL
 	case SEQ_LITERAL_MATCHED:
 				do {
 					const uint32_t match_bit
@ -398,7 +852,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 							= offset + match_bit
 							+ symbol;

-					rc_bit(probs[subcoder_index],
+					rc_bit_safe(probs[subcoder_index],
 							offset &= ~match_bit,
 							offset &= match_bit,
 							SEQ_LITERAL_MATCHED);
@ -411,61 +865,12 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					len <<= 1;

 				} while (symbol < (1 << 8));
-#else
-				// Unroll the loop.
-				uint32_t match_bit;
-				uint32_t subcoder_index;
-
-#	define d(seq) \
-		case seq: \
-			match_bit = len & offset; \
-			subcoder_index = offset + match_bit + symbol; \
-			rc_bit(probs[subcoder_index], \
-					offset &= ~match_bit, \
-					offset &= match_bit, \
-					seq)
-
-				d(SEQ_LITERAL_MATCHED0);
-				len <<= 1;
-				d(SEQ_LITERAL_MATCHED1);
-				len <<= 1;
-				d(SEQ_LITERAL_MATCHED2);
-				len <<= 1;
-				d(SEQ_LITERAL_MATCHED3);
-				len <<= 1;
-				d(SEQ_LITERAL_MATCHED4);
-				len <<= 1;
-				d(SEQ_LITERAL_MATCHED5);
-				len <<= 1;
-				d(SEQ_LITERAL_MATCHED6);
-				len <<= 1;
-				d(SEQ_LITERAL_MATCHED7);
-#	undef d
-#endif
 			}

-			//update_literal(state);
-			// Use a lookup table to update to literal state,
-			// since compared to other state updates, this would
-			// need two branches.
-			static const lzma_lzma_state next_state[] = {
-				STATE_LIT_LIT,
-				STATE_LIT_LIT,
-				STATE_LIT_LIT,
-				STATE_LIT_LIT,
-				STATE_MATCH_LIT_LIT,
-				STATE_REP_LIT_LIT,
-				STATE_SHORTREP_LIT_LIT,
-				STATE_MATCH_LIT,
-				STATE_REP_LIT,
-				STATE_SHORTREP_LIT,
-				STATE_MATCH_LIT,
-				STATE_REP_LIT
-			};
 			state = next_state[state];

 	case SEQ_LITERAL_WRITE:
-			if (unlikely(dict_put(&dict, symbol))) {
+			if (dict_put_safe(&dict, symbol)) {
 				coder->sequence = SEQ_LITERAL_WRITE;
 				goto out;
 			}
@ -473,64 +878,47 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 			continue;
 		}

-		// Instead of a new byte we are going to get a byte range
-		// (distance and length) which will be repeated from our
-		// output history.
+		///////////////////
+		// Decode match. //
+		///////////////////

 		rc_update_1(coder->is_match[state][pos_state]);

 	case SEQ_IS_REP:
-		rc_if_0(coder->is_rep[state], SEQ_IS_REP) {
-			// Not a repeated match
+		rc_if_0_safe(coder->is_rep[state], SEQ_IS_REP) {
+			///////////////////
+			// Simple match. //
+			///////////////////
+
 			rc_update_0(coder->is_rep[state]);
 			update_match(state);

-			// The latest three match distances are kept in
-			// memory in case there are repeated matches.
 			rep3 = rep2;
 			rep2 = rep1;
 			rep1 = rep0;

-			// Decode the length of the match.
 			len_decode(len, coder->match_len_decoder,
 					pos_state, SEQ_MATCH_LEN);

-			// Prepare to decode the highest two bits of the
-			// match distance.
 			probs = coder->dist_slot[get_dist_state(len)];
 			symbol = 1;

-#ifdef HAVE_SMALL
 	case SEQ_DIST_SLOT:
 			do {
-				rc_bit(probs[symbol], , , SEQ_DIST_SLOT);
+				rc_bit_safe(probs[symbol], , , SEQ_DIST_SLOT);
 			} while (symbol < DIST_SLOTS);
-#else
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT0);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT1);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT2);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT3);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT4);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT5);
-#endif
-			// Get rid of the highest bit that was needed for
-			// indexing of the probability array.
+
 			symbol -= DIST_SLOTS;
 			assert(symbol <= 63);

 			if (symbol < DIST_MODEL_START) {
-				// Match distances [0, 3] have only two bits.
 				rep0 = symbol;
 			} else {
-				// Decode the lowest [1, 29] bits of
-				// the match distance.
 				limit = (symbol >> 1) - 1;
 				assert(limit >= 1 && limit <= 30);
 				rep0 = 2 + (symbol & 1);

 				if (symbol < DIST_MODEL_END) {
-					// Prepare to decode the low bits for
-					// a distance of [4, 127].
 					assert(limit <= 5);
 					rep0 <<= limit;
 					assert(rep0 <= 96);
@ -549,118 +937,43 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					symbol = 1;
 					offset = 0;
 	case SEQ_DIST_MODEL:
-#ifdef HAVE_SMALL
 					do {
-						rc_bit(probs[symbol], ,
+						rc_bit_safe(probs[symbol], ,
 							rep0 += 1U << offset,
 							SEQ_DIST_MODEL);
 					} while (++offset < limit);
-#else
-					switch (limit) {
-					case 5:
-						assert(offset == 0);
-						rc_bit(probs[symbol], ,
-							rep0 += 1U,
-							SEQ_DIST_MODEL);
-						++offset;
-						--limit;
-					case 4:
-						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
-						++offset;
-						--limit;
-					case 3:
-						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
-						++offset;
-						--limit;
-					case 2:
-						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
-						++offset;
-						--limit;
-					case 1:
-						// We need "symbol" only for
-						// indexing the probability
-						// array, thus we can use
-						// rc_bit_last() here to omit
-						// the unneeded updating of
-						// "symbol".
-						rc_bit_last(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
-					}
-#endif
 				} else {
-					// The distance is >= 128. Decode the
-					// lower bits without probabilities
-					// except the lowest four bits.
 					assert(symbol >= 14);
 					assert(limit >= 6);
 					limit -= ALIGN_BITS;
 					assert(limit >= 2);
 	case SEQ_DIRECT:
-					// Not worth manual unrolling
 					do {
-						rc_direct(rep0, SEQ_DIRECT);
+						rc_direct_safe(rep0,
+								SEQ_DIRECT);
 					} while (--limit > 0);

-					// Decode the lowest four bits using
-					// probabilities.
 					rep0 <<= ALIGN_BITS;
 					symbol = 1;
-#ifdef HAVE_SMALL
+
 					offset = 0;
 	case SEQ_ALIGN:
 					do {
-						rc_bit(coder->pos_align[
+						rc_bit_safe(coder->pos_align[
 								symbol], ,
 							rep0 += 1U << offset,
 							SEQ_ALIGN);
 					} while (++offset < ALIGN_BITS);
-#else
-	case SEQ_ALIGN0:
-					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 1, SEQ_ALIGN0);
-	case SEQ_ALIGN1:
-					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 2, SEQ_ALIGN1);
-	case SEQ_ALIGN2:
-					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 4, SEQ_ALIGN2);
-	case SEQ_ALIGN3:
-					// Like in SEQ_DIST_MODEL, we don't
-					// need "symbol" for anything else
-					// than indexing the probability array.
-					rc_bit_last(coder->pos_align[symbol], ,
-							rep0 += 8, SEQ_ALIGN3);
-#endif

+					// End of payload marker
 					if (rep0 == UINT32_MAX) {
-						// End of payload marker was
-						// found. It may only be
-						// present if
-						//   - uncompressed size is
-						//     unknown or
-						//   - after known uncompressed
-						//     size amount of bytes has
-						//     been decompressed and
-						//     caller has indicated
-						//     that EOPM might be used
-						//     (it's not allowed in
-						//     LZMA2).
 						if (!eopm_is_valid) {
 							ret = LZMA_DATA_ERROR;
 							goto out;
 						}

 	case SEQ_EOPM:
-						// LZMA1 stream with
-						// end-of-payload marker.
-						rc_normalize(SEQ_EOPM);
+						rc_normalize_safe(SEQ_EOPM);
 						ret = rc_is_finished(rc)
 							? LZMA_STREAM_END
 							: LZMA_DATA_ERROR;
@ -669,36 +982,30 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				}
 			}

-			// Validate the distance we just decoded.
 			if (unlikely(!dict_is_distance_valid(&dict, rep0))) {
 				ret = LZMA_DATA_ERROR;
 				goto out;
 			}

 		} else {
+			/////////////////////
+			// Repeated match. //
+			/////////////////////
+
 			rc_update_1(coder->is_rep[state]);

-			// Repeated match
-			//
-			// The match distance is a value that we have had
-			// earlier. The latest four match distances are
-			// available as rep0, rep1, rep2 and rep3. We will
-			// now decode which of them is the new distance.
-			//
-			// There cannot be a match if we haven't produced
-			// any output, so check that first.
 			if (unlikely(!dict_is_distance_valid(&dict, 0))) {
 				ret = LZMA_DATA_ERROR;
 				goto out;
 			}

 	case SEQ_IS_REP0:
-			rc_if_0(coder->is_rep0[state], SEQ_IS_REP0) {
+			rc_if_0_safe(coder->is_rep0[state], SEQ_IS_REP0) {
 				rc_update_0(coder->is_rep0[state]);
-				// The distance is rep0.

 	case SEQ_IS_REP0_LONG:
-				rc_if_0(coder->is_rep0_long[state][pos_state],
+				rc_if_0_safe(coder->is_rep0_long
+						[state][pos_state],
 						SEQ_IS_REP0_LONG) {
 					rc_update_0(coder->is_rep0_long[
 							state][pos_state]);
@ -706,8 +1013,9 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					update_short_rep(state);

 	case SEQ_SHORTREP:
-					if (unlikely(dict_put(&dict, dict_get(
-							&dict, rep0)))) {
+					if (dict_put_safe(&dict,
+							dict_get(&dict,
+							rep0))) {
 						coder->sequence = SEQ_SHORTREP;
 						goto out;
 					}
@ -715,8 +1023,6 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					continue;
 				}

-				// Repeating more than one byte at
-				// distance of rep0.
 				rc_update_1(coder->is_rep0_long[
 						state][pos_state]);

@ -724,11 +1030,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				rc_update_1(coder->is_rep0[state]);

 	case SEQ_IS_REP1:
-				// The distance is rep1, rep2 or rep3. Once
-				// we find out which one of these three, it
-				// is stored to rep0 and rep1, rep2 and rep3
-				// are updated accordingly.
-				rc_if_0(coder->is_rep1[state], SEQ_IS_REP1) {
+				rc_if_0_safe(coder->is_rep1[state], SEQ_IS_REP1) {
 					rc_update_0(coder->is_rep1[state]);

 					const uint32_t distance = rep1;
@ -738,7 +1040,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				} else {
 					rc_update_1(coder->is_rep1[state]);
 	case SEQ_IS_REP2:
-					rc_if_0(coder->is_rep2[state],
+					rc_if_0_safe(coder->is_rep2[state],
 							SEQ_IS_REP2) {
 						rc_update_0(coder->is_rep2[
 								state]);
@ -763,7 +1065,6 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,

 			update_long_rep(state);

-			// Decode the length of the repeated match.
 			len_decode(len, coder->rep_len_decoder,
 					pos_state, SEQ_REP_LEN);
 		}
@ -772,13 +1073,10 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		// Repeat from history buffer. //
 		/////////////////////////////////

-		// The length is always between these limits. There is no way
-		// to trigger the algorithm to set len outside this range.
 		assert(len >= MATCH_LEN_MIN);
 		assert(len <= MATCH_LEN_MAX);

 	case SEQ_COPY:
-		// Repeat len bytes from distance of rep0.
 		if (unlikely(dict_repeat(&dict, rep0, &len))) {
 			coder->sequence = SEQ_COPY;
 			goto out;