xz-analysis-mirror/src/xz/list.c

///////////////////////////////////////////////////////////////////////////////
//
/// \file       list.c
/// \brief      Listing information about .xz files
//
//  Author:     Lasse Collin
//
//  This file has been put into the public domain.
//  You can do whatever you want with this file.
//
///////////////////////////////////////////////////////////////////////////////

#include "private.h"
#include "tuklib_integer.h"


/// Totals that are displayed if there was more than one file.
/// The "files" counter is also used in print_info_adv() to show
/// the file number.
static struct {
	uint64_t files;
	uint64_t streams;
	uint64_t blocks;
	uint64_t compressed_size;
	uint64_t uncompressed_size;
	uint32_t checks;
} totals = { 0, 0, 0, 0, 0, 0 };


/// \brief      Parse the Index(es) from the given .xz file
///
/// \param      idx     If decoding is successful, *idx will be set to point
///                     to lzma_index containing the decoded information.
///                     On error, *idx is not modified.
/// \param      pair    Input file
///
/// \return     On success, false is returned. On error, true is returned.
///
// TODO: This function is pretty big. liblzma should have a function that
// takes a callback function to parse the Index(es) from a .xz file to make
// it easy for applications.
static bool
parse_indexes(lzma_index **idx, file_pair *pair)
{
	if (pair->src_st.st_size <= 0) {
		message_error(_("%s: File is empty"), pair->src_name);
		return true;
	}

	if (pair->src_st.st_size < 2 * LZMA_STREAM_HEADER_SIZE) {
		message_error(_("%s: Too small to be a valid .xz file"),
				pair->src_name);
		return true;
	}

	io_buf buf;
	lzma_stream_flags header_flags;
	lzma_stream_flags footer_flags;
	lzma_ret ret;

	// lzma_stream for the Index decoder
	lzma_stream strm = LZMA_STREAM_INIT;

	// All Indexes decoded so far
	lzma_index *combined_index = NULL;

	// The Index currently being decoded
	lzma_index *this_index = NULL;

	// Current position in the file. We parse the file backwards so
	// initialize it to point to the end of the file.
	off_t pos = pair->src_st.st_size;

	// Each loop iteration decodes one Index.
	do {
		// Check that there is enough data left to contain at least
		// the Stream Header and Stream Footer. This check cannot
		// fail in the first pass of this loop.
		if (pos < 2 * LZMA_STREAM_HEADER_SIZE) {
			message_error("%s: %s", pair->src_name,
					message_strm(LZMA_DATA_ERROR));
			goto error;
		}

		pos -= LZMA_STREAM_HEADER_SIZE;
		lzma_vli stream_padding = 0;

		// Locate the Stream Footer. There may be Stream Padding which
		// we must skip when reading backwards.
		while (true) {
			if (pos < LZMA_STREAM_HEADER_SIZE) {
				message_error("%s: %s", pair->src_name,
						message_strm(
							LZMA_DATA_ERROR));
				goto error;
			}

			if (io_pread(pair, &buf,
					LZMA_STREAM_HEADER_SIZE, pos))
				goto error;

			// Stream Padding is always a multiple of four bytes.
			int i = 2;
			if (buf.u32[i] != 0)
				break;

			// To avoid calling io_pread() for every four bytes
			// of Stream Padding, take advantage that we read
			// 12 bytes (LZMA_STREAM_HEADER_SIZE) already and
			// check them too before calling io_pread() again.
			do {
				stream_padding += 4;
				pos -= 4;
				--i;
			} while (i >= 0 && buf.u32[i] == 0);
		}

		// Decode the Stream Footer.
		ret = lzma_stream_footer_decode(&footer_flags, buf.u8);
		if (ret != LZMA_OK) {
			message_error("%s: %s", pair->src_name,
					message_strm(ret));
			goto error;
		}

		// Check that the size of the Index field looks sane.
		lzma_vli index_size = footer_flags.backward_size;
		if ((lzma_vli)(pos) < index_size + LZMA_STREAM_HEADER_SIZE) {
			message_error("%s: %s", pair->src_name,
					message_strm(LZMA_DATA_ERROR));
			goto error;
		}

		// Set pos to the beginning of the Index.
		pos -= index_size;

		// See how much memory we can use for decoding this Index.
		uint64_t memlimit = hardware_memlimit_get();
		uint64_t memused = 0;
		if (combined_index != NULL) {
			memused = lzma_index_memused(combined_index);
			if (memused > memlimit)
				message_bug();

			memlimit -= memused;
		}

		// Decode the Index.
		ret = lzma_index_decoder(&strm, &this_index, memlimit);
		if (ret != LZMA_OK) {
			message_error("%s: %s", pair->src_name,
					message_strm(ret));
			goto error;
		}

		do {
			// Don't give the decoder more input than the
			// Index size.
			strm.avail_in = MIN(IO_BUFFER_SIZE, index_size);
			if (io_pread(pair, &buf, strm.avail_in, pos))
				goto error;

			pos += strm.avail_in;
			index_size -= strm.avail_in;

			strm.next_in = buf.u8;
			ret = lzma_code(&strm, LZMA_RUN);

		} while (ret == LZMA_OK);

		// If the decoding seems to be successful, check also that
		// the Index decoder consumed as much input as indicated
		// by the Backward Size field.
		if (ret == LZMA_STREAM_END)
			if (index_size != 0 || strm.avail_in != 0)
				ret = LZMA_DATA_ERROR;

		if (ret != LZMA_STREAM_END) {
			// LZMA_BUFFER_ERROR means that the Index decoder
			// would have liked more input than what the Index
			// size should be according to Stream Footer.
			// The message for LZMA_DATA_ERROR makes more
			// sense in that case.
			if (ret == LZMA_BUF_ERROR)
				ret = LZMA_DATA_ERROR;

			message_error("%s: %s", pair->src_name,
					message_strm(ret));

			// If the error was too low memory usage limit,
			// show also how much memory would have been needed.
			if (ret == LZMA_MEMLIMIT_ERROR) {
				uint64_t needed = lzma_memusage(&strm);
				if (UINT64_MAX - needed < memused)
					needed = UINT64_MAX;
				else
					needed += memused;

				message_mem_needed(V_ERROR, needed);
			}

			goto error;
		}

		// Decode the Stream Header and check that its Stream Flags
		// match the Stream Footer.
		pos -= footer_flags.backward_size + LZMA_STREAM_HEADER_SIZE;
		if ((lzma_vli)(pos) < lzma_index_total_size(this_index)) {
			message_error("%s: %s", pair->src_name,
					message_strm(LZMA_DATA_ERROR));
			goto error;
		}

		pos -= lzma_index_total_size(this_index);
		if (io_pread(pair, &buf, LZMA_STREAM_HEADER_SIZE, pos))
			goto error;

		ret = lzma_stream_header_decode(&header_flags, buf.u8);
		if (ret != LZMA_OK) {
			message_error("%s: %s", pair->src_name,
					message_strm(ret));
			goto error;
		}

		ret = lzma_stream_flags_compare(&header_flags, &footer_flags);
		if (ret != LZMA_OK) {
			message_error("%s: %s", pair->src_name,
					message_strm(ret));
			goto error;
		}

		// Store the decoded Stream Flags into this_index. This is
		// needed so that we can print which Check is used in each
		// Stream.
		ret = lzma_index_stream_flags(this_index, &footer_flags);
		if (ret != LZMA_OK)
			message_bug();

		// Store also the size of the Stream Padding field. It is
		// needed to show the offsets of the Streams correctly.
		ret = lzma_index_stream_padding(this_index, stream_padding);
		if (ret != LZMA_OK)
			message_bug();

		if (combined_index != NULL) {
			// Append the earlier decoded Indexes
			// after this_index.
			ret = lzma_index_cat(
					this_index, combined_index, NULL);
			if (ret != LZMA_OK) {
				message_error("%s: %s", pair->src_name,
						message_strm(ret));
				goto error;
			}
		}

		combined_index = this_index;
		this_index = NULL;

	} while (pos > 0);

	lzma_end(&strm);

	// All OK. Make combined_index available to the caller.
	*idx = combined_index;
	return false;

error:
	// Something went wrong, free the allocated memory.
	lzma_end(&strm);
	lzma_index_end(combined_index, NULL);
	lzma_index_end(this_index, NULL);
	return true;
}


/// \brief      Get the compression ratio
///
/// This has slightly different format than that is used by in message.c.
static const char *
get_ratio(uint64_t compressed_size, uint64_t uncompressed_size)
{
	if (uncompressed_size == 0)
		return "---";

	const double ratio = (double)(compressed_size)
			/ (double)(uncompressed_size);
	if (ratio > 9.999)
		return "---";

	static char buf[6];
	snprintf(buf, sizeof(buf), "%.3f", ratio);
	return buf;
}


static const char check_names[LZMA_CHECK_ID_MAX + 1][12] = {
	"None",
	"CRC32",
	"Unknown-2",
	"Unknown-3",
	"CRC64",
	"Unknown-5",
	"Unknown-6",
	"Unknown-7",
	"Unknown-8",
	"Unknown-9",
	"SHA-256",
	"Unknown-11",
	"Unknown-12",
	"Unknown-13",
	"Unknown-14",
	"Unknown-15",
};


/// \brief      Get a comma-separated list of Check names
///
/// \param      checks  Bit mask of Checks to print
/// \param      space_after_comma
///                     It's better to not use spaces in table-like listings,
///                     but in more verbose formats a space after a comma
///                     is good for readability.
static const char *
get_check_names(uint32_t checks, bool space_after_comma)
{
	assert(checks != 0);

	static char buf[sizeof(check_names)];
	char *pos = buf;
	size_t left = sizeof(buf);

	const char *sep = space_after_comma ? ", " : ",";
	bool comma = false;

	for (size_t i = 0; i <= LZMA_CHECK_ID_MAX; ++i) {
		if (checks & (UINT32_C(1) << i)) {
			my_snprintf(&pos, &left, "%s%s",
					comma ? sep : "", check_names[i]);
			comma = true;
		}
	}

	return buf;
}


/// \brief      Read the Check value from the .xz file and print it
///
/// Since this requires a seek, listing all Check values for all Blocks can
/// be slow.
///
/// \param      pair    Input file
/// \param      iter    Location of the Block whose Check value should
///                     be printed.
///
/// \return     False on success, true on I/O error.
static bool
print_check_value(file_pair *pair, const lzma_index_iter *iter)
{
	// Don't read anything from the file if there is no integrity Check.
	if (iter->stream.flags->check == LZMA_CHECK_NONE) {
		printf("---");
		return false;
	}

	// Locate and read the Check field.
	const uint32_t size = lzma_check_size(iter->stream.flags->check);
	const off_t offset = iter->block.compressed_file_offset
			+ iter->block.total_size - size;
	io_buf buf;
	if (io_pread(pair, &buf, size, offset))
		return true;

	// CRC32 and CRC64 are in little endian. Guess that all the future
	// 32-bit and 64-bit Check values are little endian too. It shouldn't
	// be a too big problem if this guess is wrong.
	if (size == 4) {
		printf("%08" PRIx32, conv32le(buf.u32[0]));
	} else if (size == 8) {
		printf("%016" PRIx64, conv64le(buf.u64[0]));
	} else {
		for (size_t i = 0; i < size; ++i)
			printf("%02x", buf.u8[i]);
	}

	return false;
}


static void
print_info_basic(const lzma_index *idx, file_pair *pair)
{
	static bool headings_displayed = false;
	if (!headings_displayed) {
		headings_displayed = true;
		// TRANSLATORS: These are column titles. From Strms (Streams)
		// to Ratio, the columns are right aligned. Check and Filename
		// are left aligned. If you need longer words, it's OK to
		// use two lines here. Test with xz --list.
		puts(_("Strms  Blocks   Compressed Uncompressed  Ratio  "
				"Check   Filename"));
	}

	printf("%5s %7s  %11s  %11s  %5s  %-7s %s\n",
			uint64_to_str(lzma_index_stream_count(idx), 0),
			uint64_to_str(lzma_index_block_count(idx), 1),
			uint64_to_nicestr(lzma_index_file_size(idx),
				NICESTR_B, NICESTR_TIB, false, 2),
			uint64_to_nicestr(lzma_index_uncompressed_size(idx),
				NICESTR_B, NICESTR_TIB, false, 3),
			get_ratio(lzma_index_file_size(idx),
				lzma_index_uncompressed_size(idx)),
			get_check_names(lzma_index_checks(idx), false),
			pair->src_name);

	return;
}


static void
print_adv_helper(uint64_t stream_count, uint64_t block_count,
		uint64_t compressed_size, uint64_t uncompressed_size,
		uint32_t checks)
{
	printf(_("  Stream count:       %s\n"),
			uint64_to_str(stream_count, 0));
	printf(_("  Block count:        %s\n"),
			uint64_to_str(block_count, 0));
	printf(_("  Compressed size:    %s\n"),
			uint64_to_nicestr(compressed_size,
				NICESTR_B, NICESTR_TIB, true, 0));
	printf(_("  Uncompressed size:  %s\n"),
			uint64_to_nicestr(uncompressed_size,
				NICESTR_B, NICESTR_TIB, true, 0));
	printf(_("  Ratio:              %s\n"),
			get_ratio(compressed_size, uncompressed_size));
	printf(_("  Check:              %s\n"),
			get_check_names(checks, true));
	return;
}


static void
print_info_adv(const lzma_index *idx, file_pair *pair)
{
	// Print an empty line between files.
	static bool first_filename_printed = false;
	if (!first_filename_printed)
		first_filename_printed = true;
	else
		putchar('\n');

	// Print the filename and overall information.
	printf("%s (%" PRIu64 "):\n", pair->src_name, totals.files);
	print_adv_helper(lzma_index_stream_count(idx),
			lzma_index_block_count(idx),
			lzma_index_file_size(idx),
			lzma_index_uncompressed_size(idx),
			lzma_index_checks(idx));

	// TODO: The rest of this function needs some work. Currently
	// the offsets are not printed, which could be useful even when
	// printed in a less accurate format. On the other hand, maybe
	// this should print the information with exact byte values,
	// or maybe there should be at least an option to do that.
	//
	// We could also display some other info. E.g. it could be useful
	// to quickly see how big is the biggest Block (uncompressed size)
	// and if all Blocks have Compressed Size and Uncompressed Size
	// fields present, which can be used e.g. for multithreaded
	// decompression.

	// Avoid printing Stream and Block lists when they wouldn't be useful.
	bool show_blocks = false;
	if (lzma_index_stream_count(idx) > 1) {
		puts(_("  Streams:"));
		puts(_("      Number      Blocks    Compressed   "
				"Uncompressed   Ratio   Check"));

		lzma_index_iter iter;
		lzma_index_iter_init(&iter, idx);
		while (!lzma_index_iter_next(&iter, LZMA_INDEX_ITER_STREAM)) {
			if (iter.stream.block_count > 1)
				show_blocks = true;

			printf("    %8s  %10s   %11s    %11s   %5s   %s\n",
				uint64_to_str(iter.stream.number, 0),
				uint64_to_str(iter.stream.block_count, 1),
				uint64_to_nicestr(
					iter.stream.compressed_size,
					NICESTR_B, NICESTR_TIB, false, 2),
				uint64_to_nicestr(
					iter.stream.uncompressed_size,
					NICESTR_B, NICESTR_TIB, false, 3),
				get_ratio(iter.stream.compressed_size,
					iter.stream.uncompressed_size),
				check_names[iter.stream.flags->check]);
		}
	}

	if (show_blocks || lzma_index_block_count(idx)
				> lzma_index_stream_count(idx)
			|| message_verbosity_get() >= V_DEBUG) {
		puts(_("  Blocks:"));
		// FIXME: Number in Stream/file, which one is better?
		puts(_("      Stream      Number    Compressed   "
				"Uncompressed   Ratio   Check"));

		lzma_index_iter iter;
		lzma_index_iter_init(&iter, idx);
		while (!lzma_index_iter_next(&iter, LZMA_INDEX_ITER_BLOCK)) {
			printf("    %8s  %10s   %11s    %11s   %5s   %-7s",
				uint64_to_str(iter.stream.number, 0),
				uint64_to_str(iter.block.number_in_stream, 1),
				uint64_to_nicestr(iter.block.total_size,
					NICESTR_B, NICESTR_TIB, false, 2),
				uint64_to_nicestr(
					iter.block.uncompressed_size,
					NICESTR_B, NICESTR_TIB, false, 3),
				get_ratio(iter.block.total_size,
					iter.block.uncompressed_size),
				check_names[iter.stream.flags->check]);

			if (message_verbosity_get() >= V_DEBUG)
				if (print_check_value(pair, &iter))
					return;

			putchar('\n');
		}
	}
}


static void
print_info_robot(const lzma_index *idx, file_pair *pair)
{
	printf("file\t%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t%" PRIu64
			"\t%s\t%s\t%s\n",
			lzma_index_stream_count(idx),
			lzma_index_block_count(idx),
			lzma_index_file_size(idx),
			lzma_index_uncompressed_size(idx),
			get_ratio(lzma_index_file_size(idx),
				lzma_index_uncompressed_size(idx)),
			get_check_names(lzma_index_checks(idx), false),
			pair->src_name);

	if (message_verbosity_get() >= V_VERBOSE) {
		lzma_index_iter iter;
		lzma_index_iter_init(&iter, idx);

		while (!lzma_index_iter_next(&iter, LZMA_INDEX_ITER_STREAM))
			printf("stream\t%" PRIu64 "\t%" PRIu64 "\t%" PRIu64
				"\t%" PRIu64 "\t%" PRIu64
				"\t%s\t%" PRIu64 "\t%s\n",
				iter.stream.number,
				iter.stream.compressed_offset,
				iter.stream.uncompressed_offset,
				iter.stream.compressed_size,
				iter.stream.uncompressed_size,
				get_ratio(iter.stream.compressed_size,
					iter.stream.uncompressed_size),
				iter.stream.padding,
				check_names[iter.stream.flags->check]);

		lzma_index_iter_rewind(&iter);
		while (!lzma_index_iter_next(&iter, LZMA_INDEX_ITER_BLOCK)) {
			printf("block\t%" PRIu64 "\t%" PRIu64 "\t%" PRIu64
					"\t%" PRIu64 "\t%" PRIu64
					"\t%" PRIu64 "\t%" PRIu64 "\t%s\t%s",
					iter.stream.number,
					iter.block.number_in_stream,
					iter.block.number_in_file,
					iter.block.compressed_file_offset,
					iter.block.uncompressed_file_offset,
					iter.block.total_size,
					iter.block.uncompressed_size,
					get_ratio(iter.block.total_size,
						iter.block.uncompressed_size),
					check_names[iter.stream.flags->check]);

			if (message_verbosity_get() >= V_DEBUG) {
				putchar('\t');
				if (print_check_value(pair, &iter))
					return;
			}

			putchar('\n');
		}
	}

	return;
}


static void
update_totals(const lzma_index *idx)
{
	// TODO: Integer overflow checks
	++totals.files;
	totals.streams += lzma_index_stream_count(idx);
	totals.blocks += lzma_index_block_count(idx);
	totals.compressed_size += lzma_index_file_size(idx);
	totals.uncompressed_size += lzma_index_uncompressed_size(idx);
	totals.checks |= lzma_index_checks(idx);
	return;
}


static void
print_totals_basic(void)
{
	// Print a separator line.
	char line[80];
	memset(line, '-', sizeof(line));
	line[sizeof(line) - 1] = '\0';
	puts(line);

	// Print the totals except the file count, which needs
	// special handling.
	printf("%5s %7s  %11s  %11s  %5s  %-7s ",
			uint64_to_str(totals.streams, 0),
			uint64_to_str(totals.blocks, 1),
			uint64_to_nicestr(totals.compressed_size,
				NICESTR_B, NICESTR_TIB, false, 2),
			uint64_to_nicestr(totals.uncompressed_size,
				NICESTR_B, NICESTR_TIB, false, 3),
			get_ratio(totals.compressed_size,
				totals.uncompressed_size),
			get_check_names(totals.checks, false));

	// Since we print totals only when there are at least two files,
	// the English message will always use "%s files". But some other
	// languages need different forms for different plurals so we
	// have to translate this string still.
	//
	// TRANSLATORS: This simply indicates the number of files shown
	// by --list even though the format string uses %s.
	printf(N_("%s file", "%s files\n",
			totals.files <= ULONG_MAX ? totals.files
				: (totals.files % 1000000) + 1000000),
			uint64_to_str(totals.files, 0));

	return;
}


static void
print_totals_adv(void)
{
	putchar('\n');
	puts(_("Totals:"));
	printf(_("  Number of files:    %s\n"),
			uint64_to_str(totals.files, 0));
	print_adv_helper(totals.streams, totals.blocks,
			totals.compressed_size, totals.uncompressed_size,
			totals.checks);

	return;
}


static void
print_totals_robot(void)
{
	printf("totals\t%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t%" PRIu64
			"\t%s\t%s\t%" PRIu64 "\n",
			totals.streams,
			totals.blocks,
			totals.compressed_size,
			totals.uncompressed_size,
			get_ratio(totals.compressed_size,
				totals.uncompressed_size),
			get_check_names(totals.checks, false),
			totals.files);

	return;
}


extern void
list_totals(void)
{
	if (opt_robot) {
		// Always print totals in --robot mode. It can be convenient
		// in some cases and doesn't complicate usage of the
		// single-file case much.
		print_totals_robot();

	} else if (totals.files > 1) {
		// For non-robot mode, totals are printed only if there
		// is more than one file.
		if (message_verbosity_get() <= V_WARNING)
			print_totals_basic();
		else
			print_totals_adv();
	}

	return;
}


extern void
list_file(const char *filename)
{
	if (opt_format != FORMAT_XZ && opt_format != FORMAT_AUTO)
		message_fatal(_("--list works only on .xz files "
				"(--format=xz or --format=auto)"));

	if (strcmp(filename, "-") == 0) {
		message_error(_("--list does not support reading from "
				"standard input"));
		return;
	}

	if (is_empty_filename(filename))
		return;

	// Set opt_stdout so that io_open() won't create a new file.
	// Disable also sparse mode so that it doesn't remove O_APPEND
	// from stdout.
	opt_stdout = true;
	io_no_sparse();
	file_pair *pair = io_open(filename);
	if (pair == NULL)
		return;

	lzma_index *idx;
	if (!parse_indexes(&idx, pair)) {
		// Update the totals that are displayed after all
		// the individual files have been listed.
		update_totals(idx);

		// We have three main modes:
		//  - --robot, which has submodes if --verbose is specified
		//     once or twice
		//  - Normal --list without --verbose
		//  - --list with one or two --verbose
		if (opt_robot)
			print_info_robot(idx, pair);
		else if (message_verbosity_get() <= V_WARNING)
			print_info_basic(idx, pair);
		else
			print_info_adv(idx, pair);

		lzma_index_end(idx, NULL);
	}

	io_close(pair, false);
	return;
}