From 9ffdb5f006b622d79b1562c29669c5b515d7ca02 Mon Sep 17 00:00:00 2001 From: Jia Tan Date: Wed, 1 Nov 2023 14:47:05 +0800 Subject: [PATCH] xz: Parse directories in recursive mode. This directory parsing method prioritizes lower memory usage and file descriptor utilization at the cost of more complicated code and a higher number of small allocations. This method makes no recursive calls and instead keeps a queue of directories to parse.Only one directory file descriptor is ever needed at one time. The directory_iterator abstracts the implementation of the directory parsing to allow for an easy interface for both POSIX and MSVC. Currently the MSVC builds suffers from MAX_PATH being limited to 260 by default. This restricts the usefulness of recursive mode on Windows. A user can edit a registry config in Windows 10, Version 1607 and later to remove this low path limit. Alternatively, we can prefix the absolute path with "\\?\" to also remove the restriction. Note, this restriction also applies to the compatibility functions so MSVC builds cannot read or write to files with paths longer than 260 characters. --- src/xz/file_io.c | 176 ++++++++++++++++++++++++++++++++++++++++-- src/xz/file_io.h | 44 +++++++++++ src/xz/main.c | 195 ++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 400 insertions(+), 15 deletions(-) diff --git a/src/xz/file_io.c b/src/xz/file_io.c index cea83969..680412c1 100644 --- a/src/xz/file_io.c +++ b/src/xz/file_io.c @@ -21,6 +21,10 @@ static bool warn_fchown; #endif +#ifdef HAVE_DIRENT_H +# include +#endif + #if defined(HAVE_FUTIMES) || defined(HAVE_FUTIMESAT) || defined(HAVE_UTIMES) # include #elif defined(HAVE__FUTIME) @@ -794,7 +798,7 @@ io_open_src_real(file_pair *pair) if (was_symlink) message_warning(_("%s: Is a symbolic link, " "skipping"), pair->src_name); - else { + else #endif { #ifdef _WIN32 @@ -805,7 +809,7 @@ io_open_src_real(file_pair *pair) if (errno == EACCES) { pair->is_directory = should_parse_dir_windows( pair->src_name); - return pair->is_directory; + return !pair->is_directory; } #else // Something else than O_NOFOLLOW failing @@ -815,7 +819,6 @@ io_open_src_real(file_pair *pair) strerror(errno)); #endif } - return true; } @@ -847,9 +850,13 @@ io_open_src_real(file_pair *pair) // Do not allow symlinks with recursive mode because this // could lead to a loop in the file system and thus infinite // recursion. If a symlink is detected, skip it. - // S_ISLNK and lstat() are not available with MSVC so these need - // to be in an #ifdef + // S_ISLNK and lstat() are not available with MSVC so these + // need to be in an #ifdef if (follow_symlinks) { +#ifdef _WIN32 + if (!should_parse_dir_windows(pair->src_name)) + goto error; +#else if (lstat(pair->src_name, &pair->src_st) != 0) goto error_msg; @@ -858,6 +865,7 @@ io_open_src_real(file_pair *pair) "directory, skipping"), pair->src_name); goto error; } +#endif } (void)close(pair->src_fd); @@ -1567,3 +1575,161 @@ io_write(file_pair *pair, const io_buf *buf, size_t size) return io_write_buf(pair, buf->u8, size); } + + +#if defined(_MSC_VER) || defined(HAVE_DIRENT_H) +struct directory_iter_s { +#if defined(_MSC_VER) + HANDLE dir; + + // The path must be saved because the call to + // directory_iterator_init() does not actually open + // the directory HANDLE. There is not a way to open + // the directory without reading the first entry. + // Instead, the search path is prepared in + // directory_iterator_init() so the first call to + // directory_iter_next() will be able to use the saved + // path. + char *path; + + // Windows uses FindFirstFile() to do the first search and + // open the HANDLE to the directory. After that, FindNextFile() + // must be used to continue the search. So this flag marks if + // FindFirstFile() or FindNextFile() should be used. + bool first; +#elif defined(HAVE_DIRENT_H) + DIR *dir; +#endif +}; + + +extern directory_iter * +directory_iterator_init(const char *path) +{ + directory_iter *iter = xmalloc(sizeof(directory_iter)); + +#ifdef _MSC_VER + iter->first = true; + + const size_t path_len = strlen(path); + char* path_search = xmalloc(path_len + 3); + memcpy(path_search, path, path_len); + + // The windows directory search functions take a regular expression + // instead of just the directory name. Since we want all files in + // the directory, we need to append the wildcard character (*) to + // the end of the path. + // + // Note: It does not matter if the path parameter ends with the + // path separator. The search path is not displayed and the + // proper path name extension is handled elsewhere. + path_search[path_len] = PATH_SEP; + path_search[path_len + 1] = '*'; + path_search[path_len + 2] = '\0'; + + iter->path = path_search; +#else + // On some platforms, opendir() interrupted so it is safest to + // block signals here. + signals_block(); + iter->dir = opendir(path); + signals_unblock(); + + if (iter->dir == NULL) { + free(iter); + message_error(_("%s: Error opening the directory: %s"), + path, strerror(errno)); + return NULL; + } +#endif + return iter; +} + + +extern bool +directory_iter_next(directory_iter *iter, char *entry, size_t *entry_len) +{ + bool next = true; + char *next_entry; + +#ifdef _MSC_VER + WIN32_FIND_DATA dir_entry; + if (iter->first) { + iter->dir = FindFirstFile(iter->path, &dir_entry); + + // The existence of the directory is checked in + // io_open_src_real() so its most likely this + // is an empty directory. + if (iter->dir == INVALID_HANDLE_VALUE) + next = false; + + iter->first = false; + } + else { + next = FindNextFile(iter->dir, &dir_entry); + } + + next_entry = dir_entry.cFileName; +#else + // The only way to check if an error occurred is by saving the + // old errno and comparing it to the errno after readdir() + // completes. readdir() will return NULL on error and if the + // directory has been parsed to completion. + int old_errno = errno; + struct dirent *dir_entry = readdir(iter->dir); + + if (dir_entry == NULL) { + // readdir() is not supposed to change the errno based on + // the POSIX standard. However the implementation used by + // MinGW-w64 will set errno to 0 on success. So if the errno + // was previously set it will falsely indicate and error. + if(old_errno != errno && errno != 0) + message_error(_("Error reading directory entry: %s"), + strerror(errno)); + next = false; + } + + next_entry = dir_entry->d_name; +#endif + + if (next) { + const size_t next_entry_len = strlen(next_entry); + + if (*entry_len <= next_entry_len) { + message_error(_("Unexpected directory entry " + "length.")); + *entry_len = 0; + return true; + } + + // Copy NULL terminator + memcpy(entry, next_entry, next_entry_len + 1); + *entry_len = next_entry_len; + } + + return next; +} + + +extern void +directory_iter_close(directory_iter *iter) +{ + if (iter != NULL) { +#ifdef _MSC_VER + if (iter->dir != INVALID_HANDLE_VALUE + && !FindClose(iter->dir)) { + DWORD err = GetLastError(); + message_windows_error("Error closing directory", err); + } + + free(iter->path); +#else + if(closedir(iter->dir)) + message_error(_("Error closing directory: %s"), + strerror(errno)); +#endif + free(iter); + } +} + +#endif //defined(_MSC_VER) || defined(HAVE_DIRENT_H) diff --git a/src/xz/file_io.h b/src/xz/file_io.h index 812b677f..40ed7577 100644 --- a/src/xz/file_io.h +++ b/src/xz/file_io.h @@ -26,6 +26,9 @@ # define stat _stat64 # define fstat _fstat64 # define off_t __int64 +# define PATH_SEP '\\' +#else +# define PATH_SEP '/' #endif @@ -193,3 +196,44 @@ extern bool io_pread(file_pair *pair, io_buf *buf, size_t size, uint64_t pos); /// \return On success, zero is returned. On error, -1 is returned /// and error message printed. extern bool io_write(file_pair *pair, const io_buf *buf, size_t size); + +/// Opaque struct representing a directory iterator. This should be used +/// with directory_iterator_init(), directory_iter_next(), and +/// directory_iter_close(). +typedef struct directory_iter_s directory_iter; + + +/// @brief Creates a Directory Iterator +/// +/// This will create and initialize a directory_iter structure. +/// The pointer should not be freed and should instead be passed +/// to directory_iter_close() when it is no longer needed. +/// +/// @param path String path to a directory +/// +/// @return On success, a pointer to the directory iterator. +/// On error, NULL. +extern directory_iter * directory_iterator_init(const char* path); + + +/// @brief Iterate to the next directory entry +/// +/// @param iter Pointer to the iterator +/// @param entry Buffer to receive the next directory entry +/// @param entry_len Set this to the size of the entry buffer. On +/// success this is set to the string length of +/// the entry that was copied into entry (does not +/// count the NULL terminator). +/// +/// @return Returns true if there may be more entries. +/// Returns false otherwise. +extern bool directory_iter_next(directory_iter *iter, char *entry, + size_t *entry_len); + +/// @brief Close the Directory Iterator +/// +/// The cleans up the iterator by closing files and freeing +/// all needed memory. +/// +/// @param iter Pointer to the iterator to close +extern void directory_iter_close(directory_iter *iter); diff --git a/src/xz/main.c b/src/xz/main.c index 05e9f5e3..a6e1c840 100644 --- a/src/xz/main.c +++ b/src/xz/main.c @@ -19,6 +19,24 @@ # include #endif +/// The directory_list type is used in recursive mode to keep track of all +/// the directories that need processing. Its used a a queue to process +/// directories in the order they are discovered. Files, on the other hand +/// are processed right away to reduce the size of the queue and hence the +/// amount of memory needed to be allocated at any one time. +typedef struct directory_list_s { + /// Path to the directory. This is used as a pointer since it is + /// likely that most directories do not need the full possible file + /// path length allowed by systems. This saves memory in cases where + /// many directories need to be on the queue at the same time. + char *dir_path; + + /// Pointer to the next directory in the queue. This is only a + /// singly linked list since we only ever need to process the queue + /// in one direction. + struct directory_list_s *next; +} directory_list; + /// Exit status to use. This can be changed with set_exit_status(). static enum exit_status_type exit_status = E_SUCCESS; @@ -149,27 +167,184 @@ read_name(const args_info *args) static void process_entry(const char *path) { - // Set and possibly print the filename for the progress message. - message_filename(path); +#ifdef HAVE_DECODERS + if (opt_mode == MODE_LIST && path == stdin_filename) { + message_error(_("--list does not support reading from " + "standard input")); + return; + } +#endif // Open the entry file_pair *pair = io_open_src(path); if (pair == NULL) return; -#ifdef HAVE_DECODERS - if (opt_mode == MODE_LIST) { - if (path == stdin_filename) { - message_error(_("--list does not support reading from " - "standard input")); - return; +#if defined(_MSC_VER) || defined(HAVE_DIRENT_H) + // io_open_src() will return NULL if the path points to a directory + // and we aren't in recursive mode. So there is no need to check + // for recursive mode here. + if (pair->is_directory) { + // Create the queue of directories to process. The first + // item in the queue will be the base entry. The first item + // is dynamically allocated to simplify the memory freeing + // code later on. + directory_list *dir_list = xmalloc(sizeof(directory_list)); + + dir_list->dir_path = xstrdup(path); + + // Strip any trailing path separators at the end of the + // directory. This makes the path compatible with Windows + // MSVC search functions and makes the output look nicer. + for (size_t i = strlen(path) - 1; dir_list->dir_path[i] + == PATH_SEP && i > 1; i--) { + dir_list->dir_path[i] = '\0'; } - list_file(pair); + dir_list->next = NULL; + + // The current pointer represents the directory we are + // currently processing. To start, it is initialzed as the + // base entry. + directory_list *current = dir_list; + + // The pointer to the last item in the queue is used to + // append new directories. + directory_list *last = dir_list; + do { + directory_list* next; + + // The iterator initialization will return NULL and + // print an error message if there is any kind of + // problem. In this case, we can simply continue on + // to the next directory to process. + directory_iter *iter = directory_iterator_init( + current->dir_path); + + // The error message is printed during + // directory_iterator_init(), so no need to print + // anything before proceeding to the next iteration. + if (iter == NULL) + goto next_iteration; + + const size_t dir_path_len = strlen(current->dir_path); + + // Set ENTRY_LEN_MAX depending on the system. On + // POSIX systems, NAME_MAX will be defined in + // . On Windows, the directory parsing + // functions have buffers of size MAX_PATH. +#ifdef TUKLIB_DOSLIKE +# define ENTRY_LEN_MAX MAX_PATH +#else +# define ENTRY_LEN_MAX NAME_MAX +#endif + char entry[ENTRY_LEN_MAX + 1]; + size_t entry_len; + + // The entry_len must be reset each iteration because + // directory_iter_next() will only write to the entry + // buffer if it can write the entire entry name. If the + // value is not reset each time, it will limit the + // next entry size based on the last entry's size. + while ((entry_len = ENTRY_LEN_MAX) + && directory_iter_next(iter, entry, + &entry_len)) { + // Extend current directory path with + // new entry. + if (entry_len == 0) + continue; + + // Check for '.' and '..' since there is no + // point in processing them. + if (entry[0] == '.' && ((entry[1] == '.' + && entry[2] == '\0') + || entry[1] == '\0')) + continue; + + // The total entry size needs the "+2" to + // make room for the directory path separator + // and the NULL terminator. + const size_t total_size = entry_len + dir_path_len + 2; + char *entry_path = xmalloc(total_size); + + memcpy(entry_path, current->dir_path, dir_path_len); + + char *entry_copy_start = entry_path + dir_path_len; + + entry_path[dir_path_len] = PATH_SEP; + entry_copy_start++; + + memcpy(entry_copy_start, entry, entry_len + 1); + + // Try to open the next entry. If it is a file + // it will be processed immediately. If it is a + // directory it will be added to the queue to + // be processed later. Processing files right + // away reduces the amount of memory needed + // for queue nodes and stored file paths. + // Exploring directories only increases the + // amount of memory needed so its better to + // prioritize processing files as early as + // possible. + pair = io_open_src(entry_path); + + if (pair == NULL) { + free(entry_path); + continue; + } + + if (pair->is_directory) { + directory_list *next_dir = xmalloc( + sizeof(directory_list)); + next_dir->dir_path = entry_path; + next_dir->next = NULL; + last->next = next_dir; + last = next_dir; + } else if (entry[0] == '.' + && opt_mode == MODE_COMPRESS + && !opt_keep_original) { + message_warning(_("%s: Hidden file " + "skipped during recursive " + "compression mode. Use --keep " + "to process these files.\n"), + entry_path); + free(entry_path); + } else { + + message_filename(entry_path); +#ifdef HAVE_DECODERS + if (opt_mode == MODE_LIST) + list_file(pair); + else +#endif + coder_run(pair); + free(entry_path); + } + } + + directory_iter_close(iter); +next_iteration: + next = current->next; + + free(current->dir_path); + free(current); + + current = next; + } while (current != NULL); + return; } -#endif +#endif // defined(_MSC_VER) || defined(HAVE_DIRENT_H) + +// Set and possibly print the filename for the progress message. +message_filename(path); + +#ifdef HAVE_DECODERS + if (opt_mode == MODE_LIST) + list_file(pair); + else +#endif coder_run(pair); }