fs: make dedupe work again after a really unfortunate build fix

In commit 14ce81c08 "fs: get rid of silly base class that causes build failures now" I neglected to set the dest_count field in the ioctl arg structure, so bees master hasn't been deduping anything for about three weeks. I'd put a THROW_CHECK in here to catch this kind of bug in the future, but it would be placed at exactly the point where this fix is. Fixes: 14ce81c08 Signed-off-by: Zygo Blaxell <bees@furryterror.org>
fs: update btrfs compatibility header: add csum types, BTRFS_FS_INFO_FLAG_GENERATION and _METADATA_UUID
2025-08-02 13:53:28 +02:00 · 2022-11-05 13:43:21 -04:00 · 2022-10-25 12:56:16 -04:00 · 2022-10-25 12:56:16 -04:00 · 2022-10-25 12:56:16 -04:00 · 2022-10-25 12:56:16 -04:00
38 changed files with 763 additions and 560 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,8 @@
 *.[ao]
 *.bak
-*.new
 *.dep
+*.new
+*.tmp
 *.so*
 Doxyfile
 README.html
--- a/2
+++ b/2
@@ -61,7 +61,7 @@ install_bees: src $(RUN_INSTALL_TESTS)
 install_scripts: ## Install scipts
 install_scripts: scripts
 	install -Dm755 scripts/beesd $(DESTDIR)$(PREFIX)/sbin/beesd
-	install -Dm644 scripts/beesd.conf.sample $(DESTDIR)/$(ETC_PREFIX)/bees/beesd.conf.sample
+	install -Dm644 scripts/beesd.conf.sample $(DESTDIR)$(ETC_PREFIX)/bees/beesd.conf.sample
 ifneq ($(SYSTEMD_SYSTEM_UNIT_DIR),)
 	install -Dm644 scripts/beesd@.service $(DESTDIR)$(SYSTEMD_SYSTEM_UNIT_DIR)/beesd@.service
 endif
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ Strengths
 * Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
 * Daemon incrementally dedupes new data using btrfs tree search
 * Works with btrfs compression - dedupe any combination of compressed and uncompressed files
- * **NEW** [Works around `btrfs send` problems with dedupe and incremental parent shapshots](docs/options.md)
+ * **NEW** [Works around `btrfs send` problems with dedupe and incremental parent snapshots](docs/options.md)
 * Works around btrfs filesystem structure to free more disk space
 * Persistent hash table for rapid restart after shutdown
 * Whole-filesystem dedupe - including snapshots
@@ -70,6 +70,6 @@ You can also use Github:
 Copyright & License
 -------------------

-Copyright 2015-2018 Zygo Blaxell <bees@furryterror.org>.
+Copyright 2015-2022 Zygo Blaxell <bees@furryterror.org>.

 GPL (version 3 or later).
--- a/docs/btrfs-kernel.md
+++ b/docs/btrfs-kernel.md
@@ -9,7 +9,7 @@ This issue is fixed in kernel 5.4.14 and later.

 **Recommended kernel versions for bees are 4.19, 5.4, 5.10, 5.11, or 5.12,
 with recent LTS and -stable updates.**  The latest released kernel as
-of this writing is 5.12.3.
+of this writing is 5.18.18.

 4.14, 4.9, and 4.4 LTS kernels with recent updates are OK with
 some issues.  Older kernels will be slower (a little slower or a lot
@@ -31,7 +31,7 @@ In some future bees release, this API version may become mandatory.
 Kernel Bug Tracking Table
 -------------------------

-These bugs are particularly popular among bees users:
+These bugs are particularly popular among bees users, though not all are specifically relevant to bees:

 | First bad kernel | Last bad kernel | Issue Description | Fixed Kernel Versions | Fix Commit
 | :---: | :---: | --- | :---: | ---
@@ -61,7 +61,11 @@ These bugs are particularly popular among bees users:
 | 5.4 | 5.11 | spurious tree checker failures on extent ref hash | 5.11.5, 5.12 and later | 1119a72e223f btrfs: tree-checker: do not error out if extent ref hash doesn't match
 | - | 5.11 | tree mod log issue #5 | 4.4.263, 4.9.263, 4.14.227, 4.19.183, 5.4.108, 5.10.26, 5.11.9, 5.12 and later | dbcc7d57bffc btrfs: fix race when cloning extent buffer during rewind of an old root
 | - | 5.12 | tree mod log issue #6 | 4.14.233, 4.19.191, 5.4.118, 5.10.36, 5.11.20, 5.12.3, 5.13 and later | f9690f426b21 btrfs: fix race when picking most recent mod log operation for an old root
-| 4.15 | - | spurious warnings from `fs/fs-writeback.c` when `flushoncommit` is enabled | - | workaround:  comment out the `WARN_ON`
+| 4.15 | 5.16 | spurious warnings from `fs/fs-writeback.c` when `flushoncommit` is enabled | 5.15.27, 5.16.13, 5.17 and later | a0f0cf8341e3 btrfs: get rid of warning on transaction commit when using flushoncommit
+| - | 5.17 | crash during device removal can make filesystem unmountable | 5.15.54, 5.16.20, 5.17.3, 5.18 and later | bbac58698a55 btrfs: remove device item and update super block in the same transaction
+| - | 5.18 | wrong superblock num_devices makes filesystem unmountable | 4.14.283, 4.19.247, 5.4.198, 5.10.121, 5.15.46, 5.17.14, 5.18.3, 5.19 and later | d201238ccd2f btrfs: repair super block num_devices automatically
+| 5.18 | 5.19 | parent transid verify failed during log tree replay after a crash during a rename operation | 5.18.18, 5.19.2, 6.0 and later | 723df2bcc9e1 btrfs: join running log transaction when logging new name
+| 5.4 | - | kernel hang when multiple threads are running `LOGICAL_INO` and dedupe ioctl | - | workaround: reduce bees thread count to 1 with `-c1`

 "Last bad kernel" refers to that version's last stable update from
 kernel.org.  Distro kernels may backport additional fixes.  Consult
@@ -77,7 +81,7 @@ A "-" for "first bad kernel" indicates the bug has been present since
 the relevant feature first appeared in btrfs.

 A "-" for "last bad kernel" indicates the bug has not yet been fixed as
-of 5.8.14.
+of 5.18.18.

 In cases where issues are fixed by commits spread out over multiple
 kernel versions, "fixed kernel version" refers to the version that
@@ -87,6 +91,11 @@ contains all components of the fix.
 Workarounds for known kernel bugs
 ---------------------------------

+* **Hangs with high worker thread counts**:  On kernels newer than
+  5.4, multiple threads running `LOGICAL_INO` and dedupe ioctls
+  at the same time can lead to a kernel hang.  The workaround is
+  to reduce the thread count to 1 with `-c1`.
+
 * **Tree mod log issues**:  bees will detect that a btrfs balance is
  running, and pause bees activity until the balance is done.  This avoids
  running both the `LOGICAL_INO` ioctl and btrfs balance at the same time,
@@ -128,7 +137,7 @@ Workarounds for known kernel bugs
 Unfixed kernel bugs
 -------------------

-As of 5.12.3:
+As of 5.18.18:

 * **The kernel does not permit `btrfs send` and dedupe to run at the
  same time**.  Recent kernels no longer crash, but now refuse one
@@ -151,22 +160,3 @@ As of 5.12.3:
  still saves some IO.

  `btrfs receive` is not affected by this issue.
-
-* **Spurious warnings in `fs/fs-writeback.c`** on kernel 4.15 and later
-  when filesystem is mounted with `flushoncommit`.  These
-  seem to be harmless (there are other locks which prevent
-  concurrent umount of the filesystem), but the underlying
-  problems that trigger the `WARN_ON` are [not trivial to
-  fix](https://www.spinics.net/lists/linux-btrfs/msg87752.html).
-
-  The warnings can be especially voluminous when bees is running.
-
-  Workarounds:
-
-  1. mount with `-o noflushoncommit`
-  2. patch kernel to remove warning in `fs/fs-writeback.c`.
-
-  Note that using kernels 4.14 and earlier is *not* a viable workaround
-  for this issue, because kernels 4.14 and earlier will eventually
-  deadlock when a filesystem is mounted with `-o flushoncommit` (a single
-  commit fixes one bug and introduces the other).
--- a/docs/event-counters.md
+++ b/docs/event-counters.md
@@ -67,11 +67,12 @@ The `adjust` event group consists of operations related to translating stored vi
 * `adjust_exact`: A block address from the hash table corresponding to an uncompressed data block was processed to find its `(root, inode, offset)` references.
 * `adjust_exact_correct`: A block address corresponding to an uncompressed block was retrieved from the hash table and resolved to a physical block containing data that matches another block bees has already read.
 * `adjust_exact_wrong`: A block address corresponding to an uncompressed block was retrieved from the hash table and resolved to a physical block containing data that matches the hash but not the data from another block bees has already read (i.e. there was a hash collision).
- * `adjust_hit`: A block address was retrieved from the hash table and resolved to a physical block containing data that matches the data from another block bees has already read (i.e. a duplicate match was found).
+ * `adjust_hit`: A block address was retrieved from the hash table and resolved to a physical block in an uncompressed extent containing data that matches the data from another block bees has already read (i.e. a duplicate match was found).
 * `adjust_miss`: A block address was retrieved from the hash table and resolved to a physical block containing a hash that does not match the hash from another block bees has already read (i.e. the hash table contained a stale entry and the data it referred to has since been overwritten in the filesystem).
 * `adjust_needle_too_long`: A block address was retrieved from the hash table, but when the corresponding extent item was retrieved, its offset or length were out of range to be a match (i.e. the hash table contained a stale entry and the data it referred to has since been overwritten in the filesystem).
 * `adjust_no_match`: A hash collision occurred (i.e. a block on disk was located with the same hash as the hash table entry but different data) .  Effectively an alias for `hash_collision` as it is not possible to have one event without the other.
 * `adjust_offset_high`: The `LOGICAL_INO` ioctl gave an extent item that does not overlap with the desired block because the extent item ends before the desired block in the extent data.
+ * `adjust_offset_hit`: A block address was retrieved from the hash table and resolved to a physical block in a compressed extent containing data that matches the data from another block bees has already read (i.e. a duplicate match was found).
 * `adjust_offset_low`: The `LOGICAL_INO` ioctl gave an extent item that does not overlap with the desired block because the extent item begins after the desired block in the extent data.
 * `adjust_try`: A block address and extent item candidate were passed to `BeesResolver::adjust_offset` for processing.

@@ -299,6 +300,7 @@ The `resolve` event group consists of operations related to translating a btrfs
 * `resolve_large`: The `LOGICAL_INO` ioctl returned more than 2730 results (the limit of the v1 ioctl).
 * `resolve_ms`: Total time spent in the `LOGICAL_INO` ioctl (i.e. wallclock time, not kernel CPU time).
 * `resolve_ok`: The `LOGICAL_INO` ioctl returned success.
+ * `resolve_overflow`: The `LOGICAL_INO` ioctl returned more than 655050 extents (the limit of the v2 ioctl).
 * `resolve_toxic`: The `LOGICAL_INO` ioctl took more than 0.1 seconds of kernel CPU time.

 root
@@ -333,6 +335,7 @@ The `scan` event group consists of operations related to scanning incoming data.
 * `scan_eof`: Scan past EOF was attempted.
 * `scan_erase_redundant`: Blocks in the hash table were removed because they were removed from the filesystem by dedupe.
 * `scan_extent`: An extent was scanned (`scan_one_extent`).
+ * `scan_extent_tiny`: An extent below 128K that was not the beginning or end of a file was scanned.  No action is currently taken for these--they are merely counted.
 * `scan_forward`: A logical byte range was scanned (`scan_forward`).
 * `scan_found`: An entry was found in the hash table matching a scanned block from the filesystem.
 * `scan_hash_hit`: A block was found on the filesystem corresponding to a block found in the hash table.
--- a/docs/gotchas.md
+++ b/docs/gotchas.md
@@ -45,7 +45,7 @@ bees will loop billions of times considering all possibilities.  This is
 a waste of time, so an exception is currently used to break out of such
 loops early.  The exception text in this case is:

-	`FIXME: bailing out here, need to fix this further up the call stack`
+	`FIXME: too many duplicate candidates, bailing out here`


 Terminating bees with SIGTERM
--- a/docs/index.md
+++ b/docs/index.md
@@ -17,7 +17,7 @@ Strengths
 * Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
 * Daemon incrementally dedupes new data using btrfs tree search
 * Works with btrfs compression - dedupe any combination of compressed and uncompressed files
- * **NEW** [Works around `btrfs send` problems with dedupe and incremental parent shapshots](options.md)
+ * **NEW** [Works around `btrfs send` problems with dedupe and incremental parent snapshots](options.md)
 * Works around btrfs filesystem structure to free more disk space
 * Persistent hash table for rapid restart after shutdown
 * Whole-filesystem dedupe - including snapshots
@@ -70,6 +70,6 @@ You can also use Github:
 Copyright & License
 -------------------

-Copyright 2015-2018 Zygo Blaxell <bees@furryterror.org>.
+Copyright 2015-2022 Zygo Blaxell <bees@furryterror.org>.

 GPL (version 3 or later).
--- a/docs/wrong.md
+++ b/docs/wrong.md
@@ -134,7 +134,7 @@ ulimit -c 0

 # If there were core files, generate reports for them
 for x in core*; do
-	if [ -e "$x" ]; then 
+	if [ -e "$x" ]; then
 		gdb --core="$x" \
 		--eval-command='set pagination off' \
 		--eval-command='info shared' \
--- a/include/crucible/btrfs.h
+++ b/include/crucible/btrfs.h
@@ -216,7 +216,28 @@ enum btrfs_compression_type {
 	#define BTRFS_FS_INFO_FLAG_CSUM_INFO                    (1 << 0)
 #endif

-struct btrfs_ioctl_fs_info_args_v2 {
+#ifndef BTRFS_FS_INFO_FLAG_GENERATION
+/* Request information about filesystem generation */
+#define BTRFS_FS_INFO_FLAG_GENERATION                   (1 << 1)
+#endif
+
+#ifndef BTRFS_FS_INFO_FLAG_METADATA_UUID
+/* Request information about filesystem metadata UUID */
+#define BTRFS_FS_INFO_FLAG_METADATA_UUID                (1 << 2)
+#endif
+
+// BTRFS_CSUM_TYPE_CRC32 was a #define from 2008 to 2019.
+// After that, it's an enum with the other 3 types.
+// So if we do _not_ have CRC32 defined, it means we have the other 3;
+// if we _do_ have CRC32 defined, it means we need the other 3.
+// This seems likely to break some day.
+#ifdef BTRFS_CSUM_TYPE_CRC32
+	#define BTRFS_CSUM_TYPE_XXHASH 1
+	#define BTRFS_CSUM_TYPE_SHA256 2
+	#define BTRFS_CSUM_TYPE_BLAKE2 3
+#endif
+
+struct btrfs_ioctl_fs_info_args_v3 {
 	__u64 max_id;                           /* out */
 	__u64 num_devices;                      /* out */
 	__u8 fsid[BTRFS_FSID_SIZE];             /* out */
@@ -227,7 +248,9 @@ struct btrfs_ioctl_fs_info_args_v2 {
 	__u16 csum_type;                        /* out */
 	__u16 csum_size;                        /* out */
 	__u64 flags;                            /* in/out */
-	__u8 reserved[968];                     /* pad to 1k */
+	__u64 generation;                       /* out */
+	__u8 metadata_uuid[BTRFS_FSID_SIZE];    /* out */
+	__u8 reserved[944];                     /* pad to 1k */
 };

 #endif // CRUCIBLE_BTRFS_H
--- a/include/crucible/bytevector.h
+++ b/include/crucible/bytevector.h
@@ -0,0 +1,71 @@
+#ifndef _CRUCIBLE_BYTEVECTOR_H_
+#define _CRUCIBLE_BYTEVECTOR_H_
+
+#include <memory>
+
+#include <cstdint>
+#include <cstdlib>
+
+namespace crucible {
+	using namespace std;
+	// new[] is a little slower than malloc
+	// shared_ptr is about 2x slower than unique_ptr
+	// vector<uint8_t> is ~160x slower
+	// so we won't bother with unique_ptr because we can't do shared copies with it
+
+	class ByteVector {
+	public:
+		using Pointer = shared_ptr<uint8_t>;
+		using value_type = Pointer::element_type;
+		using iterator = value_type*;
+
+		ByteVector() = default;
+		ByteVector(size_t size);
+		ByteVector(const ByteVector &that, size_t start, size_t length);
+		ByteVector(iterator begin, iterator end, size_t min_size = 0);
+
+		ByteVector at(size_t start, size_t length) const;
+
+		value_type& at(size_t) const;
+		iterator begin() const;
+		void clear();
+		value_type* data() const;
+		bool empty() const;
+		iterator end() const;
+		value_type& operator[](size_t) const;
+		size_t size() const;
+		bool operator==(const ByteVector &that) const;
+
+		// this version of erase only works at the beginning or end of the buffer, else throws exception
+		void erase(iterator first);
+		void erase(iterator first, iterator last);
+
+		// An important use case is ioctls that have a fixed-size header struct
+		// followed by a buffer for further arguments.  These templates avoid
+		// doing reinterpret_casts every time.
+		template <class T> ByteVector(const T& object, size_t min_size);
+		template <class T> T* get() const;
+	private:
+		Pointer m_ptr;
+		size_t m_size = 0;
+	};
+
+	template <class T>
+	ByteVector::ByteVector(const T& object, size_t min_size)
+	{
+		const auto size = max(min_size, sizeof(T));
+		m_ptr = Pointer(static_cast<value_type*>(malloc(size)), free);
+		memcpy(m_ptr.get(), &object, sizeof(T));
+		m_size = size;
+	}
+
+	template <class T>
+	T*
+	ByteVector::get() const
+	{
+		return reinterpret_cast<T*>(data());
+	}
+
+}
+
+#endif // _CRUCIBLE_BYTEVECTOR_H_
--- a/include/crucible/endian.h
+++ b/include/crucible/endian.h
@@ -28,7 +28,7 @@ namespace crucible {
 	};

 	template<> struct le_to_cpu_helper<uint16_t> {
-		uint16_t operator()(const uint16_t v) { return le64toh(v); }
+		uint16_t operator()(const uint16_t v) { return le16toh(v); }
 	};

 	template<> struct le_to_cpu_helper<uint8_t> {
--- a/include/crucible/error.h
+++ b/include/crucible/error.h
@@ -126,6 +126,13 @@ namespace crucible {
 	} \
 } while(0)

+#define THROW_CHECK4(type, value1, value2, value3, value4, expr) do { \
+	if (!(expr)) { \
+		THROW_ERROR(type, #value1 << " = " << (value1) << ", " #value2 << " = " << (value2) << ", " #value3 << " = " << (value3) << ", " #value4 << " = " << (value4) \
+			<< " failed constraint check (" << #expr << ")"); \
+	} \
+} while(0)
+
 #define THROW_CHECK_BIN_OP(type, value1, op, value2) do { \
 	if (!((value1) op (value2))) { \
 		THROW_ERROR(type, "failed constraint check " << #value1 << " (" << (value1) << ") " << #op << " " << #value2 << " (" << (value2) << ")"); \
--- a/include/crucible/extentwalker.h
+++ b/include/crucible/extentwalker.h
@@ -42,9 +42,6 @@ namespace crucible {
 		uint64_t bytenr() const;
 		bool operator==(const Extent &that) const;
 		bool operator!=(const Extent &that) const { return !(*this == that); }
-
-		Extent() = default;
-		Extent(const Extent &e) = default;
 	};

 	class ExtentWalker {
--- a/include/crucible/fd.h
+++ b/include/crucible/fd.h
@@ -1,6 +1,7 @@
 #ifndef CRUCIBLE_FD_H
 #define CRUCIBLE_FD_H

+#include "crucible/bytevector.h"
 #include "crucible/namedptr.h"

 #include <cstring>
@@ -125,11 +126,14 @@ namespace crucible {
 	// Specialization for strings which reads/writes the string content, not the struct string
 	template<> void write_or_die<string>(int fd, const string& str);
 	template<> void pread_or_die<string>(int fd, string& str, off_t offset);
-	template<> void pread_or_die<vector<char>>(int fd, vector<char>& str, off_t offset);
-	template<> void pread_or_die<vector<uint8_t>>(int fd, vector<uint8_t>& str, off_t offset);
 	template<> void pwrite_or_die<string>(int fd, const string& str, off_t offset);
-	template<> void pwrite_or_die<vector<char>>(int fd, const vector<char>& str, off_t offset);
-	template<> void pwrite_or_die<vector<uint8_t>>(int fd, const vector<uint8_t>& str, off_t offset);
+	template<> void pread_or_die<ByteVector>(int fd, ByteVector& str, off_t offset);
+	template<> void pwrite_or_die<ByteVector>(int fd, const ByteVector& str, off_t offset);
+	// Deprecated
+	template<> void pread_or_die<vector<uint8_t>>(int fd, vector<uint8_t>& str, off_t offset) = delete;
+	template<> void pwrite_or_die<vector<uint8_t>>(int fd, const vector<uint8_t>& str, off_t offset) = delete;
+	template<> void pread_or_die<vector<char>>(int fd, vector<char>& str, off_t offset) = delete;
+	template<> void pwrite_or_die<vector<char>>(int fd, const vector<char>& str, off_t offset) = delete;

 	// A different approach to reading a simple string
 	string read_string(int fd, size_t size);
--- a/include/crucible/fs.h
+++ b/include/crucible/fs.h
@@ -1,9 +1,9 @@
 #ifndef CRUCIBLE_FS_H
 #define CRUCIBLE_FS_H

+#include "crucible/bytevector.h"
 #include "crucible/endian.h"
 #include "crucible/error.h"
-#include "crucible/spanner.h"

 // Terribly Linux-specific FS-wrangling functions

@@ -31,12 +31,14 @@ namespace crucible {
 		BtrfsExtentInfo(int dst_fd, off_t dst_offset);
 	};

-	struct BtrfsExtentSame : public btrfs_ioctl_same_args {
+	struct BtrfsExtentSame {
 		virtual ~BtrfsExtentSame();
 		BtrfsExtentSame(int src_fd, off_t src_offset, off_t src_length);
 		void add(int fd, off_t offset);
 		virtual void do_ioctl();

+		uint64_t m_logical_offset = 0;
+		uint64_t m_length = 0;
 		int m_fd;
 		vector<BtrfsExtentInfo> m_info;
 	};
@@ -53,17 +55,17 @@ namespace crucible {

 	ostream & operator<<(ostream &os, const BtrfsInodeOffsetRoot &p);

-	struct BtrfsDataContainer : public btrfs_data_container {
+	struct BtrfsDataContainer {
 		BtrfsDataContainer(size_t size = 64 * 1024);
 		void *prepare(size_t size);

 		size_t get_size() const;
-		decltype(bytes_left) get_bytes_left() const;
-		decltype(bytes_missing) get_bytes_missing() const;
-		decltype(elem_cnt) get_elem_cnt() const;
-		decltype(elem_missed) get_elem_missed() const;
+		decltype(btrfs_data_container::bytes_left) get_bytes_left() const;
+		decltype(btrfs_data_container::bytes_missing) get_bytes_missing() const;
+		decltype(btrfs_data_container::elem_cnt) get_elem_cnt() const;
+		decltype(btrfs_data_container::elem_missed) get_elem_missed() const;

-		vector<uint8_t> m_data;
+		ByteVector m_data;
 	};

 	struct BtrfsIoctlLogicalInoArgs : public btrfs_ioctl_logical_ino_args {
@@ -141,16 +143,26 @@ namespace crucible {
 		off_t end() const;
 	};

-	struct Fiemap : public fiemap {
+	struct Fiemap {
+
+		// because fiemap.h insists on giving FIEMAP_MAX_OFFSET
+		// a different type from the struct fiemap members
+		static const uint64_t s_fiemap_max_offset = FIEMAP_MAX_OFFSET;

 		// Get entire file
-		Fiemap(uint64_t start = 0, uint64_t length = FIEMAP_MAX_OFFSET);
+		Fiemap(uint64_t start = 0, uint64_t length = s_fiemap_max_offset);

 		void do_ioctl(int fd);

 		vector<FiemapExtent> m_extents;
-		uint64_t m_min_count = (4096 - sizeof(fiemap)) / sizeof(fiemap_extent);
-		uint64_t m_max_count = 16 * 1024 * 1024 / sizeof(fiemap_extent);
+		decltype(fiemap::fm_extent_count) m_min_count = (4096 - sizeof(fiemap)) / sizeof(fiemap_extent);
+		decltype(fiemap::fm_extent_count) m_max_count = 16 * 1024 * 1024 / sizeof(fiemap_extent);
+		uint64_t m_start;
+		uint64_t m_length;
+		// FIEMAP is slow and full of lies.
+		// This makes FIEMAP even slower, but reduces the lies a little.
+		decltype(fiemap::fm_flags) m_flags = FIEMAP_FLAG_SYNC;
+	friend ostream &operator<<(ostream &, const Fiemap &);
 	};

 	ostream & operator<<(ostream &os, const fiemap_extent *info);
@@ -166,8 +178,8 @@ namespace crucible {

 	struct BtrfsIoctlSearchHeader : public btrfs_ioctl_search_header {
 		BtrfsIoctlSearchHeader();
-		Spanner<const uint8_t> m_data;
-		size_t set_data(const vector<uint8_t> &v, size_t offset);
+		ByteVector m_data;
+		size_t set_data(const ByteVector &v, size_t offset);
 		bool operator<(const BtrfsIoctlSearchHeader &that) const;
 	};

@@ -181,17 +193,18 @@ namespace crucible {
 	ostream & operator<<(ostream &os, const BtrfsIoctlSearchHeader &hdr);

 	struct BtrfsIoctlSearchKey : public btrfs_ioctl_search_key {
-		BtrfsIoctlSearchKey(size_t buf_size = 4096);
-		virtual bool do_ioctl_nothrow(int fd);
-		virtual void do_ioctl(int fd);
+		BtrfsIoctlSearchKey(size_t buf_size = 1024);
+		bool do_ioctl_nothrow(int fd);
+		void do_ioctl(int fd);

 		// Copy objectid/type/offset so we move forward
 		void next_min(const BtrfsIoctlSearchHeader& ref);

-		size_t m_buf_size;
-		vector<uint8_t> m_ioctl_arg;
-		set<BtrfsIoctlSearchHeader> m_result;
+		// move forward to next object of a single type
+		void next_min(const BtrfsIoctlSearchHeader& ref, const uint8_t type);

+		size_t m_buf_size;
+		set<BtrfsIoctlSearchHeader> m_result;
 	};

 	ostream & operator<<(ostream &os, const btrfs_ioctl_search_key &key);
@@ -235,11 +248,12 @@ namespace crucible {

 	template<class V> ostream &hexdump(ostream &os, const V &v);

-	struct BtrfsIoctlFsInfoArgs : public btrfs_ioctl_fs_info_args_v2 {
+	struct BtrfsIoctlFsInfoArgs : public btrfs_ioctl_fs_info_args_v3 {
 		BtrfsIoctlFsInfoArgs();
 		void do_ioctl(int fd);
 		uint16_t csum_type() const;
 		uint16_t csum_size() const;
+		uint64_t generation() const;
 	};

 	ostream & operator<<(ostream &os, const BtrfsIoctlFsInfoArgs &a);
--- a/include/crucible/namedptr.h
+++ b/include/crucible/namedptr.h
@@ -82,7 +82,7 @@ namespace crucible {
 		// "our" map entry if it exists and is expired.  The other
 		// thread would have done the same for us if the race had
 		// a different winner.
-		auto found = m_map_rep->m_map.find(m_ret_key);
+		const auto found = m_map_rep->m_map.find(m_ret_key);
 		if (found != m_map_rep->m_map.end() && found->second.expired()) {
 			m_map_rep->m_map.erase(found);
 		}
@@ -93,10 +93,10 @@ namespace crucible {
 	NamedPtr<Return, Arguments...>::lookup_item(const Key &k)
 	{
 		// Must be called with lock held
-		auto found = m_map_rep->m_map.find(k);
+		const auto found = m_map_rep->m_map.find(k);
 		if (found != m_map_rep->m_map.end()) {
 			// Get the strong pointer back
-			auto rv = found->second.lock();
+			const auto rv = found->second.lock();
 			if (rv) {
 				// Have strong pointer.  Return value that shares map entry.
 				return shared_ptr<Return>(rv, rv->m_ret_ptr.get());
@@ -116,34 +116,36 @@ namespace crucible {
 		Key k(args...);

 		// Is it already in the map?
-		unique_lock<mutex> lock(m_map_rep->m_mutex);
+		unique_lock<mutex> lock_lookup(m_map_rep->m_mutex);
 		auto rv = lookup_item(k);
 		if (rv) {
 			return rv;
 		}

 		// Release map lock and acquire key lock
-		lock.unlock();
-		auto key_lock = m_lockset.make_lock(k);
+		lock_lookup.unlock();
+		const auto key_lock = m_lockset.make_lock(k);

 		// Did item appear in map while we were waiting for key?
-		lock.lock();
+		lock_lookup.lock();
 		rv = lookup_item(k);
 		if (rv) {
 			return rv;
 		}

 		// We now hold key and index locks, but item not in map (or expired).
-		// Release map lock
-		lock.unlock();
+		// Release map lock so other threads can use the map
+		lock_lookup.unlock();
+
+		// Call the function and create a new Value outside of the map
+		const auto new_value_ptr = make_shared<Value>(fn(args...), k, m_map_rep);

-		// Call the function and create a new Value
-		auto new_value_ptr = make_shared<Value>(fn(args...), k, m_map_rep);
 		// Function must return a non-null pointer
 		THROW_CHECK0(runtime_error, new_value_ptr->m_ret_ptr);

-		// Reacquire index lock for map insertion
-		lock.lock();
+		// Reacquire index lock for map insertion.  We still hold the key lock.
+		// Use a different lock object to make exceptions unlock in the right order
+		unique_lock<mutex> lock_insert(m_map_rep->m_mutex);

 		// Insert return value in map or overwrite existing
 		// empty or expired weak_ptr value.
@@ -158,14 +160,13 @@ namespace crucible {
 		// to find and fix.
 		assert(new_item_ref.expired());

-		// Update the empty map slot
+		// Update the map slot we are sure is empty
 		new_item_ref = new_value_ptr;

-		// Drop lock so we don't deadlock in constructor exceptions
-		lock.unlock();
-
 		// Return shared_ptr to Return using strong pointer's reference counter
 		return shared_ptr<Return>(new_value_ptr, new_value_ptr->m_ret_ptr.get());
+
+		// Release map lock, then key lock
 	}

 	template <class Return, class... Arguments>
@@ -188,7 +189,7 @@ namespace crucible {
 	NamedPtr<Return, Arguments...>::insert(const Ptr &r, Arguments... args)
 	{
 		THROW_CHECK0(invalid_argument, r);
-		return insert_item([&](Arguments...) -> Ptr { return r; }, args...);
+		return insert_item([&](Arguments...) { return r; }, args...);
 	}

 }
--- a/include/crucible/progress.h
+++ b/include/crucible/progress.h
@@ -20,8 +20,8 @@ namespace crucible {
 		using ProgressHolder = shared_ptr<ProgressHolderState>;

 		ProgressTracker(const value_type &v);
-		value_type begin();
-		value_type end();
+		value_type begin() const;
+		value_type end() const;

 		ProgressHolder hold(const value_type &v);

@@ -51,7 +51,7 @@ namespace crucible {

 	template <class T>
 	typename ProgressTracker<T>::value_type
-	ProgressTracker<T>::begin()
+	ProgressTracker<T>::begin() const
 	{
 		unique_lock<mutex> lock(m_state->m_mutex);
 		return m_state->m_begin;
@@ -59,7 +59,7 @@ namespace crucible {

 	template <class T>
 	typename ProgressTracker<T>::value_type
-	ProgressTracker<T>::end()
+	ProgressTracker<T>::end() const
 	{
 		unique_lock<mutex> lock(m_state->m_mutex);
 		return m_state->m_end;
--- a/include/crucible/spanner.h
+++ b/include/crucible/spanner.h
@@ -1,167 +0,0 @@
-#ifndef CRUCIBLE_SPANNER_H
-#define CRUCIBLE_SPANNER_H
-
-#include "crucible/error.h"
-
-#include <memory>
-
-namespace crucible {
-
-	using namespace std;
-
-	// C++20 is already using the name "span" for something similar.
-	template <class T, class Head = T*, class Iter = Head>
-	class Spanner {
-	public:
-		using iterator = Iter;
-		using head_pointer = Head;
-		using value_type = T;
-
-		template <class Container>
-		Spanner(Container& container);
-
-		Spanner(head_pointer begin, iterator end);
-		Spanner(size_t size, head_pointer begin);
-		Spanner() = default;
-		Spanner &operator=(const Spanner &that) = default;
-		iterator begin() const;
-		iterator end() const;
-		value_type *data() const;
-		value_type &at(size_t n) const;
-		size_t size() const;
-		bool empty() const;
-		void clear();
-		value_type &operator[](size_t n) const;
-		iterator erase(iterator first, iterator last);
-		iterator erase(iterator first);
-	private:
-		head_pointer	m_begin;
-		size_t		m_size;
-	};
-
-	template <class Container, class Head = typename Container::value_type *, class Iter = Head>
-	Spanner<typename Container::value_type, Head, Iter> make_spanner(Container &container)
-	{
-		return Spanner<typename Container::value_type, Head, Iter>(container);
-	}
-
-	// This template is an attempt to turn a shared_ptr to a container
-	// into a range view that can be cheaply passed around.
-	// It probably doesn't quite work in the general case.
-	template <class Container, class Head = shared_ptr<typename Container::value_type>, class Iter = typename Container::value_type *>
-	Spanner<typename Container::value_type, Head, Iter> make_spanner(shared_ptr<Container> &cont_ptr)
-	{
-		shared_ptr<typename Container::value_type> head(cont_ptr, cont_ptr->data());
-		size_t const size = cont_ptr->size();
-		return Spanner<typename Container::value_type, Head, Iter>(size, head);
-	}
-
-	template <class T, class Head, class Iter>
-	template <class Container>
-	Spanner<T, Head, Iter>::Spanner(Container &container) :
-		m_begin(container.data()),
-		m_size(container.size())
-	{
-	}
-
-	template <class T, class Head, class Iter>
-	Spanner<T, Head, Iter>::Spanner(head_pointer begin, iterator end) :
-		m_begin(begin),
-		m_size(end - begin)
-	{
-	}
-
-	template <class T, class Head, class Iter>
-	Spanner<T, Head, Iter>::Spanner(size_t size, head_pointer begin) :
-		m_begin(begin),
-		m_size(size)
-	{
-	}
-
-	template <class T, class Head, class Iter>
-	typename Spanner<T, Head, Iter>::iterator
-	Spanner<T, Head, Iter>::erase(iterator first, iterator last)
-	{
-		auto end = m_begin + m_size;
-		if (first == m_begin) {
-			THROW_CHECK0(invalid_argument, last <= end);
-			m_begin = last;
-			return last;
-		}
-		if (last == end) {
-			THROW_CHECK0(invalid_argument, m_begin <= first);
-			m_size = first - m_begin;
-			return first;
-		}
-		THROW_ERROR(invalid_argument, "first != begin() and last != end()");
-	}
-
-	template <class T, class Head, class Iter>
-	typename Spanner<T, Head, Iter>::iterator
-	Spanner<T, Head, Iter>::erase(iterator first)
-	{
-		return erase(first, first + 1);
-	}
-
-	template <class T, class Head, class Iter>
-	typename Spanner<T, Head, Iter>::value_type &
-	Spanner<T, Head, Iter>::operator[](size_t n) const
-	{
-		return at(n);
-	}
-
-	template <class T, class Head, class Iter>
-	void
-	Spanner<T, Head, Iter>::clear()
-	{
-		m_begin = head_pointer();
-		m_size = 0;
-	}
-
-	template <class T, class Head, class Iter>
-	bool
-	Spanner<T, Head, Iter>::empty() const
-	{
-		return m_size == 0;
-	}
-
-	template <class T, class Head, class Iter>
-	size_t
-	Spanner<T, Head, Iter>::size() const
-	{
-		return m_size;
-	}
-
-	template <class T, class Head, class Iter>
-	typename Spanner<T, Head, Iter>::value_type *
-	Spanner<T, Head, Iter>::data() const
-	{
-		return &(*m_begin);
-	}
-
-	template <class T, class Head, class Iter>
-	typename Spanner<T, Head, Iter>::iterator
-	Spanner<T, Head, Iter>::begin() const
-	{
-		return data();
-	}
-
-	template <class T, class Head, class Iter>
-	typename Spanner<T, Head, Iter>::iterator
-	Spanner<T, Head, Iter>::end() const
-	{
-		return data() + m_size;
-	}
-
-	template <class T, class Head, class Iter>
-	typename Spanner<T, Head, Iter>::value_type &
-	Spanner<T, Head, Iter>::at(size_t n) const
-	{
-		THROW_CHECK2(out_of_range, n, size(), n < size());
-		return *(data() + n);
-	}
-
-}
-
-
-#endif // CRUCIBLE_SPANNER_H
--- a/include/crucible/string.h
+++ b/include/crucible/string.h
@@ -11,23 +11,6 @@
 namespace crucible {
 	using namespace std;

-	// Zero-initialize a base class object (usually a C struct)
-	template <class Base>
-	void
-	memset_zero(Base *that)
-	{
-		memset(that, 0, sizeof(Base));
-	}
-
-	// Copy a base class object (usually a C struct) into a vector<uint8_t>
-	template <class Base>
-	vector<uint8_t>
-	vector_copy_struct(Base *that)
-	{
-		const uint8_t *begin_that = reinterpret_cast<const uint8_t *>(static_cast<const Base *>(that));
-		return vector<uint8_t>(begin_that, begin_that + sizeof(Base));
-	}
-
 	// int->hex conversion with sprintf
 	string to_hex(uint64_t i);

--- a/include/crucible/uname.h
+++ b/include/crucible/uname.h
@@ -0,0 +1,14 @@
+#ifndef CRUCIBLE_UNAME_H
+#define CRUCIBLE_UNAME_H
+
+#include <sys/utsname.h>
+
+namespace crucible {
+	using namespace std;
+
+	struct Uname : public utsname {
+		Uname();
+	};
+}
+
+#endif
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -4,6 +4,7 @@ default: libcrucible.a
 %.a: Makefile

 CRUCIBLE_OBJS = \
+	bytevector.o \
 	chatter.o \
 	city.o \
 	cleanup.o \
@@ -18,6 +19,7 @@ CRUCIBLE_OBJS = \
 	string.o \
 	task.o \
 	time.o \
+	uname.o \

 include ../makeflags
 -include ../localconf
--- a/lib/bytevector.cc
+++ b/lib/bytevector.cc
@@ -0,0 +1,147 @@
+#include "crucible/bytevector.h"
+
+#include "crucible/error.h"
+
+namespace crucible {
+	using namespace std;
+
+	ByteVector::iterator
+	ByteVector::begin() const
+	{
+		return m_ptr.get();
+	}
+
+	ByteVector::iterator
+	ByteVector::end() const
+	{
+		return m_ptr.get() + m_size;
+	}
+
+	size_t
+	ByteVector::size() const
+	{
+		return m_size;
+	}
+
+	bool
+	ByteVector::empty() const
+	{
+		return !m_ptr || !m_size;
+	}
+
+	void
+	ByteVector::clear()
+	{
+		m_ptr.reset();
+		m_size = 0;
+	}
+
+	ByteVector::value_type&
+	ByteVector::operator[](size_t size) const
+	{
+		return m_ptr.get()[size];
+	}
+
+	ByteVector::ByteVector(const ByteVector &that, size_t start, size_t length)
+	{
+		THROW_CHECK0(out_of_range, that.m_ptr);
+		THROW_CHECK2(out_of_range, start, that.m_size, start <= that.m_size);
+		THROW_CHECK2(out_of_range, start + length, that.m_size + length, start + length <= that.m_size + length);
+		m_ptr = Pointer(that.m_ptr, that.m_ptr.get() + start);
+		m_size = length;
+	}
+
+	ByteVector
+	ByteVector::at(size_t start, size_t length) const
+	{
+		return ByteVector(*this, start, length);
+	}
+
+	ByteVector::value_type&
+	ByteVector::at(size_t size) const
+	{
+		THROW_CHECK0(out_of_range, m_ptr);
+		THROW_CHECK2(out_of_range, size, m_size, size < m_size);
+		return m_ptr.get()[size];
+	}
+
+	static
+	void *
+	bv_allocate(size_t size)
+	{
+#ifdef BEES_VALGRIND
+		// XXX: only do this to shut up valgrind
+		return calloc(1, size);
+#else
+		return malloc(size);
+#endif
+	}
+
+	ByteVector::ByteVector(size_t size)
+	{
+		m_ptr = Pointer(static_cast<value_type*>(bv_allocate(size)), free);
+		// bad_alloc doesn't fit THROW_CHECK's template
+		THROW_CHECK0(runtime_error, m_ptr);
+		m_size = size;
+	}
+
+	ByteVector::ByteVector(iterator begin, iterator end, size_t min_size)
+	{
+		const size_t size = end - begin;
+		const size_t alloc_size = max(size, min_size);
+		m_ptr = Pointer(static_cast<value_type*>(bv_allocate(alloc_size)), free);
+		THROW_CHECK0(runtime_error, m_ptr);
+		m_size = alloc_size;
+		memcpy(m_ptr.get(), begin, size);
+	}
+
+	bool
+	ByteVector::operator==(const ByteVector &that) const
+	{
+		if (!m_ptr) {
+			return !that.m_ptr;
+		}
+		if (!that.m_ptr) {
+			return false;
+		}
+		if (m_size != that.m_size) {
+			return false;
+		}
+		if (m_ptr.get() == that.m_ptr.get()) {
+			return true;
+		}
+		return !memcmp(m_ptr.get(), that.m_ptr.get(), m_size);
+	}
+
+	void
+	ByteVector::erase(iterator begin, iterator end)
+	{
+		const size_t size = end - begin;
+		if (!size) return;
+		THROW_CHECK0(out_of_range, m_ptr);
+		const iterator my_begin = m_ptr.get();
+		const iterator my_end = my_begin + m_size;
+		THROW_CHECK4(out_of_range, my_begin, begin, my_end, end, my_begin == begin || my_end == end);
+		if (begin == my_begin) {
+			if (end == my_end) {
+				m_size = 0;
+				m_ptr.reset();
+				return;
+			}
+			m_ptr = Pointer(m_ptr, end);
+		}
+		m_size -= size;
+	}
+
+	void
+	ByteVector::erase(iterator begin)
+	{
+		erase(begin, begin + 1);
+	}
+
+	ByteVector::value_type*
+	ByteVector::data() const
+	{
+		return m_ptr.get();
+	}
+}
--- a/lib/extentwalker.cc
+++ b/lib/extentwalker.cc
@@ -496,7 +496,7 @@ namespace crucible {
 	BtrfsExtentWalker::Vec
 	BtrfsExtentWalker::get_extent_map(off_t pos)
 	{
-		BtrfsIoctlSearchKey sk(65536);
+		BtrfsIoctlSearchKey sk;
 		if (!m_root_fd) {
 			m_root_fd = m_fd;
 		}
@@ -640,9 +640,7 @@ namespace crucible {
 	ExtentWalker::get_extent_map(off_t pos)
 	{
 		EWLOG("get_extent_map(" << to_hex(pos) << ")");
-		Fiemap fm;
-		fm.fm_start = ranged_cast<uint64_t>(pos);
-		fm.fm_length = ranged_cast<uint64_t>(numeric_limits<off_t>::max() - pos);
+		Fiemap fm(ranged_cast<uint64_t>(pos), ranged_cast<uint64_t>(numeric_limits<off_t>::max() - pos));
 		fm.m_max_count = fm.m_min_count = sc_extent_fetch_max;
 		fm.do_ioctl(m_fd);
 		Vec rv;
--- a/lib/fd.cc
+++ b/lib/fd.cc
@@ -362,7 +362,7 @@ namespace crucible {
                }
 		int rv = ::pwrite(fd, buf, size, offset);
 		if (rv != static_cast<int>(size)) {
-			THROW_ERROR(runtime_error, "pwrite: only " << rv << " of " << size << " bytes written at offset " << offset);
+			THROW_ERROR(runtime_error, "pwrite: only " << rv << " of " << size << " bytes written at fd " << name_fd(fd) << " offset " << offset);
 		}
 	}

@@ -442,7 +442,7 @@ namespace crucible {
 					THROW_ERRNO("pread: " << size << " bytes");
 				}
 				if (rv != static_cast<int>(size)) {
-					THROW_ERROR(runtime_error, "pread: " << size << " bytes at offset " << offset << " returned " << rv);
+					THROW_ERROR(runtime_error, "pread: " << size << " bytes at fd " << name_fd(fd) << " offset " << offset << " returned " << rv);
 				}
 				break;
 			}
@@ -458,28 +458,14 @@ namespace crucible {

 	template<>
 	void
-	pread_or_die<vector<char>>(int fd, vector<char> &text, off_t offset)
+	pread_or_die<ByteVector>(int fd, ByteVector &text, off_t offset)
 	{
 		return pread_or_die(fd, text.data(), text.size(), offset);
 	}

 	template<>
 	void
-	pread_or_die<vector<uint8_t>>(int fd, vector<uint8_t> &text, off_t offset)
-	{
-		return pread_or_die(fd, text.data(), text.size(), offset);
-	}
-
-	template<>
-	void
-	pwrite_or_die<vector<uint8_t>>(int fd, const vector<uint8_t> &text, off_t offset)
-	{
-		return pwrite_or_die(fd, text.data(), text.size(), offset);
-	}
-
-	template<>
-	void
-	pwrite_or_die<vector<char>>(int fd, const vector<char> &text, off_t offset)
+	pwrite_or_die<ByteVector>(int fd, const ByteVector &text, off_t offset)
 	{
 		return pwrite_or_die(fd, text.data(), text.size(), offset);
 	}
@@ -491,9 +477,9 @@ namespace crucible {
 		return pwrite_or_die(fd, text.data(), text.size(), offset);
 	}

-	Stat::Stat()
+	Stat::Stat() :
+		stat( (stat) { } )
 	{
-		memset_zero<stat>(this);
 	}

 	Stat &
@@ -512,15 +498,15 @@ namespace crucible {
 		return *this;
 	}

-	Stat::Stat(int fd)
+	Stat::Stat(int fd) :
+		stat( (stat) { } )
 	{
-		memset_zero<stat>(this);
 		fstat(fd);
 	}

-	Stat::Stat(const string &filename)
+	Stat::Stat(const string &filename) :
+		stat( (stat) { } )
 	{
-		memset_zero<stat>(this);
 		lstat(filename);
 	}

--- a/lib/fs.cc
+++ b/lib/fs.cc
@@ -32,19 +32,23 @@ namespace crucible {
 #endif
 	}

-	BtrfsExtentInfo::BtrfsExtentInfo(int dst_fd, off_t dst_offset)
+	BtrfsExtentInfo::BtrfsExtentInfo(int dst_fd, off_t dst_offset) :
+		btrfs_ioctl_same_extent_info( (btrfs_ioctl_same_extent_info) { } )
 	{
-		memset_zero<btrfs_ioctl_same_extent_info>(this);
+		assert(fd == 0);
+		assert(logical_offset == 0);
+		assert(bytes_deduped == 0);
+		assert(status == 0);
+		assert(reserved == 0);
 		fd = dst_fd;
 		logical_offset = dst_offset;
 	}

 	BtrfsExtentSame::BtrfsExtentSame(int src_fd, off_t src_offset, off_t src_length) :
+		m_logical_offset(src_offset),
+		m_length(src_length),
 		m_fd(src_fd)
 	{
-		memset_zero<btrfs_ioctl_same_args>(this);
-		logical_offset = src_offset;
-		length = src_length;
 	}

 	BtrfsExtentSame::~BtrfsExtentSame()
@@ -111,11 +115,8 @@ namespace crucible {
 				os << " '" << fd_name << "'";
 			});
 		}
-		os << ", .logical_offset = " << to_hex(bes.logical_offset);
-		os << ", .length = " << to_hex(bes.length);
-		os << ", .dest_count = " << bes.dest_count;
-		os << ", .reserved1 = " << bes.reserved1;
-		os << ", .reserved2 = " << bes.reserved2;
+		os << ", .logical_offset = " << to_hex(bes.m_logical_offset);
+		os << ", .length = " << to_hex(bes.m_length);
 		os << ", .info[] = {";
 		for (size_t i = 0; i < bes.m_info.size(); ++i) {
 			os << " [" << i << "] = " << &(bes.m_info[i]) << ",";
@@ -126,22 +127,25 @@ namespace crucible {
 	void
 	btrfs_clone_range(int src_fd, off_t src_offset, off_t src_length, int dst_fd, off_t dst_offset)
 	{
-		struct btrfs_ioctl_clone_range_args args;
-		memset_zero(&args);
-		args.src_fd = src_fd;
-		args.src_offset = src_offset;
-		args.src_length = src_length;
-		args.dest_offset = dst_offset;
+		btrfs_ioctl_clone_range_args args ( (btrfs_ioctl_clone_range_args) {
+			.src_fd = src_fd,
+			.src_offset = ranged_cast<uint64_t, off_t>(src_offset),
+			.src_length = ranged_cast<uint64_t, off_t>(src_length),
+			.dest_offset = ranged_cast<uint64_t, off_t>(dst_offset),
+		} );
 		DIE_IF_MINUS_ONE(ioctl(dst_fd, BTRFS_IOC_CLONE_RANGE, &args));
 	}

 	void
 	BtrfsExtentSame::do_ioctl()
 	{
-		dest_count = m_info.size();
-		vector<uint8_t> ioctl_arg = vector_copy_struct<btrfs_ioctl_same_args>(this);
-		ioctl_arg.resize(sizeof(btrfs_ioctl_same_args) + dest_count * sizeof(btrfs_ioctl_same_extent_info), 0);
-		btrfs_ioctl_same_args *ioctl_ptr = reinterpret_cast<btrfs_ioctl_same_args *>(ioctl_arg.data());
+		const size_t buf_size = sizeof(btrfs_ioctl_same_args) + m_info.size() * sizeof(btrfs_ioctl_same_extent_info);
+		ByteVector ioctl_arg( (btrfs_ioctl_same_args) {
+			.logical_offset = m_logical_offset,
+			.length = m_length,
+			.dest_count = ranged_cast<decltype(btrfs_ioctl_same_args::dest_count)>(m_info.size()),
+		}, buf_size);
+		btrfs_ioctl_same_args *const ioctl_ptr = ioctl_arg.get<btrfs_ioctl_same_args>();
 		size_t count = 0;
 		for (auto i = m_info.cbegin(); i != m_info.cend(); ++i) {
 			ioctl_ptr->info[count] = static_cast<const btrfs_ioctl_same_extent_info &>(m_info[count]);
@@ -194,18 +198,15 @@ namespace crucible {
 	void *
 	BtrfsDataContainer::prepare(size_t container_size)
 	{
-		if (m_data.size() < container_size) {
-			m_data.resize(container_size);
-		}
-		btrfs_data_container *p = reinterpret_cast<btrfs_data_container *>(m_data.data());
 		const size_t min_size = offsetof(btrfs_data_container, val);
 		if (container_size < min_size) {
 			THROW_ERROR(out_of_range, "container size " << container_size << " smaller than minimum " << min_size);
 		}
-		p->bytes_left = 0;
-		p->bytes_missing = 0;
-		p->elem_cnt = 0;
-		p->elem_missed = 0;
+		if (m_data.size() < container_size) {
+			m_data = ByteVector(container_size);
+		}
+		const auto p = m_data.get<btrfs_data_container>();
+		*p = (btrfs_data_container) { };
 		return p;
 	}

@@ -218,25 +219,29 @@ namespace crucible {
 	decltype(btrfs_data_container::bytes_left)
 	BtrfsDataContainer::get_bytes_left() const
 	{
-		return bytes_left;
+		const auto p = m_data.get<btrfs_data_container>();
+		return p->bytes_left;
 	}

 	decltype(btrfs_data_container::bytes_missing)
 	BtrfsDataContainer::get_bytes_missing() const
 	{
-		return bytes_missing;
+		const auto p = m_data.get<btrfs_data_container>();
+		return p->bytes_missing;
 	}

 	decltype(btrfs_data_container::elem_cnt)
 	BtrfsDataContainer::get_elem_cnt() const
 	{
-		return elem_cnt;
+		const auto p = m_data.get<btrfs_data_container>();
+		return p->elem_cnt;
 	}

 	decltype(btrfs_data_container::elem_missed)
 	BtrfsDataContainer::get_elem_missed() const
 	{
-		return elem_missed;
+		const auto p = m_data.get<btrfs_data_container>();
+		return p->elem_missed;
 	}

 	ostream &
@@ -257,10 +262,13 @@ namespace crucible {
 	}

 	BtrfsIoctlLogicalInoArgs::BtrfsIoctlLogicalInoArgs(uint64_t new_logical, size_t new_size) :
+		btrfs_ioctl_logical_ino_args( (btrfs_ioctl_logical_ino_args) { } ),
 		m_container_size(new_size),
 		m_container(new_size)
 	{
-		memset_zero<btrfs_ioctl_logical_ino_args>(this);
+		assert(logical == 0);
+		assert(size == 0);
+		assert(flags == 0);
 		logical = new_logical;
 	}

@@ -328,7 +336,7 @@ namespace crucible {
 	bool
 	BtrfsIoctlLogicalInoArgs::do_ioctl_nothrow(int fd)
 	{
-		btrfs_ioctl_logical_ino_args *p = static_cast<btrfs_ioctl_logical_ino_args *>(this);
+		btrfs_ioctl_logical_ino_args *const p = static_cast<btrfs_ioctl_logical_ino_args *>(this);
 		inodes = reinterpret_cast<uint64_t>(m_container.prepare(m_container_size));
 		size = m_container.get_size();

@@ -367,8 +375,8 @@ namespace crucible {
 			bili_version = BTRFS_IOC_LOGICAL_INO_V2;
 		}

-		btrfs_data_container *bdc = reinterpret_cast<btrfs_data_container *>(p->inodes);
-		BtrfsInodeOffsetRoot *input_iter = reinterpret_cast<BtrfsInodeOffsetRoot *>(bdc->val);
+		btrfs_data_container *const bdc = reinterpret_cast<btrfs_data_container *>(p->inodes);
+		BtrfsInodeOffsetRoot *const input_iter = reinterpret_cast<BtrfsInodeOffsetRoot *>(bdc->val);

 		// elem_cnt counts uint64_t, but BtrfsInodeOffsetRoot is 3x uint64_t
 		THROW_CHECK1(runtime_error, bdc->elem_cnt, bdc->elem_cnt % 3 == 0);
@@ -396,9 +404,10 @@ namespace crucible {
 	}

 	BtrfsIoctlInoPathArgs::BtrfsIoctlInoPathArgs(uint64_t inode, size_t new_size) :
+		btrfs_ioctl_ino_path_args( (btrfs_ioctl_ino_path_args) { } ),
 		m_container_size(new_size)
 	{
-		memset_zero<btrfs_ioctl_ino_path_args>(this);
+		assert(inum == 0);
 		inum = inode;
 	}

@@ -416,14 +425,14 @@ namespace crucible {
 			return false;
 		}

-		btrfs_data_container *bdc = reinterpret_cast<btrfs_data_container *>(p->fspath);
+		btrfs_data_container *const bdc = reinterpret_cast<btrfs_data_container *>(p->fspath);
 		m_paths.reserve(bdc->elem_cnt);

 		const uint64_t *up = reinterpret_cast<const uint64_t *>(bdc->val);
-		const char *cp = reinterpret_cast<const char *>(bdc->val);
+		const char *const cp = reinterpret_cast<const char *>(bdc->val);

 		for (auto count = bdc->elem_cnt; count > 0; --count) {
-			const char *path = cp + *up++;
+			const char *const path = cp + *up++;
 			if (static_cast<size_t>(path - cp) > container.get_size()) {
 				THROW_ERROR(out_of_range, "offset " << (path - cp) << " > size " << container.get_size() << " in " << __PRETTY_FUNCTION__);
 			}
@@ -458,9 +467,10 @@ namespace crucible {
 		return os;
 	}

-	BtrfsIoctlInoLookupArgs::BtrfsIoctlInoLookupArgs(uint64_t new_objectid)
+	BtrfsIoctlInoLookupArgs::BtrfsIoctlInoLookupArgs(uint64_t new_objectid) :
+		btrfs_ioctl_ino_lookup_args( (btrfs_ioctl_ino_lookup_args) { } )
 	{
-		memset_zero<btrfs_ioctl_ino_lookup_args>(this);
+		assert(objectid == 0);
 		objectid = new_objectid;
 	}

@@ -478,9 +488,9 @@ namespace crucible {
 		}
 	}

-	BtrfsIoctlDefragRangeArgs::BtrfsIoctlDefragRangeArgs()
+	BtrfsIoctlDefragRangeArgs::BtrfsIoctlDefragRangeArgs() :
+		btrfs_ioctl_defrag_range_args( (btrfs_ioctl_defrag_range_args) { } )
 	{
-		memset_zero<btrfs_ioctl_defrag_range_args>(this);
 	}

 	bool
@@ -537,9 +547,9 @@ namespace crucible {
 		return os;
 	}

-	FiemapExtent::FiemapExtent()
+	FiemapExtent::FiemapExtent() :
+		fiemap_extent( (fiemap_extent) { } )
 	{
-		memset_zero<fiemap_extent>(this);
 	}

 	FiemapExtent::FiemapExtent(const fiemap_extent &that)
@@ -646,13 +656,10 @@ namespace crucible {
 	operator<<(ostream &os, const Fiemap &args)
 	{
 		os << "Fiemap {";
-		os << " .fm_start = " << to_hex(args.fm_start) << ".." << to_hex(args.fm_start + args.fm_length);
-		os << ", .fm_length = " << to_hex(args.fm_length);
-		if (args.fm_flags) os << ", .fm_flags = " << fiemap_flags_ntoa(args.fm_flags);
-		os << ", .fm_mapped_extents = " << args.fm_mapped_extents;
-		os << ", .fm_extent_count = " << args.fm_extent_count;
-		if (args.fm_reserved) os << ", .fm_reserved = " << args.fm_reserved;
-		os << ", .fm_extents[] = {";
+		os << " .m_start = " << to_hex(args.m_start) << ".." << to_hex(args.m_start + args.m_length);
+		os << ", .m_length = " << to_hex(args.m_length);
+		os << ", .m_flags = " << fiemap_flags_ntoa(args.m_flags);
+		os << ", .fm_extents[" << args.m_extents.size() << "] = {";
 		size_t count = 0;
 		for (auto i = args.m_extents.cbegin(); i != args.m_extents.cend(); ++i) {
 			os << "\n\t[" << count++ << "] = " << &(*i) << ",";
@@ -660,41 +667,35 @@ namespace crucible {
 		return os << "\n}";
 	}

-	Fiemap::Fiemap(uint64_t start, uint64_t length)
+	Fiemap::Fiemap(uint64_t start, uint64_t length) :
+		m_start(start),
+		m_length(length)
 	{
-		memset_zero<fiemap>(this);
-		fm_start = start;
-		fm_length = length;
-		// FIEMAP is slow and full of lines.
-		// This makes FIEMAP even slower, but reduces the lies a little.
-		fm_flags = FIEMAP_FLAG_SYNC;
 	}

 	void
 	Fiemap::do_ioctl(int fd)
 	{
 		THROW_CHECK1(out_of_range, m_min_count, m_min_count <= m_max_count);
+		THROW_CHECK1(out_of_range, m_min_count, m_min_count > 0);

-		auto extent_count = m_min_count;
-		vector<uint8_t> ioctl_arg = vector_copy_struct<fiemap>(this);
+		const auto extent_count = m_min_count;
+		ByteVector ioctl_arg(sizeof(fiemap) + extent_count * sizeof(fiemap_extent));

-		ioctl_arg.resize(sizeof(fiemap) + extent_count * sizeof(fiemap_extent), 0);
+		fiemap *const ioctl_ptr = ioctl_arg.get<fiemap>();

-		fiemap *ioctl_ptr = reinterpret_cast<fiemap *>(ioctl_arg.data());
-
-		auto start = fm_start;
-		auto end = fm_start + fm_length;
-
-		auto orig_start = fm_start;
-		auto orig_length = fm_length;
+		auto start = m_start;
+		const auto end = m_start + m_length;

 		vector<FiemapExtent> extents;

 		while (start < end && extents.size() < m_max_count) {
-			ioctl_ptr->fm_start = start;
-			ioctl_ptr->fm_length = end - start;
-			ioctl_ptr->fm_extent_count = extent_count;
-			ioctl_ptr->fm_mapped_extents = 0;
+			*ioctl_ptr = (fiemap) {
+				.fm_start = start,
+				.fm_length = end - start,
+				.fm_flags = m_flags,
+				.fm_extent_count = extent_count,
+			};

 			// cerr << "Before (fd = " << fd << ") : " << ioctl_ptr << endl;
 			DIE_IF_MINUS_ONE(ioctl(fd, FS_IOC_FIEMAP, ioctl_ptr));
@@ -720,74 +721,89 @@ namespace crucible {
 			}
 		}

-		fiemap *this_ptr = static_cast<fiemap *>(this);
-		*this_ptr = *ioctl_ptr;
-		fm_start = orig_start;
-		fm_length = orig_length;
-		fm_extent_count = extents.size();
 		m_extents = extents;
 	}

 	BtrfsIoctlSearchKey::BtrfsIoctlSearchKey(size_t buf_size) :
+		btrfs_ioctl_search_key( (btrfs_ioctl_search_key) {
+			.max_objectid = numeric_limits<decltype(max_objectid)>::max(),
+			.max_offset = numeric_limits<decltype(max_offset)>::max(),
+			.max_transid = numeric_limits<decltype(max_transid)>::max(),
+			.max_type = numeric_limits<decltype(max_type)>::max(),
+			.nr_items = 1,
+		}),
 		m_buf_size(buf_size)
 	{
-		memset_zero<btrfs_ioctl_search_key>(this);
-		max_objectid = numeric_limits<decltype(max_objectid)>::max();
-		max_offset = numeric_limits<decltype(max_offset)>::max();
-		max_transid = numeric_limits<decltype(max_transid)>::max();
-		max_type = numeric_limits<decltype(max_type)>::max();
-		nr_items = numeric_limits<decltype(nr_items)>::max();
 	}

-	BtrfsIoctlSearchHeader::BtrfsIoctlSearchHeader()
+	BtrfsIoctlSearchHeader::BtrfsIoctlSearchHeader() :
+		btrfs_ioctl_search_header( (btrfs_ioctl_search_header) { } )
 	{
-		memset_zero<btrfs_ioctl_search_header>(this);
 	}

 	size_t
-	BtrfsIoctlSearchHeader::set_data(const vector<uint8_t> &v, size_t offset)
+	BtrfsIoctlSearchHeader::set_data(const ByteVector &v, size_t offset)
 	{
 		THROW_CHECK2(invalid_argument, offset, v.size(), offset + sizeof(btrfs_ioctl_search_header) <= v.size());
 		memcpy(static_cast<btrfs_ioctl_search_header *>(this), &v[offset], sizeof(btrfs_ioctl_search_header));
 		offset += sizeof(btrfs_ioctl_search_header);
 		THROW_CHECK2(invalid_argument, offset + len, v.size(), offset + len <= v.size());
-		m_data = Spanner<const uint8_t>(&v[offset], &v[offset + len]);
+		m_data = ByteVector(v, offset, len);
 		return offset + len;
 	}

 	bool
 	BtrfsIoctlSearchKey::do_ioctl_nothrow(int fd)
 	{
-		// Normally we like to be paranoid and fill empty bytes with zero,
-		// but these buffers can be huge.  80% of a 4GHz CPU huge.
-
-		// Keep the ioctl buffer from one run to the next to save on malloc costs
-		size_t target_buf_size = sizeof(btrfs_ioctl_search_args_v2) + m_buf_size;
-
-		m_ioctl_arg = vector_copy_struct<btrfs_ioctl_search_key>(this);
-		m_ioctl_arg.resize(target_buf_size);
+		// It would be really nice if the kernel tells us whether our
+		// buffer overflowed or how big the overflowing object
+		// was; instead, we have to guess.

 		m_result.clear();
+		// Make sure there is space for at least the search key and one (empty) header
+		size_t buf_size = max(m_buf_size, sizeof(btrfs_ioctl_search_args_v2) + sizeof(btrfs_ioctl_search_header));
+		ByteVector ioctl_arg;
+		btrfs_ioctl_search_args_v2 *ioctl_ptr;
+		do {
+			// ioctl buffer size does not include search key header or buffer size
+			ioctl_arg = ByteVector(buf_size + sizeof(btrfs_ioctl_search_args_v2));
+			ioctl_ptr = ioctl_arg.get<btrfs_ioctl_search_args_v2>();
+			ioctl_ptr->key = static_cast<const btrfs_ioctl_search_key&>(*this);
+			ioctl_ptr->buf_size = buf_size;
+			// Don't bother supporting V1.  Kernels that old have other problems.
+			int rv = ioctl(fd, BTRFS_IOC_TREE_SEARCH_V2, ioctl_arg.data());
+			if (rv != 0 && errno != EOVERFLOW) {
+				return false;
+			}
+			if (rv == 0 && nr_items <= ioctl_ptr->key.nr_items) {
+				// got all the items we wanted, thanks
+				m_buf_size = max(m_buf_size, buf_size);
+				break;
+			}
+			// Didn't get all the items we wanted.  Increase the buf size and try again.
+			// These sizes are very common on default-formatted btrfs, so use these
+			// instead of naive doubling.
+			if (buf_size < 4096) {
+				buf_size = 4096;
+			} else if (buf_size < 16384) {
+				buf_size = 16384;
+			} else if (buf_size < 65536) {
+				buf_size = 65536;
+			} else {
+				buf_size *= 2;
+			}
+			// don't automatically raise the buf size higher than 64K, the largest possible btrfs item
+		} while (buf_size < 65536);

-		btrfs_ioctl_search_args_v2 *ioctl_ptr = reinterpret_cast<btrfs_ioctl_search_args_v2 *>(m_ioctl_arg.data());
-
-		ioctl_ptr->buf_size = m_buf_size;
-
-		// Don't bother supporting V1.  Kernels that old have other problems.
-		int rv = ioctl(fd, BTRFS_IOC_TREE_SEARCH_V2, ioctl_ptr);
-		if (rv != 0) {
-			return false;
-		}
-
+		// ioctl changes nr_items, this has to be copied back
 		static_cast<btrfs_ioctl_search_key&>(*this) = ioctl_ptr->key;

 		size_t offset = pointer_distance(ioctl_ptr->buf, ioctl_ptr);
 		for (decltype(nr_items) i = 0; i < nr_items; ++i) {
 			BtrfsIoctlSearchHeader item;
-			offset = item.set_data(m_ioctl_arg, offset);
+			offset = item.set_data(ioctl_arg, offset);
 			m_result.insert(item);
 		}
-
 		return true;
 	}

@@ -795,7 +811,7 @@ namespace crucible {
 	BtrfsIoctlSearchKey::do_ioctl(int fd)
 	{
 		if (!do_ioctl_nothrow(fd)) {
-			THROW_ERRNO("BTRFS_IOC_TREE_SEARCH_V2: " << name_fd(fd));
+			THROW_ERRNO("BTRFS_IOC_TREE_SEARCH_V2: " << name_fd(fd) << ": " << *this);
 		}
 	}

@@ -806,8 +822,46 @@ namespace crucible {
 		min_type = ref.type;
 		min_offset = ref.offset + 1;
 		if (min_offset < ref.offset) {
-			// We wrapped, try the next objectid
-			++min_objectid;
+			// We wrapped, try the next type
+			++min_type;
+			assert(min_offset == 0);
+			if (min_type < ref.type) {
+				assert(min_type == 0);
+				// We wrapped, try the next objectid
+				++min_objectid;
+				// no advancement possible at end
+				THROW_CHECK1(runtime_error, min_type, min_type == 0);
+			}
+		}
+	}
+
+	void
+	BtrfsIoctlSearchKey::next_min(const BtrfsIoctlSearchHeader &ref, const uint8_t type)
+	{
+		if (ref.type < type) {
+			// forward to type in same object with zero offset
+			min_objectid = ref.objectid;
+			min_type = type;
+			min_offset = 0;
+		} else if (ref.type > type) {
+			// skip directly to start of next objectid with target type
+			min_objectid = ref.objectid + 1;
+			// no advancement possible at end
+			THROW_CHECK2(out_of_range, min_objectid, ref.objectid, min_objectid > ref.objectid);
+			min_type = type;
+			min_offset = 0;
+		} else {
+			// advance within this type
+			min_objectid = ref.objectid;
+			min_type = ref.type;
+			min_offset = ref.offset + 1;
+			if (min_offset < ref.offset) {
+				// We wrapped, try the next objectid, same type
+				++min_objectid;
+				THROW_CHECK2(out_of_range, min_objectid, ref.objectid, min_objectid > ref.objectid);
+				min_type = type;
+				assert(min_offset == 0);
+			}
 		}
 	}

@@ -815,7 +869,7 @@ namespace crucible {
 	ostream &
 	hexdump(ostream &os, const V &v)
 	{
-		os << "vector<uint8_t> { size = " << v.size() << ", data:\n";
+		os << "V { size = " << v.size() << ", data:\n";
 		for (size_t i = 0; i < v.size(); i += 8) {
 			string hex, ascii;
 			for (size_t j = i; j < i + 8; ++j) {
@@ -1029,9 +1083,9 @@ namespace crucible {
 		return rv;
 	}

-	Statvfs::Statvfs()
+	Statvfs::Statvfs() :
+		statvfs( (statvfs) { } )
 	{
-		memset_zero<statvfs>(this);
 	}

 	Statvfs::Statvfs(int fd) :
@@ -1082,16 +1136,20 @@ namespace crucible {
 		return os << " }";
 	};

-	BtrfsIoctlFsInfoArgs::BtrfsIoctlFsInfoArgs()
+	BtrfsIoctlFsInfoArgs::BtrfsIoctlFsInfoArgs() :
+		btrfs_ioctl_fs_info_args_v3( (btrfs_ioctl_fs_info_args_v3) {
+			.flags = 0
+				| BTRFS_FS_INFO_FLAG_CSUM_INFO
+				| BTRFS_FS_INFO_FLAG_GENERATION
+			,
+		})
 	{
-		memset_zero<btrfs_ioctl_fs_info_args_v2>(this);
-		flags = BTRFS_FS_INFO_FLAG_CSUM_INFO;
 	}

 	void
 	BtrfsIoctlFsInfoArgs::do_ioctl(int fd)
 	{
-		btrfs_ioctl_fs_info_args_v2 *p = static_cast<btrfs_ioctl_fs_info_args_v2 *>(this);
+		btrfs_ioctl_fs_info_args_v3 *p = static_cast<btrfs_ioctl_fs_info_args_v3 *>(this);
 		if (ioctl(fd, BTRFS_IOC_FS_INFO, p)) {
 			THROW_ERRNO("BTRFS_IOC_FS_INFO: fd " << fd);
 		}
@@ -1100,13 +1158,19 @@ namespace crucible {
 	uint16_t
 	BtrfsIoctlFsInfoArgs::csum_type() const
 	{
-		return this->btrfs_ioctl_fs_info_args_v2::csum_type;
+		return this->btrfs_ioctl_fs_info_args_v3::csum_type;
 	}

 	uint16_t
 	BtrfsIoctlFsInfoArgs::csum_size() const
 	{
-		return this->btrfs_ioctl_fs_info_args_v2::csum_size;
+		return this->btrfs_ioctl_fs_info_args_v3::csum_size;
+	}
+
+	uint64_t
+	BtrfsIoctlFsInfoArgs::generation() const
+	{
+		return this->btrfs_ioctl_fs_info_args_v3::generation;
 	}

 };
--- a/lib/task.cc
+++ b/lib/task.cc
@@ -89,6 +89,7 @@ namespace crucible {

 		TaskState &operator=(const TaskState &) = delete;
 		TaskState(const TaskState &) = delete;
+		TaskState(TaskState &&) = delete;

 	public:
 		~TaskState();
@@ -199,13 +200,19 @@ namespace crucible {
 			tlcc->m_local_queue.splice(tlcc->m_local_queue.begin(), queue);
 		} else {
 			// We are not executing under a TaskConsumer.
-			// Create a new task to wrap our post-exec queue,
-			// then push it to the front of the global queue using normal locking methods.
-			TaskStatePtr rescue_task(make_shared<TaskState>("rescue_task", [](){}));
-			swap(rescue_task->m_post_exec_queue, queue);
-			TaskQueue tq_one { rescue_task };
-			TaskMasterState::push_front(tq_one);
+			// If there is only one task, then just insert it at the front of the queue.
+			if (queue.size() == 1) {
+				TaskMasterState::push_front(queue);
+			} else {
+				// If there are multiple tasks, create a new task to wrap our post-exec queue,
+				// then push it to the front of the global queue using normal locking methods.
+				TaskStatePtr rescue_task(make_shared<TaskState>("rescue_task", [](){}));
+				swap(rescue_task->m_post_exec_queue, queue);
+				TaskQueue tq_one { rescue_task };
+				TaskMasterState::push_front(tq_one);
+			}
 		}
+		assert(queue.empty());
 	}

 	TaskState::~TaskState()
@@ -286,23 +293,23 @@ namespace crucible {
 			--m_run_count;
 			m_is_running = true;
 		}
+
+		TaskStatePtr this_task = shared_from_this();
+		swap(this_task, tl_current_task);
 		lock.unlock();

 		char buf[24] = { 0 };
 		DIE_IF_MINUS_ERRNO(pthread_getname_np(pthread_self(), buf, sizeof(buf)));
 		DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), m_title.c_str()));

-		TaskStatePtr this_task = shared_from_this();
-		swap(this_task, tl_current_task);
-
 		catch_all([&]() {
 			m_exec_fn();
 		});

-		swap(this_task, tl_current_task);
 		pthread_setname_np(pthread_self(), buf);

 		lock.lock();
+		swap(this_task, tl_current_task);
 		m_is_running = false;

 		// Splice task post_exec queue at front of local queue
@@ -749,6 +756,7 @@ namespace crucible {
 		// There is no longer a current consumer, but hold our own shared
 		// state so it's still there in the destructor
 		swap(this_consumer, tl_current_consumer);
+		assert(!tl_current_consumer);

 		// Release lock to rescue queue (may attempt to queue a new task at TaskMaster).
 		// rescue_queue normally sends tasks to the local queue of the current TaskConsumer thread,
--- a/lib/uname.cc
+++ b/lib/uname.cc
@@ -0,0 +1,11 @@
+#include "crucible/error.h"
+#include "crucible/uname.h"
+
+namespace crucible {
+	using namespace std;
+
+	Uname::Uname()
+	{
+		DIE_IF_NON_ZERO(uname(static_cast<utsname*>(this)));
+	}
+}
--- a/2
+++ b/2
@@ -10,4 +10,4 @@ CCFLAGS = -Wall -Wextra -Werror -O3
 CCFLAGS += -I../include -D_FILE_OFFSET_BITS=64

 BEES_CFLAGS   = $(CCFLAGS) -std=c99 $(CFLAGS)
-BEES_CXXFLAGS = $(CCFLAGS) -std=c++11 -Wold-style-cast $(CXXFLAGS)
+BEES_CXXFLAGS = $(CCFLAGS) -std=c++11 -Wold-style-cast -Wno-missing-field-initializers $(CXXFLAGS)
--- a/scripts/beesd.in
+++ b/scripts/beesd.in
@@ -31,20 +31,18 @@ help(){
    exec "$bees_bin" --help
 }

-get_bees_supp_opts(){
-    "$bees_bin" --help |& awk '/--../ { gsub( ",", "" ); print $1 " " $2}'
-}
-
-SUPPORTED_ARGS=(
-    $(get_bees_supp_opts)
-)
+for i in $("$bees_bin" --help 2>&1 | grep -E " --" | sed -e "s/^[^-]*-/-/" -e "s/,[^-]*--/ --/" -e "s/ [^-]*$//")
+do
+   TMP_ARGS="$TMP_ARGS $i"
+done
+IFS=" " read -r -a SUPPORTED_ARGS <<< $TMP_ARGS
 NOT_SUPPORTED_ARGS=()
 ARGUMENTS=()

 for arg in "${@}"; do
    supp=false
    for supp_arg in "${SUPPORTED_ARGS[@]}"; do
-        if [ "$arg" == "$supp_arg" ]; then
+        if [[ "$arg" == ${supp_arg}* ]]; then
            supp=true
            break
        fi
@@ -73,7 +71,7 @@ done
 [ -z "$UUID" ] && help


-FILE_CONFIG="$(egrep -l '^[^#]*UUID\s*=\s*"?'"$UUID" "$CONFIG_DIR"/*.conf | head -1)"
+FILE_CONFIG="$(grep -E -l '^[^#]*UUID\s*=\s*"?'"$UUID" "$CONFIG_DIR"/*.conf | head -1)"
 [ ! -f "$FILE_CONFIG" ] && ERRO "No config for $UUID"
 INFO "Find $UUID in $FILE_CONFIG, use as conf"
 source "$FILE_CONFIG"
--- a/scripts/beesd@.service.in
+++ b/scripts/beesd@.service.in
@@ -17,6 +17,7 @@ KillSignal=SIGTERM
 MemoryAccounting=true
 Nice=19
 Restart=on-abnormal
+RuntimeDirectory=bees
 StartupCPUWeight=25
 StartupIOWeight=25

--- a/src/bees-context.cc
+++ b/src/bees-context.cc
@@ -187,20 +187,20 @@ BeesContext::is_root_ro(uint64_t root)
 }

 bool
-BeesContext::dedup(const BeesRangePair &brp)
+BeesContext::dedup(const BeesRangePair &brp_in)
 {
 	// TOOLONG and NOTE can retroactively fill in the filename details, but LOG can't
-	BEESNOTE("dedup " << brp);
+	BEESNOTE("dedup " << brp_in);

-	brp.second.fd(shared_from_this());
-
-	if (is_root_ro(brp.second.fid().root())) {
-		// BEESLOGDEBUG("WORKAROUND: dst root is read-only in " << name_fd(brp.second.fd()));
+	if (is_root_ro(brp_in.second.fid().root())) {
+		// BEESLOGDEBUG("WORKAROUND: dst root " << (brp_in.second.fid().root()) << " is read-only);
 		BEESCOUNT(dedup_workaround_btrfs_send);
 		return false;
 	}

+	auto brp = brp_in;
 	brp.first.fd(shared_from_this());
+	brp.second.fd(shared_from_this());

 	BEESTOOLONG("dedup " << brp);

@@ -209,6 +209,8 @@ BeesContext::dedup(const BeesRangePair &brp)

 	BEESLOGINFO("dedup: src " << pretty(brp.first.size())  << " [" << to_hex(brp.first.begin())  << ".." << to_hex(brp.first.end())  << "] {" << first_addr  << "} " << name_fd(brp.first.fd()) << "\n"
 		 << "       dst " << pretty(brp.second.size()) << " [" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "] {" << second_addr << "} " << name_fd(brp.second.fd()));
+	BEESNOTE("dedup: src " << pretty(brp.first.size())  << " [" << to_hex(brp.first.begin())  << ".." << to_hex(brp.first.end())  << "] {" << first_addr  << "} " << name_fd(brp.first.fd()) << "\n"
+		 << "       dst " << pretty(brp.second.size()) << " [" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "] {" << second_addr << "} " << name_fd(brp.second.fd()));

 	if (first_addr.get_physical_or_zero() == second_addr.get_physical_or_zero()) {
 		BEESLOGTRACE("equal physical addresses in dedup");
@@ -292,6 +294,15 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
 	BEESTRACE("scan extent " << e);
 	BEESCOUNT(scan_extent);

+	// EXPERIMENT:  Don't bother with tiny extents unless they are the entire file.
+	// We'll take a tiny extent at BOF or EOF but not in between.
+	if (e.begin() && e.size() < 128 * 1024 && e.end() != Stat(bfr.fd()).st_size) {
+		BEESCOUNT(scan_extent_tiny);
+		// This doesn't work properly with the current architecture,
+		// so we don't do an early return here.
+		// return bfr;
+	}
+
 	// We keep moving this method around
 	auto m_ctx = shared_from_this();

@@ -708,27 +719,28 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
 		BEESLOGINFO("scan: " << pretty(e.size()) << " " << to_hex(e.begin()) << " [" << bar << "] " << to_hex(e.end()) << ' ' << name_fd(bfr.fd()));
 	}

+	// Costs 10% on benchmarks
+	// bees_unreadahead(bfr.fd(), bfr.begin(), bfr.size());
 	return bfr;
 }

 BeesFileRange
-BeesContext::scan_forward(const BeesFileRange &bfr)
+BeesContext::scan_forward(const BeesFileRange &bfr_in)
 {
-	// What are we doing here?
-	BEESTRACE("scan_forward " << bfr);
+	BEESTRACE("scan_forward " << bfr_in);
 	BEESCOUNT(scan_forward);

 	Timer scan_timer;

 	// Silently filter out blacklisted files
-	if (is_blacklisted(bfr.fid())) {
+	if (is_blacklisted(bfr_in.fid())) {
 		BEESCOUNT(scan_blacklisted);
-		return bfr;
+		return bfr_in;
 	}

-	BEESNOTE("scan open " << bfr);
-
 	// Reconstitute FD
+	BEESNOTE("scan open " << bfr_in);
+	auto bfr = bfr_in;
 	bfr.fd(shared_from_this());

 	BEESNOTE("scan extent " << bfr);
@@ -796,8 +808,7 @@ BeesContext::wait_for_balance()
 	Timer balance_timer;
 	BEESNOTE("WORKAROUND: waiting for balance to stop");
 	while (true) {
-		btrfs_ioctl_balance_args args;
-		memset_zero<btrfs_ioctl_balance_args>(&args);
+		btrfs_ioctl_balance_args args {};
 		const int ret = ioctl(root_fd(), BTRFS_IOC_BALANCE_PROGRESS, &args);
 		if (ret < 0) {
 			// Either can't get balance status or not running, exit either way
@@ -835,24 +846,6 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
 	// transaction latency, competing threads, and freeze/SIGSTOP
 	// pausing the bees process.

-	// There can be only one of these running at a time, or some lingering
-	// backref bug will kill the whole system.  Also it looks like there
-	// are so many locks held while LOGICAL_INO runs that there is no
-	// point in trying to run two of them on the same filesystem.
-	// ...but it works most of the time, and the performance hit from
-	// not running resolve in multiple threads is significant.
-	// But "most of the time" really just means "between forced reboots",
-	// and with recent improvements in kernel uptime, this is now in the
-	// top 3 crash causes.
-	static mutex s_resolve_mutex;
-	unique_lock<mutex> lock(s_resolve_mutex, defer_lock);
-	if (BEES_SERIALIZE_RESOLVE) {
-		BEESNOTE("waiting to resolve addr " << addr);
-		lock.lock();
-	}
-
-	// Is there a bug where resolve and balance cause a crash (BUG_ON at fs/btrfs/ctree.c:1227)?
-	// Apparently yes, and more than one.
 	// Wait for the balance to finish before we run LOGICAL_INO
 	wait_for_balance();

@@ -880,15 +873,15 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
 	struct rusage usage_after;
 	DIE_IF_MINUS_ONE(getrusage(RUSAGE_THREAD, &usage_after));

-	double sys_usage_delta =
+	const double sys_usage_delta =
 		(usage_after.ru_stime.tv_sec + usage_after.ru_stime.tv_usec / 1000000.0) -
 		(usage_before.ru_stime.tv_sec + usage_before.ru_stime.tv_usec / 1000000.0);

-	double user_usage_delta =
+	const double user_usage_delta =
 		(usage_after.ru_utime.tv_sec + usage_after.ru_utime.tv_usec / 1000000.0) -
 		(usage_before.ru_utime.tv_sec + usage_before.ru_utime.tv_usec / 1000000.0);

-	auto rt_age = resolve_timer.age();
+	const auto rt_age = resolve_timer.age();

 	BeesResolveAddrResult rv;

@@ -912,12 +905,13 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)

 	// Count how many times this happens so we can figure out how
 	// important this case is
-	static size_t most_refs_ever = 2730;
+	static const size_t max_logical_ino_v1_refs = 2730; // (65536 - header_len) / (sizeof(uint64_t) * 3)
+	static size_t most_refs_ever = max_logical_ino_v1_refs;
 	if (rv_count > most_refs_ever) {
 		BEESLOGINFO("addr " << addr << " refs " << rv_count << " beats previous record " << most_refs_ever);
 		most_refs_ever = rv_count;
 	}
-	if (rv_count > 2730) {
+	if (rv_count > max_logical_ino_v1_refs) {
 		BEESCOUNT(resolve_large);
 	}

@@ -1060,9 +1054,13 @@ BeesContext::stop()
 	BEESLOGDEBUG("Waiting for progress thread");
 	m_progress_thread->join();

-	// XXX: nobody can see this BEESNOTE because we are killing the
-	// thread that publishes it
-	BEESNOTE("waiting for status thread");
+	// Write status once with this message...
+	BEESNOTE("stopping status thread at " << stop_timer << " sec");
+	lock.lock();
+	m_stop_condvar.notify_all();
+	lock.unlock();
+
+	// then wake the thread up one more time to exit the while loop
 	BEESLOGDEBUG("Waiting for status thread");
 	lock.lock();
 	m_stop_status = true;
--- a/src/bees-hash.cc
+++ b/src/bees-hash.cc
@@ -3,9 +3,9 @@
 #include "crucible/city.h"
 #include "crucible/crc64.h"
 #include "crucible/string.h"
+#include "crucible/uname.h"

 #include <algorithm>
-#include <random>

 #include <sys/mman.h>

@@ -123,7 +123,7 @@ BeesHashTable::flush_dirty_extent(uint64_t extent_index)
 		THROW_CHECK2(out_of_range, dirty_extent_end, dirty_extent, dirty_extent_end - dirty_extent == BLOCK_SIZE_HASHTAB_EXTENT);
 		BEESTOOLONG("pwrite(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
 		// Copy the extent because we might be stuck writing for a while
-		vector<uint8_t> extent_copy(dirty_extent, dirty_extent_end);
+		ByteVector extent_copy(dirty_extent, dirty_extent_end);

 		// Mark extent non-dirty while we still hold the lock
 		m_extent_metadata.at(extent_index).m_dirty = false;
@@ -206,8 +206,11 @@ BeesHashTable::writeback_loop()
 	}
 	catch_all([&]() {
 		// trigger writeback on our way out
+#if 0
+		// seems to trigger huge latency spikes
 		BEESTOOLONG("unreadahead hash table size " << pretty(m_size));
 		bees_unreadahead(m_fd, 0, m_size);
+#endif
 	});
 	BEESLOGDEBUG("Exited hash table writeback_loop");
 }
@@ -226,6 +229,7 @@ percent(size_t num, size_t den)
 void
 BeesHashTable::prefetch_loop()
 {
+	Uname uname;
 	bool not_locked = true;
 	while (!m_stop_requested) {
 		size_t width = 64;
@@ -319,6 +323,7 @@ BeesHashTable::prefetch_loop()
 		graph_blob << "Now:     " << format_time(time(NULL)) << "\n";
 		graph_blob << "Uptime:  " << m_ctx->total_timer().age() << " seconds\n";
 		graph_blob << "Version: " << BEES_VERSION << "\n";
+		graph_blob << "Kernel:  " << uname.sysname << " " << uname.release << " " << uname.machine << " " << uname.version << "\n";

 		graph_blob
 			<< "\nHash table page occupancy histogram (" << occupied_count << "/" << total_count << " cells occupied, " << (occupied_count * 100 / total_count) << "%)\n"
@@ -538,6 +543,8 @@ BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
 	return found;
 }

+thread_local uniform_int_distribution<size_t> BeesHashTable::tl_distribution(0, c_cells_per_bucket - 1);
+
 /// Insert a hash entry at some unspecified point in the list.
 /// If entry is already present in list, returns true and does not
 /// modify list.  If entry is not present in list, returns false and
@@ -555,9 +562,7 @@ BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr)
 	Cell *ip = find(er.first, er.second, mv);
 	bool found = (ip < er.second);

-	thread_local default_random_engine generator;
-	thread_local uniform_int_distribution<int> distribution(0, c_cells_per_bucket - 1);
-	auto pos = distribution(generator);
+	const auto pos = tl_distribution(bees_generator);

 	int case_cond = 0;
 #if 0
--- a/src/bees-resolve.cc
+++ b/src/bees-resolve.cc
@@ -385,14 +385,15 @@ BeesResolver::for_each_extent_ref(BeesBlockData bbd, function<bool(const BeesFil
 }

 BeesFileRange
-BeesResolver::replace_dst(const BeesFileRange &dst_bfr)
+BeesResolver::replace_dst(const BeesFileRange &dst_bfr_in)
 {
-	BEESTRACE("replace_dst dst_bfr " << dst_bfr);
+	BEESTRACE("replace_dst dst_bfr " << dst_bfr_in);
 	BEESCOUNT(replacedst_try);

 	// Open dst, reuse it for all src
-	BEESNOTE("Opening dst bfr " << dst_bfr);
-	BEESTRACE("Opening dst bfr " << dst_bfr);
+	BEESNOTE("Opening dst bfr " << dst_bfr_in);
+	BEESTRACE("Opening dst bfr " << dst_bfr_in);
+	auto dst_bfr = dst_bfr_in;
 	dst_bfr.fd(m_ctx);

 	BeesFileRange overlap_bfr;
@@ -400,10 +401,11 @@ BeesResolver::replace_dst(const BeesFileRange &dst_bfr)

 	BeesBlockData bbd(dst_bfr);

-	for_each_extent_ref(bbd, [&](const BeesFileRange &src_bfr) -> bool {
+	for_each_extent_ref(bbd, [&](const BeesFileRange &src_bfr_in) -> bool {
 		// Open src
-		BEESNOTE("Opening src bfr " << src_bfr);
-		BEESTRACE("Opening src bfr " << src_bfr);
+		BEESNOTE("Opening src bfr " << src_bfr_in);
+		BEESTRACE("Opening src bfr " << src_bfr_in);
+		auto src_bfr = src_bfr_in;
 		src_bfr.fd(m_ctx);

 		if (dst_bfr.overlaps(src_bfr)) {
@@ -418,7 +420,7 @@ BeesResolver::replace_dst(const BeesFileRange &dst_bfr)
 			BEESCOUNT(replacedst_same);
 			// stop looping here, all the other srcs will probably fail this test too
 			BeesTracer::set_silent();
-			throw runtime_error("FIXME: bailing out here, need to fix this further up the call stack");
+			throw runtime_error("FIXME: too many duplicate candidates, bailing out here");
 		}

 		// Make pair(src, dst)
--- a/src/bees-roots.cc
+++ b/src/bees-roots.cc
@@ -171,15 +171,23 @@ BeesRoots::crawl_state_erase(const BeesCrawlState &bcs)
 uint64_t
 BeesRoots::transid_min()
 {
-	BEESNOTE("Calculating transid_min");
+	uint64_t rv = numeric_limits<uint64_t>::max();
+	uint64_t last_root = 0;
+	BEESNOTE("Calculating transid_min (" << rv << " so far, last_root " << last_root << ")");
 	unique_lock<mutex> lock(m_mutex);
 	if (m_root_crawl_map.empty()) {
 		return 0;
 	}
-	uint64_t rv = numeric_limits<uint64_t>::max();
 	const uint64_t max_rv = rv;
 	for (auto i : m_root_crawl_map) {
-		rv = min(rv, i.second->get_state_end().m_min_transid);
+		// Do not count subvols that are isolated by btrfs send workaround.
+		// They will not advance until the workaround is removed or they are set read-write.
+		catch_all([&](){
+			if (!is_root_ro(i.first)) {
+				rv = min(rv, i.second->get_state_end().m_min_transid);
+			}
+		});
+		last_root = i.first;
 	}
 	// If we get through this loop without setting rv, we'll create broken crawlers due to integer overflow.
 	THROW_CHECK2(runtime_error, rv, max_rv, max_rv > rv);
@@ -201,7 +209,7 @@ BeesRoots::transid_max_nocache()
 	sk.min_objectid = sk.max_objectid = BTRFS_EXTENT_TREE_OBJECTID;

 	while (true) {
-		sk.nr_items = 1024;
+		sk.nr_items = 4;
 		BEESTRACE("transid_max search sk " << sk);
 		sk.do_ioctl(m_ctx->root_fd());

@@ -212,7 +220,7 @@ BeesRoots::transid_max_nocache()
 		// We are just looking for the highest transid on the filesystem.
 		// We don't care which object it comes from.
 		for (auto i : sk.m_result) {
-			sk.next_min(i);
+			sk.next_min(i, BTRFS_ROOT_ITEM_KEY);
 			if (i.transid > rv) {
 				rv = i.transid;
 			}
@@ -221,6 +229,8 @@ BeesRoots::transid_max_nocache()

 	// transid must be greater than zero, or we did something very wrong
 	THROW_CHECK1(runtime_error, rv, rv > 0);
+	// transid must be less than max, or we did something very wrong
+	THROW_CHECK1(runtime_error, rv, rv < numeric_limits<uint64_t>::max());
 	return rv;
 }

@@ -624,7 +634,6 @@ BeesRoots::open_root_nocache(uint64_t rootid)

 	BEESTRACE("sk " << sk);
 	while (sk.min_objectid <= rootid) {
-		sk.nr_items = 1024;
 		sk.do_ioctl(m_ctx->root_fd());

 		if (sk.m_result.empty()) {
@@ -632,16 +641,16 @@ BeesRoots::open_root_nocache(uint64_t rootid)
 		}

 		for (auto i : sk.m_result) {
-			sk.next_min(i);
+			sk.next_min(i, BTRFS_ROOT_BACKREF_KEY);
 			if (i.type == BTRFS_ROOT_BACKREF_KEY && i.objectid == rootid) {
-				auto dirid = btrfs_get_member(&btrfs_root_ref::dirid, i.m_data);
-				auto name_len = btrfs_get_member(&btrfs_root_ref::name_len, i.m_data);
-				auto name_start = sizeof(struct btrfs_root_ref);
-				auto name_end = name_len + name_start;
+				const auto dirid = btrfs_get_member(&btrfs_root_ref::dirid, i.m_data);
+				const auto name_len = btrfs_get_member(&btrfs_root_ref::name_len, i.m_data);
+				const auto name_start = sizeof(struct btrfs_root_ref);
+				const auto name_end = name_len + name_start;
 				THROW_CHECK2(runtime_error, i.m_data.size(), name_end, i.m_data.size() >= name_end);
-				string name(i.m_data.data() + name_start, i.m_data.data() + name_end);
+				const string name(i.m_data.data() + name_start, i.m_data.data() + name_end);

-				auto parent_rootid = i.offset;
+				const auto parent_rootid = i.offset;
 				// BEESLOG("parent_rootid " << parent_rootid << " dirid " << dirid << " name " << name);
 				BEESTRACE("parent_rootid " << parent_rootid << " dirid " << dirid << " name " << name);
 				BEESCOUNT(root_parent_open_try);
@@ -761,7 +770,6 @@ BeesRoots::next_root(uint64_t root)
 	sk.min_objectid = root + 1;

 	while (true) {
-		sk.nr_items = 1024;
 		sk.do_ioctl(m_ctx->root_fd());

 		if (sk.m_result.empty()) {
@@ -769,7 +777,7 @@ BeesRoots::next_root(uint64_t root)
 		}

 		for (auto i : sk.m_result) {
-			sk.next_min(i);
+			sk.next_min(i, BTRFS_ROOT_BACKREF_KEY);
 			if (i.type == BTRFS_ROOT_BACKREF_KEY) {
 				// BEESLOGDEBUG("Found root " << i.objectid << " parent " << i.offset << " transid " << i.transid);
 				return i.objectid;
@@ -947,8 +955,8 @@ BeesCrawl::BeesCrawl(shared_ptr<BeesContext> ctx, BeesCrawlState initial_state)
 bool
 BeesCrawl::next_transid()
 {
-	auto roots = m_ctx->roots();
-	auto next_transid = roots->transid_max();
+	const auto roots = m_ctx->roots();
+	const auto next_transid = roots->transid_max();
 	auto crawl_state = get_state_end();

 	// If we are already at transid_max then we are still finished
@@ -958,7 +966,7 @@ BeesCrawl::next_transid()
 		m_deferred = true;
 	} else {
 		// Log performance stats from the old crawl
-		auto current_time = time(NULL);
+		const auto current_time = time(NULL);

 		// Start new crawl
 		crawl_state.m_min_transid = crawl_state.m_max_transid;
@@ -993,25 +1001,11 @@ BeesCrawl::fetch_extents()
 		return next_transid();
 	}

-	// Check for btrfs send workaround: don't scan RO roots at all, pretend
-	// they are just empty.  We can't free any space there, and we
-	// don't have the necessary analysis logic to be able to use
-	// them as dedupe src extents (yet).
-	//
-	// This will keep the max_transid up to date so if the root
-	// is ever switched back to read-write, it won't trigger big
-	// expensive in-kernel searches for ancient transids.
-	if (m_ctx->is_root_ro(old_state.m_root)) {
-		BEESLOGDEBUG("WORKAROUND: skipping scan of RO root " << old_state.m_root);
-		BEESCOUNT(root_workaround_btrfs_send);
-		return next_transid();
-	}
-
 	BEESNOTE("crawling " << get_state_end());

 	Timer crawl_timer;

-	BtrfsIoctlSearchKey sk(BEES_MAX_CRAWL_BYTES);
+	BtrfsIoctlSearchKey sk;
 	sk.tree_id = old_state.m_root;
 	sk.min_objectid = old_state.m_objectid;
 	sk.min_type = sk.max_type = BTRFS_EXTENT_DATA_KEY;
@@ -1019,7 +1013,7 @@ BeesCrawl::fetch_extents()
 	sk.min_transid = old_state.m_min_transid;
 	// Don't set max_transid to m_max_transid here.	 See below.
 	sk.max_transid = numeric_limits<uint64_t>::max();
-	sk.nr_items = BEES_MAX_CRAWL_ITEMS;
+	sk.nr_items = 4;

 	// Lock in the old state
 	set_state(old_state);
@@ -1047,6 +1041,43 @@ BeesCrawl::fetch_extents()
 		return next_transid();
 	}

+	// Check for btrfs send workaround: don't scan RO roots at all, pretend
+	// they are just empty.  We can't free any space there, and we
+	// don't have the necessary analysis logic to be able to use
+	// them as dedupe src extents (yet).
+	bool ro_root = true;
+	catch_all([&](){
+		ro_root = m_ctx->is_root_ro(old_state.m_root);
+	});
+	if (ro_root) {
+		BEESLOGDEBUG("WORKAROUND: skipping scan of RO root " << old_state.m_root);
+		BEESCOUNT(root_workaround_btrfs_send);
+		// We would call next_transid() here, but we want to do a few things differently.
+		// We immediately defer further crawling on this subvol.
+		// We track max_transid if the subvol scan has never started.
+		// We postpone the started timestamp since we haven't started.
+		auto crawl_state = get_state_end();
+		if (crawl_state.m_objectid == 0) {
+			// This will keep the max_transid up to date so if the root
+			// is ever switched back to read-write, it won't trigger big
+			// expensive in-kernel searches for ancient transids.
+			// If the root is made RO while crawling is in progress, we will
+			// have the big expensive in-kernel searches (same as if we have
+			// been not running for a long time).
+			// Don't allow transid_max to ever move backwards.
+			const auto roots = m_ctx->roots();
+			const auto next_transid = roots->transid_max();
+			const auto current_time = time(NULL);
+			crawl_state.m_max_transid = max(next_transid, crawl_state.m_max_transid);
+			// Move the start time forward too, since we have not started crawling yet.
+			crawl_state.m_started = current_time;
+			set_state(crawl_state);
+		}
+		// Mark this root deferred so we won't see it until the next transid cycle
+		m_deferred = true;
+		return false;
+	}
+
 	// BEESLOGINFO("Crawling " << sk.m_result.size() << " results from " << get_state_end());
 	auto results_left = sk.m_result.size();
 	BEESNOTE("crawling " << results_left << " results from " << get_state_end());
@@ -1058,7 +1089,7 @@ BeesCrawl::fetch_extents()
 	size_t count_high = 0;
 	BeesFileRange last_bfr;
 	for (auto i : sk.m_result) {
-		sk.next_min(i);
+		sk.next_min(i, BTRFS_EXTENT_DATA_KEY);
 		--results_left;
 		BEESCOUNT(crawl_items);

--- a/src/bees-types.cc
+++ b/src/bees-types.cc
@@ -287,7 +287,7 @@ BeesFileRange::fd() const
 }

 Fd
-BeesFileRange::fd(const shared_ptr<BeesContext> &ctx) const
+BeesFileRange::fd(const shared_ptr<BeesContext> &ctx)
 {
 	// If we don't have a fid we can't do much here
 	if (m_fid) {
--- a/src/bees.cc
+++ b/src/bees.cc
@@ -231,17 +231,23 @@ bees_readahead(int const fd, off_t offset, size_t size)
 	Timer readahead_timer;
 	BEESNOTE("readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
 	BEESTOOLONG("readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
+#if 1
 	// In the kernel, readahead() is identical to posix_fadvise(..., POSIX_FADV_DONTNEED)
 	DIE_IF_NON_ZERO(readahead(fd, offset, size));
-#if 0
+#else
 	// Make sure this data is in page cache by brute force
-	// This isn't necessary and it might even be slower
+	// This isn't necessary and it might even be slower,
+	// but the btrfs kernel code does readahead with lower ioprio
+	// and might discard the readahead request entirely,
+	// so it's maybe, *maybe*, worth doing both.
 	BEESNOTE("emulating readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
 	while (size) {
+		// don't care about multithreaded writes to this buffer--it is garbage anyway
 		static uint8_t dummy[BEES_READAHEAD_SIZE];
 		size_t this_read_size = min(size, sizeof(dummy));
-		// Ignore errors and short reads.
-		// It turns out our size parameter isn't all that accurate.
+		// Ignore errors and short reads.  It turns out our size
+		// parameter isn't all that accurate, so we can't use
+		// the pread_or_die template.
 		(void)!pread(fd, dummy, this_read_size, offset);
 		BEESCOUNT(readahead_count);
 		BEESCOUNTADD(readahead_bytes, this_read_size);
@@ -262,6 +268,13 @@ bees_unreadahead(int const fd, off_t offset, size_t size)
 	BEESCOUNTADD(readahead_unread_ms, unreadahead_timer.age() * 1000);
 }

+thread_local random_device bees_random_device;
+thread_local uniform_int_distribution<default_random_engine::result_type> bees_random_seed_dist(
+	numeric_limits<default_random_engine::result_type>::min(),
+	numeric_limits<default_random_engine::result_type>::max()
+);
+thread_local default_random_engine bees_generator(bees_random_seed_dist(bees_random_device));
+
 BeesStringFile::BeesStringFile(Fd dir_fd, string name, size_t limit) :
 	m_dir_fd(dir_fd),
 	m_name(name),
--- a/src/bees.h
+++ b/src/bees.h
@@ -13,15 +13,15 @@
 #include "crucible/time.h"
 #include "crucible/task.h"

-#include <atomic>
 #include <functional>
 #include <list>
 #include <mutex>
 #include <string>
+#include <random>
 #include <thread>

-#include <syslog.h>
 #include <endian.h>
+#include <syslog.h>

 using namespace crucible;
 using namespace std;
@@ -101,12 +101,6 @@ const double BEES_HASH_TABLE_ANALYZE_INTERVAL = BEES_STATS_INTERVAL;
 // Stop growing the work queue after we have this many tasks queued
 const size_t BEES_MAX_QUEUE_SIZE = 128;

-// Read this many items at a time in SEARCHv2
-const size_t BEES_MAX_CRAWL_ITEMS = 8;
-
-// Read this many bytes at a time in SEARCHv2 (one maximum-sized metadata page)
-const size_t BEES_MAX_CRAWL_BYTES = 64 * 1024;
-
 // Insert this many items before switching to a new subvol
 const size_t BEES_MAX_CRAWL_BATCH = 128;

@@ -116,9 +110,6 @@ const size_t BEES_TRANSID_FACTOR = 10;
 // Wait this long for a balance to stop
 const double BEES_BALANCE_POLL_INTERVAL = 60.0;

-// Workaround for backref bugs
-const bool BEES_SERIALIZE_RESOLVE = false;
-
 // Workaround for tree mod log bugs
 const bool BEES_SERIALIZE_BALANCE = false;

@@ -269,7 +260,7 @@ ostream& operator<<(ostream &os, const BeesFileId &bfi);

 class BeesFileRange {
 protected:
-	mutable Fd		m_fd;
+	Fd			m_fd;
 	mutable BeesFileId	m_fid;
 	off_t			m_begin = 0, m_end = 0;
 	mutable off_t		m_file_size = -1;
@@ -310,7 +301,7 @@ public:
 	Fd fd() const;

 	// Get the fd, opening it if necessary
-	Fd fd(const shared_ptr<BeesContext> &ctx) const;
+	Fd fd(const shared_ptr<BeesContext> &ctx);

 	BeesFileRange copy_closed() const;

@@ -345,6 +336,7 @@ public:
 	BeesAddress(Type addr = ZERO) : m_addr(addr) {}
 	BeesAddress(MagicValue addr) : m_addr(addr) {}
 	BeesAddress& operator=(const BeesAddress &that) = default;
+	BeesAddress(const BeesAddress &that) = default;
 	operator Type() const { return m_addr; }
 	bool operator==(const BeesAddress &that) const;
 	bool operator==(const MagicValue that) const { return *this == BeesAddress(that); }
@@ -405,6 +397,7 @@ public:
 		HashType	e_hash;
 		AddrType	e_addr;
 		Cell(const Cell &) = default;
+		Cell &operator=(const Cell &) = default;
 		Cell(HashType hash, AddrType addr) : e_hash(hash), e_addr(addr) { }
 		bool operator==(const Cell &e) const { return tie(e_hash, e_addr) == tie(e.e_hash, e.e_addr); }
 		bool operator!=(const Cell &e) const { return tie(e_hash, e_addr) != tie(e.e_hash, e.e_addr); }
@@ -468,7 +461,7 @@ private:
 	// Mutex/condvar for the writeback thread
 	mutex			m_dirty_mutex;
 	condition_variable	m_dirty_condvar;
-	bool			m_dirty;
+	bool			m_dirty = false;

 	// Mutex/condvar to stop
 	mutex			m_stop_mutex;
@@ -502,6 +495,8 @@ private:

 	BeesHashTable(const BeesHashTable &) = delete;
 	BeesHashTable &operator=(const BeesHashTable &) = delete;
+
+	static thread_local uniform_int_distribution<size_t> tl_distribution;
 };

 ostream &operator<<(ostream &os, const BeesHashTable::Cell &bhte);
@@ -639,7 +634,7 @@ private:
 ostream & operator<<(ostream &os, const BeesHash &bh);

 class BeesBlockData {
-	using Blob = vector<uint8_t>;
+	using Blob = ByteVector;

 	mutable Fd		m_fd;
 	off_t			m_offset;
@@ -812,7 +807,7 @@ class BeesResolver {
 	BeesAddress				m_addr;
 	vector<BtrfsInodeOffsetRoot>		m_biors;
 	set<BeesFileRange>			m_ranges;
-	unsigned				m_bior_count;
+	size_t					m_bior_count;

 	// We found matching data, so we can dedupe
 	bool					m_found_data = false;
@@ -887,6 +882,7 @@ public:
 extern int bees_log_level;
 extern const char *BEES_USAGE;
 extern const char *BEES_VERSION;
+extern thread_local default_random_engine bees_generator;
 string pretty(double d);
 void bees_sync(int fd);
 void bees_readahead(int fd, off_t offset, size_t size);
--- a/src/fiemap.cc
+++ b/src/fiemap.cc
@@ -22,19 +22,21 @@ main(int argc, char **argv)
 	
 		cout << "File: " << filename << endl;
 		Fd fd = open_or_die(filename, O_RDONLY);
-		Fiemap fm;
-		fm.fm_flags &= ~(FIEMAP_FLAG_SYNC);
+		uint64_t start = 0;
+		uint64_t length = Fiemap::s_fiemap_max_offset;
+		if (argc > 2) { start = stoull(argv[2], nullptr, 0); }
+		if (argc > 3) { length = stoull(argv[3], nullptr, 0); }
+		length = min(length, Fiemap::s_fiemap_max_offset - start);
+		Fiemap fm(start, length);
+		fm.m_flags &= ~(FIEMAP_FLAG_SYNC);
 		fm.m_max_count = 100;
-		if (argc > 2) { fm.fm_start = stoull(argv[2], nullptr, 0); }
-		if (argc > 3) { fm.fm_length = stoull(argv[3], nullptr, 0); }
-		if (argc > 4) { fm.fm_flags = stoull(argv[4], nullptr, 0); }
-		fm.fm_length = min(fm.fm_length, FIEMAP_MAX_OFFSET - fm.fm_start);
-		uint64_t stop_at = fm.fm_start + fm.fm_length;
-		uint64_t last_byte = fm.fm_start;
+		if (argc > 4) { fm.m_flags = stoull(argv[4], nullptr, 0); }
+		uint64_t stop_at = start + length;
+		uint64_t last_byte = start;
 		do {
 			fm.do_ioctl(fd);
 			// cerr << fm;
-			uint64_t last_logical = FIEMAP_MAX_OFFSET;
+			uint64_t last_logical = Fiemap::s_fiemap_max_offset;
 			for (auto &extent : fm.m_extents) {
 				if (extent.fe_logical > last_byte) {
 					cout << "Log " << to_hex(last_byte) << ".." << to_hex(extent.fe_logical) << " Hole" << endl;
@@ -45,8 +47,8 @@ main(int argc, char **argv)
 				last_logical = extent.fe_logical + extent.fe_length;
 				last_byte = last_logical;
 			}
-			fm.fm_start = last_logical;
-		} while (fm.fm_start < stop_at);
+			fm.m_start = last_logical;
+		} while (fm.m_start < stop_at);
 	});
 	exit(EXIT_SUCCESS);
 }