docs: add readahead_ event group

readahead and unreadahead have new event counters. Document them. Signed-off-by: Zygo Blaxell <bees@furryterror.org>
hash: use POSIX_FADV_WILLNEED and POSIX_FADV_DONTNEED
2025-08-02 13:53:28 +02:00 · 2021-10-04 20:44:25 -04:00 · 2021-10-04 20:41:09 -04:00 · 2021-10-04 20:21:01 -04:00 · 2021-10-04 20:19:00 -04:00 · 2021-10-04 20:19:00 -04:00
76 changed files with 5673 additions and 2017 deletions
--- a/29
+++ b/29
@@ -7,7 +7,7 @@ LIBEXEC_PREFIX ?= $(LIB_PREFIX)/bees

 SYSTEMD_SYSTEM_UNIT_DIR ?= $(shell pkg-config systemd --variable=systemdsystemunitdir)

-MARKDOWN := $(firstword $(shell type -P markdown markdown2 markdown_py 2>/dev/null || echo markdown))
+BEES_VERSION ?= $(shell git describe --always --dirty || echo UNKNOWN)

 # allow local configuration to override above variables
 -include localconf
@@ -23,51 +23,46 @@ include Defines.mk
 default: $(DEFAULT_MAKE_TARGET)

 all: lib src scripts
-docs: README.html
-reallyall: all docs test
+reallyall: all doc test

 clean: ## Cleanup
 	git clean -dfx -e localconf

-.PHONY: lib src test
+.PHONY: lib src test doc

 lib: ## Build libs
-	$(MAKE) -C lib
+	+$(MAKE) TAG="$(BEES_VERSION)" -C lib

 src: ## Build bins
 src: lib
-	$(MAKE) -C src
+	+$(MAKE) BEES_VERSION="$(BEES_VERSION)" -C src

 test: ## Run tests
 test: lib src
-	$(MAKE) -C test
+	+$(MAKE) -C test
+
+doc: ## Build docs
+	+$(MAKE) -C docs

 scripts/%: scripts/%.in
 	$(TEMPLATE_COMPILER)

 scripts: scripts/beesd scripts/beesd@.service

-README.html: README.md
-	$(MARKDOWN) README.md > README.html.new
-	mv -f README.html.new README.html
-
-install_libs: lib
-	install -Dm644 lib/libcrucible.so $(DESTDIR)$(LIB_PREFIX)/libcrucible.so
-
 install_tools: ## Install support tools + libs
-install_tools: install_libs src
+install_tools: src
 	install -Dm755 bin/fiemap $(DESTDIR)$(PREFIX)/bin/fiemap
 	install -Dm755 bin/fiewalk $(DESTDIR)$(PREFIX)/sbin/fiewalk

 install_bees: ## Install bees + libs
-install_bees: install_libs src $(RUN_INSTALL_TESTS)
+install_bees: src $(RUN_INSTALL_TESTS)
 	install -Dm755 bin/bees	$(DESTDIR)$(LIBEXEC_PREFIX)/bees

 install_scripts: ## Install scipts
 install_scripts: scripts
 	install -Dm755 scripts/beesd $(DESTDIR)$(PREFIX)/sbin/beesd
 	install -Dm644 scripts/beesd.conf.sample $(DESTDIR)/$(ETC_PREFIX)/bees/beesd.conf.sample
-ifneq (SYSTEMD_SYSTEM_UNIT_DIR,)
+ifneq ($(SYSTEMD_SYSTEM_UNIT_DIR),)
 	install -Dm644 scripts/beesd@.service $(DESTDIR)$(SYSTEMD_SYSTEM_UNIT_DIR)/beesd@.service
 endif

--- a/README.md
+++ b/README.md
@@ -1,579 +1,62 @@
 BEES
 ====

-Best-Effort Extent-Same, a btrfs dedup agent.
+Best-Effort Extent-Same, a btrfs deduplication agent.

-About Bees
+About bees
 ----------

-Bees is a block-oriented userspace dedup agent designed to avoid
-scalability problems on large filesystems.
+bees is a block-oriented userspace deduplication agent designed for large
+btrfs filesystems.  It is an offline dedupe combined with an incremental
+data scan capability to minimize time data spends on disk from write
+to dedupe.

-Bees is designed to degrade gracefully when underprovisioned with RAM.
-Bees does not use more RAM or storage as filesystem data size increases.
-The dedup hash table size is fixed at creation time and does not change.
-The effective dedup block size is dynamic and adjusts automatically to
-fit the hash table into the configured RAM limit.  Hash table overflow
-is not implemented to eliminate the IO overhead of hash table overflow.
-Hash table entries are only 16 bytes per dedup block to keep the average
-dedup block size small.
-
-Bees does not require alignment between dedup blocks or extent boundaries
-(i.e. it can handle any multiple-of-4K offset between dup block pairs).
-Bees rearranges blocks into shared and unique extents if required to
-work within current btrfs kernel dedup limitations.
-
-Bees can dedup any combination of compressed and uncompressed extents.
-
-Bees operates in a single pass which removes duplicate extents immediately
-during scan.  There are no separate scanning and dedup phases.
-
-Bees uses only data-safe btrfs kernel operations, so it can dedup live
-data (e.g. build servers, sqlite databases, VM disk images).  It does
-not modify file attributes or timestamps.
-
-Bees does not store any information about filesystem structure, so it is
-not affected by the number or size of files (except to the extent that
-these cause performance problems for btrfs in general).  It retrieves such
-information on demand through btrfs SEARCH_V2 and LOGICAL_INO ioctls.
-This eliminates the storage required to maintain the equivalents of
-these functions in userspace.  It's also why bees has no XFS support.
-
-Bees is a daemon designed to run continuously and maintain its state
-across crahes and reboots.  Bees uses checkpoints for persistence to
-eliminate the IO overhead of a transactional data store.  On restart,
-bees will dedup any data that was added to the filesystem since the
-last checkpoint.
-
-Bees is used to dedup filesystems ranging in size from 16GB to 35TB, with
-hash tables ranging in size from 128MB to 11GB.
-
-How Bees Works
--------------
-
-Bees uses a fixed-size persistent dedup hash table with a variable dedup
-block size.  Any size of hash table can be dedicated to dedup.  Bees will
-scale the dedup block size to fit the filesystem's unique data size
-using a weighted sampling algorithm.  This allows Bees to adapt itself
-to its filesystem size without forcing admins to do math at install time.
-At the same time, the duplicate block alignment constraint can be as low
-as 4K, allowing efficient deduplication of files with narrowly-aligned
-duplicate block offsets (e.g. compiled binaries and VM/disk images)
-even if the effective block size is much larger.
-
-The Bees hash table is loaded into RAM at startup (using hugepages if
-available), mlocked, and synced to persistent storage by trickle-writing
-over a period of several hours.  This avoids issues related to seeking
-or fragmentation, and enables the hash table to be efficiently stored
-on Btrfs with compression (or an ext4 filesystem, or a raw disk, or
-on CIFS...).
-
-Once a duplicate block is identified, Bees examines the nearby blocks
-in the files where block appears.  This allows Bees to find long runs
-of adjacent duplicate block pairs if it has an entry for any one of
-the blocks in its hash table.  The stored hash entry plus the block
-recently scanned from disk form a duplicate pair.  On typical data sets,
-this means most of the blocks in the hash table are redundant and can
-be discarded without significant performance impact.
-
-Hash table entries are grouped together into LRU lists.  As each block
-is scanned, its hash table entry is inserted into the LRU list at a
-random position.  If the LRU list is full, the entry at the end of the
-list is deleted.  If a hash table entry is used to discover duplicate
-blocks, the entry is moved to the beginning of the list.  This makes Bees
-unable to detect a small number of duplicates (less than 1% on typical
-filesystems), but it dramatically improves efficiency on filesystems
-with many small files.  Bees has found a net 13% more duplicate bytes
-than a naive fixed-block-size algorithm with a 64K block size using the
-same size of hash table, even after discarding 1% of the duplicate bytes.
-
-Hash Table Sizing
-----------------
-
-Hash table entries are 16 bytes each (64-bit hash, 52-bit block number,
-and some metadata bits).  Each entry represents a minimum of 4K on disk.
-
-    unique data size    hash table size    average dedup block size
-        1TB                 4GB                  4K
-        1TB                 1GB                 16K
-        1TB               256MB                 64K
-        1TB                16MB               1024K
-       64TB                 1GB               1024K
-
-To change the size of the hash table, use 'truncate' to change the hash
-table size, delete `beescrawl.dat` so that bees will start over with a
-fresh full-filesystem rescan, and restart `bees`.
-
-Things You Might Expect That Bees Doesn't Have
----------------------------------------------
-
-* There's no configuration file (patches welcome!).  There are some tunables
-hardcoded in the source that could eventually become configuration options.
-There's also an incomplete option parser (patches welcome!).
-
-* There's no way to *stop* the Bees daemon.  Use SIGKILL, SIGTERM, or
-Ctrl-C for now.  Some of the destructors are unreachable and have never
-been tested.  Bees will repeat some work when restarted.
-
-* The Bees process doesn't fork and writes its log to stdout/stderr.
-A shell wrapper is required to make it behave more like a daemon.
-
-* There's no facility to exclude any part of a filesystem (patches
-welcome).
-
-* PREALLOC extents and extents containing blocks filled with zeros will
-be replaced by holes unconditionally.
-
-* Duplicate block groups that are less than 12K in length can take 30%
-of the run time while saving only 3% of the disk space.  There should
-be an option to just not bother with those.
-
-* There is a lot of duplicate reading of blocks in snapshots.  Bees will
-scan all snapshots at close to the same time to try to get better
-performance by caching, but really fixing this requires rewriting the
-crawler to scan the btrfs extent tree directly instead of the subvol
-FS trees.
-
-* Block reads are currently more allocation- and CPU-intensive than they
-should be, especially for filesystems on SSD where the IO overhead is
-much smaller.  This is a problem for power-constrained environments
-(e.g. laptops with slow CPU).
-
-* Bees can currently fragment extents when required to remove duplicate
-blocks, but has no defragmentation capability yet.  When possible, Bees
-will attempt to work with existing extent boundaries, but it will not
-aggregate blocks together from multiple extents to create larger ones.
-
-* It is possible to resize the hash table without starting over with
-a new full-filesystem scan; however, this has not been implemented yet.
-
-Good Btrfs Feature Interactions
-------------------------------
-
-Bees has been tested in combination with the following:
-
-* btrfs compression (zlib, lzo, zstd), mixtures of compressed and uncompressed extents
-* PREALLOC extents (unconditionally replaced with holes)
-* HOLE extents and btrfs no-holes feature
-* Other deduplicators, reflink copies (though Bees may decide to redo their work)
-* btrfs snapshots and non-snapshot subvols (RW and RO)
-* Concurrent file modification (e.g. PostgreSQL and sqlite databases, build daemons)
-* all btrfs RAID profiles (people ask about this, but it's irrelevant to bees)
-* IO errors during dedup (read errors will throw exceptions, Bees will catch them and skip over the affected extent)
-* Filesystems mounted *with* the flushoncommit option
-* 4K filesystem data block size / clone alignment
-* 64-bit and 32-bit host CPUs (amd64, x86, arm)
-* Large (>16M) extents
-* Huge files (>1TB--although Btrfs performance on such files isn't great in general)
-* filesystems up to 25T bytes, 100M+ files
-* btrfs receive
-* btrfs nodatacow/nodatasum inode attribute or mount option (bees skips all nodatasum files)
-* open(O_DIRECT) (seems to work as well--or as poorly--with bees as with any other btrfs feature)
-
-Bad Btrfs Feature Interactions
------------------------------
-
-Bees has been tested in combination with the following, and various problems are known:
-
-* bcache, lvmcache:  *severe (filesystem-destroying) metadata corruption
-  issues* observed in testing and reported by users, apparently only when
-  used with bees.  Plain SSD and HDD seem to be OK.
-* btrfs send:  sometimes aborts with an I/O error when bees changes the
-  data layout during a send.  The send can be restarted and will work
-  if bees has finished processing the snapshot being sent.  No data
-  corruption observed other than the truncated send.
-* btrfs qgroups:  very slow, sometimes hangs
-* btrfs autodefrag mount option:  hangs and high CPU usage problems
-  reported by users.  bees cannot distinguish autodefrag activity from
-  normal filesystem activity and will likely try to undo the autodefrag,
-  so it should probably be turned off for bees in any case.
-
-Untested Btrfs Feature Interactions
-----------------------------------
-
-Bees has not been tested with the following, and undesirable interactions may occur:
-
-* Non-4K filesystem data block size (should work if recompiled)
-* Non-equal hash (SUM) and filesystem data block (CLONE) sizes (probably never will work)
-* btrfs seed filesystems (does anyone even use those?)
-* btrfs out-of-tree kernel patches (e.g. in-band dedup or encryption)
-* btrfs-convert from ext2/3/4 (never tested, might run out of space or ignore significant portions of the filesystem due to sanity checks)
-* btrfs mixed block groups (don't know a reason why it would *not* work, but never tested)
-* Filesystems mounted *without* the flushoncommit option (don't know the impact of crashes during dedup writes vs. ordinary writes)
-
-Other Caveats
-------------
-
-* btrfs balance will invalidate parts of the dedup hash table.  Bees will
-  happily rebuild the table, but it will have to scan all the blocks
-  again.
-
-* btrfs defrag will cause Bees to rescan the defragmented file.  If it
-  contained duplicate blocks and other references to the original
-  fragmented duplicates still exist, Bees will replace the defragmented
-  extents with the original fragmented ones.
-
-* Bees creates temporary files (with O_TMPFILE) and uses them to split
-  and combine extents elsewhere in btrfs.  These will take up to 2GB
-  of disk space per thread during normal operation.
-
-* Like all deduplicators, Bees will replace data blocks with metadata
-  references.  It is a good idea to ensure there is sufficient unallocated
-  space (see `btrfs fi usage`) on the filesystem to allow the metadata
-  to multiply in size by the number of snapshots before running Bees
-  for the first time.  Use
-
-        btrfs balance start -dusage=100,limit=N /your/filesystem
-
-  where the `limit` parameter 'N' should be calculated as follows:
-
-	* start with the current size of metadata usage (from `btrfs fi
-	  df`) in GB, plus 1
-
-	* multiply by the proportion of disk space in subvols with
-	  snapshots (i.e. if there are no snapshots, multiply by 0;
-	  if all of the data is shared between at least one origin
-	  and one snapshot subvol, multiply by 1)
-
-	* multiply by the number of snapshots (i.e. if there is only
-	  one subvol, multiply by 0; if there are 3 snapshots and one
-	  origin subvol, multiply by 3)
-
-  `limit = GB_metadata * (disk_space_in_snapshots / total_disk_space) * number_of_snapshots`
-
-  Monitor unallocated space to ensure that the filesystem never runs out
-  of metadata space (whether Bees is running or not--this is a general
-  btrfs requirement).
-
-
-A Brief List Of Btrfs Kernel Bugs
---------------------------------
-
-Missing features (usually not available in older LTS kernels):
-
-* 3.13: `FILE_EXTENT_SAME` ioctl added.  No way to reliably dedup with
-  concurrent modifications before this.
-* 3.16: `SEARCH_V2` ioctl added.  Bees could use `SEARCH` instead.
-* 4.2: `FILE_EXTENT_SAME` no longer updates mtime, can be used at EOF.
-
-Future features (kernel features Bees does not yet use, but may rely on
-in the future):
-
-* 4.14: `LOGICAL_INO_V2` allows userspace to create forward and backward
-  reference maps to entire physical extents with a single ioctl call,
-  and raises the limit of 2730 references per extent.  Bees has not yet
-  been rewritten to take full advantage of these features.
-
-Bug fixes (sometimes included in older LTS kernels):
-
-* Bugs fixed prior to 4.4.107 are not listed here.
-* 4.5: hang in the `INO_PATHS` ioctl used by Bees.
-* 4.5: use-after-free in the `FILE_EXTENT_SAME` ioctl used by Bees.
-* 4.6: lost inodes after a rename, crash, and log tree replay
-  (triggered by the fsync() while writing `beescrawl.dat`).
-* 4.7: *slow backref* bug no longer triggers a softlockup panic.  It still
-  takes too long to resolve a block address to a root/inode/offset triple.
-* 4.10: reduced CPU time cost of the LOGICAL_INO ioctl and dedup
-  backref processing in general.
-* 4.11: yet another dedup deadlock case is fixed.  Alas, it is not the
-  last one.
-* 4.14: backref performance improvements make LOGICAL_INO even faster
-  in the worst cases (but possibly slower in the best cases?).
-* 4.14.29: WARN_ON(ref->count < 0) in fs/btrfs/backref.c triggers
-  almost once per second.  The WARN_ON is incorrect and can be removed.
-
-Unfixed kernel bugs (as of 4.14.34) with workarounds in Bees:
-
-* *Deadlocks* in the kernel dedup ioctl when files are modified
-  immediately before dedup.  `BeesTempFile::make_copy` calls `fsync()`
-  immediately before dedup to work around this.  If the `fsync()` is
-  removed, the filesystem hangs within a few hours, requiring a reboot
-  to recover.  Even with the `fsync()`, it is possible to lose the
-  kernel race condition and encounter a deadlock within a machine-year.
-  VM image workloads may trigger this faster.  Over the past years
-  several specific deadlock cases have been fixed, but at least one
-  remains.
-
-* *Bad interactions* with other Linux block layers:  bcache and lvmcache
-  can fail spectacularly, and apparently only while running bees.
-  This is definitely a kernel bug, either in btrfs or the lower block
-  layers.  Avoid using bees with these tools, or test very carefully
-  before deployment.
-
-* *slow backrefs* (aka toxic extents): If the number of references to a
-  single shared extent within a single file grows above a few thousand,
-  the kernel consumes CPU for minutes at a time while holding various
-  locks that block access to the filesystem.  Bees avoids this bug by
-  measuring the time the kernel spends performing certain operations
-  and permanently blacklisting any extent or hash where the kernel
-  starts to get slow.  Inside Bees, such blocks are marked as 'toxic'
-  hash/block addresses.  Linux kernel v4.14 is better but can still
-  have problems.
-
-* `LOGICAL_INO` output is arbitrarily limited to 2730 references
-  even if more buffer space is provided for results.  Once this number
-  has been reached, Bees can no longer replace the extent since it can't
-  find and remove all existing references.  Bees refrains from adding
-  any more references after the first 2560.  Offending blocks are
-  marked 'toxic' even if there is no corresponding performance problem.
-  This places an obvious limit on dedup efficiency for extremely common
-  blocks or filesystems with many snapshots (although this limit is
-  far greater than the effective limit imposed by the *slow backref* bug).
-  *Fixed in v4.14.*
-
-* `LOGICAL_INO` on compressed extents returns a list of root/inode/offset
-  tuples matching the extent bytenr of its argument.  On uncompressed
-  extents, any r/i/o tuple whose extent offset does not match the
-  argument's extent offset is discarded, i.e. only the single 4K block
-  matching the argument is returned, so a complete map of the extent
-  references requires calling `LOGICAL_INO` for every single block of
-  the extent.  This is undesirable behavior for Bees, which wants a
-  list of all extent refs referencing a data extent (i.e. Bees wants
-  the compressed-extent behavior in all cases).  *Fixed in v4.14.*
-
-* `FILE_EXTENT_SAME` is arbitrarily limited to 16MB.  This is less than
-  128MB which is the maximum extent size that can be created by defrag
-  or prealloc.  Bees avoids feedback loops this can generate while
-  attempting to replace extents over 16MB in length.
-
-Not really bugs, but gotchas nonetheless:
-
-* If a process holds a directory FD open, the subvol containing the
-  directory cannot be deleted (`btrfs sub del` will start the deletion
-  process, but it will not proceed past the first open directory FD).
-  `btrfs-cleaner` will simply skip over the directory *and all of its
-  children* until the FD is closed.  Bees avoids this gotcha by closing
-  all of the FDs in its directory FD cache every 10 btrfs transactions.
-
-* If a file is deleted while Bees is caching an open FD to the file,
-  Bees continues to scan the file.  For very large files (e.g. VM
-  images), the deletion of the file can be delayed indefinitely.
-  To limit this delay, Bees closes all FDs in its file FD cache every
-  10 btrfs transactions.
-
-* If a snapshot is deleted, bees will generate a burst of exceptions
-  for references to files in the snapshot that no longer exist.  This
-  lasts until the FD caches are cleared.
-
-Installation
-============
-
-Bees can be installed by following one these instructions:
-
-Arch package
------------
-
-Bees is available in Arch Linux AUR. Install with:
-
-`$ pacaur -S bees-git`
-
-Gentoo ebuild
-------------
-
-Bees is available as a Gentoo ebuild. Just copy `bees-9999.ebuild` from
-`contrib/gentoo` including the `files` subdirectory to your local
-overlay category `sys-fs`.
-
-You can copy the ebuild to match a Bees version number, and it will
-build that tagged version. It is partly supported since v0.5,
-previous versions won't work.
-
-Build from source
-----------------
-
-Build with `make`. The build produces `bin/bees` and `lib/libcrucible.so`,
-which must be copied to somewhere in `$PATH` and `$LD_LIBRARY_PATH`
-on the target system respectively.
-
-It will also generate `scripts/beesd@.service` for systemd users. This
-service makes use of a helper script `scripts/beesd` to boot the service.
-Both of the latter use the filesystem UUID to mount the root subvolume
-within a temporary runtime directory.
-
-### Ubuntu 16.04 - 17.04:
-`$ apt -y install build-essential btrfs-tools uuid-dev markdown && make`
-
-### Ubuntu 14.04:
-You can try to carry on the work done here: https://gist.github.com/dagelf/99ee07f5638b346adb8c058ab3d57492
-
-Packaging
+Strengths
 ---------

-See 'Dependencies' below. Package maintainers can pick ideas for building and
-configuring the source package from the Gentoo ebuild in `contrib/gentoo`.
-You can configure some build options by creating a file `localconf` and
-adjust settings for your distribution environment there.
+ * Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
+ * Daemon incrementally dedupes new data using btrfs tree search
+ * Works with btrfs compression - dedupe any combination of compressed and uncompressed files
+ * **NEW** [Works around `btrfs send` problems with dedupe and incremental parent shapshots](docs/options.md)
+ * Works around btrfs filesystem structure to free more disk space
+ * Persistent hash table for rapid restart after shutdown
+ * Whole-filesystem dedupe - including snapshots
+ * Constant hash table size - no increased RAM usage if data set becomes larger
+ * Works on live data - no scheduled downtime required
+ * Automatic self-throttling based on system load

-Please also review the Makefile for additional hints.
+Weaknesses
+----------

-Dependencies
------------
+ * Whole-filesystem dedupe - has no include/exclude filters, does not accept file lists
+ * Requires root privilege (or `CAP_SYS_ADMIN`)
+ * First run may require temporary disk space for extent reorganization
+ * [First run may increase metadata space usage if many snapshots exist](docs/gotchas.md)
+ * Constant hash table size - no decreased RAM usage if data set becomes smaller
+ * btrfs only

-* C++11 compiler (tested with GCC 4.9, 6.2.0, 8.1.0)
+Installation and Usage
+----------------------

-  Sorry.  I really like closures and shared_ptr, so support
-  for earlier compiler versions is unlikely.
+ * [Installation](docs/install.md)
+ * [Configuration](docs/config.md)
+ * [Running](docs/running.md)
+ * [Command Line Options](docs/options.md)

-* btrfs-progs (tested with 4.1..4.15.1) or libbtrfs-dev
-  (tested with version 4.16.1)
+Recommended Reading
+-------------------

-  Needed for btrfs.h and ctree.h during compile.
-  Also needed by the service wrapper script.
+ * [bees Gotchas](docs/gotchas.md)
+ * [btrfs kernel bugs](docs/btrfs-kernel.md) - especially DATA CORRUPTION WARNING
+ * [bees vs. other btrfs features](docs/btrfs-other.md)
+ * [What to do when something goes wrong](docs/wrong.md)

-* libuuid-dev
-
-  This library is only required for a feature that was removed after v0.1.
-  The lingering support code can be removed.
-
-* Linux kernel version: *minimum* 4.4.107, *4.14.29 or later recommended*
-
-  Don't bother trying to make Bees work with kernel versions older than
-  4.4.107.  It may appear to work, but it won't end well:  there are
-  too many missing features and bugs (including data corruption bugs)
-  to work around in older kernels.
-
-  Kernel versions between 4.4.107 and 4.14.29 are usable with bees,
-  but bees can trigger known performance bugs and hangs in dedup-related
-  functions.
-
-* markdown
-
-* util-linux version that provides `blkid` command for the helper
-  script `scripts/beesd` to work
-
-Setup
-----
-
-If you don't want to use the helper script `scripts/beesd` to setup and
-configure bees, here's how you manually setup bees.
-
-Create a directory for bees state files:
-
-        export BEESHOME=/some/path
-        mkdir -p "$BEESHOME"
-
-Create an empty hash table (your choice of size, but it must be a multiple
-of 16M).  This example creates a 1GB hash table:
-
-        truncate -s 1g "$BEESHOME/beeshash.dat"
-        chmod 700 "$BEESHOME/beeshash.dat"
-
-bees can only process the root subvol of a btrfs (seriously--if the
-argument is not the root subvol directory, Bees will just throw an
-exception and stop).
-
-Use a bind mount, and let only bees access it:
-
-	UUID=3399e413-695a-4b0b-9384-1b0ef8f6c4cd
-	mkdir -p /var/lib/bees/$UUID
-	mount /dev/disk/by-uuid/$UUID /var/lib/bees/$UUID -osubvol=/
-
-If you don't set BEESHOME, the path ".beeshome" will be used relative
-to the root subvol of the filesystem.  For example:
-
-	btrfs sub create /var/lib/bees/$UUID/.beeshome
-	truncate -s 1g /var/lib/bees/$UUID/.beeshome/beeshash.dat
-	chmod 700 /var/lib/bees/$UUID/.beeshome/beeshash.dat
-
-You can use any relative path in BEESHOME.  The path will be taken
-relative to the root of the deduped filesystem (in other words it can
-be the name of a subvol):
-
-	export BEESHOME=@my-beeshome
-	btrfs sub create /var/lib/bees/$UUID/$BEESHOME
-	truncate -s 1g /var/lib/bees/$UUID/$BEESHOME/beeshash.dat
-	chmod 700 /var/lib/bees/$UUID/$BEESHOME/beeshash.dat
-
-Configuration
-------------
-
-There are some runtime configurable options using environment variables:
-
-* BEESHOME: Directory containing Bees state files:
- * beeshash.dat  | persistent hash table.  Must be a multiple of 16M.
-                   This contains 16-byte records:  8 bytes for CRC64,
-                   8 bytes for physical address and some metadata bits.
- * beescrawl.dat | state of SEARCH_V2 crawlers.  ASCII text.
- * beesstats.txt | statistics and performance counters.  ASCII text.
-* BEESSTATUS: File containing a snapshot of current Bees state:  performance
-  counters and current status of each thread.  The file is meant to be
-  human readable, but understanding it probably requires reading the source.
-  You can watch bees run in realtime with a command like:
-
-	watch -n1 cat $BEESSTATUS
-
-Other options (e.g. interval between filesystem crawls) can be configured
-in src/bees.h or on the cmdline (see 'Command Line Options' below).
-
-Running
-------
-
-Reduce CPU and IO priority to be kinder to other applications sharing
-this host (or raise them for more aggressive disk space recovery).  If you
-use cgroups, put `bees` in its own cgroup, then reduce the `blkio.weight`
-and `cpu.shares` parameters.  You can also use `schedtool` and `ionice`
-in the shell script that launches `bees`:
-
-        schedtool -D -n20 $$
-        ionice -c3 -p $$
-
-Let the bees fly:
-
-	for fs in /var/lib/bees/*-*-*-*-*/; do
-		bees "$fs" >> "$fs/.beeshome/bees.log" 2>&1 &
-	done
-
-You'll probably want to arrange for /var/log/bees.log to be rotated
-periodically.  You may also want to set umask to 077 to prevent disclosure
-of information about the contents of the filesystem through the log file.
-
-There are also some shell wrappers in the `scripts/` directory.
-
-
-
-Command Line Options
--------------------
-
-* --thread-count (-c) COUNT
-  * Specify maximum number of worker threads for scanning.  Overrides
-    --thread-factor (-C) and default/autodetected values.
-* --thread-factor (-C) FACTOR
-  * Specify ratio of worker threads to CPU cores.  Overridden by --thread-count (-c).
-    Default is 1.0, i.e. 1 worker thread per detected CPU.  Use values
-    below 1.0 to leave some cores idle, or above 1.0 if there are more
-    disks than CPUs in the filesystem.
-* --loadavg-target (-g) LOADAVG
-  * Specify load average target for dynamic worker threads.
-    Threads will be started or stopped subject to the upper limit imposed
-    by thread-factor, thread-min and thread-count until the load average
-    is within +/- 0.5 of LOADAVG.
-* --thread-min (-G) COUNT
-  * Specify minimum number of worker threads for scanning.
-    Ignored unless -g option is used to specify a target load.
-
-* --scan-mode (-m) MODE
-  * Specify extent scanning algorithm.  Default mode is 0.
-    _EXPERIMENTAL_ feature that may go away.
-    * Mode 0: scan extents in ascending order of (inode, subvol, offset).
-      Keeps shared extents between snapshots together.  Reads files sequentially.
-      Minimizes temporary space usage.
-    * Mode 1: scan extents from all subvols in parallel.  Good performance
-      on non-spinning media when subvols are unrelated.
-    * Mode 2: scan all extents from one subvol at a time.  Good sequential
-      read performance for spinning media.  Maximizes temporary space usage.
-
-* --timestamps (-t)
-  * Enable timestamps in log output.
-* --no-timestamps (-T)
-  * Disable timestamps in log output.
-* --absolute-paths (-p)
-  * Paths in log output will be absolute.
-* --strip-paths (-P)
-  * Paths in log output will have the working directory at Bees startup
-    stripped.
-* --verbose (-v)
-  * Set log verbosity (0 = no output, 8 = all output, default 8).
+More Information
+----------------

+ * [How bees works](docs/how-it-works.md)
+ * [Missing bees features](docs/missing.md)
+ * [Event counter descriptions](docs/event-counters.md)

 Bug Reports and Contributions
 -----------------------------
@@ -584,11 +67,9 @@ You can also use Github:

        https://github.com/Zygo/bees

-
-
 Copyright & License
-===================
+-------------------

-Copyright 2015-2017 Zygo Blaxell <bees@furryterror.org>.
+Copyright 2015-2018 Zygo Blaxell <bees@furryterror.org>.

 GPL (version 3 or later).
--- a/contrib/gentoo-bees/metadata/layout.conf
+++ b/contrib/gentoo-bees/metadata/layout.conf
@@ -1,18 +0,0 @@
-# manifest-hashes specify hashes used for new/updated entries
-# the current set went live on 2017-11-21, per 2017-11-12 Council meeting
-# https://archives.gentoo.org/gentoo-dev/message/ba2e5d9666ebd7e1bff1143485a37856
-manifest-hashes = BLAKE2B SHA512
-
-# The following hashes are required on all Manifest entries. If any
-# of them are missing, repoman will refetch and rehash old distfiles.
-# Otherwise, old distfiles will keep using their current hash set.
-manifest-required-hashes = BLAKE2B
-
-# No more old ChangeLogs in Git
-update-changelog = false
-
-# Sign Git commits, and NOT Manifests
-sign-commits = true
-sign-manifests = false
-
-masters = gentoo
--- a/contrib/gentoo-bees/profiles/repo_name
+++ b/contrib/gentoo-bees/profiles/repo_name
@@ -1 +0,0 @@
-bees
--- a/contrib/gentoo-bees/sys-fs/bees/Manifest
+++ b/contrib/gentoo-bees/sys-fs/bees/Manifest
@@ -1,2 +0,0 @@
-EBUILD bees-9999.ebuild 2001 BLAKE2B 7fa1c9d043a4334579dfad3560d1593717e548c0d31695cf8ccf8ffe45f2347584c7da43b47cad873745f3c843207433c6b892a0469c5618f107c68f78fd5fe2 SHA512 d49266e007895c049e1c9f7e28ec2f649b386a6441eccba02ee411f14ad395925eecdaa8a747962ccc526f9e1d3aba9fd68f4452a1d276d4e5b7d48c80102cd8
-MISC metadata.xml 479 BLAKE2B ef5e110ba8d88f0188dbc0d12bec2ad45c51abf707656f6fe4e0fa498d933fe9c32c5dc4c9b446402ec686084459f9f075e52f33402810962c1ac6b149fb70c8 SHA512 3fcc136ed4c55323cac4f8cf542210eb77f73e2a80f95fcce2d688bc645f6e5126404776536dedc938b18287b54abbc264610cc2f587a42a3a8e6d7bf8415aaa
--- a/contrib/gentoo-bees/sys-fs/bees/bees-9999.ebuild
+++ b/contrib/gentoo-bees/sys-fs/bees/bees-9999.ebuild
@@ -1,66 +0,0 @@
-# Copyright 1999-2018 Gentoo Foundation
-# Distributed under the terms of the GNU General Public License v2
-
-EAPI=7
-
-inherit linux-info
-
-DESCRIPTION="Best-Effort Extent-Same, a btrfs dedup agent"
-HOMEPAGE="https://github.com/Zygo/bees"
-
-if [[ ${PV} == "9999" ]] ; then
-	EGIT_REPO_URI="https://github.com/Zygo/bees.git"
-	inherit git-r3
-else
-	SRC_URI="https://github.com/Zygo/bees/archive/v${PV}.tar.gz -> ${P}.tar.gz"
-	KEYWORDS="~amd64"
-fi
-
-LICENSE="GPL-3"
-SLOT="0"
-IUSE="tools"
-
-DEPEND="
-	>=sys-apps/util-linux-2.30.2
-	>=sys-fs/btrfs-progs-4.1
-"
-RDEPEND="${DEPEND}"
-
-CONFIG_CHECK="~BTRFS_FS"
-ERROR_BTRFS_FS="CONFIG_BTRFS_FS: bees does currently only work with btrfs"
-
-pkg_pretend() {
-	if [[ ${MERGE_TYPE} != buildonly ]]; then
-		if kernel_is -lt 4 4 3; then
-			ewarn "Kernel versions below 4.4.3 lack critical features needed for bees to"
-			ewarn "properly operate, so it won't work. It's recommended to run at least"
-			ewarn "kernel version 4.11 for best performance and reliability."
-			ewarn
-		elif kernel_is -lt 4 11; then
-			ewarn "With kernel versions below 4.11, bees may severely degrade system performance"
-			ewarn "and responsiveness. Especially, the kernel may deadlock while bees is"
-			ewarn "running, it's recommended to run at least kernel 4.11."
-			ewarn
-		elif kernel_is -lt 4 14 29; then
-			ewarn "With kernel versions below 4.14.29, bees may generate a lot of bogus WARN_ON()"
-			ewarn "messages in the kernel log. These messages can be ignored and this is fixed"
-			ewarn "with more recent kernels:"
-			ewarn "# WARNING: CPU: 3 PID: 18172 at fs/btrfs/backref.c:1391 find_parent_nodes+0xc41/0x14e0"
-			ewarn
-		fi
-		elog "Bees recommends to run the latest current kernel for performance and"
-		elog "reliability reasons, see README.md."
-	fi
-}
-
-src_configure() {
-	cat >localconf <<-EOF || die
-		LIBEXEC_PREFIX=/usr/libexec
-		PREFIX=/usr
-		LIBDIR=$(get_libdir)
-		DEFAULT_MAKE_TARGET=all
-	EOF
-	if use tools; then
-		echo OPTIONAL_INSTALL_TARGETS=install_tools >>localconf || die
-	fi
-}
--- a/contrib/gentoo-bees/sys-fs/bees/metadata.xml
+++ b/contrib/gentoo-bees/sys-fs/bees/metadata.xml
@@ -1,15 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE pkgmetadata SYSTEM "http://www.gentoo.org/dtd/metadata.dtd">
-<pkgmetadata>
-<maintainer type="person">
-	<email>hurikhan77+bgo@gmail.com</email>
-	<name>Kai Krakow</name>
-</maintainer>
-<use>
-	<flag name="tools">Build extra tools useful for debugging (fiemap, feiwalk, beestop)</flag>
-</use>
-<upstream>
-	<bugs-to>https://github.com/Zygo/bees/issues</bugs-to>
-	<remote-id type="github">Zygo/bees</remote-id>
-</upstream>
-</pkgmetadata>
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -0,0 +1 @@
+*.html
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -0,0 +1,18 @@
+MARKDOWN := $(firstword $(shell command -v cmark-gfm redcarpet markdown2 markdown markdown_py 2>/dev/null || echo markdown))
+
+# If you have cmark-gfm, you get Github-style tables; otherwise, you don't.
+ifeq ($(notdir $(MARKDOWN)),cmark-gfm)
+MARKDOWN += -e table
+endif
+
+.PHONY: docs
+
+docs: $(subst .md,.html,$(wildcard *.md)) index.html ../README.html
+
+%.html: %.md Makefile
+	$(MARKDOWN) $< | sed -e 's/\.md/\.html/g' > $@.new
+	mv -f $@.new $@
+
+index.md: ../README.md
+	sed -e 's:docs/::g' < ../README.md > index.md.new
+	mv -f index.md.new index.md
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-cayman
--- a/docs/btrfs-kernel.md
+++ b/docs/btrfs-kernel.md
@@ -0,0 +1,172 @@
+Recommended Kernel Version for bees
+===================================
+
+First, a warning that is not specific to bees:
+
+> **Kernel 5.1, 5.2, and 5.3 should not be used with btrfs due to a
+severe regression that can lead to fatal metadata corruption.**
+This issue is fixed in kernel 5.4.14 and later.
+
+**Recommended kernel versions for bees are 4.19, 5.4, 5.10, 5.11, or 5.12,
+with recent LTS and -stable updates.**  The latest released kernel as
+of this writing is 5.12.3.
+
+4.14, 4.9, and 4.4 LTS kernels with recent updates are OK with
+some issues.  Older kernels will be slower (a little slower or a lot
+slower depending on which issues are triggered).  Not all fixes are
+backported.
+
+Obsolete non-LTS kernels have a variety of unfixed issues and should
+not be used with btrfs.  For details see the table below.
+
+bees requires btrfs kernel API version 4.2 or higher, and does not work
+on older kernels.
+
+bees will detect and use btrfs kernel API up to version 4.15 if present.
+In some future bees release, this API version may become mandatory.
+
+
+
+
+Kernel Bug Tracking Table
+-------------------------
+
+These bugs are particularly popular among bees users:
+
+| First bad kernel | Last bad kernel | Issue Description | Fixed Kernel Versions | Fix Commit
+| :---: | :---: | --- | :---: | ---
+| - | 4.10 | garbage inserted in read data when reading compressed inline extent followed by a hole | 3.18.89, 4.1.49, 4.4.107, 4.9.71, 4.11 and later | e1699d2d7bf6 btrfs: add missing memset while reading compressed inline extents
+| - | 4.14 | spurious warnings from `fs/btrfs/backref.c` in `find_parent_nodes` | 3.16.57, 4.14.29, 4.15.12, 4.16 and later | c8195a7b1ad5 btrfs: remove spurious WARN_ON(ref->count < 0) in find_parent_nodes
+| 4.15 | 4.18 | compression ratio and performance regression on bees test corpus | improved in 4.19 | 4.14 performance not fully restored yet
+| - | 5.0 | silently corrupted data returned when reading compressed extents around a punched hole (bees dedupes all-zero data blocks with holes which can produce a similar effect to hole punching) | 3.16.70, 3.18.137, 4.4.177, 4.9.165, 4.14.108, 4.19.31, 5.0.4, 5.1 and later | 8e928218780e Btrfs: fix corruption reading shared and compressed extents after hole punching
+| - | 5.0 | deadlock when dedupe and rename are used simultaneously on the same files | 5.0.4, 5.1 and later | 4ea748e1d2c9 Btrfs: fix deadlock between clone/dedupe and rename
+| - | 5.1 | send failure or kernel crash while running send and dedupe on same snapshot at same time | 5.0.18, 5.1.4, 5.2 and later | 62d54f3a7fa2 Btrfs: fix race between send and deduplication that lead to failures and crashes
+| - | 5.2 | alternating send and dedupe results in incremental send failure | 4.9.188, 4.14.137, 4.19.65, 5.2.7, 5.3 and later | b4f9a1a87a48 Btrfs: fix incremental send failure after deduplication
+| 4.20 | 5.3 | balance convert to single rejected with error on 32-bit CPUs | 5.3.7, 5.4 and later | 7a54789074a5 btrfs: fix balance convert to single on 32-bit host CPUs
+| - | 5.3 | kernel crash due to tree mod log issue #1 (often triggered by bees) | 3.16.79, 4.4.195, 4.9.195, 4.14.147, 4.19.77, 5.2.19, 5.3.4, 5.4 and later | efad8a853ad2 Btrfs: fix use-after-free when using the tree modification log
+| - | 5.4 | kernel crash due to tree mod log issue #2 (often triggered by bees) | 3.16.83, 4.4.208, 4.9.208, 4.14.161, 4.19.92, 5.4.7, 5.5 and later | 6609fee8897a Btrfs: fix removal logic of the tree mod log that leads to use-after-free issues
+| 5.1 | 5.4 | metadata corruption resulting in loss of filesystem when a write operation occurs while balance starts a new block group.  **Do not use kernel 5.1 with btrfs.**  Kernel 5.2 and 5.3 have workarounds that may detect corruption in progress and abort before it becomes permanent, but do not prevent corruption from occurring.  Also kernel crash due to tree mod log issue #4. | 5.4.14, 5.5 and later | 6282675e6708 btrfs: relocation: fix reloc_root lifespan and access
+| - | 5.4 | send performance failure when shared extents have too many references | 4.9.207, 4.14.159, 4.19.90, 5.3.17, 5.4.4, 5.5 and later | fd0ddbe25095 Btrfs: send, skip backreference walking for extents with many references
+| 5.0 | 5.5 | dedupe fails to remove the last extent in a file if the file size is not a multiple of 4K | 5.4.19, 5.5.3, 5.6 and later | 831d2fa25ab8 Btrfs: make deduplication with range including the last block work
+| 4.5, backported to 3.18.31, 4.1.22, 4.4.4 | 5.5 | `df` incorrectly reports 0 free space while data space is available.  Triggered by changes in metadata size, including those typical of large-scale dedupe.  Occurs more often starting in 5.3 and especially 5.4 | 4.4.213, 4.9.213, 4.14.170, 4.19.102, 5.4.18, 5.5.2, 5.6 and later | d55966c4279b btrfs: do not zero f_bavail if we have available space
+| - | 5.5 | kernel crash due to tree mod log issue #3 (often triggered by bees) | 3.16.84, 4.4.214, 4.9.214, 4.14.171, 4.19.103, 5.4.19, 5.5.3, 5.6 and later | 7227ff4de55d Btrfs: fix race between adding and putting tree mod seq elements and nodes
+| - | 5.6 | deadlock when enumerating file references to physical extent addresses while some references still exist in deleted subvols | 5.7 and later | 39dba8739c4e btrfs: do not resolve backrefs for roots that are being deleted
+| - | 5.6 | deadlock when many extent reference updates are pending and available memory is low | 4.14.177, 4.19.116, 5.4.33, 5.5.18, 5.6.5, 5.7 and later | 351cbf6e4410 btrfs: use nofs allocations for running delayed items
+| - | 5.6 | excessive CPU usage in `LOGICAL_INO` and `FIEMAP` ioctl and increased btrfs write latency in other processes when bees translates from extent physical address to list of referencing files and offsets.  Also affects other tools like `duperemove` and `btrfs send` | 5.4.96, 5.7 and later | b25b0b871f20 btrfs: backref, use correct count to resolve normal data refs, plus 3 parent commits.  Some improvements also in earlier kernels.
+| - | 5.7 | filesystem becomes read-only if out of space while deleting snapshot | 4.9.238, 4.14.200, 4.19.149, 5.4.69, 5.8 and later | 7c09c03091ac btrfs: don't force read-only after error in drop snapshot
+| 5.1 | 5.7 | balance, device delete, or filesystem shrink operations loop endlessly on a single block group without decreasing extent count | 5.4.54, 5.7.11, 5.8 and later | 1dae7e0e58b4 btrfs: reloc: clear DEAD\_RELOC\_TREE bit for orphan roots to prevent runaway balance
+| - | 5.8 | deadlock in `TREE_SEARCH` ioctl (core component of bees filesystem scanner), followed by regression in deadlock fix | 4.4.237, 4.9.237, 4.14.199, 4.19.146, 5.4.66, 5.8.10 and later | a48b73eca4ce btrfs: fix potential deadlock in the search ioctl, 1c78544eaa46 btrfs: fix wrong address when faulting in pages in the search ioctl
+| 5.7 | 5.10 | kernel crash if balance receives fatal signal e.g. Ctrl-C | 5.4.93, 5.10.11, 5.11 and later | 18d3bff411c8 btrfs: don't get an EINTR during drop_snapshot for reloc
+| 5.10 | 5.10 | 20x write performance regression | 5.10.8, 5.11 and later | e076ab2a2ca7 btrfs: shrink delalloc pages instead of full inodes
+| 5.4 | 5.11 | spurious tree checker failures on extent ref hash | 5.11.5, 5.12 and later | 1119a72e223f btrfs: tree-checker: do not error out if extent ref hash doesn't match
+| - | 5.11 | tree mod log issue #5 | 4.4.263, 4.9.263, 4.14.227, 4.19.183, 5.4.108, 5.10.26, 5.11.9, 5.12 and later | dbcc7d57bffc btrfs: fix race when cloning extent buffer during rewind of an old root
+| - | 5.12 | tree mod log issue #6 | 4.14.233, 4.19.191, 5.4.118, 5.10.36, 5.11.20, 5.12.3, 5.13 and later | f9690f426b21 btrfs: fix race when picking most recent mod log operation for an old root
+| 4.15 | - | spurious warnings from `fs/fs-writeback.c` when `flushoncommit` is enabled | - | workaround:  comment out the `WARN_ON`
+
+"Last bad kernel" refers to that version's last stable update from
+kernel.org.  Distro kernels may backport additional fixes.  Consult
+your distro's kernel support for details.
+
+When the same version appears in both "last bad kernel" and "fixed kernel
+version" columns, it means the bug appears in the `.0` release and is
+fixed in the stated `.y` release.  e.g.  a "last bad kernel" of 5.4 and
+a "fixed kernel version" of 5.4.14 has the bug in kernel versions 5.4.0
+through 5.4.13 inclusive.
+
+A "-" for "first bad kernel" indicates the bug has been present since
+the relevant feature first appeared in btrfs.
+
+A "-" for "last bad kernel" indicates the bug has not yet been fixed as
+of 5.8.14.
+
+In cases where issues are fixed by commits spread out over multiple
+kernel versions, "fixed kernel version" refers to the version that
+contains all components of the fix.
+
+
+Workarounds for known kernel bugs
+---------------------------------
+
+* **Tree mod log issues**:  bees will detect that a btrfs balance is
+  running, and pause bees activity until the balance is done.  This avoids
+  running both the `LOGICAL_INO` ioctl and btrfs balance at the same time,
+  which avoids kernel crashes on old kernel versions.
+
+  The numbers for "tree mod log issue #" in the above table are arbitrary.
+  There are a lot of them, and they all behave fairly similarly.
+
+  This workaround is less necessary for kernels 5.4.19 and later.
+
+* **Slow backrefs** (aka toxic extents):  Under certain conditions,
+  if the number of references to a single shared extent grows too
+  high, the kernel consumes more and more CPU while also holding locks
+  that delay write access to the filesystem.  bees avoids this bug
+  by measuring the time the kernel spends performing `LOGICAL_INO`
+  operations and permanently blacklisting any extent or hash involved
+  where the kernel starts to get slow.  In the bees log, such blocks
+  are labelled as 'toxic' hash/block addresses.  Toxic extents are
+  rare (about 1 in 100,000 extents become toxic), but toxic extents can
+  become 8 orders of magnitude more expensive to process than the fastest
+  non-toxic extents.  This seems to affect all dedupe agents on btrfs;
+  at this time of writing only bees has a workaround for this bug.
+
+  This workaround is less necessary for kernels 5.4.96, 5.7 and later,
+  though it can still take 2 ms of CPU to resolve each extent ref on a
+  fast machine on a large, heavily fragmented file.
+
+* **dedupe breaks `btrfs send` in old kernels**.  The bees option
+  `--workaround-btrfs-send` prevents any modification of read-only subvols
+  in order to avoid breaking `btrfs send`.
+
+  This workaround is no longer necessary to avoid kernel crashes
+  and send performance failure on kernel 4.9.207, 4.14.159, 4.19.90,
+  5.3.17, 5.4.4, 5.5 and later; however, some conflict between send
+  and dedupe still remains, so the workaround is still useful.
+
+  `btrfs receive` is not and has never been affected by this issue.
+
+Unfixed kernel bugs
+-------------------
+
+As of 5.12.3:
+
+* **The kernel does not permit `btrfs send` and dedupe to run at the
+  same time**.  Recent kernels no longer crash, but now refuse one
+  operation with an error if the other operation was already running.
+
+  bees has not been updated to handle the new dedupe behavior optimally.
+  Optimal behavior is to defer dedupe operations when send is detected,
+  and resume after the send is finished.  Current bees behavior is to
+  complain loudly about each individual dedupe failure in log messages,
+  and abandon duplicate data references in the snapshot that send is
+  processing.  A future bees version shall have better handling for
+  this situation.
+
+  Workaround:  send `SIGSTOP` to bees, or terminate the bees process,
+  before running `btrfs send`.
+
+  This workaround is not strictly required if snapshot is deleted after
+  sending.  In that case, any duplicate data blocks that were not removed
+  by dedupe will be removed by snapshot delete instead.  The workaround
+  still saves some IO.
+
+  `btrfs receive` is not affected by this issue.
+
+* **Spurious warnings in `fs/fs-writeback.c`** on kernel 4.15 and later
+  when filesystem is mounted with `flushoncommit`.  These
+  seem to be harmless (there are other locks which prevent
+  concurrent umount of the filesystem), but the underlying
+  problems that trigger the `WARN_ON` are [not trivial to
+  fix](https://www.spinics.net/lists/linux-btrfs/msg87752.html).
+
+  The warnings can be especially voluminous when bees is running.
+
+  Workarounds:
+
+  1. mount with `-o noflushoncommit`
+  2. patch kernel to remove warning in `fs/fs-writeback.c`.
+
+  Note that using kernels 4.14 and earlier is *not* a viable workaround
+  for this issue, because kernels 4.14 and earlier will eventually
+  deadlock when a filesystem is mounted with `-o flushoncommit` (a single
+  commit fixes one bug and introduces the other).
--- a/docs/btrfs-other.md
+++ b/docs/btrfs-other.md
@@ -0,0 +1,62 @@
+Good Btrfs Feature Interactions
+-------------------------------
+
+bees has been tested in combination with the following:
+
+* btrfs compression (zlib, lzo, zstd), mixtures of compressed and uncompressed extents
+* PREALLOC extents (unconditionally replaced with holes)
+* HOLE extents and btrfs no-holes feature
+* Other deduplicators, reflink copies (though bees may decide to redo their work)
+* btrfs snapshots and non-snapshot subvols (RW and RO)
+* Concurrent file modification (e.g. PostgreSQL and sqlite databases, build daemons)
+* all btrfs RAID profiles
+* IO errors during dedupe (read errors will throw exceptions, bees will catch them and skip over the affected extent)
+* Filesystems mounted *with* the flushoncommit option ([lots of harmless kernel log warnings on 4.15 and later](btrfs-kernel.md))
+* Filesystems mounted *without* the flushoncommit option
+* 4K filesystem data block size / clone alignment
+* 64-bit and 32-bit LE host CPUs (amd64, x86, arm)
+* Huge files (>1TB--although Btrfs performance on such files isn't great in general)
+* filesystems up to 30T+ bytes, 100M+ files
+* btrfs receive
+* btrfs nodatacow/nodatasum inode attribute or mount option (bees skips all nodatasum files)
+* open(O_DIRECT) (seems to work as well--or as poorly--with bees as with any other btrfs feature)
+* lvmcache:  no problems observed in testing with recent kernels or reported by users in the last year.
+
+Bad Btrfs Feature Interactions
+------------------------------
+
+bees has been tested in combination with the following, and various problems are known:
+
+* bcache:  no data-losing problems observed in testing with recent kernels
+  or reported by users in the last year.  Some issues observed with
+  bcache interacting badly with some SSD models' firmware, but so far
+  this only causes temporary loss of service, not filesystem damage.
+  This behavior does not seem to be specific to bees (ordinary filesystem
+  tests with rsync and snapshots will reproduce it), but it does prevent
+  any significant testing of bees on bcache.
+
+* btrfs send:  there are bugs in `btrfs send` that can be triggered by bees.
+  The [`--workaround-btrfs-send` option](options.md) works around this issue
+  by preventing bees from modifying read-only snapshots.
+
+* btrfs qgroups:  very slow, sometimes hangs...and it's even worse when
+  bees is running.
+
+* btrfs autodefrag mount option:  hangs and high CPU usage problems
+  reported by users.  bees cannot distinguish autodefrag activity from
+  normal filesystem activity and will likely try to undo the autodefrag
+  if duplicate copies of the defragmented data exist.
+
+Untested Btrfs Feature Interactions
+-----------------------------------
+
+bees has not been tested with the following, and undesirable interactions may occur:
+
+* Non-4K filesystem data block size (should work if recompiled)
+* Non-equal hash (SUM) and filesystem data block (CLONE) sizes (need to fix that eventually)
+* btrfs seed filesystems (does anyone even use those?)
+* btrfs out-of-tree kernel patches (e.g. in-kernel dedupe or encryption)
+* btrfs-convert from ext2/3/4 (never tested, might run out of space or ignore significant portions of the filesystem due to sanity checks)
+* btrfs mixed block groups (don't know a reason why it would *not* work, but never tested)
+* flashcache: an out-of-tree cache-HDD-on-SSD block layer helper.
+* Host CPUs with exotic page sizes, alignment requirements, or endianness (ppc, alpha, sparc, strongarm, s390, mips, m68k...)
--- a/docs/config.md
+++ b/docs/config.md
@@ -0,0 +1,151 @@
+bees Configuration
+==================
+
+The only configuration parameter that *must* be provided is the hash
+table size.  Other parameters are optional or hardcoded, and the defaults
+are reasonable in most cases.
+
+Hash Table Sizing
+-----------------
+
+Hash table entries are 16 bytes per data block.  The hash table stores
+the most recently read unique hashes.  Once the hash table is full,
+each new entry in the table evicts an old entry.
+
+Here are some numbers to estimate appropriate hash table sizes:
+
+    unique data size |  hash table size |average dedupe extent size
+        1TB          |      4GB         |        4K
+        1TB          |      1GB         |       16K
+        1TB          |    256MB         |       64K
+        1TB          |    128MB         |      128K <- recommended
+        1TB          |     16MB         |     1024K
+       64TB          |      1GB         |     1024K
+
+Notes:
+
+ * If the hash table is too large, no extra dedupe efficiency is
+obtained, and the extra space just wastes RAM.  Extra space can also slow
+bees down by preventing old data from being evicted, so bees wastes time
+looking for matching data that is no longer present on the filesystem.
+
+ * If the hash table is too small, bees extrapolates from matching
+blocks to find matching adjacent blocks in the filesystem that have been
+evicted from the hash table.  In other words, bees only needs to find
+one block in common between two extents in order to be able to dedupe
+the entire extents.  This provides significantly more dedupe hit rate
+per hash table byte than other dedupe tools.
+
+ * When counting unique data in compressed data blocks to estimate
+optimum hash table size, count the *uncompressed* size of the data.
+
+ * Another way to approach the hash table size is to simply decide how much
+RAM can be spared without too much discomfort, give bees that amount of
+RAM, and accept whatever dedupe hit rate occurs as a result.  bees will
+do the best job it can with the RAM it is given.
+
+Factors affecting optimal hash table size
+-----------------------------------------
+
+It is difficult to predict the net effect of data layout and access
+patterns on dedupe effectiveness without performing deep inspection of
+both the filesystem data and its structure--a task that is as expensive
+as performing the deduplication.
+
+* **Compression** on the filesystem reduces the average extent length
+compared to uncompressed filesystems.  The maximum compressed extent
+length on btrfs is 128KB, while the maximum uncompressed extent length
+is 128MB.  Longer extents decrease the optimum hash table size while
+shorter extents increase the optimum hash table size because the
+probability of a hash table entry being present (i.e. unevicted) in
+each extent is proportional to the extent length.
+
+   As a rule of thumb, the optimal hash table size for a compressed
+filesystem is 2-4x larger than the optimal hash table size for the same
+data on an uncompressed filesystem.  Dedupe efficiency falls dramatically
+with hash tables smaller than 128MB/TB as the average dedupe extent size
+is larger than the largest possible compressed extent size (128KB).
+
+* **Short writes** also shorten the average extent length and increase
+optimum hash table size.  If a database writes to files randomly using
+4K page writes, all of these extents will be 4K in length, and the hash
+table size must be increased to retain each one (or the user must accept
+a lower dedupe hit rate).
+
+   Defragmenting files that have had many short writes increases the
+extent length and therefore reduces the optimum hash table size.
+
+* **Time between duplicate writes** also affects the optimum hash table
+size.  bees reads data blocks in logical order during its first pass,
+and after that new data blocks are read incrementally a few seconds or
+minutes after they are written.  bees finds more matching blocks if there
+is a smaller amount of data between the matching reads, i.e. there are
+fewer blocks evicted from the hash table.  If most identical writes to
+the filesystem occur near the same time, the optimum hash table size is
+smaller.  If most identical writes occur over longer intervals of time,
+the optimum hash table size must be larger to avoid evicting hashes from
+the table before matches are found.
+
+   For example, a build server normally writes out very similar source
+code files over and over, so it will need a smaller hash table than a
+backup server which has to refer to the oldest data on the filesystem
+every time a new client machine's data is added to the server.
+
+Scanning modes for multiple subvols
+-----------------------------------
+
+The `--scan-mode` option affects how bees divides resources between
+subvolumes.  This is particularly relevant when there are snapshots,
+as there are tradeoffs to be made depending on how snapshots are used
+on the filesystem.
+
+Note that if a filesystem has only one subvolume (i.e. the root,
+subvol ID 5) then the `--scan-mode` option has no effect, as there is
+only one subvolume to scan.
+
+The default mode is mode 0, "lockstep".  In this mode, each inode of each
+subvol is scanned at the same time, before moving to the next inode in
+each subvol.  This maximizes the likelihood that all of the references to
+a snapshot of a file are scanned at the same time, which takes advantage
+of VFS caching in the Linux kernel.  If snapshots are created very often,
+bees will not make very good progress as it constantly restarts the
+filesystem scan from the beginning each time a new snapshot is created.
+
+Scan mode 1, "independent", simply scans every subvol independently
+in parallel.  Each subvol's scanner shares time equally with all other
+subvol scanners.  Whenever a new subvol appears, a new scanner is
+created and the new subvol scanner doesn't affect the behavior of any
+existing subvol scanner.
+
+Scan mode 2, "sequential", processes each subvol completely before
+proceeding to the next subvol.  This is a good mode when using bees for
+the first time on a filesystem that already has many existing snapshots
+and a high rate of new snapshot creation.  Short-lived snapshots
+(e.g. those used for `btrfs send`) are effectively ignored, and bees
+directs its efforts toward older subvols that are more likely to be
+origin subvols for snapshots.  By deduping origin subvols first, bees
+ensures that future snapshots will already be deduplicated and do not
+need to be deduplicated again.
+
+If you are using bees for the first time on a filesystem with many
+existing snapshots, you should read about [snapshot gotchas](gotchas.md).
+
+Threads and load management
+---------------------------
+
+By default, bees creates one worker thread for each CPU detected.
+These threads then perform scanning and dedupe operations.  The number of
+worker threads can be set with the [`--thread-count` and `--thread-factor`
+options](options.md).
+
+If desired, bees can automatically increase or decrease the number
+of worker threads in response to system load.  This reduces impact on
+the rest of the system by pausing bees when other CPU and IO intensive
+loads are active on the system, and resumes bees when the other loads
+are inactive.  This is configured with the [`--loadavg-target` and
+`--thread-min` options](options.md).
+
+Log verbosity
+-------------
+
+bees can be made less chatty with the [`--verbose` option](options.md).
--- a/docs/event-counters.md
+++ b/docs/event-counters.md
@@ -0,0 +1,398 @@
+Event Counters
+==============
+
+General
+-------
+
+Event counters are used in bees to collect simple branch-coverage
+statistics.  Every time bees makes a decision, it increments an event
+counter, so there are _many_ event counters.
+
+Events are grouped by prefix in their event names, e.g. `block` is block
+I/O, `dedup` is deduplication requests, `tmp` is temporary files, etc.
+
+Events with the suffix `_ms` count total milliseconds spent performing
+the operation.  These are counted separately for each thread, so there
+can be more than 1000 ms per second.
+
+There is considerable overlap between some events, e.g. `example_try`
+denotes an event that is counted when an action is attempted,
+`example_hit` is counted when the attempt succeeds and has a desired
+outcome, and `example_miss` is counted when the attempt succeeds but
+the desired outcome is not achieved.  In most cases `example_try =
+example_hit + example_miss + (`example failed and threw an exception`)`,
+but some event groups defy such simplistic equations.
+
+addr
+----
+
+The `addr` event group consists of operations related to translating `(root,
+inode, offset)` tuples (i.e. logical position within a file) into btrfs
+virtual block addresses (i.e. physical position on disk).
+
+ * `addr_block`: The address of a block was computed.
+ * `addr_compressed`: Obsolete implementation of `addr_compressed_offset`.
+ * `addr_compressed_offset`: The address of a compressed block was computed.
+ * `addr_delalloc`: The address of a block could not be computed due to
+ delayed allocation.  Only possible when using obsolete `FIEMAP` code.
+ * `addr_eof_e`: The address of a block at EOF that was not block-aligned was computed.
+ * `addr_from_fd`: The address of a block was computed using a `fd`
+ (open to the file in question) and `offset` pair.
+ * `addr_from_root_fd`: The address of a block was computed using
+ the filesystem root `fd` instead of the open file `fd` for the
+ `TREE_SEARCH_V2` ioctl.  This is obsolete and should probably be removed
+ at some point.
+ * `addr_hole`: The address of a block in a hole was computed.
+ * `addr_magic`: The address of a block cannot be determined in a way
+ that bees can use (unrecognized flags or flags known to be incompatible
+ with bees).
+ * `addr_uncompressed`: The address of an uncompressed block was computed.
+ * `addr_unrecognized`: The address of a block with unrecognized flags
+ (i.e. kernel version newer than bees) was computed.
+ * `addr_unusable`: The address of a block with unusable flags (i.e. flags
+ that are known to be incompatible with bees) was computed.
+
+adjust
+------
+
+The `adjust` event group consists of operations related to translating stored virtual block addresses (i.e. physical position on disk) to `(root, inode, offset)` tuples (i.e. logical positions within files).  `BeesResolver::adjust_offset` determines if a single candidate reference from the `LOGICAL_INO` ioctl corresponds to the requested btrfs virtual block address.
+
+ * `adjust_compressed_offset_correct`: A block address corresponding to a compressed block was retrieved from the hash table and resolved to a physical block containing data that matches another block bees has already read.
+ * `adjust_compressed_offset_wrong`: A block address corresponding to a compressed block was retrieved from the hash table and resolved to a physical block containing data that matches the hash but not the data from another block bees has already read (i.e. there was a hash collision).
+ * `adjust_eof_fail`: A block address corresponding to a block at EOF that was not aligned to a block boundary matched another block bees already read, but the length of the unaligned data in both blocks was not equal.  This is usually caused by stale entries in the hash table pointing to blocks that have been overwritten since the hash table entries were created.  It can also be caused by hash collisions, but hashes are not yet computed at this point in the code, so this event does not correlate to the `hash_collision` counter.
+ * `adjust_eof_haystack`: A block address from the hash table corresponding to a block at EOF that was not aligned to a block boundary was processed.
+ * `adjust_eof_hit`: A block address corresponding to a block at EOF that was not aligned to a block boundary matched a similarly unaligned block that bees already read.
+ * `adjust_eof_miss`: A block address from the hash table corresponding to a block at EOF that was not aligned to a block boundary did not match a similarly unaligned block that bees already read.
+ * `adjust_eof_needle`: A block address from scanning the disk corresponding to a block at EOF that was not aligned to a block boundary was processed.
+ * `adjust_exact`: A block address from the hash table corresponding to an uncompressed data block was processed to find its `(root, inode, offset)` references.
+ * `adjust_exact_correct`: A block address corresponding to an uncompressed block was retrieved from the hash table and resolved to a physical block containing data that matches another block bees has already read.
+ * `adjust_exact_wrong`: A block address corresponding to an uncompressed block was retrieved from the hash table and resolved to a physical block containing data that matches the hash but not the data from another block bees has already read (i.e. there was a hash collision).
+ * `adjust_hit`: A block address was retrieved from the hash table and resolved to a physical block containing data that matches the data from another block bees has already read (i.e. a duplicate match was found).
+ * `adjust_miss`: A block address was retrieved from the hash table and resolved to a physical block containing a hash that does not match the hash from another block bees has already read (i.e. the hash table contained a stale entry and the data it referred to has since been overwritten in the filesystem).
+ * `adjust_needle_too_long`: A block address was retrieved from the hash table, but when the corresponding extent item was retrieved, its offset or length were out of range to be a match (i.e. the hash table contained a stale entry and the data it referred to has since been overwritten in the filesystem).
+ * `adjust_no_match`: A hash collision occurred (i.e. a block on disk was located with the same hash as the hash table entry but different data) .  Effectively an alias for `hash_collision` as it is not possible to have one event without the other.
+ * `adjust_offset_high`: The `LOGICAL_INO` ioctl gave an extent item that does not overlap with the desired block because the extent item ends before the desired block in the extent data.
+ * `adjust_offset_low`: The `LOGICAL_INO` ioctl gave an extent item that does not overlap with the desired block because the extent item begins after the desired block in the extent data.
+ * `adjust_try`: A block address and extent item candidate were passed to `BeesResolver::adjust_offset` for processing.
+
+block
+-----
+
+The `block` event group consists of operations related to reading data blocks from the filesystem.
+
+ * `block_bytes`: Number of data bytes read.
+ * `block_hash`: Number of block hashes computed.
+ * `block_ms`: Total time reading data blocks.
+ * `block_read`: Number of data blocks read.
+ * `block_zero`: Number of data blocks read with zero contents (i.e. candidates for replacement with a hole).
+
+bug
+---
+
+The `bug` event group consists of known bugs in bees.
+
+ * `bug_bad_max_transid`: A bad `max_transid` was found and removed in `beescrawl.dat`.
+ * `bug_bad_min_transid`: A bad `min_transid` was found and removed in `beescrawl.dat`.
+ * `bug_dedup_same_physical`: `BeesContext::dedup` detected that the physical extent was the same for `src` and `dst`.  This has no effect on space usage so it is a waste of time, and also carries the risk of creating a toxic extent.
+ * `bug_grow_pair_overlaps`: Two identical blocks were found, and while searching matching adjacent extents, the potential `src` grew to overlap the potential `dst`.  This would create a cycle where bees keeps trying to eliminate blocks but instead just moves them around.
+ * `bug_hash_duplicate_cell`: Two entries in the hash table were identical.  This only happens due to data corruption or a bug.
+ * `bug_hash_magic_addr`: An entry in the hash table contains an address with magic.  Magic addresses cannot be deduplicated so they should not be stored in the hash table.
+
+chase
+-----
+
+The `chase` event group consists of operations connecting btrfs virtual block addresses with `(root, inode, offset)` tuples.  `resolve` is the top level, `adjust` is the bottom level, and `chase` is the middle level.  `BeesResolver::chase_extent_ref` iterates over `(root, inode, offset)` tuples from `LOGICAL_INO` and attempts to find a single matching block in the filesystem given a candidate block from an earlier `scan` operation.
+
+ * `chase_corrected`: A matching block was resolved to a `(root, inode, offset)` tuple, but the offset of a block matching data did not match the offset given by `LOGICAL_INO`.
+ * `chase_hit`: A block address was successfully and correctly translated to a `(root, inode, offset)` tuple.
+ * `chase_no_data`: A block address was not successfully translated to a `(root, inode, offset)` tuple.
+ * `chase_no_fd`: A `(root, inode)` tuple could not be opened (i.e. the file was deleted on the filesystem).
+ * `chase_try`: A block address translation attempt started.
+ * `chase_uncorrected`: A matching block was resolved to a `(root, inode, offset)` tuple, and the offset of a block matching data did match the offset given by `LOGICAL_INO`.
+ * `chase_wrong_addr`: The btrfs virtual address (i.e. physical block address) found at a candidate `(root, inode, offset)` tuple did not match the expected btrfs virtual address (i.e. the filesystem was modified during the resolve operation).
+ * `chase_wrong_magic`: The extent item at a candidate `(root, inode, offset)` tuple has magic bits and cannot match any btrfs virtual address in the hash table (i.e. the filesystem was modified during the resolve operation).
+
+crawl
+-----
+
+The `crawl` event group consists of operations related to scanning btrfs trees to find new extent refs to scan for dedupe.
+
+ * `crawl_blacklisted`: An extent was not scanned because it belongs to a blacklisted file.
+ * `crawl_create`: A new subvol crawler was created.
+ * `crawl_done`: One pass over all subvols on the filesystem was completed.
+ * `crawl_empty`: A `TREE_SEARCH_V2` ioctl call failed or returned an empty set (usually because all data in the subvol was scanned).
+ * `crawl_fail`: A `TREE_SEARCH_V2` ioctl call failed.
+ * `crawl_gen_high`: An extent item in the search results refers to an extent that is newer than the current crawl's `max_transid` allows.
+ * `crawl_gen_low`: An extent item in the search results refers to an extent that is older than the current crawl's `min_transid` allows.
+ * `crawl_hole`: An extent item in the search results refers to a hole.
+ * `crawl_inline`: An extent item in the search results contains an inline extent.
+ * `crawl_items`: An item in the `TREE_SEARCH_V2` data was processed.
+ * `crawl_ms`: Time spent running the `TREE_SEARCH_V2` ioctl.
+ * `crawl_no_empty`: Attempted to delete the last crawler.  Should never happen.
+ * `crawl_nondata`: An item in the search results is not data.
+ * `crawl_prealloc`: An extent item in the search results refers to a `PREALLOC` extent.
+ * `crawl_push`: An extent item in the search results is suitable for scanning and deduplication.
+ * `crawl_restart`: A subvol crawl was restarted with a new `min_transid..max_transid` range.
+ * `crawl_scan`: An extent item in the search results is submitted to `BeesContext::scan_forward` for scanning and deduplication.
+ * `crawl_search`: A `TREE_SEARCH_V2` ioctl call was successful.
+ * `crawl_unknown`: An extent item in the search results has an unrecognized type.
+
+dedup
+-----
+
+The `dedup` (sic) event group consists of operations that deduplicate data.
+
+ * `dedup_bytes`: Total bytes in extent references deduplicated.
+ * `dedup_copy`: Total bytes copied to eliminate unique data in extents containing a mix of unique and duplicate data.
+ * `dedup_hit`: Total number of pairs of identical extent references.
+ * `dedup_miss`: Total number of pairs of non-identical extent references.
+ * `dedup_ms`: Total time spent running the `FILE_EXTENT_SAME` (aka `FI_DEDUPERANGE` or `dedupe_file_range`) ioctl.
+ * `dedup_prealloc_bytes`: Total bytes in eliminated `PREALLOC` extent references.
+ * `dedup_prealloc_hit`: Total number of successfully eliminated `PREALLOC` extent references.
+ * `dedup_prealloc_hit`: Total number of unsuccessfully eliminated `PREALLOC` extent references (i.e. filesystem data changed between scan and dedupe).
+ * `dedup_try`: Total number of pairs of extent references submitted for deduplication.
+ * `dedup_workaround_btrfs_send`: Total number of extent reference pairs submitted for deduplication that were discarded to workaround `btrfs send` bugs.
+
+exception
+---------
+
+The `exception` event group consists of C++ exceptions.  C++ exceptions are thrown due to IO errors and internal constraint check failures.
+
+ * `exception_caught`: Total number of C++ exceptions thrown and caught by a generic exception handler.
+ * `exception_caught_silent`: Total number of "silent" C++ exceptions thrown and caught by a generic exception handler.  These are exceptions which are part of the correct and normal operation of bees.  The exceptions are logged at a lower log level.
+
+hash
+----
+
+The `hash` event group consists of operations related to the bees hash table.
+
+ * `hash_already`: A `(hash, address)` pair was already present in the hash table during a `BeesHashTable::push_random_hash_addr` operation.
+ * `hash_bump`: An existing `(hash, address)` pair was moved forward in the hash table by a `BeesHashTable::push_random_hash_addr` operation.
+ * `hash_collision`: A pair of data blocks was found with identical hashes but different data.
+ * `hash_erase`: A `(hash, address)` pair in the hash table was removed because a matching data block could not be found in the filesystem (i.e. the hash table entry is out of date).
+ * `hash_erase_miss`: A `(hash, address)` pair was reported missing from the filesystem but no such entry was found in the hash table (i.e. race between scanning threads or pair already evicted).
+ * `hash_evict`: A `(hash, address)` pair was evicted from the hash table to accommodate a new hash table entry.
+ * `hash_extent_in`: A hash table extent was read.
+ * `hash_extent_out`: A hash table extent was written.
+ * `hash_front`: A `(hash, address)` pair was pushed to the front of the list because it matched a duplicate block.
+ * `hash_front_already`: A `(hash, address)` pair was pushed to the front of the list because it matched a duplicate block, but the pair was already at the front of the list so no change occurred.
+ * `hash_insert`: A `(hash, address)` pair was inserted by `BeesHashTable::push_random_hash_addr`.
+ * `hash_lookup`: The hash table was searched for `(hash, address)` pairs matching a given `hash`.
+
+inserted
+--------
+
+The `inserted` event group consists of operations related to storing hash and address data in the hash table (i.e. the hash table client).
+
+ * `inserted_block`: Total number of data block references scanned and inserted into the hash table.
+ * `inserted_clobbered`: Total number of data block references scanned and eliminated from the filesystem.
+
+matched
+-------
+
+The `matched` event group consists of events related to matching incoming data blocks against existing hash table entries.
+
+ * `matched_0`: A data block was scanned, hash table entries found, but no matching data blocks on the filesytem located.
+ * `matched_1_or_more`: A data block was scanned, hash table entries found, and one or more matching data blocks on the filesystem located.
+ * `matched_2_or_more`: A data block was scanned, hash table entries found, and two or more matching data blocks on the filesystem located.
+ * `matched_3_or_more`: A data block was scanned, hash table entries found, and three or more matching data blocks on the filesystem located.
+
+open
+----
+
+The `open` event group consists of operations related to translating `(root, inode)` tuples into open file descriptors (i.e. `open_by_handle` emulation for btrfs).
+
+ * `open_clear`: The open FD cache was cleared to avoid keeping file descriptors open too long.
+ * `open_fail_enoent`: A file could not be opened because it no longer exists (i.e. it was deleted or renamed during the lookup/resolve operations).
+ * `open_fail_error`: A file could not be opened for other reasons (e.g. IO error, permission denied, out of resources).
+ * `open_file`: A file was successfully opened.  This counts only the `open()` system call, not other reasons why the opened FD might not be usable.
+ * `open_hit`: A file was successfully opened and the FD was acceptable.
+ * `open_ino_ms`: Total time spent executing the `open()` system call.
+ * `open_lookup_empty`: No paths were found for the inode in the `INO_PATHS` ioctl.
+ * `open_lookup_enoent`: The `INO_PATHS` ioctl returned ENOENT.
+ * `open_lookup_error`: The `INO_PATHS` ioctl returned a different error.
+ * `open_lookup_ok`: The `INO_PATHS` ioctl successfully returned a list of one or more filenames.
+ * `open_no_path`: All attempts to open a file by `(root, inode)` pair failed.
+ * `open_no_root`: An attempt to open a file by `(root, inode)` pair failed because the `root` could not be opened.
+ * `open_root_ms`: Total time spent opening subvol root FDs.
+ * `open_wrong_dev`: A FD returned by `open()` did not match the device belonging to the filesystem subvol.
+ * `open_wrong_flags`: A FD returned by `open()` had incompatible flags (`NODATASUM` / `NODATACOW`).
+ * `open_wrong_ino`: A FD returned by `open()` did not match the expected inode (i.e. the file was renamed or replaced during the lookup/resolve operations).
+ * `open_wrong_root`: A FD returned by `open()` did not match the expected subvol ID (i.e. `root`).
+
+pairbackward
+------------
+
+The `pairbackward` event group consists of events related to extending matching block ranges backward starting from the initial block match found using the hash table.
+
+ * `pairbackward_bof_first`: A matching pair of block ranges could not be extended backward because the beginning of the first (src) file was reached.
+ * `pairbackward_bof_second`: A matching pair of block ranges could not be extended backward because the beginning of the second (dst) file was reached.
+ * `pairbackward_hit`: A pair of matching block ranges was extended backward by one block.
+ * `pairbackward_miss`: A pair of matching block ranges could not be extended backward by one block because the pair of blocks before the first block in the range did not contain identical data.
+ * `pairbackward_ms`: Total time spent extending matching block ranges backward from the first matching block found by hash table lookup.
+ * `pairbackward_overlap`: A pair of matching block ranges could not be extended backward by one block because this would cause the two block ranges to overlap.
+ * `pairbackward_same`: A pair of matching block ranges could not be extended backward by one block because this would cause the two block ranges to refer to the same btrfs data extent.
+ * `pairbackward_stop`: Stopped extending a pair of matching block ranges backward for any of the reasons listed here.
+ * `pairbackward_toxic_addr`: A pair of matching block ranges was abandoned because the extended range would include a data block with a toxic address.
+ * `pairbackward_toxic_hash`: A pair of matching block ranges was abandoned because the extended range would include a data block with a toxic hash.
+ * `pairbackward_try`: Started extending a pair of matching block ranges backward.
+ * `pairbackward_zero`: A pair of matching block ranges could not be extended backward by one block because the src block contained all zeros and was not compressed.
+
+pairforward
+-----------
+
+The `pairforward` event group consists of events related to extending matching block ranges forward starting from the initial block match found using the hash table.
+
+ * `pairforward_eof_first`: A matching pair of block ranges could not be extended forward because the end of the first (src) file was reached.
+ * `pairforward_eof_malign`: A matching pair of block ranges could not be extended forward because the end of the second (dst) file was not aligned to a 4K boundary nor the end of the first (src) file.
+ * `pairforward_eof_second`: A matching pair of block ranges could not be extended forward because the end of the second (dst) file was reached.
+ * `pairforward_hit`: A pair of matching block ranges was extended forward by one block.
+ * `pairforward_hole`: A pair of matching block ranges was extended forward by one block, and the block was a hole in the second (dst) file.
+ * `pairforward_miss`: A pair of matching block ranges could not be extended forward by one block because the pair of blocks after the last block in the range did not contain identical data.
+ * `pairforward_ms`: Total time spent extending matching block ranges forward from the first matching block found by hash table lookup.
+ * `pairforward_overlap`: A pair of matching block ranges could not be extended forward by one block because this would cause the two block ranges to overlap.
+ * `pairforward_same`: A pair of matching block ranges could not be extended forward by one block because this would cause the two block ranges to refer to the same btrfs data extent.
+ * `pairforward_stop`: Stopped extending a pair of matching block ranges forward for any of the reasons listed here.
+ * `pairforward_toxic_addr`: A pair of matching block ranges was abandoned because the extended range would include a data block with a toxic address.
+ * `pairforward_toxic_hash`: A pair of matching block ranges was abandoned because the extended range would include a data block with a toxic hash.
+ * `pairforward_try`: Started extending a pair of matching block ranges forward.
+ * `pairforward_zero`: A pair of matching block ranges could not be extended backward by one block because the src block contained all zeros and was not compressed.
+
+readahead
+---------
+
+The `readahead` event group consists of events related to calls to `posix_fadvise`.
+
+ * `readahead_ms`: Total time spent running `posix_fadvise(..., POSIX_FADV_WILLNEED)` aka `readahead()`.
+ * `readahead_unread_ms`: Total time spent running `posix_fadvise(..., POSIX_FADV_DONTNEED)`.
+
+replacedst
+----------
+
+The `replacedst` event group consists of events related to replacing a single reference to a dst extent using any suitable src extent (i.e. eliminating a single duplicate extent ref during a crawl).
+
+ * `replacedst_dedup_hit`: A duplicate extent reference was identified and removed.
+ * `replacedst_dedup_miss`: A duplicate extent reference was identified, but src and dst extents did not match (i.e. the filesystem changed in the meantime).
+ * `replacedst_grown`: A duplicate block was identified, and adjacent blocks were duplicate as well.
+ * `replacedst_overlaps`: A pair of duplicate block ranges was identified, but the pair was not usable for dedupe because the two ranges overlap.
+ * `replacedst_same`: A pair of duplicate block ranges was identified, but the pair was not usable for dedupe because the physical block ranges were the same.
+ * `replacedst_try`: A duplicate block was identified and an attempt was made to remove it (i.e. this is the total number of replacedst calls).
+
+replacesrc
+----------
+
+The `replacesrc` event group consists of events related to replacing every reference to a src extent using a temporary copy of the extent's data (i.e. eliminating leftover unique data in a partially duplicate extent during a crawl).
+
+ * `replacesrc_dedup_hit`: A duplicate extent reference was identified and removed.
+ * `replacesrc_dedup_miss`: A duplicate extent reference was identified, but src and dst extents did not match (i.e. the filesystem changed in the meantime).
+ * `replacesrc_grown`: A duplicate block was identified, and adjacent blocks were duplicate as well.
+ * `replacesrc_overlaps`: A pair of duplicate block ranges was identified, but the pair was not usable for dedupe because the two ranges overlap.
+ * `replacesrc_try`: A duplicate block was identified and an attempt was made to remove it (i.e. this is the total number of replacedst calls).
+
+
+resolve
+-------
+
+The `resolve` event group consists of operations related to translating a btrfs virtual block address (i.e. physical block address) to a `(root, inode, offset)` tuple (i.e. locating and opening the file containing a matching block).  `resolve` is the top level, `chase` and `adjust` are the lower two levels.
+
+ * `resolve_fail`: The `LOGICAL_INO` ioctl returned an error.
+ * `resolve_large`: The `LOGICAL_INO` ioctl returned more than 2730 results (the limit of the v1 ioctl).
+ * `resolve_ms`: Total time spent in the `LOGICAL_INO` ioctl (i.e. wallclock time, not kernel CPU time).
+ * `resolve_ok`: The `LOGICAL_INO` ioctl returned success.
+ * `resolve_toxic`: The `LOGICAL_INO` ioctl took more than 0.1 seconds of kernel CPU time.
+
+root
+----
+
+The `root` event group consists of operations related to translating a btrfs root ID (i.e. subvol ID) into an open file descriptor by navigating the btrfs root tree.
+
+ * `root_clear`: The root FD cache was cleared.
+ * `root_found`: A root FD was successfully opened.
+ * `root_notfound`: A root FD could not be opened because all candidate paths could not be opened, or there were no paths available.
+ * `root_ok`: A root FD was opened and its correctness verified.
+ * `root_open_fail`: A root FD `open()` attempt returned an error.
+ * `root_parent_open_fail`: A recursive call to open the parent of a subvol failed.
+ * `root_parent_open_ok`: A recursive call to open the parent of a subvol succeeded.
+ * `root_parent_open_try`: A recursive call to open the parent of a subvol was attempted.
+ * `root_parent_path_empty`: No path could be found to connect a parent root FD to its child.
+ * `root_parent_path_fail`: The `INO_PATH` ioctl failed to find a name for a child subvol relative to its parent.
+ * `root_parent_path_open_fail`: The `open()` call in a recursive call to open the parent of a subvol returned an error.
+ * `root_workaround_btrfs_send`: A subvol was determined to be read-only and disabled to implement the btrfs send workaround.
+
+scan
+----
+
+The `scan` event group consists of operations related to scanning incoming data.  This is where bees finds duplicate data and populates the hash table.
+
+ * `scan_blacklisted`: A blacklisted extent was passed to `scan_forward` and dropped.
+ * `scan_block`: A block of data was scanned.
+ * `scan_bump`: After deduping a block range, the scan pointer had to be moved past the end of the deduped byte range.
+ * `scan_dup_block`: Number of duplicate blocks deduped.
+ * `scan_dup_hit`: A pair of duplicate block ranges was found and removed.
+ * `scan_dup_miss`: A pair of duplicate blocks was found in the hash table but not in the filesystem.
+ * `scan_eof`: Scan past EOF was attempted.
+ * `scan_erase_redundant`: Blocks in the hash table were removed because they were removed from the filesystem by dedupe.
+ * `scan_extent`: An extent was scanned (`scan_one_extent`).
+ * `scan_forward`: A logical byte range was scanned (`scan_forward`).
+ * `scan_found`: An entry was found in the hash table matching a scanned block from the filesystem.
+ * `scan_hash_hit`: A block was found on the filesystem corresponding to a block found in the hash table.
+ * `scan_hash_miss`: A block was not found on the filesystem corresponding to a block found in the hash table.
+ * `scan_hash_preinsert`: A block was prepared for insertion into the hash table.
+ * `scan_hole`: A hole extent was found during scan and ignored.
+ * `scan_interesting`: An extent had flags that were not recognized by bees and was ignored.
+ * `scan_lookup`: A hash was looked up in the hash table.
+ * `scan_malign`: A block being scanned matched a hash at EOF in the hash table, but the EOF was not aligned to a block boundary and the two blocks did not have the same length.
+ * `scan_no_fd`: References to a block from the hash table were found, but a FD could not be opened.
+ * `scan_no_rewrite`: All blocks in an extent were removed by dedupe (i.e. no copies).
+ * `scan_push_front`: An entry in the hash table matched a duplicate block, so the entry was moved to the head of its LRU list.
+ * `scan_reinsert`: A copied block's hash and block address was inserted into the hash table.
+ * `scan_resolve_hit`: A block address in the hash table was successfully resolved to an open FD and offset pair.
+ * `scan_resolve_zero`: A block address in the hash table was not resolved to any subvol/inode pair, so the corresponding hash table entry was removed.
+ * `scan_rewrite`: A range of bytes in a file was copied, then the copy deduped over the original data.
+ * `scan_toxic_hash`: A scanned block has the same hash as a hash table entry that is marked toxic.
+ * `scan_toxic_match`: A hash table entry points to a block that is discovered to be toxic.
+ * `scan_twice`: Two references to the same block have been found in the hash table.
+ * `scan_zero_compressed`: An extent that was compressed and contained only zero bytes was found.
+ * `scan_zero_uncompressed`: A block that contained only zero bytes was found in an uncompressed extent.
+
+scanf
+-----
+
+The `scanf` event group consists of operations related to `BeesContext::scan_forward`.  This is the entry point where `crawl` schedules new data for scanning.
+
+ * `scanf_extent`: A btrfs extent item was scanned.
+ * `scanf_extent_ms`: Total thread-seconds spent scanning btrfs extent items.
+ * `scanf_total`: A logical byte range of a file was scanned.
+ * `scanf_total_ms`: Total thread-seconds spent scanning logical byte ranges.
+
+Note that in current versions of bees, `scan_forward` is passed extents
+that correspond exactly to btrfs extent items, so the `scanf_extent` and
+`scanf_total` numbers can only be different if the filesystem changes
+between crawl time and scan time.
+
+sync
+----
+
+The `sync` event group consists of operations related to the `fsync` workarounds in bees.
+
+ * `sync_count`: `fsync()` was called on a temporary file.
+ * `sync_ms`: Total time spent executing `fsync()`.
+
+tmp
+---
+
+The `sync` event group consists of operations related temporary files and the data within them.
+
+ * `tmp_aligned`: A temporary extent was allocated on a block boundary.
+ * `tmp_block`: Total number of temporary blocks copied.
+ * `tmp_block_zero`: Total number of temporary hole blocks copied.
+ * `tmp_bytes`: Total number of temporary bytes copied.
+ * `tmp_copy`: Total number of extents copied.
+ * `tmp_copy_ms`: Total time spent copying extents.
+ * `tmp_create`: Total number of temporary files created.
+ * `tmp_create_ms`: Total time spent creating temporary files.
+ * `tmp_hole`: Total number of hole extents created.
+ * `tmp_realign`: A temporary extent was not aligned to a block boundary.
+ * `tmp_resize`: A temporary file was resized with `ftruncate()`
+ * `tmp_resize_ms`: Total time spent in `ftruncate()`
+ * `tmp_trunc`: The temporary file size limit was exceeded, triggering a new temporary file creation.
--- a/docs/gotchas.md
+++ b/docs/gotchas.md
@@ -0,0 +1,268 @@
+bees Gotchas
+============
+
+C++ Exceptions
+--------------
+
+bees is very paranoid about the data it gets from btrfs, and if btrfs
+does anything bees does not expect, bees will throw an exception and move
+on without touching the offending data.  This will trigger a stack trace
+to the log containing data which is useful for developers to understand
+what happened.
+
+In all cases C++ exceptions in bees are harmless to data on the
+filesystem.  bees handles most exceptions by aborting processing of
+the current extent and moving to the next extent.  In some cases an
+exception may occur in a critical bees thread, which will stop the bees
+process from making any further progress; however, these cases are rare
+and are typically caused by unusual filesystem conditions (e.g. [freshly
+formatted filesystem with no
+data](https://github.com/Zygo/bees/issues/93)) or lack of memory or
+other resources.
+
+The following are common cases that users may encounter:
+
+* If a snapshot is deleted, bees will generate a burst of exceptions for
+references to files in the snapshot that no longer exist.  This lasts
+until the FD caches are cleared, usually a few minutes with default
+btrfs mount options.  These generally look like:
+
+	`std::system_error: BTRFS_IOC_TREE_SEARCH_V2: [path] at fs.cc:844: No such file or directory`
+
+* If data is modified at the same time it is being scanned, bees will get
+an inconsistent version of the data layout in the filesystem, causing
+the `ExtentWalker` class to throw various constraint-check exceptions.
+The exception causes bees to retry the extent in a later filesystem scan
+(hopefully when the file is no longer being modified).  The exception
+text is similar to:
+
+	`std::runtime_error: fm.rbegin()->flags() = 776 failed constraint check (fm.rbegin()->flags() & FIEMAP_EXTENT_LAST) at extentwalker.cc:229`
+
+  but the line number or specific code fragment may vary.
+
+* If there are too many possible matching blocks within a pair of extents,
+bees will loop billions of times considering all possibilities.  This is
+a waste of time, so an exception is currently used to break out of such
+loops early.  The exception text in this case is:
+
+	`FIXME: bailing out here, need to fix this further up the call stack`
+
+
+Terminating bees with SIGTERM
+-----------------------------
+
+bees is designed to survive host crashes, so it is safe to terminate
+bees using SIGKILL; however, when bees next starts up, it will repeat
+some work that was performed between the last bees crawl state save point
+and the SIGKILL (up to 15 minutes).  If bees is stopped and started less
+than once per day, then this is not a problem as the proportional impact
+is quite small; however, users who stop and start bees daily or even
+more often may prefer to have a clean shutdown with SIGTERM so bees can
+restart faster.
+
+bees handling of SIGTERM can take a long time on machines with some or
+all of:
+
+   * Large RAM and `vm.dirty_ratio`
+   * Large number of active bees worker threads
+   * Large number of bees temporary files (proportional to thread count)
+   * Large hash table size
+   * Large filesystem size
+   * High IO latency, especially "low power" spinning disks
+   * High filesystem activity, especially duplicate data writes
+
+Each of these factors individually increases the total time required
+to perform a clean bees shutdown.  When combined, the factors can
+multiply with each other, dramatically increasing the time required to
+flush bees state to disk.
+
+On a large system with many of the above factors present, a "clean"
+bees shutdown can take more than 20 minutes.  Even a small machine
+(16GB RAM, 1GB hash table, 1TB NVME disk) can take several seconds to
+complete a SIGTERM shutdown.
+
+The shutdown procedure performs potentially long-running tasks in
+this order:
+
+   1.  Worker threads finish executing their current Task and exit.
+       Threads executing `LOGICAL_INO` ioctl calls usually finish quickly,
+       but btrfs imposes no limit on the ioctl's running time, so it
+       can take several minutes in rare bad cases.  If there is a btrfs
+       commit already in progress on the filesystem, then most worker
+       threads will be blocked until the btrfs commit is finished.
+
+   2.  Crawl state is saved to `$BEESHOME`.  This normally completes
+       relatively quickly (a few seconds at most).  This is the most
+       important bees state to save to disk as it directly impacts
+       restart time, so it is done as early as possible (but no earlier).
+
+   3.  Hash table is written to disk.  Normally the hash table is
+       trickled back to disk at a rate of about 2GB per hour;
+       however, SIGTERM causes bees to attempt to flush the whole table
+       immediately.  If bees has recently been idle then the hash table is
+       likely already flushed to disk, so this step will finish quickly;
+       however, if bees has recently been active and the hash table is
+       large relative to RAM size, the blast of rapidly written data
+       can force the Linux VFS to block all writes to the filesystem
+       for sufficient time to complete all pending btrfs metadata
+       writes which accumulated during the btrfs commit before bees
+       received SIGTERM...and _then_ let bees write out the hash table.
+       The time spent here depends on the size of RAM, speed of disks,
+       and aggressiveness of competing filesystem workloads.
+
+   4.  bees temporary files are closed, which implies deletion of their
+       inodes.  These are files which consist entirely of shared extent
+       structures, and btrfs takes an unusually long time to delete such
+       files (up to a few minutes for each on slow spinning disks).
+
+If bees is terminated with SIGKILL, only step #1 and #4 are performed (the
+kernel performs these automatically if bees exits).  This reduces the
+shutdown time at the cost of increased startup time.
+
+Balances
+--------
+
+First, read [`LOGICAL_INO` and btrfs balance WARNING](btrfs-kernel.md).
+bees will suspend operations during a btrfs balance to work around
+kernel bugs.
+
+A btrfs balance relocates data on disk by making a new copy of the
+data, replacing all references to the old data with references to the
+new copy, and deleting the old copy.  To bees, this is the same as any
+other combination of new and deleted data (e.g. from defrag, or ordinary
+file operations):  some new data has appeared (to be scanned) and some
+old data has disappeared (to be removed from the hash table when it is
+detected).
+
+As bees scans the newly balanced data, it will get hits on the hash
+table pointing to the old data (it's identical data, so it would look
+like a duplicate).  These old hash table entries will not be valid any
+more, so when bees tries to compare new data with old data, it will not
+be able to find the old data at the old address, and bees will delete
+the hash table entries.  If no other duplicates are found, bees will
+then insert new hash table entries pointing to the new data locations.
+The erase is performed before the insert, so the new data simply replaces
+the old and there is (little or) no impact on hash table entry lifetimes
+(depending on how overcommitted the hash table is).  Each block is
+processed one at a time, which can be slow if there are many of them.
+
+Routine btrfs maintenance balances rarely need to relocate more than 0.1%
+of the total filesystem data, so the impact on bees is small even after
+taking into account the extra work bees has to do.
+
+If the filesystem must undergo a full balance (e.g. because disks were
+added or removed, or to change RAID profiles), then every data block on
+the filesystem will be relocated to a new address, which invalidates all
+the data in the bees hash table at once.  In such cases it is a good idea to:
+
+  1.  Stop bees before the full balance starts,
+  2.  Wipe the `$BEESHOME` directory (or delete and recreate `beeshash.dat`),
+  3.  Restart bees after the full balance is finished.
+
+bees will perform a full filesystem scan automatically after the balance
+since all the data has "new" btrfs transids.  bees won't waste any time
+invalidating stale hash table data after the balance if the hash table
+is empty.  This can considerably improve the performance of both bees
+(since it has no stale hash table entries to invalidate) and btrfs balance
+(since it's not competing with bees for iops).
+
+Snapshots
+---------
+
+bees can dedupe filesystems with many snapshots, but bees only does
+well in this situation if bees was running on the filesystem from
+the beginning.
+
+Each time bees dedupes an extent that is referenced by a snapshot,
+the entire metadata page in the snapshot subvol (16KB by default) must
+be CoWed in btrfs.  This can result in a substantial increase in btrfs
+metadata size if there are many snapshots on a filesystem.
+
+Normally, metadata is small (less than 1% of the filesystem) and dedupe
+hit rates are large (10-40% of the filesystem), so the increase in
+metadata size is offset by much larger reductions in data size and the
+total space used by the entire filesystem is reduced.
+
+If a subvol is deduped _before_ a snapshot is created, the snapshot will
+have the same deduplication as the subvol.  This does _not_ result in
+unusually large metadata sizes.  If a snapshot is made after bees has
+fully scanned the origin subvol, bees can avoid scanning most of the
+data in the snapshot subvol, as it will be provably identical to the
+origin subvol that was already scanned.
+
+If a subvol is deduped _after_ a snapshot is created, the origin and
+snapshot subvols must be deduplicated separately.  In the worst case, this
+will double the amount of reading the bees scanner must perform, and will
+also double the amount of btrfs metadata used for the snapshot; however,
+the "worst case" is a dedupe hit rate of 1% or more, so a doubling of
+metadata size is certain for all but the most unique data sets.  Also,
+bees will not be able to free any space until the last snapshot has been
+scanned and deduped, so payoff in data space savings is deferred until
+the metadata has almost finished expanding.
+
+If a subvol is deduped after _many_ snapshots have been created, all
+subvols must be deduplicated individually.  In the worst case, this will
+multiply the scanning work and metadata size by the number of snapshots.
+For 100 snapshots this can mean a 100x growth in metadata size and
+bees scanning time, which typically exceeds the possible savings from
+reducing the data size by dedupe.  In such cases using bees will result
+in a net increase in disk space usage that persists until the snapshots
+are deleted.
+
+Snapshot case studies
+---------------------
+
+ * bees running on an empty filesystem
+   * filesystem is mkfsed
+   * bees is installed and starts running
+   * data is written to the filesystem
+   * bees dedupes the data as it appears
+   * a snapshot is made of the data
+      * The snapshot will already be 99% deduped, so the metadata will
+      not expand very much because only 1% of the data in the snapshot
+      must be deduped.
+   * more snapshots are made of the data
+      * as long as dedupe has been completed on the origin subvol,
+      bees will quickly scan each new snapshot because it can skip
+      all the previously scanned data.  Metadata usage remains low
+      (it may even shrink because there are fewer csums).
+
+ * bees installed on a non-empty filesystem with snapshots
+   * filesystem is mkfsed
+   * data is written to the filesystem
+   * multiple snapshots are made of the data
+   * bees is installed and starts running
+   * bees dedupes each snapshot individually
+      * The snapshot metadata will no longer be shared, resulting in
+      substantial growth of metadata usage.
+      * Disk space savings do not occur until bees processes the
+      last snapshot reference to data.
+
+
+Other Gotchas
+-------------
+
+* bees avoids the [slow backrefs kernel bug](btrfs-kernel.md) by
+  measuring the time required to perform `LOGICAL_INO` operations.
+  If an extent requires over 0.1 kernel CPU seconds to perform a
+  `LOGICAL_INO` ioctl, then bees blacklists the extent and avoids
+  referencing it in future operations.  In most cases, fewer than 0.1%
+  of extents in a filesystem must be avoided this way.  This results
+  in short write latency spikes as btrfs will not allow writes to the
+  filesystem while `LOGICAL_INO` is running.  Generally the CPU spends
+  most of the runtime of the `LOGICAL_INO` ioctl running the kernel,
+  so on a single-core CPU the entire system can freeze up for a second
+  during operations on toxic extents.
+
+* If a process holds a directory FD open, the subvol containing the
+  directory cannot be deleted (`btrfs sub del` will start the deletion
+  process, but it will not proceed past the first open directory FD).
+  `btrfs-cleaner` will simply skip over the directory *and all of its
+  children* until the FD is closed.  bees avoids this gotcha by closing
+  all of the FDs in its directory FD cache every 10 btrfs transactions.
+
+* If a file is deleted while bees is caching an open FD to the file,
+  bees continues to scan the file.  For very large files (e.g. VM
+  images), the deletion of the file can be delayed indefinitely.
+  To limit this delay, bees closes all FDs in its file FD cache every
+  10 btrfs transactions.
--- a/docs/how-it-works.md
+++ b/docs/how-it-works.md
@@ -0,0 +1,100 @@
+How bees Works
+--------------
+
+bees is a daemon designed to run continuously and maintain its state
+across crashes and reboots.
+
+bees uses checkpoints for persistence to eliminate the IO overhead of a
+transactional data store.  On restart, bees will dedupe any data that
+was added to the filesystem since the last checkpoint.  Checkpoints
+occur every 15 minutes for scan progress, stored in `beescrawl.dat`.
+The hash table trickle-writes to disk at 4GB/hour to `beeshash.dat`.
+An hourly performance report is written to `beesstats.txt`.  There are
+no special requirements for bees hash table storage--`.beeshome` could
+be stored on a different btrfs filesystem, ext4, or even CIFS.
+
+bees uses a persistent dedupe hash table with a fixed size configured
+by the user.  Any size of hash table can be dedicated to dedupe.  If a
+fast dedupe with low hit rate is desired, bees can use a hash table as
+small as 128KB.
+
+The bees hash table is loaded into RAM at startup and `mlock`ed so it
+will not be swapped out by the kernel (if swap is permitted, performance
+degrades to nearly zero).
+
+bees scans the filesystem in a single pass which removes duplicate
+extents immediately after they are detected.  There are no distinct
+scanning and dedupe phases, so bees can start recovering free space
+immediately after startup.
+
+Once a filesystem scan has been completed, bees uses the `min_transid`
+parameter of the `TREE_SEARCH_V2` ioctl to avoid rescanning old data
+on future scans and quickly scan new data.  An incremental data scan
+can complete in less than a millisecond on an idle filesystem.
+
+Once a duplicate data block is identified, bees examines the nearby
+blocks in the files where the matched block appears.  This allows bees
+to find long runs of adjacent duplicate block pairs if it has an entry
+for any one of the blocks in its hash table.  On typical data sets,
+this means most of the blocks in the hash table are redundant and can
+be discarded without significant impact on dedupe hit rate.
+
+Hash table entries are grouped together into LRU lists.  As each block
+is scanned, its hash table entry is inserted into the LRU list at a
+random position.  If the LRU list is full, the entry at the end of the
+list is deleted.  If a hash table entry is used to discover duplicate
+blocks, the entry is moved to the beginning of the list.  This makes bees
+unable to detect a small number of duplicates, but it dramatically
+improves efficiency on filesystems with many small files.
+
+Once the hash table fills up, old entries are evicted by new entries.
+This means that the optimum hash table size is determined by the
+distance between duplicate blocks on the filesystem rather than the
+filesystem unique data size.  Even if the hash table is too small
+to find all duplicates, it may still find _most_ of them, especially
+during incremental scans where the data in many workloads tends to be
+more similar.
+
+When a duplicate block pair is found in two btrfs extents, bees will
+attempt to match all other blocks in the newer extent with blocks in
+the older extent (i.e. the goal is to keep the extent referenced in the
+hash table and remove the most recently scanned extent).  If this is
+possible, then the new extent will be replaced with a reference to the
+old extent.  If this is not possible, then bees will create a temporary
+copy of the unmatched data in the new extent so that the entire new
+extent can be removed by deduplication.  This must be done because btrfs
+cannot partially overwrite extents--the _entire_ extent must be replaced.
+The temporary copy is then scanned during the next pass bees makes over
+the filesystem for potential duplication of other extents.
+
+When a block containing all-zero bytes is found, bees dedupes the extent
+against a temporary file containing a hole, possibly creating temporary
+copies of any non-zero data in the extent for later deduplication as
+described above.  If the extent is compressed, bees avoids splitting
+the extent in the middle as this generally has a negative impact on
+compression ratio (and also triggers a [kernel bug](btrfs-kernel.md)).
+
+bees does not store any information about filesystem structure, so
+its performance is linear in the number or size of files.  The hash
+table stores physical block numbers which are converted into paths
+and FDs on demand through btrfs `SEARCH_V2` and `LOGICAL_INO` ioctls.
+This eliminates the storage required to maintain the equivalents
+of these functions in userspace, at the expense of encountering [some
+kernel bugs in `LOGICAL_INO` performance](btrfs-kernel.md).
+
+bees uses only the data-safe `FILE_EXTENT_SAME` (aka `FIDEDUPERANGE`)
+kernel operations to manipulate user data, so it can dedupe live data
+(e.g. build servers, sqlite databases, VM disk images).  It does not
+modify file attributes or timestamps.
+
+When bees has scanned all of the data, bees will pause until 10
+transactions have been completed in the btrfs filesystem.  bees tracks
+the current btrfs transaction ID over time so that it polls less often
+on quiescent filesystems and more often on busy filesystems.
+
+Scanning and deduplication work is performed by worker threads.  If the
+[`--loadavg-target` option](options.md) is used, bees adjusts the number
+of worker threads up or down as required to have a user-specified load
+impact on the system.  The maximum and minimum number of threads is
+configurable.  If the system load is too high then bees will stop until
+the load falls to acceptable levels.
--- a/docs/index.md
+++ b/docs/index.md
@@ -0,0 +1,75 @@
+BEES
+====
+
+Best-Effort Extent-Same, a btrfs deduplication agent.
+
+About bees
+----------
+
+bees is a block-oriented userspace deduplication agent designed for large
+btrfs filesystems.  It is an offline dedupe combined with an incremental
+data scan capability to minimize time data spends on disk from write
+to dedupe.
+
+Strengths
+---------
+
+ * Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
+ * Daemon incrementally dedupes new data using btrfs tree search
+ * Works with btrfs compression - dedupe any combination of compressed and uncompressed files
+ * **NEW** [Works around `btrfs send` problems with dedupe and incremental parent shapshots](options.md)
+ * Works around btrfs filesystem structure to free more disk space
+ * Persistent hash table for rapid restart after shutdown
+ * Whole-filesystem dedupe - including snapshots
+ * Constant hash table size - no increased RAM usage if data set becomes larger
+ * Works on live data - no scheduled downtime required
+ * Automatic self-throttling based on system load
+
+Weaknesses
+----------
+
+ * Whole-filesystem dedupe - has no include/exclude filters, does not accept file lists
+ * Requires root privilege (or `CAP_SYS_ADMIN`)
+ * First run may require temporary disk space for extent reorganization
+ * [First run may increase metadata space usage if many snapshots exist](gotchas.md)
+ * Constant hash table size - no decreased RAM usage if data set becomes smaller
+ * btrfs only
+
+Installation and Usage
+----------------------
+
+ * [Installation](install.md)
+ * [Configuration](config.md)
+ * [Running](running.md)
+ * [Command Line Options](options.md)
+
+Recommended Reading
+-------------------
+
+ * [bees Gotchas](gotchas.md)
+ * [btrfs kernel bugs](btrfs-kernel.md) - especially DATA CORRUPTION WARNING
+ * [bees vs. other btrfs features](btrfs-other.md)
+ * [What to do when something goes wrong](wrong.md)
+
+More Information
+----------------
+
+ * [How bees works](how-it-works.md)
+ * [Missing bees features](missing.md)
+ * [Event counter descriptions](event-counters.md)
+
+Bug Reports and Contributions
+-----------------------------
+
+Email bug reports and patches to Zygo Blaxell <bees@furryterror.org>.
+
+You can also use Github:
+
+        https://github.com/Zygo/bees
+
+Copyright & License
+-------------------
+
+Copyright 2015-2018 Zygo Blaxell <bees@furryterror.org>.
+
+GPL (version 3 or later).
--- a/docs/install.md
+++ b/docs/install.md
@@ -0,0 +1,91 @@
+Building bees
+=============
+
+Dependencies
+------------
+
+* C++11 compiler (tested with GCC 4.9, 6.3.0, 8.1.0)
+
+  Sorry.  I really like closures and shared_ptr, so support
+  for earlier compiler versions is unlikely.
+
+  Note that the C++ standard--and GCC's implementation of it--is evolving.
+  There may be problems when building with newer compiler versions.
+  Build failure reports welcome!
+
+* btrfs-progs
+
+  Needed at runtime by the service wrapper script.
+
+* [Linux kernel version](btrfs-kernel.md) gets its own page.
+
+* markdown for documentation
+
+* util-linux version that provides `blkid` command for the helper
+  script `scripts/beesd` to work
+
+Installation
+============
+
+bees can be installed by following one these instructions:
+
+Arch package
+------------
+
+bees is available for Arch Linux in the community repository. Install with:
+
+`$ pacman -S bees`
+
+or build a live version from git master using AUR:
+
+`$ git clone https://aur.archlinux.org/bees-git.git && cd bees-git && makepkg -si`
+
+Gentoo package
+--------------
+
+bees is officially available in Gentoo Portage. Just emerge a stable
+version:
+
+`$ emerge --ask bees`
+
+or build a live version from git master:
+
+`$ emerge --ask =bees-9999`
+
+You can opt-out of building the support tools with
+
+`USE="-tools" emerge ...`
+
+If you want to start hacking on bees and contribute changes, just emerge
+the live version which automatically pulls in all required development
+packages.
+
+Build from source
+-----------------
+
+Build with `make`. The build produces `bin/bees` which must be copied
+to somewhere in `$PATH` on the target system respectively.
+
+It will also generate `scripts/beesd@.service` for systemd users. This
+service makes use of a helper script `scripts/beesd` to boot the service.
+Both of the latter use the filesystem UUID to mount the root subvolume
+within a temporary runtime directory.
+
+### Ubuntu 16.04 - 17.04:
+`$ apt -y install build-essential btrfs-tools markdown && make`
+
+### Ubuntu 18.10:
+`$ apt -y install build-essential btrfs-progs markdown && make`
+
+Packaging
+---------
+
+See 'Dependencies' below. Package maintainers can pick ideas for building and
+configuring the source package from the Gentoo ebuild:
+
+<https://github.com/gentoo/gentoo/tree/master/sys-fs/bees>
+
+You can configure some build options by creating a file `localconf` and
+adjust settings for your distribution environment there.
+
+Please also review the Makefile for additional hints.
--- a/docs/missing.md
+++ b/docs/missing.md
@@ -0,0 +1,45 @@
+Features You Might Expect That bees Doesn't Have
+------------------------------------------------
+
+* There's no configuration file (patches welcome!).  There are
+some tunables hardcoded in the source that could eventually become
+configuration options.  There's also an incomplete option parser
+(patches welcome!).
+
+* The bees process doesn't fork and writes its log to stdout/stderr.
+A shell wrapper is required to make it behave more like a daemon.
+
+* There's no facility to exclude any part of a filesystem or focus on
+specific files (patches welcome).
+
+* PREALLOC extents and extents containing blocks filled with zeros will
+be replaced by holes.  There is no way to turn this off.
+
+* Consecutive runs of duplicate blocks that are less than 12K in length
+can take 30% of the processing time while saving only 3% of the disk
+space.  There should be an option to just not bother with those, but it's
+complicated by the btrfs requirement to always dedupe complete extents.
+
+* There is a lot of duplicate reading of blocks in snapshots.  bees will
+scan all snapshots at close to the same time to try to get better
+performance by caching, but really fixing this requires rewriting the
+crawler to scan the btrfs extent tree directly instead of the subvol
+FS trees.
+
+* Block reads are currently more allocation- and CPU-intensive than they
+should be, especially for filesystems on SSD where the IO overhead is
+much smaller.  This is a problem for CPU-power-constrained environments
+(e.g. laptops running from battery, or ARM devices with slow CPU).
+
+* bees can currently fragment extents when required to remove duplicate
+blocks, but has no defragmentation capability yet.  When possible, bees
+will attempt to work with existing extent boundaries, but it will not
+aggregate blocks together from multiple extents to create larger ones.
+
+* When bees fragments an extent, the copied data is compressed.  There
+is currently no way (other than by modifying the source) to select a
+compression method or not compress the data (patches welcome!).
+
+* It is theoretically possible to resize the hash table without starting
+over with a new full-filesystem scan; however, this feature has not been
+implemented yet.
--- a/docs/options.md
+++ b/docs/options.md
@@ -0,0 +1,92 @@
+# bees Command Line Options
+
+## Load management options
+
+* `--thread-count COUNT` or `-c`
+
+ Specify maximum number of worker threads.  Overrides `--thread-factor`
+ (`-C`), default/autodetected values, and the hardcoded thread limit.
+
+* `--thread-factor FACTOR` or `-C`
+
+ Specify ratio of worker threads to detected CPU cores.  Overridden by
+ `--thread-count` (`-c`).
+
+ Default is 1.0, i.e. 1 worker thread per detected CPU.  Use values
+ below 1.0 to leave some cores idle, or above 1.0 if there are more
+ disks than CPUs in the filesystem.
+
+* `--loadavg-target LOADAVG` or `-g`
+
+ Specify load average target for dynamic worker threads.  Default is
+ to run the maximum number of worker threads all the time.
+
+ Worker threads will be started or stopped subject to the upper limit
+ imposed by `--thread-factor`, `--thread-min` and `--thread-count`
+ until the load average is within +/- 0.5 of `LOADAVG`.
+
+* `--thread-min COUNT` or `-G`
+
+ Specify minimum number of dynamic worker threads.  This can be used
+ to force a minimum number of threads to continue running while using
+ `--loadavg-target` to manage load.
+
+ Default is 0, i.e. all bees worker threads will stop when the system
+ load exceeds the target.
+
+ Has no effect unless `--loadavg-target` is used to specify a target load.
+
+## Filesystem tree traversal options
+
+* `--scan-mode MODE` or `-m`
+
+ Specify extent scanning algorithm.  Default `MODE` is 0.
+ **EXPERIMENTAL** feature that may go away.
+
+  * Mode 0: scan extents in ascending order of (inode, subvol, offset).
+  Keeps shared extents between snapshots together.  Reads files sequentially.
+  Minimizes temporary space usage.
+  * Mode 1: scan extents from all subvols in parallel.  Good performance
+  on non-spinning media when subvols are unrelated.
+  * Mode 2: scan all extents from one subvol at a time.  Good sequential
+  read performance for spinning media.  Maximizes temporary space usage.
+
+## Workarounds
+
+* `--workaround-btrfs-send` or `-a`
+
+ Pretend that read-only snapshots are empty and silently discard any
+request to dedupe files referenced through them.  This is a workaround for
+[problems with the kernel implementation of `btrfs send` and `btrfs send
+-p`](btrfs-kernel.md) which make these btrfs features unusable with bees.
+
+ This option should be used to avoid breaking `btrfs send` on the same
+filesystem.
+
+ **Note:** There is a _significant_ space tradeoff when using this option:
+it is likely no space will be recovered--and possibly significant extra
+space used--until the read-only snapshots are deleted.  On the other
+hand, if snapshots are rotated frequently then bees will spend less time
+scanning them.
+
+## Logging options
+
+* `--timestamps` or `-t`
+
+ Enable timestamps in log output.
+
+* `--no-timestamps` or `-T`
+
+ Disable timestamps in log output.
+
+* `--absolute-paths` or `-p`
+
+ Paths in log output will be absolute.
+
+* `--strip-paths` or `-P`
+
+ Paths in log output will have the working directory at bees startup stripped.
+
+* `--verbose` or `-v`
+
+ Set log verbosity (0 = no output, 8 = all output, default 8).
--- a/docs/running.md
+++ b/docs/running.md
@@ -0,0 +1,92 @@
+Running bees
+============
+
+Setup
+-----
+
+If you don't want to use the helper script `scripts/beesd` to setup and
+configure bees, here's how you manually setup bees.
+
+Create a directory for bees state files:
+
+        export BEESHOME=/some/path
+        mkdir -p "$BEESHOME"
+
+Create an empty hash table ([your choice of size](config.md), but it
+must be a multiple of 128KB).  This example creates a 1GB hash table:
+
+        truncate -s 1g "$BEESHOME/beeshash.dat"
+        chmod 700 "$BEESHOME/beeshash.dat"
+
+bees can _only_ process the root subvol of a btrfs with nothing mounted
+over top.  If the bees argument is not the root subvol directory, bees
+will just throw an exception and stop.
+
+Use a separate mount point, and let only bees access it:
+
+        UUID=3399e413-695a-4b0b-9384-1b0ef8f6c4cd
+        mkdir -p /var/lib/bees/$UUID
+        mount /dev/disk/by-uuid/$UUID /var/lib/bees/$UUID -osubvol=/
+
+If you don't set BEESHOME, the path "`.beeshome`" will be used relative
+to the root subvol of the filesystem.  For example:
+
+        btrfs sub create /var/lib/bees/$UUID/.beeshome
+        truncate -s 1g /var/lib/bees/$UUID/.beeshome/beeshash.dat
+        chmod 700 /var/lib/bees/$UUID/.beeshome/beeshash.dat
+
+You can use any relative path in `BEESHOME`.  The path will be taken
+relative to the root of the deduped filesystem (in other words it can
+be the name of a subvol):
+
+        export BEESHOME=@my-beeshome
+        btrfs sub create /var/lib/bees/$UUID/$BEESHOME
+        truncate -s 1g /var/lib/bees/$UUID/$BEESHOME/beeshash.dat
+        chmod 700 /var/lib/bees/$UUID/$BEESHOME/beeshash.dat
+
+Configuration
+-------------
+
+There are some runtime configurable options using environment variables:
+
+* BEESHOME: Directory containing bees state files:
+	* beeshash.dat  | persistent hash table.  Must be a multiple of 128KB, and must be created before bees starts.
+	* beescrawl.dat | state of SEARCH_V2 crawlers.  ASCII text.  bees will create this.
+	* beesstats.txt | statistics and performance counters.  ASCII text.  bees will create this.
+* BEESSTATUS: File containing a snapshot of current bees state:  performance
+  counters and current status of each thread.  The file is meant to be
+  human readable, but understanding it probably requires reading the source.
+  You can watch bees run in realtime with a command like:
+
+        watch -n1 cat $BEESSTATUS
+
+Other options (e.g. interval between filesystem crawls) can be configured
+in `src/bees.h` or [on the command line](options.md).
+
+Running
+-------
+
+Reduce CPU and IO priority to be kinder to other applications sharing
+this host (or raise them for more aggressive disk space recovery).  If you
+use cgroups, put `bees` in its own cgroup, then reduce the `blkio.weight`
+and `cpu.shares` parameters.  You can also use `schedtool` and `ionice`
+in the shell script that launches `bees`:
+
+        schedtool -D -n20 $$
+        ionice -c3 -p $$
+
+You can also use the [`--loadavg-target` and `--thread-min`
+options](options.md) to further control the impact of bees on the rest
+of the system.
+
+Let the bees fly:
+
+        for fs in /var/lib/bees/*-*-*-*-*/; do
+                bees "$fs" >> "$fs/.beeshome/bees.log" 2>&1 &
+        done
+
+You'll probably want to arrange for `/var/log/bees.log` to be rotated
+periodically.  You may also want to set umask to 077 to prevent disclosure
+of information about the contents of the filesystem through the log file.
+
+There are also some shell wrappers in the `scripts/` directory.
--- a/docs/wrong.md
+++ b/docs/wrong.md
@@ -0,0 +1,168 @@
+What to do when something goes wrong with bees
+==============================================
+
+Hangs and excessive slowness
+----------------------------
+
+### Are you using qgroups or autodefrag?
+
+  Read about [bad btrfs feature interactions](btrfs-other.md).
+
+### Use load-throttling options
+
+  If bees is just more aggressive than you would like, consider using
+  [load throttling options](options.md).  These are usually more effective
+  than `ionice`, `schedtool`, and the `blkio` cgroup (though you can
+  certainly use those too).
+
+### Check `$BEESSTATUS`
+
+  If bees or the filesystem seems to be stuck, check the contents of
+  `$BEESSTATUS`.  bees describes what it is doing (and how long it has
+  been trying to do it) through this file.
+
+  Sample:
+
+<pre>
+THREADS (work queue 68 tasks):
+	tid 20939: crawl_5986: dedup BeesRangePair: 512K src[0x9933f000..0x993bf000] dst[0x9933f000..0x993bf000]
+src = 147 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data/home/builder/linux/.git/objects/pack/pack-09f06f8759ac7fd163df320b7f7671f06ac2a747.pack
+dst = 15 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data.new/home/builder/linux/.git/objects/pack/pack-09f06f8759ac7fd163df320b7f7671f06ac2a747.pack
+	tid 20940: crawl_5986: dedup BeesRangePair: 512K src[0x992bf000..0x9933f000] dst[0x992bf000..0x9933f000]
+src = 147 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data/home/builder/linux/.git/objects/pack/pack-09f06f8759ac7fd163df320b7f7671f06ac2a747.pack
+dst = 15 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data.new/home/builder/linux/.git/objects/pack/pack-09f06f8759ac7fd163df320b7f7671f06ac2a747.pack
+	tid 21177: crawl_5986: dedup BeesRangePair: 512K src[0x9923f000..0x992bf000] dst[0x9923f000..0x992bf000]
+src = 147 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data/home/builder/linux/.git/objects/pack/pack-09f06f8759ac7fd163df320b7f7671f06ac2a747.pack
+dst = 15 /run/bees/ede84fbd-cb59-0c60-9ea7-376fa4984887/data.new/home/builder/linux/.git/objects/pack/pack-09f06f8759ac7fd163df320b7f7671f06ac2a747.pack
+	tid 21677: bees: [68493.1s] main
+	tid 21689: crawl_transid: [236.508s] waiting 332.575s for next 10 transid RateEstimator { count = 87179, raw = 969.066 / 32229.2, ratio = 969.066 / 32465.7, rate = 0.0298489, duration(1) = 33.5021, seconds_for(1) = 1 }
+	tid 21690: status: writing status to file '/run/bees.status'
+	tid 21691: crawl_writeback: [203.456s] idle, dirty
+	tid 21692: hash_writeback: [12.466s] flush rate limited after extent #17 of 64 extents
+	tid 21693: hash_prefetch: [2896.61s] idle 3600s
+</pre>
+
+ The time in square brackets indicates how long the thread has been
+ executing the current task (if this time is below 5 seconds then it
+ is omitted).  We can see here that the main thread (and therefore the
+ bees process as a whole) has been running for 68493.1 seconds, the
+ last hash table write was 12.5 seconds ago, and the last transid poll
+ was 236.5 seconds ago.  Three worker threads are currently performing
+ dedupe on extents.
+
+ Thread names of note:
+
+ * `crawl_12345`: scan/dedupe worker threads (the number is the subvol
+   ID which the thread is currently working on).  These threads appear
+   and disappear from the status dynamically according to the requirements
+   of the work queue and loadavg throttling.
+ * `bees`: main thread (doesn't do anything after startup, but its task execution time is that of the whole bees process)
+ * `crawl_master`: task that finds new extents in the filesystem and populates the work queue
+ * `crawl_transid`: btrfs transid (generation number) tracker and polling thread
+ * `status`: the thread that writes the status reports to `$BEESSTATUS`
+ * `crawl_writeback`: writes the scanner progress to `beescrawl.dat`
+ * `hash_writeback`: trickle-writes the hash table back to `beeshash.dat`
+ * `hash_prefetch`: prefetches the hash table at startup and updates `beesstats.txt` hourly
+
+### Dump kernel stacks of hung processes
+
+Check the kernel stacks of all blocked kernel processes:
+
+	ps xar | while read -r x y; do ps "$x"; head -50 --verbose /proc/"$x"/task/*/stack; done | tee lockup-stacks.txt
+
+Submit the above information in your bug report.
+
+### Check dmesg for btrfs stack dumps
+
+Sometimes these are relevant too.
+
+
+bees Crashes
+------------
+
+ * If you have a core dump, run these commands in gdb and include
+ the output in your report (you may need to post it as a compressed
+ attachment, as it can be quite large):
+
+        (gdb) set pagination off
+        (gdb) info shared
+        (gdb) bt
+        (gdb) thread apply all bt
+        (gdb) thread apply all bt full
+
+  The last line generates megabytes of output and will often crash gdb.
+  This is OK, submit whatever output gdb can produce.
+
+  **Note that this output may include filenames or data from your
+  filesystem.**
+
+ * If you have `systemd-coredump` installed, you can use `coredumpctl`:
+
+        (echo set pagination off;
+        echo info shared;
+        echo bt;
+        echo thread apply all bt;
+        echo thread apply all bt full) | coredumpctl gdb bees
+
+ * If the crash happens often (or don't want to use coredumpctl),
+   you can run automate the gdb data collection with this wrapper script:
+
+<pre>
+#!/bin/sh
+set -x
+
+# Move aside old core files for analysis
+for x in core*; do
+	if [ -e "$x" ]; then
+		mv -vf "$x" "old-$x.$(date +%Y-%m-%d-%H-%M-%S)"
+	fi
+done
+
+# Delete old core files after a week
+find old-core* -type f -mtime +7 -exec rm -vf {} + &
+
+# Turn on the cores (FIXME: may need to change other system parameters
+# that capture or redirect core files)
+ulimit -c unlimited
+
+# Run the command
+"$@"
+rv="$?"
+
+# Don't clobber our core when gdb crashes
+ulimit -c 0
+
+# If there were core files, generate reports for them
+for x in core*; do
+	if [ -e "$x" ]; then 
+		gdb --core="$x" \
+		--eval-command='set pagination off' \
+		--eval-command='info shared' \
+		--eval-command='bt' \
+		--eval-command='thread apply all bt' \
+		--eval-command='thread apply all bt full' \
+		--eval-command='quit' \
+		--args "$@" 2>&1 | tee -a "$x.txt"
+	fi
+done
+
+# Return process exit status to caller
+exit "$rv"
+</pre>
+
+  To use the wrapper script, insert it just before the `bees` command,
+  as in:
+
+    gdb-wrapper bees /path/to/fs/
+
+
+Kernel crashes, corruption, and filesystem damage
+-------------------------------------------------
+
+bees doesn't do anything that _should_ cause corruption or data loss;
+however, [btrfs has kernel bugs](btrfs-kernel.md) and [interacts poorly
+with some Linux block device layers](btrfs-other.md), so corruption is
+not impossible.
+
+Issues with the btrfs filesystem kernel code or other block device layers
+should be reported to their respective maintainers.
--- a/include/crucible/btrfs.h
+++ b/include/crucible/btrfs.h
@@ -13,18 +13,22 @@
 // __u64 typedef and friends
 #include <linux/types.h>

-// try Linux headers first
-#include <btrfs/ioctl.h>
+// the btrfs headers
+#include <linux/btrfs.h>
+#include <linux/btrfs_tree.h>

-// Supply any missing definitions
-#define mutex not_mutex
-#include <btrfs/ctree.h>
-// Repair the damage
-#undef min
-#undef max
-#undef mutex
+// And now all the things that have been missing in some version of
+// the headers.

-#ifndef BTRFS_FIRST_FREE_OBJECTID
+enum btrfs_compression_type {
+	BTRFS_COMPRESS_NONE,
+	BTRFS_COMPRESS_ZLIB,
+	BTRFS_COMPRESS_LZO,
+	BTRFS_COMPRESS_ZSTD,
+};
+
+// BTRFS_CSUM_ITEM_KEY is not defined in include/uapi
+#ifndef BTRFS_CSUM_ITEM_KEY

 	#define BTRFS_ROOT_TREE_OBJECTID 1ULL
 	#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
@@ -158,7 +162,7 @@
 		__u64 bytes_deduped;    /* out - total # of bytes we were able
 					 * to dedupe from this file */
 		/* status of this dedupe operation:
-		 * 0 if dedup succeeds
+		 * 0 if dedupe succeeds
 		 * < 0 for error
 		 * == BTRFS_SAME_DATA_DIFFERS if data differs
 		 */
@@ -202,4 +206,28 @@
 					   struct btrfs_ioctl_search_args_v2)
 #endif

+#ifndef BTRFS_IOC_LOGICAL_INO_V2
+	#define BTRFS_IOC_LOGICAL_INO_V2 _IOWR(BTRFS_IOCTL_MAGIC, 59, struct btrfs_ioctl_logical_ino_args)
+	#define BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET (1ULL << 0)
+#endif
+
+#ifndef BTRFS_FS_INFO_FLAG_CSUM_INFO
+	/* Request information about checksum type and size */
+	#define BTRFS_FS_INFO_FLAG_CSUM_INFO                    (1 << 0)
+#endif
+
+struct btrfs_ioctl_fs_info_args_v2 {
+	__u64 max_id;                           /* out */
+	__u64 num_devices;                      /* out */
+	__u8 fsid[BTRFS_FSID_SIZE];             /* out */
+	__u32 nodesize;                         /* out */
+	__u32 sectorsize;                       /* out */
+	__u32 clone_alignment;                  /* out */
+	/* See BTRFS_FS_INFO_FLAG_* */
+	__u16 csum_type;                        /* out */
+	__u16 csum_size;                        /* out */
+	__u64 flags;                            /* in/out */
+	__u8 reserved[968];                     /* pad to 1k */
+};
+
 #endif // CRUCIBLE_BTRFS_H
--- a/include/crucible/cache.h
+++ b/include/crucible/cache.h
@@ -3,12 +3,11 @@

 #include "crucible/lockset.h"

-#include <algorithm>
 #include <functional>
+#include <list>
 #include <map>
 #include <mutex>
 #include <tuple>
-#include <vector>

 namespace crucible {
 	using namespace std;
@@ -20,25 +19,24 @@ namespace crucible {
 		using Func = function<Return(Arguments...)>;
 	private:
 		struct Value {
-			Value *fp = nullptr;
-			Value *bp = nullptr;
 			Key key;
 			Return ret;
-			Value(Key k, Return r) : key(k), ret(r) { }
-			// Crash early!
-			~Value() { fp = bp = nullptr; };
 		};

-		Func		m_fn;
-		map<Key, Value>	m_map;
-		LockSet<Key>	m_lockset;
-		size_t		m_max_size;
-		mutex		m_mutex;
-		Value		*m_last = nullptr;
+		using ListIter = typename list<Value>::iterator;
+
+		Func			m_fn;
+		list<Value>		m_list;
+		map<Key, ListIter>	m_map;
+		LockSet<Key>		m_lockset;
+		size_t			m_max_size;
+		mutex			m_mutex;

 		void check_overflow();
-		void move_to_front(Value *vp);
-		void erase_one(Value *vp);
+		void recent_use(ListIter vp);
+		void erase_item(ListIter vp);
+		void erase_key(const Key &k);
+		Return insert_item(Func fn, Arguments... args);
 	public:
 		LRUCache(Func f = Func(), size_t max_size = 100);

@@ -48,7 +46,6 @@ namespace crucible {
 		Return operator()(Arguments... args);
 		Return refresh(Arguments... args);
 		void expire(Arguments... args);
-		void prune(function<bool(const Return &)> predicate);
 		void insert(const Return &r, Arguments... args);
 		void clear();
 	};
@@ -61,30 +58,81 @@ namespace crucible {
 	}

 	template <class Return, class... Arguments>
-	void
-	LRUCache<Return, Arguments...>::erase_one(Value *vp)
+	Return
+	LRUCache<Return, Arguments...>::insert_item(Func fn, Arguments... args)
 	{
-		THROW_CHECK0(invalid_argument, vp);
-		Value *vp_bp = vp->bp;
-		THROW_CHECK0(runtime_error, vp_bp);
-		Value *vp_fp = vp->fp;
-		THROW_CHECK0(runtime_error, vp_fp);
-		vp_fp->bp = vp_bp;
-		vp_bp->fp = vp_fp;
-		// If we delete the head of the list then advance the head by one
-		if (vp == m_last) {
-			// If the head of the list is also the tail of the list then clear m_last
-			if (vp_fp == m_last) {
-				m_last = nullptr;
-			} else {
-				m_last = vp_fp;
+		Key k(args...);
+
+		// Do we have it cached?
+		unique_lock<mutex> lock(m_mutex);
+		auto found = m_map.find(k);
+		if (found == m_map.end()) {
+			// No, release cache lock and acquire key lock
+			lock.unlock();
+			auto key_lock = m_lockset.make_lock(k);
+
+			// Did item appear in cache while we were waiting for key?
+			lock.lock();
+			found = m_map.find(k);
+			if (found == m_map.end()) {
+
+				// No, we now hold key and cache locks, but item not in cache.
+				// Release cache lock and call the function
+				lock.unlock();
+
+				// Create new value
+				Value v {
+					.key = k,
+					.ret = fn(args...),
+				};
+
+				// Reacquire cache lock
+				lock.lock();
+
+				// Make room
+				check_overflow();
+
+				// Insert return value at back of LRU list (hot end)
+				auto new_item = m_list.insert(m_list.end(), v);
+
+				// Insert return value in map
+				bool inserted = false;
+				tie(found, inserted) = m_map.insert(make_pair(v.key, new_item));
+
+				// We (should be) holding a lock on this key so we are the ones to insert it
+				THROW_CHECK0(runtime_error, inserted);
 			}
-		}
-		m_map.erase(vp->key);
-		if (!m_last) {
-			THROW_CHECK0(runtime_error, m_map.empty());
+
+			// Item should be in cache now
+			THROW_CHECK0(runtime_error, found != m_map.end());
 		} else {
-			THROW_CHECK0(runtime_error, !m_map.empty());
+			// Move to end of LRU
+			recent_use(found->second);
+		}
+
+		// Return cached object
+		return found->second->ret;
+	}
+
+	template <class Return, class... Arguments>
+	void
+	LRUCache<Return, Arguments...>::erase_item(ListIter vp)
+	{
+		if (vp != m_list.end()) {
+			m_map.erase(vp->key);
+			m_list.erase(vp);
+		}
+	}
+
+	template <class Return, class... Arguments>
+	void
+	LRUCache<Return, Arguments...>::erase_key(const Key &k)
+	{
+		auto map_item = m_map.find(k);
+		if (map_item != m_map.end()) {
+			auto list_item = map_item->second;
+			m_map.erase(map_item);
+			m_list.erase(list_item);
 		}
 	}

@@ -92,46 +140,20 @@ namespace crucible {
 	void
 	LRUCache<Return, Arguments...>::check_overflow()
 	{
-		while (m_map.size() >= m_max_size) {
-			THROW_CHECK0(runtime_error, m_last);
-			THROW_CHECK0(runtime_error, m_last->bp);
-			erase_one(m_last->bp);
+		// Erase items at front of LRU list (cold end) until max size reached or list empty
+		while (m_map.size() >= m_max_size && !m_list.empty()) {
+			erase_item(m_list.begin());
 		}
 	}

 	template <class Return, class... Arguments>
 	void
-	LRUCache<Return, Arguments...>::move_to_front(Value *vp)
+	LRUCache<Return, Arguments...>::recent_use(ListIter vp)
 	{
-		if (!m_last) {
-			// Create new LRU list
-			m_last = vp->fp = vp->bp = vp;
-		} else if (m_last != vp) {
-			Value *vp_fp = vp->fp;
-			Value *vp_bp = vp->bp;
-			if (vp_fp && vp_bp) {
-				// There are at least two and we are removing one that isn't m_last
-				// Connect adjacent nodes to each other (has no effect if vp is new), removing vp from list
-				vp_fp->bp = vp_bp;
-				vp_bp->fp = vp_fp;
-			} else {
-				// New insertion, both must be null
-				THROW_CHECK0(runtime_error, !vp_fp);
-				THROW_CHECK0(runtime_error, !vp_bp);
-			}
-			// Splice new node into list
-			Value *last_bp = m_last->bp;
-			THROW_CHECK0(runtime_error, last_bp);
-			// New element points to both ends of list
-			vp->fp = m_last;
-			vp->bp = last_bp;
-			// Insert vp as fp from the end of the list
-			last_bp->fp = vp;
-			// Insert vp as bp from the second from the start of the list
-			m_last->bp = vp;
-			// Update start of list
-			m_last = vp;
-		}
+		// Splice existing items at back of LRU list (hot end)
+		auto next_vp = vp;
+		++next_vp;
+		m_list.splice(m_list.end(), m_list, vp, next_vp);
 	}

 	template <class Return, class... Arguments>
@@ -158,93 +180,29 @@ namespace crucible {
 	void
 	LRUCache<Return, Arguments...>::clear()
 	{
-		// Move the map onto the stack, then destroy it after we've released the lock.
+		// Move the map and list onto the stack, then destroy it after we've released the lock
+		// so that we don't block other threads if the list's destructors are expensive
+		decltype(m_list) new_list;
 		decltype(m_map) new_map;
 		unique_lock<mutex> lock(m_mutex);
+		m_list.swap(new_list);
 		m_map.swap(new_map);
-		m_last = nullptr;
-	}
-
-	template <class Return, class... Arguments>
-	void
-	LRUCache<Return, Arguments...>::prune(function<bool(const Return &)> pred)
-	{
-		unique_lock<mutex> lock(m_mutex);
-		for (auto it = m_map.begin(); it != m_map.end(); ) {
-			auto next_it = ++it;
-			if (pred(it.second.ret)) {
-				erase_one(&it.second);
-			}
-			it = next_it;
-		}
+		lock.unlock();
 	}

 	template<class Return, class... Arguments>
 	Return
 	LRUCache<Return, Arguments...>::operator()(Arguments... args)
 	{
-		Key k(args...);
-		bool inserted = false;
-
-		// Do we have it cached?
-		unique_lock<mutex> lock(m_mutex);
-		auto found = m_map.find(k);
-		if (found == m_map.end()) {
-			// No, release cache lock and acquire key lock
-			lock.unlock();
-			auto key_lock = m_lockset.make_lock(k);
-
-			// Did item appear in cache while we were waiting for key?
-			lock.lock();
-			found = m_map.find(k);
-			if (found == m_map.end()) {
-
-				// No, we hold key and cache locks, but item not in cache.
-				// Release cache lock and call function
-				lock.unlock();
-
-				// Create new value
-				Value v(k, m_fn(args...));
-
-				// Reacquire cache lock
-				lock.lock();
-
-				// Make room
-				check_overflow();
-
-				// Reacquire cache lock and insert return value
-				tie(found, inserted) = m_map.insert(make_pair(k, v));
-
-				// We hold a lock on this key so we are the ones to insert it
-				THROW_CHECK0(runtime_error, inserted);
-
-				// Release key lock, keep the cache lock
-				key_lock.unlock();
-
-			}
-		}
-
-		// Item should be in cache now
-		THROW_CHECK0(runtime_error, found != m_map.end());
-
-		// (Re)insert at head of LRU
-		move_to_front(&(found->second));
-
-		// Make copy before releasing lock
-		auto rv = found->second.ret;
-		return rv;
+		return insert_item(m_fn, args...);
 	}

 	template<class Return, class... Arguments>
 	void
 	LRUCache<Return, Arguments...>::expire(Arguments... args)
 	{
-		Key k(args...);
 		unique_lock<mutex> lock(m_mutex);
-		auto found = m_map.find(k);
-		if (found != m_map.end()) {
-			erase_one(&found->second);
-		}
+		erase_key(Key(args...));
 	}

 	template<class Return, class... Arguments>
@@ -259,40 +217,7 @@ namespace crucible {
 	void
 	LRUCache<Return, Arguments...>::insert(const Return &r, Arguments... args)
 	{
-		Key k(args...);
-		bool inserted = false;
-
-		// Do we have it cached?
-		unique_lock<mutex> lock(m_mutex);
-		auto found = m_map.find(k);
-		if (found == m_map.end()) {
-			// No, release cache lock and acquire key lock
-			lock.unlock();
-			auto key_lock = m_lockset.make_lock(k);
-
-			// Did item appear in cache while we were waiting for key?
-			lock.lock();
-			found = m_map.find(k);
-			if (found == m_map.end()) {
-
-				// Make room
-				check_overflow();
-
-				// No, we hold key and cache locks, but item not in cache.
-				// Insert the provided return value (no need to unlock here)
-				Value v(k, r);
-				tie(found, inserted) = m_map.insert(make_pair(k, v));
-
-				// We hold a lock on this key so we are the ones to insert it
-				THROW_CHECK0(runtime_error, inserted);
-			}
-		}
-
-		// Item should be in cache now
-		THROW_CHECK0(runtime_error, found != m_map.end());
-
-		// (Re)insert at head of LRU
-		move_to_front(&(found->second));
+		insert_item([&](Arguments...) -> Return { return r; }, args...);
 	}
 }

--- a/include/crucible/chatter.h
+++ b/include/crucible/chatter.h
@@ -50,6 +50,7 @@ namespace crucible {
 		~Chatter();

 		static void enable_timestamp(bool prefix_timestamp);
+		static void enable_level(bool prefix_level);
 	};

 	template <class Argument>
--- a/include/crucible/city.h
+++ b/include/crucible/city.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// CityHash, by Geoff Pike and Jyrki Alakuijala
+//
+// http://code.google.com/p/cityhash/
+//
+// This file provides a few functions for hashing strings.  All of them are
+// high-quality functions in the sense that they pass standard tests such
+// as Austin Appleby's SMHasher.  They are also fast.
+//
+// For 64-bit x86 code, on short strings, we don't know of anything faster than
+// CityHash64 that is of comparable quality.  We believe our nearest competitor
+// is Murmur3.  For 64-bit x86 code, CityHash64 is an excellent choice for hash
+// tables and most other hashing (excluding cryptography).
+//
+// For 64-bit x86 code, on long strings, the picture is more complicated.
+// On many recent Intel CPUs, such as Nehalem, Westmere, Sandy Bridge, etc.,
+// CityHashCrc128 appears to be faster than all competitors of comparable
+// quality.  CityHash128 is also good but not quite as fast.  We believe our
+// nearest competitor is Bob Jenkins' Spooky.  We don't have great data for
+// other 64-bit CPUs, but for long strings we know that Spooky is slightly
+// faster than CityHash on some relatively recent AMD x86-64 CPUs, for example.
+// Note that CityHashCrc128 is declared in citycrc.h [which has been removed
+// for bees].
+//
+// For 32-bit x86 code, we don't know of anything faster than CityHash32 that
+// is of comparable quality.  We believe our nearest competitor is Murmur3A.
+// (On 64-bit CPUs, it is typically faster to use the other CityHash variants.)
+//
+// Functions in the CityHash family are not suitable for cryptography.
+//
+// Please see CityHash's README file for more details on our performance
+// measurements and so on.
+//
+// WARNING: This code has been only lightly tested on big-endian platforms!
+// It is known to work well on little-endian platforms that have a small penalty
+// for unaligned reads, such as current Intel and AMD moderate-to-high-end CPUs.
+// It should work on all 32-bit and 64-bit platforms that allow unaligned reads;
+// bug reports are welcome.
+//
+// By the way, for some hash functions, given strings a and b, the hash
+// of a+b is easily derived from the hashes of a and b.  This property
+// doesn't hold for any hash functions in this file.
+
+#ifndef CITY_HASH_H_
+#define CITY_HASH_H_
+
+#include <stdlib.h>  // for size_t.
+#include <stdint.h>
+#include <utility>
+
+typedef uint8_t uint8;
+typedef uint32_t uint32;
+typedef uint64_t uint64;
+typedef std::pair<uint64, uint64> uint128;
+
+inline uint64 Uint128Low64(const uint128& x) { return x.first; }
+inline uint64 Uint128High64(const uint128& x) { return x.second; }
+
+// Hash function for a byte array.
+uint64 CityHash64(const char *buf, size_t len);
+
+// Hash function for a byte array.  For convenience, a 64-bit seed is also
+// hashed into the result.
+uint64 CityHash64WithSeed(const char *buf, size_t len, uint64 seed);
+
+// Hash function for a byte array.  For convenience, two seeds are also
+// hashed into the result.
+uint64 CityHash64WithSeeds(const char *buf, size_t len,
+                           uint64 seed0, uint64 seed1);
+
+// Hash function for a byte array.
+uint128 CityHash128(const char *s, size_t len);
+
+// Hash function for a byte array.  For convenience, a 128-bit seed is also
+// hashed into the result.
+uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed);
+
+// Hash function for a byte array.  Most useful in 32-bit binaries.
+uint32 CityHash32(const char *buf, size_t len);
+
+// Hash 128 input bits down to 64 bits of output.
+// This is intended to be a reasonably good hash function.
+inline uint64 Hash128to64(const uint128& x) {
+  // Murmur-inspired hashing.
+  const uint64 kMul = 0x9ddfea08eb382d69ULL;
+  uint64 a = (Uint128Low64(x) ^ Uint128High64(x)) * kMul;
+  a ^= (a >> 47);
+  uint64 b = (Uint128High64(x) ^ a) * kMul;
+  b ^= (b >> 47);
+  b *= kMul;
+  return b;
+}
+
+#endif  // CITY_HASH_H_
--- a/include/crucible/endian.h
+++ b/include/crucible/endian.h
@@ -0,0 +1,58 @@
+#ifndef CRUCIBLE_ENDIAN_H
+#define CRUCIBLE_ENDIAN_H
+
+#include <cstdint>
+
+#include <endian.h>
+
+namespace crucible {
+
+	template<class T>
+	struct le_to_cpu_helper {
+		T operator()(const T v);
+	};
+
+	template<> struct le_to_cpu_helper<uint64_t> {
+		uint64_t operator()(const uint64_t v) { return le64toh(v); }
+	};
+
+#if __SIZEOF_LONG__ == 8
+	// uint64_t is unsigned long on LP64 platforms
+	template<> struct le_to_cpu_helper<unsigned long long> {
+		unsigned long long operator()(const unsigned long long v) { return le64toh(v); }
+	};
+#endif
+
+	template<> struct le_to_cpu_helper<uint32_t> {
+		uint32_t operator()(const uint32_t v) { return le32toh(v); }
+	};
+
+	template<> struct le_to_cpu_helper<uint16_t> {
+		uint16_t operator()(const uint16_t v) { return le64toh(v); }
+	};
+
+	template<> struct le_to_cpu_helper<uint8_t> {
+		uint8_t operator()(const uint8_t v) { return v; }
+	};
+
+	template<class T>
+	T
+	le_to_cpu(const T v)
+	{
+		return le_to_cpu_helper<T>()(v);
+	}
+
+	template<class T>
+	T
+	get_unaligned(const void *const p)
+	{
+		struct not_aligned {
+			T v;
+		} __attribute__((packed));
+		const not_aligned *const nap = reinterpret_cast<const not_aligned*>(p);
+		return nap->v;
+	}
+
+}
+
+#endif // CRUCIBLE_ENDIAN_H
--- a/include/crucible/extentwalker.h
+++ b/include/crucible/extentwalker.h
@@ -58,10 +58,6 @@ namespace crucible {

 		virtual Vec get_extent_map(off_t pos);

-		static const unsigned sc_extent_fetch_max = 16;
-		static const unsigned sc_extent_fetch_min = 4;
-		static const off_t sc_step_size = 0x1000 * (sc_extent_fetch_max / 2);
-
 	private:
 		Vec	m_extents;
 		Itr	m_current;
@@ -69,6 +65,10 @@ namespace crucible {
 		Itr find_in_cache(off_t pos);
 		void run_fiemap(off_t pos);

+#ifdef EXTENTWALKER_DEBUG
+		ostringstream m_log;
+#endif
+
 	public:
 		ExtentWalker(Fd fd = Fd());
 		ExtentWalker(Fd fd, off_t initial_pos);
--- a/include/crucible/fd.h
+++ b/include/crucible/fd.h
@@ -1,7 +1,7 @@
 #ifndef CRUCIBLE_FD_H
 #define CRUCIBLE_FD_H

-#include "crucible/resource.h"
+#include "crucible/namedptr.h"

 #include <cstring>

@@ -34,30 +34,28 @@ namespace crucible {
 		IOHandle(IOHandle &&) = delete;
 		IOHandle& operator=(IOHandle &&) = delete;
 		IOHandle& operator=(const IOHandle &) = delete;
-	protected:
 		int	m_fd;
-		IOHandle& operator=(int that) { m_fd = that; return *this; }
+		void close();
 	public:
 		virtual ~IOHandle();
-		IOHandle(int fd);
-		IOHandle();
-
-		void close();
-		int get_fd() const { return m_fd; }
-		int release_fd();
+		IOHandle(int fd = -1);
+		int get_fd() const;
 	};

-        template <>
-        struct ResourceTraits<int, IOHandle> {
-                int get_key(const IOHandle &res) const { return res.get_fd(); }
-                shared_ptr<IOHandle> make_resource(int fd) const { return make_shared<IOHandle>(fd); }
-                bool is_null_key(const int &key) const { return key < 0; }
-                int get_null_key() const { return -1; }
-        };
+	class Fd {
+		static NamedPtr<IOHandle, int> s_named_ptr;
+		shared_ptr<IOHandle> m_handle;
+	public:
+		using resource_type = IOHandle;
+		Fd();
+		Fd(int fd);
+		Fd &operator=(int fd);
+		Fd &operator=(const shared_ptr<IOHandle> &);
+		operator int() const;
+		bool operator!() const;
+		shared_ptr<IOHandle> operator->() const;
+	};

-        typedef ResourceHandle<int, IOHandle> Fd;
-
-	static string __relative_path;
 	void set_relative_path(string path);
 	string relative_path();

--- a/include/crucible/fs.h
+++ b/include/crucible/fs.h
@@ -1,7 +1,9 @@
 #ifndef CRUCIBLE_FS_H
 #define CRUCIBLE_FS_H

+#include "crucible/endian.h"
 #include "crucible/error.h"
+#include "crucible/spanner.h"

 // Terribly Linux-specific FS-wrangling functions

@@ -39,11 +41,6 @@ namespace crucible {
 		vector<BtrfsExtentInfo> m_info;
 	};

-	struct BtrfsExtentSameByClone : public BtrfsExtentSame {
-		using BtrfsExtentSame::BtrfsExtentSame;
-		void do_ioctl() override;
-	};
-
 	ostream & operator<<(ostream &os, const btrfs_ioctl_same_extent_info *info);
 	ostream & operator<<(ostream &os, const btrfs_ioctl_same_args *info);
 	ostream & operator<<(ostream &os, const BtrfsExtentSame &bes);
@@ -58,7 +55,7 @@ namespace crucible {

 	struct BtrfsDataContainer : public btrfs_data_container {
 		BtrfsDataContainer(size_t size = 64 * 1024);
-		void *prepare();
+		void *prepare(size_t size);

 		size_t get_size() const;
 		decltype(bytes_left) get_bytes_left() const;
@@ -66,16 +63,36 @@ namespace crucible {
 		decltype(elem_cnt) get_elem_cnt() const;
 		decltype(elem_missed) get_elem_missed() const;

-		vector<char> m_data;
+		vector<uint8_t> m_data;
 	};

 	struct BtrfsIoctlLogicalInoArgs : public btrfs_ioctl_logical_ino_args {
-		BtrfsIoctlLogicalInoArgs(uint64_t logical, size_t buf_size = 64 * 1024);
+		BtrfsIoctlLogicalInoArgs(uint64_t logical, size_t buf_size = 16 * 1024 * 1024);
+
+		uint64_t get_flags() const;
+		void set_flags(uint64_t new_flags);
+
 		virtual void do_ioctl(int fd);
 		virtual bool do_ioctl_nothrow(int fd);

+		size_t m_container_size;
+		struct BtrfsInodeOffsetRootSpan {
+			using iterator = BtrfsInodeOffsetRoot*;
+			using const_iterator = const BtrfsInodeOffsetRoot*;
+			size_t size() const;
+			iterator begin() const;
+			iterator end() const;
+			const_iterator cbegin() const;
+			const_iterator cend() const;
+			iterator data() const;
+			void clear();
+			operator vector<BtrfsInodeOffsetRoot>() const;
+		private:
+			iterator m_begin = nullptr;
+			iterator m_end = nullptr;
+		friend struct BtrfsIoctlLogicalInoArgs;
+		} m_iors;
 		BtrfsDataContainer m_container;
-		vector<BtrfsInodeOffsetRoot> m_iors;
 	};

 	ostream & operator<<(ostream &os, const BtrfsIoctlLogicalInoArgs &p);
@@ -85,7 +102,7 @@ namespace crucible {
 		virtual void do_ioctl(int fd);
 		virtual bool do_ioctl_nothrow(int fd);

-		BtrfsDataContainer m_container;
+		size_t m_container_size;
 		vector<string> m_paths;
 	};

@@ -149,11 +166,17 @@ namespace crucible {

 	struct BtrfsIoctlSearchHeader : public btrfs_ioctl_search_header {
 		BtrfsIoctlSearchHeader();
-		vector<char> m_data;
-		size_t set_data(const vector<char> &v, size_t offset);
+		Spanner<const uint8_t> m_data;
+		size_t set_data(const vector<uint8_t> &v, size_t offset);
 		bool operator<(const BtrfsIoctlSearchHeader &that) const;
 	};

+	// Perf blames this function for a few percent overhead; move it here so it can be inline
+	inline bool BtrfsIoctlSearchHeader::operator<(const BtrfsIoctlSearchHeader &that) const
+	{
+		return tie(objectid, type, offset, len, transid) < tie(that.objectid, that.type, that.offset, that.len, that.transid);
+	}
+
 	ostream & operator<<(ostream &os, const btrfs_ioctl_search_header &hdr);
 	ostream & operator<<(ostream &os, const BtrfsIoctlSearchHeader &hdr);

@@ -166,6 +189,7 @@ namespace crucible {
 		void next_min(const BtrfsIoctlSearchHeader& ref);

 		size_t m_buf_size;
+		vector<uint8_t> m_ioctl_arg;
 		set<BtrfsIoctlSearchHeader> m_result;

 	};
@@ -179,51 +203,25 @@ namespace crucible {
 	uint64_t btrfs_get_root_id(int fd);
 	uint64_t btrfs_get_root_transid(int fd);

-	template<class T>
+	template<class T, class V>
 	const T*
-	get_struct_ptr(vector<char> &v, size_t offset = 0)
+	get_struct_ptr(const V &v, size_t offset = 0)
 	{
-		// OK so sometimes btrfs overshoots a little
-		if (offset + sizeof(T) > v.size()) {
-			v.resize(offset + sizeof(T), 0);
-		}
-		THROW_CHECK2(invalid_argument, v.size(), offset + sizeof(T), offset + sizeof(T) <= v.size());
-		return reinterpret_cast<const T*>(v.data() + offset);
+		THROW_CHECK2(out_of_range, v.size(), offset + sizeof(T), offset + sizeof(T) <= v.size());
+		const uint8_t *const data_ptr = v.data();
+		return reinterpret_cast<const T*>(data_ptr + offset);
 	}

-	template<class A, class R>
-	R
-	call_btrfs_get(R (*func)(const A*), vector<char> &v, size_t offset = 0)
-	{
-		return func(get_struct_ptr<A>(v, offset));
-	}
-
-	template <class T> struct btrfs_get_le;
-
-	template<> struct btrfs_get_le<__le64> {
-		uint64_t operator()(const void *p) { return get_unaligned_le64(p); }
-	};
-
-	template<> struct btrfs_get_le<__le32> {
-		uint32_t operator()(const void *p) { return get_unaligned_le32(p); }
-	};
-
-	template<> struct btrfs_get_le<__le16> {
-		uint16_t operator()(const void *p) { return get_unaligned_le16(p); }
-	};
-
-	template<> struct btrfs_get_le<__le8> {
-		uint8_t operator()(const void *p) { return get_unaligned_le8(p); }
-	};
-
-	template<class S, class T>
+	template<class S, class T, class V>
 	T
-	btrfs_get_member(T S::* member, vector<char> &v, size_t offset = 0)
+	btrfs_get_member(T S::* member, V &v, size_t offset = 0)
 	{
-		const S *sp = reinterpret_cast<const S*>(NULL);
-		const T *spm = &(sp->*member);
-		auto member_offset = reinterpret_cast<const char *>(spm) - reinterpret_cast<const char *>(sp);
-		return btrfs_get_le<T>()(get_struct_ptr<S>(v, offset + member_offset));
+		const S *const sp = nullptr;
+		const T *const spm = &(sp->*member);
+		const auto member_offset = reinterpret_cast<const uint8_t *>(spm) - reinterpret_cast<const uint8_t *>(sp);
+		const void *struct_ptr = get_struct_ptr<T>(v, offset + member_offset);
+		const T unaligned_t = get_unaligned<T>(struct_ptr);
+		return le_to_cpu(unaligned_t);
 	}

 	struct Statvfs : public statvfs {
@@ -235,12 +233,13 @@ namespace crucible {
 		unsigned long available() const;
 	};

-	ostream &hexdump(ostream &os, const vector<char> &v);
+	template<class V> ostream &hexdump(ostream &os, const V &v);

-	struct BtrfsIoctlFsInfoArgs : public btrfs_ioctl_fs_info_args {
+	struct BtrfsIoctlFsInfoArgs : public btrfs_ioctl_fs_info_args_v2 {
 		BtrfsIoctlFsInfoArgs();
 		void do_ioctl(int fd);
-		string uuid() const;
+		uint16_t csum_type() const;
+		uint16_t csum_size() const;
 	};

 	ostream & operator<<(ostream &os, const BtrfsIoctlFsInfoArgs &a);
--- a/include/crucible/lockset.h
+++ b/include/crucible/lockset.h
@@ -1,8 +1,8 @@
 #ifndef CRUCIBLE_LOCKSET_H
 #define CRUCIBLE_LOCKSET_H

-#include <crucible/error.h>
-#include <crucible/process.h>
+#include "crucible/error.h"
+#include "crucible/process.h"

 #include <cassert>

@@ -117,7 +117,7 @@ namespace crucible {
 		while (full() || locked(name)) {
 			m_condvar.wait(lock);
 		}
-		auto rv = m_set.insert(make_pair(name, gettid()));
+		auto rv = m_set.insert(make_pair(name, crucible::gettid()));
 		THROW_CHECK0(runtime_error, rv.second);
 	}

@@ -129,7 +129,7 @@ namespace crucible {
 		if (full() || locked(name)) {
 			return false;
 		}
-		auto rv = m_set.insert(make_pair(name, gettid()));
+		auto rv = m_set.insert(make_pair(name, crucible::gettid()));
 		THROW_CHECK1(runtime_error, name, rv.second);
 		return true;
 	}
--- a/include/crucible/namedptr.h
+++ b/include/crucible/namedptr.h
@@ -0,0 +1,196 @@
+#ifndef CRUCIBLE_NAMEDPTR_H
+#define CRUCIBLE_NAMEDPTR_H
+
+#include "crucible/lockset.h"
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <tuple>
+
+namespace crucible {
+	using namespace std;
+
+	/// Storage for objects with unique names
+
+	template <class Return, class... Arguments>
+	class NamedPtr {
+	public:
+		using Key = tuple<Arguments...>;
+		using Ptr = shared_ptr<Return>;
+		using Func = function<Ptr(Arguments...)>;
+	private:
+		struct Value;
+		using WeakPtr = weak_ptr<Value>;
+		using MapType = map<Key, WeakPtr>;
+		struct MapRep {
+			MapType		m_map;
+			mutex		m_mutex;
+		};
+		using MapPtr = shared_ptr<MapRep>;
+		struct Value {
+			Ptr	m_ret_ptr;
+			MapPtr	m_map_rep;
+			Key	m_ret_key;
+			~Value();
+			Value(Ptr&& ret_ptr, const Key &key, const MapPtr &map_rep);
+		};
+
+		Func		m_fn;
+		MapPtr		m_map_rep = make_shared<MapRep>();
+		LockSet<Key>	m_lockset;
+
+		Ptr lookup_item(const Key &k);
+		Ptr insert_item(Func fn, Arguments... args);
+
+	public:
+		NamedPtr(Func f = Func());
+
+		void func(Func f);
+
+		Ptr operator()(Arguments... args);
+		Ptr insert(const Ptr &r, Arguments... args);
+	};
+
+	template <class Return, class... Arguments>
+	NamedPtr<Return, Arguments...>::NamedPtr(Func f) :
+		m_fn(f)
+	{
+	}
+
+	template <class Return, class... Arguments>
+	NamedPtr<Return, Arguments...>::Value::Value(Ptr&& ret_ptr, const Key &key, const MapPtr &map_rep) :
+		m_ret_ptr(ret_ptr),
+		m_map_rep(map_rep),
+		m_ret_key(key)
+	{
+	}
+
+	template <class Return, class... Arguments>
+	NamedPtr<Return, Arguments...>::Value::~Value()
+	{
+		unique_lock<mutex> lock(m_map_rep->m_mutex);
+		// We are called from the shared_ptr destructor, so we
+		// know that the weak_ptr in the map has already expired;
+		// however, if another thread already noticed that the
+		// map entry expired while we were waiting for the lock,
+		// the other thread will have already replaced the map
+		// entry with a pointer to some other object, and that
+		// object now owns the map entry.  So we do a key lookup
+		// here instead of storing a map iterator, and only erase
+		// "our" map entry if it exists and is expired.  The other
+		// thread would have done the same for us if the race had
+		// a different winner.
+		auto found = m_map_rep->m_map.find(m_ret_key);
+		if (found != m_map_rep->m_map.end() && found->second.expired()) {
+			m_map_rep->m_map.erase(found);
+		}
+	}
+
+	template <class Return, class... Arguments>
+	typename NamedPtr<Return, Arguments...>::Ptr
+	NamedPtr<Return, Arguments...>::lookup_item(const Key &k)
+	{
+		// Must be called with lock held
+		auto found = m_map_rep->m_map.find(k);
+		if (found != m_map_rep->m_map.end()) {
+			// Get the strong pointer back
+			auto rv = found->second.lock();
+			if (rv) {
+				// Have strong pointer.  Return value that shares map entry.
+				return shared_ptr<Return>(rv, rv->m_ret_ptr.get());
+			}
+			// Have expired weak pointer.  Another thread is trying to delete it,
+			// but we got the lock first.  Leave the map entry alone here.
+			// The other thread will erase it, or we will put a different entry
+			// in the same map entry.
+		}
+		return Ptr();
+	}
+
+	template <class Return, class... Arguments>
+	typename NamedPtr<Return, Arguments...>::Ptr
+	NamedPtr<Return, Arguments...>::insert_item(Func fn, Arguments... args)
+	{
+		Key k(args...);
+
+		// Is it already in the map?
+		unique_lock<mutex> lock(m_map_rep->m_mutex);
+		auto rv = lookup_item(k);
+		if (rv) {
+			return rv;
+		}
+
+		// Release map lock and acquire key lock
+		lock.unlock();
+		auto key_lock = m_lockset.make_lock(k);
+
+		// Did item appear in map while we were waiting for key?
+		lock.lock();
+		rv = lookup_item(k);
+		if (rv) {
+			return rv;
+		}
+
+		// We now hold key and index locks, but item not in map (or expired).
+		// Release map lock
+		lock.unlock();
+
+		// Call the function and create a new Value
+		auto new_value_ptr = make_shared<Value>(fn(args...), k, m_map_rep);
+		// Function must return a non-null pointer
+		THROW_CHECK0(runtime_error, new_value_ptr->m_ret_ptr);
+
+		// Reacquire index lock for map insertion
+		lock.lock();
+
+		// Insert return value in map or overwrite existing
+		// empty or expired weak_ptr value.
+		WeakPtr &new_item_ref = m_map_rep->m_map[k];
+
+		// We searched the map while holding both locks and
+		// found no entry or an expired weak_ptr; therefore, no
+		// other thread could have inserted a new non-expired
+		// weak_ptr, and the weak_ptr in the map is expired
+		// or was default-constructed as a nullptr.  So if the
+		// new_item_ref is not expired, we have a bug we need
+		// to find and fix.
+		assert(new_item_ref.expired());
+
+		// Update the empty map slot
+		new_item_ref = new_value_ptr;
+
+		// Drop lock so we don't deadlock in constructor exceptions
+		lock.unlock();
+
+		// Return shared_ptr to Return using strong pointer's reference counter
+		return shared_ptr<Return>(new_value_ptr, new_value_ptr->m_ret_ptr.get());
+	}
+
+	template <class Return, class... Arguments>
+	void
+	NamedPtr<Return, Arguments...>::func(Func func)
+	{
+		unique_lock<mutex> lock(m_map_rep->m_mutex);
+		m_fn = func;
+	}
+
+	template<class Return, class... Arguments>
+	typename NamedPtr<Return, Arguments...>::Ptr
+	NamedPtr<Return, Arguments...>::operator()(Arguments... args)
+	{
+		return insert_item(m_fn, args...);
+	}
+
+	template<class Return, class... Arguments>
+	typename NamedPtr<Return, Arguments...>::Ptr
+	NamedPtr<Return, Arguments...>::insert(const Ptr &r, Arguments... args)
+	{
+		THROW_CHECK0(invalid_argument, r);
+		return insert_item([&](Arguments...) -> Ptr { return r; }, args...);
+	}
+
+}
+
+#endif // NAMEDPTR_H
--- a/include/crucible/ntoa.h
+++ b/include/crucible/ntoa.h
@@ -22,7 +22,7 @@ namespace crucible {
 // Enumerations (entire value matches all bits)
 #define NTOA_TABLE_ENTRY_ENUM(x) { .n = (x), .mask = ~0UL,  .a = (#x) }

-// End of table (sorry, gcc doesn't implement this)
+// End of table (sorry, C++ didn't get C99's compound literals, so we have to write out all the member names)
 #define NTOA_TABLE_ENTRY_END() { .n = 0, .mask = 0, .a = nullptr }

 #endif // CRUCIBLE_NTOA_H
--- a/include/crucible/pool.h
+++ b/include/crucible/pool.h
@@ -0,0 +1,185 @@
+#ifndef CRUCIBLE_POOL_H
+#define CRUCIBLE_POOL_H
+
+#include "crucible/error.h"
+
+#include <functional>
+#include <list>
+#include <memory>
+#include <mutex>
+
+namespace crucible {
+	using namespace std;
+
+	/// Storage for reusable anonymous objects that are too expensive to create and/or destroy frequently
+
+	template <class T>
+	class Pool {
+	public:
+		using Ptr = shared_ptr<T>;
+		using Generator = function<Ptr()>;
+		using Checker = function<void(Ptr)>;
+
+		~Pool();
+		Pool(Generator f = Generator(), Checker checkin = Checker(), Checker checkout = Checker());
+
+		/// Function to create new objects when Pool is empty
+		void generator(Generator f);
+
+		/// Optional function called when objects exit the pool (user handle is created and returned to user)
+		void checkout(Checker f);
+
+		/// Optional function called when objects enter the pool (last user handle is destroyed)
+		void checkin(Checker f);
+
+		/// Pool() returns a handle to an object of type shared_ptr<T>
+		Ptr operator()();
+
+		/// Destroy all objects in Pool that are not in use
+		void clear();
+
+	private:
+		struct PoolRep {
+			list<Ptr>	m_list;
+			mutex		m_mutex;
+			Checker		m_checkin;
+			PoolRep(Checker checkin);
+		};
+		struct Handle {
+			weak_ptr<PoolRep> m_list_rep;
+			Ptr	            m_ret_ptr;
+			Handle(shared_ptr<PoolRep> list_rep, Ptr ret_ptr);
+			~Handle();
+		};
+
+		Generator		m_fn;
+		Checker			m_checkout;
+		shared_ptr<PoolRep>	m_list_rep;
+	};
+
+	template <class T>
+	Pool<T>::PoolRep::PoolRep(Checker checkin) :
+		m_checkin(checkin)
+	{
+	}
+
+	template <class T>
+	Pool<T>::Pool(Generator f, Checker checkin, Checker checkout) :
+		m_fn(f),
+		m_checkout(checkout),
+		m_list_rep(make_shared<PoolRep>(checkin))
+	{
+	}
+
+	template <class T>
+	Pool<T>::~Pool()
+	{
+		auto list_rep = m_list_rep;
+		unique_lock<mutex> lock(list_rep->m_mutex);
+		m_list_rep.reset();
+	}
+
+	template <class T>
+	Pool<T>::Handle::Handle(shared_ptr<PoolRep> list_rep, Ptr ret_ptr) :
+		m_list_rep(list_rep),
+		m_ret_ptr(ret_ptr)
+	{
+	}
+
+	template <class T>
+	Pool<T>::Handle::~Handle()
+	{
+		// Checkin prepares the object for storage and reuse.
+		// Neither of those will happen if there is no Pool.
+		// If the Pool was destroyed, just let m_ret_ptr expire.
+		auto list_rep = m_list_rep.lock();
+		if (!list_rep) {
+			return;
+		}
+
+		unique_lock<mutex> lock(list_rep->m_mutex);
+		// If a checkin function is defined, call it
+		auto checkin = list_rep->m_checkin;
+		if (checkin) {
+			lock.unlock();
+			checkin(m_ret_ptr);
+			lock.lock();
+		}
+
+		// Place object back in pool
+		list_rep->m_list.push_front(m_ret_ptr);
+	}
+
+	template <class T>
+	typename Pool<T>::Ptr
+	Pool<T>::operator()()
+	{
+		Ptr rv;
+
+		// Do we have an object in the pool we can return instead?
+		unique_lock<mutex> lock(m_list_rep->m_mutex);
+		if (m_list_rep->m_list.empty()) {
+			// No, release cache lock and call the function
+			lock.unlock();
+
+			// Create new value
+			rv = m_fn();
+		} else {
+			rv = m_list_rep->m_list.front();
+			m_list_rep->m_list.pop_front();
+
+			// Release lock so we don't deadlock with Handle destructor
+			lock.unlock();
+		}
+
+		// rv now points to a T object that is not in the list.
+		THROW_CHECK0(runtime_error, rv);
+
+		// Construct a shared_ptr for Handle which will refcount the Handle objects
+		// and reinsert the T into the Pool when the last Handle is destroyed.
+		auto hv = make_shared<Handle>(m_list_rep, rv);
+
+		// If a checkout function is defined, call it
+		if (m_checkout) {
+			m_checkout(rv);
+		}
+
+		// T an alias shared_ptr for the T using Handle's refcount.
+		return Ptr(hv, rv.get());
+	}
+
+	template <class T>
+	void
+	Pool<T>::generator(Generator func)
+	{
+		unique_lock<mutex> lock(m_list_rep->m_mutex);
+		m_fn = func;
+	}
+
+	template <class T>
+	void
+	Pool<T>::checkin(Checker func)
+	{
+		unique_lock<mutex> lock(m_list_rep->m_mutex);
+		m_list_rep->m_checkin = func;
+	}
+
+	template <class T>
+	void
+	Pool<T>::checkout(Checker func)
+	{
+		unique_lock<mutex> lock(m_list_rep->m_mutex);
+		m_checkout = func;
+	}
+
+	template <class T>
+	void
+	Pool<T>::clear()
+	{
+		unique_lock<mutex> lock(m_list_rep->m_mutex);
+		m_list_rep->m_list.clear();
+	}
+
+}
+
+#endif // POOL_H
--- a/include/crucible/process.h
+++ b/include/crucible/process.h
@@ -77,5 +77,7 @@ namespace crucible {
 	double getloadavg1();
 	double getloadavg5();
 	double getloadavg15();
+
+	string signal_ntoa(int sig);
 }
 #endif // CRUCIBLE_PROCESS_H
--- a/include/crucible/progress.h
+++ b/include/crucible/progress.h
@@ -13,7 +13,7 @@ namespace crucible {

 	template <class T>
 	class ProgressTracker {
-		class ProgressTrackerState;
+		struct ProgressTrackerState;
 		class ProgressHolderState;
 	public:
 		using value_type = T;
--- a/include/crucible/spanner.h
+++ b/include/crucible/spanner.h
@@ -0,0 +1,167 @@
+#ifndef CRUCIBLE_SPANNER_H
+#define CRUCIBLE_SPANNER_H
+
+#include "crucible/error.h"
+
+#include <memory>
+
+namespace crucible {
+
+	using namespace std;
+
+	// C++20 is already using the name "span" for something similar.
+	template <class T, class Head = T*, class Iter = Head>
+	class Spanner {
+	public:
+		using iterator = Iter;
+		using head_pointer = Head;
+		using value_type = T;
+
+		template <class Container>
+		Spanner(Container& container);
+
+		Spanner(head_pointer begin, iterator end);
+		Spanner(size_t size, head_pointer begin);
+		Spanner() = default;
+		Spanner &operator=(const Spanner &that) = default;
+		iterator begin() const;
+		iterator end() const;
+		value_type *data() const;
+		value_type &at(size_t n) const;
+		size_t size() const;
+		bool empty() const;
+		void clear();
+		value_type &operator[](size_t n) const;
+		iterator erase(iterator first, iterator last);
+		iterator erase(iterator first);
+	private:
+		head_pointer	m_begin;
+		size_t		m_size;
+	};
+
+	template <class Container, class Head = typename Container::value_type *, class Iter = Head>
+	Spanner<typename Container::value_type, Head, Iter> make_spanner(Container &container)
+	{
+		return Spanner<typename Container::value_type, Head, Iter>(container);
+	}
+
+	// This template is an attempt to turn a shared_ptr to a container
+	// into a range view that can be cheaply passed around.
+	// It probably doesn't quite work in the general case.
+	template <class Container, class Head = shared_ptr<typename Container::value_type>, class Iter = typename Container::value_type *>
+	Spanner<typename Container::value_type, Head, Iter> make_spanner(shared_ptr<Container> &cont_ptr)
+	{
+		shared_ptr<typename Container::value_type> head(cont_ptr, cont_ptr->data());
+		size_t const size = cont_ptr->size();
+		return Spanner<typename Container::value_type, Head, Iter>(size, head);
+	}
+
+	template <class T, class Head, class Iter>
+	template <class Container>
+	Spanner<T, Head, Iter>::Spanner(Container &container) :
+		m_begin(container.data()),
+		m_size(container.size())
+	{
+	}
+
+	template <class T, class Head, class Iter>
+	Spanner<T, Head, Iter>::Spanner(head_pointer begin, iterator end) :
+		m_begin(begin),
+		m_size(end - begin)
+	{
+	}
+
+	template <class T, class Head, class Iter>
+	Spanner<T, Head, Iter>::Spanner(size_t size, head_pointer begin) :
+		m_begin(begin),
+		m_size(size)
+	{
+	}
+
+	template <class T, class Head, class Iter>
+	typename Spanner<T, Head, Iter>::iterator
+	Spanner<T, Head, Iter>::erase(iterator first, iterator last)
+	{
+		auto end = m_begin + m_size;
+		if (first == m_begin) {
+			THROW_CHECK0(invalid_argument, last <= end);
+			m_begin = last;
+			return last;
+		}
+		if (last == end) {
+			THROW_CHECK0(invalid_argument, m_begin <= first);
+			m_size = first - m_begin;
+			return first;
+		}
+		THROW_ERROR(invalid_argument, "first != begin() and last != end()");
+	}
+
+	template <class T, class Head, class Iter>
+	typename Spanner<T, Head, Iter>::iterator
+	Spanner<T, Head, Iter>::erase(iterator first)
+	{
+		return erase(first, first + 1);
+	}
+
+	template <class T, class Head, class Iter>
+	typename Spanner<T, Head, Iter>::value_type &
+	Spanner<T, Head, Iter>::operator[](size_t n) const
+	{
+		return at(n);
+	}
+
+	template <class T, class Head, class Iter>
+	void
+	Spanner<T, Head, Iter>::clear()
+	{
+		m_begin = head_pointer();
+		m_size = 0;
+	}
+
+	template <class T, class Head, class Iter>
+	bool
+	Spanner<T, Head, Iter>::empty() const
+	{
+		return m_size == 0;
+	}
+
+	template <class T, class Head, class Iter>
+	size_t
+	Spanner<T, Head, Iter>::size() const
+	{
+		return m_size;
+	}
+
+	template <class T, class Head, class Iter>
+	typename Spanner<T, Head, Iter>::value_type *
+	Spanner<T, Head, Iter>::data() const
+	{
+		return &(*m_begin);
+	}
+
+	template <class T, class Head, class Iter>
+	typename Spanner<T, Head, Iter>::iterator
+	Spanner<T, Head, Iter>::begin() const
+	{
+		return data();
+	}
+
+	template <class T, class Head, class Iter>
+	typename Spanner<T, Head, Iter>::iterator
+	Spanner<T, Head, Iter>::end() const
+	{
+		return data() + m_size;
+	}
+
+	template <class T, class Head, class Iter>
+	typename Spanner<T, Head, Iter>::value_type &
+	Spanner<T, Head, Iter>::at(size_t n) const
+	{
+		THROW_CHECK2(out_of_range, n, size(), n < size());
+		return *(data() + n);
+	}
+
+}
+
+
+#endif // CRUCIBLE_SPANNER_H
--- a/include/crucible/string.h
+++ b/include/crucible/string.h
@@ -19,13 +19,13 @@ namespace crucible {
 		memset(that, 0, sizeof(Base));
 	}

-	// Copy a base class object (usually a C struct) into a vector<char>
+	// Copy a base class object (usually a C struct) into a vector<uint8_t>
 	template <class Base>
-	vector<char>
+	vector<uint8_t>
 	vector_copy_struct(Base *that)
 	{
-		const char *begin_that = reinterpret_cast<const char *>(static_cast<const Base *>(that));
-		return vector<char>(begin_that, begin_that + sizeof(Base));
+		const uint8_t *begin_that = reinterpret_cast<const uint8_t *>(static_cast<const Base *>(that));
+		return vector<uint8_t>(begin_that, begin_that + sizeof(Base));
 	}

 	// int->hex conversion with sprintf
@@ -60,7 +60,7 @@ namespace crucible {
 	ptrdiff_t
 	pointer_distance(const P1 *a, const P2 *b)
 	{
-		return reinterpret_cast<const char *>(a) - reinterpret_cast<const char *>(b);
+		return reinterpret_cast<const uint8_t *>(a) - reinterpret_cast<const uint8_t *>(b);
 	}
 };

--- a/include/crucible/task.h
+++ b/include/crucible/task.h
@@ -13,6 +13,7 @@ namespace crucible {

 	using TaskId = uint64_t;

+	/// A unit of work to be scheduled by TaskMaster.
 	class Task {
 		shared_ptr<TaskState> m_task_state;

@@ -20,34 +21,46 @@ namespace crucible {

 	public:

-		// create empty Task object
+		/// Create empty Task object.
 		Task() = default;

-		// create Task object containing closure and description
+		/// Create Task object containing closure and description.
 		Task(string title, function<void()> exec_fn);

-		// schedule Task at end of queue.
-		// May run Task in current thread or in other thread.
-		// May run Task before or after returning.
+		/// Schedule Task for at most one future execution.
+		/// May run Task in current thread or in other thread.
+		/// May run Task before or after returning.
+		/// Schedules Task at the end of the global execution queue.
+		///
+		/// Only one instance of a Task may execute at a time.
+		/// If a Task is already scheduled, run() does nothing.
+		/// If a Task is already running when a new instance reaches
+		/// the front of the queue, the new instance will execute
+		/// after the current instance exits.
 		void run() const;

-		// schedule Task before other queued tasks
-		void run_earlier() const;
+		/// Schedule Task to run after this Task has run or
+		/// been destroyed.
+		void append(const Task &task) const;

-		// describe Task as text
+		/// Describe Task as text.
 		string title() const;

-		// Returns currently executing task if called from exec_fn.
-		// Usually used to reschedule the currently executing Task.
+		/// Returns currently executing task if called from exec_fn.
+		/// Usually used to reschedule the currently executing Task.
 		static Task current_task();

-		// Ordering for containers
+		/// Returns number of currently existing Task objects.
+		/// Good for spotting leaks.
+		static size_t instance_count();
+
+		/// Ordering operator for containers
 		bool operator<(const Task &that) const;

-		// Null test
+		/// Null test
 		operator bool() const;

-		// Unique non-repeating(ish) ID for task
+		/// Unique non-repeating(ish) ID for task
 		TaskId id() const;
 	};

@@ -55,27 +68,35 @@ namespace crucible {

 	class TaskMaster {
 	public:
-		// Blocks until the running thread count reaches this number
+		/// Blocks until the running thread count reaches this number
 		static void set_thread_count(size_t threads);

-		// Sets minimum thread count when load average tracking enabled
+		/// Sets minimum thread count when load average tracking enabled
 		static void set_thread_min_count(size_t min_threads);

-		// Calls set_thread_count with default
+		/// Calls set_thread_count with default
 		static void set_thread_count();

-		// Creates thread to track load average and adjust thread count dynamically
+		/// Creates thread to track load average and adjust thread count dynamically
 		static void set_loadavg_target(double target);

-		// Writes the current non-executing Task queue
+		/// Writes the current non-executing Task queue
 		static ostream & print_queue(ostream &);

-		// Writes the current executing Task for each worker
+		/// Writes the current executing Task for each worker
 		static ostream & print_workers(ostream &);

-		// Gets the current number of queued Tasks
+		/// Gets the current number of queued Tasks
 		static size_t get_queue_count();

+		/// Gets the current number of active workers
+		static size_t get_thread_count();
+
+		/// Drop the current queue and discard new Tasks without
+		/// running them.  Currently executing tasks are not
+		/// affected (use set_thread_count(0) to wait for those
+		/// to complete).
+		static void cancel();
 	};

 	// Barrier executes waiting Tasks once the last BarrierLock
@@ -139,7 +160,7 @@ namespace crucible {

 		Exclusion(shared_ptr<ExclusionState> pes);
 	public:
-		Exclusion();
+		Exclusion(const string &title);

 		// Attempt to obtain a Lock.  If successful, current Task
 		// owns the Lock until the ExclusionLock is released
@@ -151,10 +172,9 @@ namespace crucible {
 		// objects it holds, and exit its Task function.
 		ExclusionLock try_lock();

-		// Execute Task when Exclusion is unlocked (possibly immediately).
-		// First Task is scheduled with run_earlier(), all others are
-		// scheduled with run().
-		void insert_task(Task t);
+		// Execute Task when Exclusion is unlocked (possibly
+		// immediately).
+		void insert_task(Task t = Task::current_task());
 	};


--- a/include/crucible/time.h
+++ b/include/crucible/time.h
@@ -42,6 +42,7 @@ namespace crucible {
 		RateLimiter(double rate, double burst);
 		RateLimiter(double rate);
 		void sleep_for(double cost = 1.0);
+		double sleep_time(double cost = 1.0);
 		bool is_ready();
 		void borrow(double cost = 1.0);
 	};
--- a/include/crucible/uuid.h
+++ b/include/crucible/uuid.h
@@ -1,14 +0,0 @@
-#ifndef CRUCIBLE_UUID_H
-#define CRUCIBLE_UUID_H
-
-#include <string>
-
-#include <uuid/uuid.h>
-
-namespace crucible {
-	using namespace std;
-
-	string uuid_unparse(const unsigned char a[16]);
-}
-
-#endif // CRUCIBLE_UUID_H
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -1,10 +1,11 @@
-TAG := $(shell git describe --always --dirty || echo UNKNOWN)
+TAG ?= $(shell git describe --always --dirty || echo UNKNOWN)

-default: libcrucible.so
-%.so: Makefile
+default: libcrucible.a
+%.a: Makefile

 CRUCIBLE_OBJS = \
 	chatter.o \
+	city.o \
 	cleanup.o \
 	crc64.o \
 	error.o \
@@ -17,17 +18,21 @@ CRUCIBLE_OBJS = \
 	string.o \
 	task.o \
 	time.o \
-	uuid.o \

 include ../makeflags
+-include ../localconf
 include ../Defines.mk

+BEES_LDFLAGS = $(LDFLAGS)
+
 configure.h: configure.h.in
 	$(TEMPLATE_COMPILER)

-.depends/%.dep: %.cc configure.h Makefile
-	@mkdir -p .depends
-	$(CXX) $(CXXFLAGS) -M -MF $@ -MT $(<:.cc=.o) $<
+.depends:
+	mkdir -p $@
+
+.depends/%.dep: %.cc configure.h Makefile | .depends
+	$(CXX) $(BEES_CXXFLAGS) -M -MF $@ -MT $(<:.cc=.o) $<

 depends.mk: $(CRUCIBLE_OBJS:%.o=.depends/%.dep)
 	cat $^ > $@.new
@@ -35,12 +40,12 @@ depends.mk: $(CRUCIBLE_OBJS:%.o=.depends/%.dep)

 .version.cc: configure.h Makefile ../makeflags $(CRUCIBLE_OBJS:.o=.cc) ../include/crucible/*.h
 	echo "namespace crucible { const char *VERSION = \"$(TAG)\"; }" > $@.new
-	mv -f $@.new $@
+	if ! cmp "$@.new" "$@"; then mv -fv $@.new $@; fi

 include depends.mk

 %.o: %.cc ../makeflags
-	$(CXX) $(CXXFLAGS) -fPIC -o $@ -c $<
+	$(CXX) $(BEES_CXXFLAGS) -o $@ -c $<

-libcrucible.so: $(CRUCIBLE_OBJS) .version.o
-	$(CXX) $(LDFLAGS) -fPIC -shared -Wl,-soname,$@ -o $@ $^ -luuid
+libcrucible.a: $(CRUCIBLE_OBJS) .version.o
+	$(AR) rcs $@ $^
--- a/lib/chatter.cc
+++ b/lib/chatter.cc
@@ -18,6 +18,7 @@ namespace crucible {
 	static shared_ptr<set<string>> chatter_names;
 	static const char *SPACETAB = " \t";
 	static bool add_prefix_timestamp = true;
+	static bool add_prefix_level = true;

 	static
 	void
@@ -55,6 +56,12 @@ namespace crucible {
 		add_prefix_timestamp = prefix_timestamp;
 	}

+	void
+	Chatter::enable_level(bool prefix_level)
+	{
+		add_prefix_level = prefix_level;
+	}
+
 	Chatter::~Chatter()
 	{
 		ostringstream header_stream;
@@ -69,14 +76,19 @@ namespace crucible {
 			DIE_IF_ZERO(strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &ltm));

 			header_stream << buf;
-			header_stream << " " << getpid() << "." << gettid() << "<" << m_loglevel << ">";
+			header_stream << " " << getpid() << "." << crucible::gettid();
+			if (add_prefix_level) {
+				header_stream << "<" << m_loglevel << ">";
+			}
 			if (!m_name.empty()) {
 				header_stream << " " << m_name;
 			}
 		} else {
-			header_stream << "<" << m_loglevel << ">";
+			if (add_prefix_level) {
+				header_stream << "<" << m_loglevel << ">";
+			}
 			header_stream << (m_name.empty() ? "thread" : m_name);
-			header_stream << "[" << gettid() << "]";
+			header_stream << "[" << crucible::gettid() << "]";
 		}

 		header_stream << ": ";
@@ -124,6 +136,7 @@ namespace crucible {
 		} else if (!chatter_names->empty()) {
 			cerr << "CRUCIBLE_CHATTER does not list '" << m_file << "' or '" << m_pretty_function << "'" << endl;
 		}
+		(void)m_line; // not implemented yet
 		// cerr << "ChatterBox " << reinterpret_cast<void*>(this) << " constructed" << endl;
 	}

@@ -146,7 +159,7 @@ namespace crucible {

 	ChatterUnwinder::~ChatterUnwinder()
 	{
-		if (uncaught_exception()) {
+		if (current_exception()) {
 			m_func();
 		}
 	}
--- a/lib/city.cc
+++ b/lib/city.cc
@@ -0,0 +1,513 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+// CityHash, by Geoff Pike and Jyrki Alakuijala
+//
+// This file provides CityHash64() and related functions.
+//
+// It's probably possible to create even faster hash functions by
+// writing a program that systematically explores some of the space of
+// possible hash functions, by using SIMD instructions, or by
+// compromising on hash quality.
+
+#include "crucible/city.h"
+
+#include <algorithm>
+#include <string.h>  // for memcpy and memset
+
+using namespace std;
+
+static uint64 UNALIGNED_LOAD64(const char *p) {
+  uint64 result;
+  memcpy(&result, p, sizeof(result));
+  return result;
+}
+
+static uint32 UNALIGNED_LOAD32(const char *p) {
+  uint32 result;
+  memcpy(&result, p, sizeof(result));
+  return result;
+}
+
+#ifdef _MSC_VER
+
+#include <stdlib.h>
+#define bswap_32(x) _byteswap_ulong(x)
+#define bswap_64(x) _byteswap_uint64(x)
+
+#elif defined(__APPLE__)
+
+// Mac OS X / Darwin features
+#include <libkern/OSByteOrder.h>
+#define bswap_32(x) OSSwapInt32(x)
+#define bswap_64(x) OSSwapInt64(x)
+
+#elif defined(__sun) || defined(sun)
+
+#include <sys/byteorder.h>
+#define bswap_32(x) BSWAP_32(x)
+#define bswap_64(x) BSWAP_64(x)
+
+#elif defined(__FreeBSD__)
+
+#include <sys/endian.h>
+#define bswap_32(x) bswap32(x)
+#define bswap_64(x) bswap64(x)
+
+#elif defined(__OpenBSD__)
+
+#include <sys/types.h>
+#define bswap_32(x) swap32(x)
+#define bswap_64(x) swap64(x)
+
+#elif defined(__NetBSD__)
+
+#include <sys/types.h>
+#include <machine/bswap.h>
+#if defined(__BSWAP_RENAME) && !defined(__bswap_32)
+#define bswap_32(x) bswap32(x)
+#define bswap_64(x) bswap64(x)
+#endif
+
+#else
+
+#include <byteswap.h>
+
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define uint32_in_expected_order(x) (bswap_32(x))
+#define uint64_in_expected_order(x) (bswap_64(x))
+#else
+#define uint32_in_expected_order(x) (x)
+#define uint64_in_expected_order(x) (x)
+#endif
+
+#if !defined(LIKELY)
+#if HAVE_BUILTIN_EXPECT
+#define LIKELY(x) (__builtin_expect(!!(x), 1))
+#else
+#define LIKELY(x) (x)
+#endif
+#endif
+
+static uint64 Fetch64(const char *p) {
+  return uint64_in_expected_order(UNALIGNED_LOAD64(p));
+}
+
+static uint32 Fetch32(const char *p) {
+  return uint32_in_expected_order(UNALIGNED_LOAD32(p));
+}
+
+// Some primes between 2^63 and 2^64 for various uses.
+static const uint64 k0 = 0xc3a5c85c97cb3127ULL;
+static const uint64 k1 = 0xb492b66fbe98f273ULL;
+static const uint64 k2 = 0x9ae16a3b2f90404fULL;
+
+// Magic numbers for 32-bit hashing.  Copied from Murmur3.
+static const uint32 c1 = 0xcc9e2d51;
+static const uint32 c2 = 0x1b873593;
+
+// A 32-bit to 32-bit integer hash copied from Murmur3.
+static uint32 fmix(uint32 h)
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+  return h;
+}
+
+static uint32 Rotate32(uint32 val, int shift) {
+  // Avoid shifting by 32: doing so yields an undefined result.
+  return shift == 0 ? val : ((val >> shift) | (val << (32 - shift)));
+}
+
+#undef PERMUTE3
+#define PERMUTE3(a, b, c) do { std::swap(a, b); std::swap(a, c); } while (0)
+
+static uint32 Mur(uint32 a, uint32 h) {
+  // Helper from Murmur3 for combining two 32-bit values.
+  a *= c1;
+  a = Rotate32(a, 17);
+  a *= c2;
+  h ^= a;
+  h = Rotate32(h, 19);
+  return h * 5 + 0xe6546b64;
+}
+
+static uint32 Hash32Len13to24(const char *s, size_t len) {
+  uint32 a = Fetch32(s - 4 + (len >> 1));
+  uint32 b = Fetch32(s + 4);
+  uint32 c = Fetch32(s + len - 8);
+  uint32 d = Fetch32(s + (len >> 1));
+  uint32 e = Fetch32(s);
+  uint32 f = Fetch32(s + len - 4);
+  uint32 h = len;
+
+  return fmix(Mur(f, Mur(e, Mur(d, Mur(c, Mur(b, Mur(a, h)))))));
+}
+
+static uint32 Hash32Len0to4(const char *s, size_t len) {
+  uint32 b = 0;
+  uint32 c = 9;
+  for (size_t i = 0; i < len; i++) {
+    signed char v = s[i];
+    b = b * c1 + v;
+    c ^= b;
+  }
+  return fmix(Mur(b, Mur(len, c)));
+}
+
+static uint32 Hash32Len5to12(const char *s, size_t len) {
+  uint32 a = len, b = len * 5, c = 9, d = b;
+  a += Fetch32(s);
+  b += Fetch32(s + len - 4);
+  c += Fetch32(s + ((len >> 1) & 4));
+  return fmix(Mur(c, Mur(b, Mur(a, d))));
+}
+
+uint32 CityHash32(const char *s, size_t len) {
+  if (len <= 24) {
+    return len <= 12 ?
+        (len <= 4 ? Hash32Len0to4(s, len) : Hash32Len5to12(s, len)) :
+        Hash32Len13to24(s, len);
+  }
+
+  // len > 24
+  uint32 h = len, g = c1 * len, f = g;
+  uint32 a0 = Rotate32(Fetch32(s + len - 4) * c1, 17) * c2;
+  uint32 a1 = Rotate32(Fetch32(s + len - 8) * c1, 17) * c2;
+  uint32 a2 = Rotate32(Fetch32(s + len - 16) * c1, 17) * c2;
+  uint32 a3 = Rotate32(Fetch32(s + len - 12) * c1, 17) * c2;
+  uint32 a4 = Rotate32(Fetch32(s + len - 20) * c1, 17) * c2;
+  h ^= a0;
+  h = Rotate32(h, 19);
+  h = h * 5 + 0xe6546b64;
+  h ^= a2;
+  h = Rotate32(h, 19);
+  h = h * 5 + 0xe6546b64;
+  g ^= a1;
+  g = Rotate32(g, 19);
+  g = g * 5 + 0xe6546b64;
+  g ^= a3;
+  g = Rotate32(g, 19);
+  g = g * 5 + 0xe6546b64;
+  f += a4;
+  f = Rotate32(f, 19);
+  f = f * 5 + 0xe6546b64;
+  size_t iters = (len - 1) / 20;
+  do {
+    uint32 a0 = Rotate32(Fetch32(s) * c1, 17) * c2;
+    uint32 a1 = Fetch32(s + 4);
+    uint32 a2 = Rotate32(Fetch32(s + 8) * c1, 17) * c2;
+    uint32 a3 = Rotate32(Fetch32(s + 12) * c1, 17) * c2;
+    uint32 a4 = Fetch32(s + 16);
+    h ^= a0;
+    h = Rotate32(h, 18);
+    h = h * 5 + 0xe6546b64;
+    f += a1;
+    f = Rotate32(f, 19);
+    f = f * c1;
+    g += a2;
+    g = Rotate32(g, 18);
+    g = g * 5 + 0xe6546b64;
+    h ^= a3 + a1;
+    h = Rotate32(h, 19);
+    h = h * 5 + 0xe6546b64;
+    g ^= a4;
+    g = bswap_32(g) * 5;
+    h += a4 * 5;
+    h = bswap_32(h);
+    f += a0;
+    PERMUTE3(f, h, g);
+    s += 20;
+  } while (--iters != 0);
+  g = Rotate32(g, 11) * c1;
+  g = Rotate32(g, 17) * c1;
+  f = Rotate32(f, 11) * c1;
+  f = Rotate32(f, 17) * c1;
+  h = Rotate32(h + g, 19);
+  h = h * 5 + 0xe6546b64;
+  h = Rotate32(h, 17) * c1;
+  h = Rotate32(h + f, 19);
+  h = h * 5 + 0xe6546b64;
+  h = Rotate32(h, 17) * c1;
+  return h;
+}
+
+// Bitwise right rotate.  Normally this will compile to a single
+// instruction, especially if the shift is a manifest constant.
+static uint64 Rotate(uint64 val, int shift) {
+  // Avoid shifting by 64: doing so yields an undefined result.
+  return shift == 0 ? val : ((val >> shift) | (val << (64 - shift)));
+}
+
+static uint64 ShiftMix(uint64 val) {
+  return val ^ (val >> 47);
+}
+
+static uint64 HashLen16(uint64 u, uint64 v) {
+  return Hash128to64(uint128(u, v));
+}
+
+static uint64 HashLen16(uint64 u, uint64 v, uint64 mul) {
+  // Murmur-inspired hashing.
+  uint64 a = (u ^ v) * mul;
+  a ^= (a >> 47);
+  uint64 b = (v ^ a) * mul;
+  b ^= (b >> 47);
+  b *= mul;
+  return b;
+}
+
+static uint64 HashLen0to16(const char *s, size_t len) {
+  if (len >= 8) {
+    uint64 mul = k2 + len * 2;
+    uint64 a = Fetch64(s) + k2;
+    uint64 b = Fetch64(s + len - 8);
+    uint64 c = Rotate(b, 37) * mul + a;
+    uint64 d = (Rotate(a, 25) + b) * mul;
+    return HashLen16(c, d, mul);
+  }
+  if (len >= 4) {
+    uint64 mul = k2 + len * 2;
+    uint64 a = Fetch32(s);
+    return HashLen16(len + (a << 3), Fetch32(s + len - 4), mul);
+  }
+  if (len > 0) {
+    uint8 a = s[0];
+    uint8 b = s[len >> 1];
+    uint8 c = s[len - 1];
+    uint32 y = static_cast<uint32>(a) + (static_cast<uint32>(b) << 8);
+    uint32 z = len + (static_cast<uint32>(c) << 2);
+    return ShiftMix(y * k2 ^ z * k0) * k2;
+  }
+  return k2;
+}
+
+// This probably works well for 16-byte strings as well, but it may be overkill
+// in that case.
+static uint64 HashLen17to32(const char *s, size_t len) {
+  uint64 mul = k2 + len * 2;
+  uint64 a = Fetch64(s) * k1;
+  uint64 b = Fetch64(s + 8);
+  uint64 c = Fetch64(s + len - 8) * mul;
+  uint64 d = Fetch64(s + len - 16) * k2;
+  return HashLen16(Rotate(a + b, 43) + Rotate(c, 30) + d,
+                   a + Rotate(b + k2, 18) + c, mul);
+}
+
+// Return a 16-byte hash for 48 bytes.  Quick and dirty.
+// Callers do best to use "random-looking" values for a and b.
+static pair<uint64, uint64> WeakHashLen32WithSeeds(
+    uint64 w, uint64 x, uint64 y, uint64 z, uint64 a, uint64 b) {
+  a += w;
+  b = Rotate(b + a + z, 21);
+  uint64 c = a;
+  a += x;
+  a += y;
+  b += Rotate(a, 44);
+  return make_pair(a + z, b + c);
+}
+
+// Return a 16-byte hash for s[0] ... s[31], a, and b.  Quick and dirty.
+static pair<uint64, uint64> WeakHashLen32WithSeeds(
+    const char* s, uint64 a, uint64 b) {
+  return WeakHashLen32WithSeeds(Fetch64(s),
+                                Fetch64(s + 8),
+                                Fetch64(s + 16),
+                                Fetch64(s + 24),
+                                a,
+                                b);
+}
+
+// Return an 8-byte hash for 33 to 64 bytes.
+static uint64 HashLen33to64(const char *s, size_t len) {
+  uint64 mul = k2 + len * 2;
+  uint64 a = Fetch64(s) * k2;
+  uint64 b = Fetch64(s + 8);
+  uint64 c = Fetch64(s + len - 24);
+  uint64 d = Fetch64(s + len - 32);
+  uint64 e = Fetch64(s + 16) * k2;
+  uint64 f = Fetch64(s + 24) * 9;
+  uint64 g = Fetch64(s + len - 8);
+  uint64 h = Fetch64(s + len - 16) * mul;
+  uint64 u = Rotate(a + g, 43) + (Rotate(b, 30) + c) * 9;
+  uint64 v = ((a + g) ^ d) + f + 1;
+  uint64 w = bswap_64((u + v) * mul) + h;
+  uint64 x = Rotate(e + f, 42) + c;
+  uint64 y = (bswap_64((v + w) * mul) + g) * mul;
+  uint64 z = e + f + c;
+  a = bswap_64((x + z) * mul + y) + b;
+  b = ShiftMix((z + a) * mul + d + h) * mul;
+  return b + x;
+}
+
+uint64 CityHash64(const char *s, size_t len) {
+  if (len <= 32) {
+    if (len <= 16) {
+      return HashLen0to16(s, len);
+    } else {
+      return HashLen17to32(s, len);
+    }
+  } else if (len <= 64) {
+    return HashLen33to64(s, len);
+  }
+
+  // For strings over 64 bytes we hash the end first, and then as we
+  // loop we keep 56 bytes of state: v, w, x, y, and z.
+  uint64 x = Fetch64(s + len - 40);
+  uint64 y = Fetch64(s + len - 16) + Fetch64(s + len - 56);
+  uint64 z = HashLen16(Fetch64(s + len - 48) + len, Fetch64(s + len - 24));
+  pair<uint64, uint64> v = WeakHashLen32WithSeeds(s + len - 64, len, z);
+  pair<uint64, uint64> w = WeakHashLen32WithSeeds(s + len - 32, y + k1, x);
+  x = x * k1 + Fetch64(s);
+
+  // Decrease len to the nearest multiple of 64, and operate on 64-byte chunks.
+  len = (len - 1) & ~static_cast<size_t>(63);
+  do {
+    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
+    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+    x ^= w.second;
+    y += v.first + Fetch64(s + 40);
+    z = Rotate(z + w.first, 33) * k1;
+    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
+    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
+    std::swap(z, x);
+    s += 64;
+    len -= 64;
+  } while (len != 0);
+  return HashLen16(HashLen16(v.first, w.first) + ShiftMix(y) * k1 + z,
+                   HashLen16(v.second, w.second) + x);
+}
+
+uint64 CityHash64WithSeed(const char *s, size_t len, uint64 seed) {
+  return CityHash64WithSeeds(s, len, k2, seed);
+}
+
+uint64 CityHash64WithSeeds(const char *s, size_t len,
+                           uint64 seed0, uint64 seed1) {
+  return HashLen16(CityHash64(s, len) - seed0, seed1);
+}
+
+// A subroutine for CityHash128().  Returns a decent 128-bit hash for strings
+// of any length representable in signed long.  Based on City and Murmur.
+static uint128 CityMurmur(const char *s, size_t len, uint128 seed) {
+  uint64 a = Uint128Low64(seed);
+  uint64 b = Uint128High64(seed);
+  uint64 c = 0;
+  uint64 d = 0;
+  signed long l = len - 16;
+  if (l <= 0) {  // len <= 16
+    a = ShiftMix(a * k1) * k1;
+    c = b * k1 + HashLen0to16(s, len);
+    d = ShiftMix(a + (len >= 8 ? Fetch64(s) : c));
+  } else {  // len > 16
+    c = HashLen16(Fetch64(s + len - 8) + k1, a);
+    d = HashLen16(b + len, c + Fetch64(s + len - 16));
+    a += d;
+    do {
+      a ^= ShiftMix(Fetch64(s) * k1) * k1;
+      a *= k1;
+      b ^= a;
+      c ^= ShiftMix(Fetch64(s + 8) * k1) * k1;
+      c *= k1;
+      d ^= c;
+      s += 16;
+      l -= 16;
+    } while (l > 0);
+  }
+  a = HashLen16(a, c);
+  b = HashLen16(d, b);
+  return uint128(a ^ b, HashLen16(b, a));
+}
+
+uint128 CityHash128WithSeed(const char *s, size_t len, uint128 seed) {
+  if (len < 128) {
+    return CityMurmur(s, len, seed);
+  }
+
+  // We expect len >= 128 to be the common case.  Keep 56 bytes of state:
+  // v, w, x, y, and z.
+  pair<uint64, uint64> v, w;
+  uint64 x = Uint128Low64(seed);
+  uint64 y = Uint128High64(seed);
+  uint64 z = len * k1;
+  v.first = Rotate(y ^ k1, 49) * k1 + Fetch64(s);
+  v.second = Rotate(v.first, 42) * k1 + Fetch64(s + 8);
+  w.first = Rotate(y + z, 35) * k1 + x;
+  w.second = Rotate(x + Fetch64(s + 88), 53) * k1;
+
+  // This is the same inner loop as CityHash64(), manually unrolled.
+  do {
+    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
+    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+    x ^= w.second;
+    y += v.first + Fetch64(s + 40);
+    z = Rotate(z + w.first, 33) * k1;
+    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
+    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
+    std::swap(z, x);
+    s += 64;
+    x = Rotate(x + y + v.first + Fetch64(s + 8), 37) * k1;
+    y = Rotate(y + v.second + Fetch64(s + 48), 42) * k1;
+    x ^= w.second;
+    y += v.first + Fetch64(s + 40);
+    z = Rotate(z + w.first, 33) * k1;
+    v = WeakHashLen32WithSeeds(s, v.second * k1, x + w.first);
+    w = WeakHashLen32WithSeeds(s + 32, z + w.second, y + Fetch64(s + 16));
+    std::swap(z, x);
+    s += 64;
+    len -= 128;
+  } while (LIKELY(len >= 128));
+  x += Rotate(v.first + z, 49) * k0;
+  y = y * k0 + Rotate(w.second, 37);
+  z = z * k0 + Rotate(w.first, 27);
+  w.first *= 9;
+  v.first *= k0;
+  // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s.
+  for (size_t tail_done = 0; tail_done < len; ) {
+    tail_done += 32;
+    y = Rotate(x + y, 42) * k0 + v.second;
+    w.first += Fetch64(s + len - tail_done + 16);
+    x = x * k0 + w.first;
+    z += w.second + Fetch64(s + len - tail_done);
+    w.second += v.first;
+    v = WeakHashLen32WithSeeds(s + len - tail_done, v.first + z, v.second);
+    v.first *= k0;
+  }
+  // At this point our 56 bytes of state should contain more than
+  // enough information for a strong 128-bit hash.  We use two
+  // different 56-byte-to-8-byte hashes to get a 16-byte final result.
+  x = HashLen16(x, v.first);
+  y = HashLen16(y + z, w.first);
+  return uint128(HashLen16(x + v.second, w.second) + y,
+                 HashLen16(x + w.second, y + v.second));
+}
+
+uint128 CityHash128(const char *s, size_t len) {
+  return len >= 16 ?
+      CityHash128WithSeed(s + 16, len - 16,
+                          uint128(Fetch64(s), Fetch64(s + 8) + k0)) :
+      CityHash128WithSeed(s, len, uint128(k0, k1));
+}
+
--- a/lib/cleanup.cc
+++ b/lib/cleanup.cc
@@ -1,4 +1,4 @@
-#include <crucible/cleanup.h>
+#include "crucible/cleanup.h"

 namespace crucible {

--- a/lib/error.cc
+++ b/lib/error.cc
@@ -32,7 +32,7 @@ namespace crucible {

 	// FIXME:  could probably avoid some of these levels of indirection
 	static
-	function<void(string s)> current_catch_explainer = [&](string s) {
+	function<void(string s)> current_catch_explainer = [](string s) {
 		cerr << s << endl;
 	};

--- a/lib/extentwalker.cc
+++ b/lib/extentwalker.cc
@@ -9,22 +9,37 @@
 namespace crucible {
 	using namespace std;

-	const off_t ExtentWalker::sc_step_size;
-
 	// fm_start, fm_length, fm_flags, m_extents
 	// fe_logical, fe_physical, fe_length, fe_flags

-	static const off_t MAX_OFFSET = numeric_limits<off_t>::max();
 	static const off_t FIEMAP_BLOCK_SIZE = 4096;

-	static bool __ew_do_log = getenv("EXTENTWALKER_DEBUG");
+	// Maximum number of extents from TREE_SEARCH.
+	static const unsigned sc_extent_fetch_max = 16;

+	// Minimum number of extents from TREE_SEARCH.
+	// If we don't get this number, we'll binary search backward
+	// until we reach the beginning of the file or find at least this
+	// number of extents.
+	static const unsigned sc_extent_fetch_min = 4;
+
+	// This is a guess that tries to land at least one extent
+	// before the target extent, so we don't have to search backward as often.
+	static const off_t sc_back_step_size = 64 * 1024;
+
+#ifdef EXTENTWALKER_DEBUG
 #define EWLOG(x) do { \
-	if (__ew_do_log) { \
-		CHATTER(x); \
-	} \
+	m_log << x << endl; \
 } while (0)

+#define EWTRACE(x) do { \
+	CHATTER_UNWIND(x); \
+} while (0)
+#else
+#define EWLOG(x)   do {} while (0)
+#define EWTRACE(x) do {} while (0)
+#endif
+
 	ostream &
 	operator<<(ostream &os, const Extent &e)
 	{
@@ -42,9 +57,7 @@ namespace crucible {
 		if (e.m_flags & Extent::OBSCURED) {
 			os << "Extent::OBSCURED|";
 		}
-		if (e.m_flags & ~(Extent::HOLE | Extent::PREALLOC | Extent::OBSCURED)) {
-			os << fiemap_extent_flags_ntoa(e.m_flags & ~(Extent::HOLE | Extent::PREALLOC | Extent::OBSCURED));
-		}
+		os << fiemap_extent_flags_ntoa(e.m_flags & ~(Extent::HOLE | Extent::PREALLOC | Extent::OBSCURED));
 		if (e.m_physical_len) {
 			os << ", physical_len = " << to_hex(e.m_physical_len);
 		}
@@ -70,12 +83,16 @@ namespace crucible {
 	ostream &
 	operator<<(ostream &os, const ExtentWalker &ew)
 	{
-		return os << "ExtentWalker {"
+		os << "ExtentWalker {"
 			<< " fd = " << name_fd(ew.m_fd)
 			<< ", stat.st_size = " << to_hex(ew.m_stat.st_size)
 			<< ", extents = " << ew.m_extents
 			<< ", current = [" << ew.m_current - ew.m_extents.begin()
-			<< "] }";
+			<< "] ";
+#ifdef EXTENTWALKER_DEBUG
+		os << "\nLog:\n" << ew.m_log.str() << "\nEnd log";
+#endif
+		return os << "}";
 	}

 	Extent::operator bool() const
@@ -161,8 +178,7 @@ namespace crucible {
 	void
 	ExtentWalker::run_fiemap(off_t pos)
 	{
-		ostringstream log;
-		CHATTER_UNWIND("Log of run_fiemap: " << log.str());
+		EWTRACE("Log of run_fiemap: " << m_log.str());

 		EWLOG("pos = " << to_hex(pos));

@@ -170,18 +186,24 @@ namespace crucible {

 		Vec fm;

-		off_t step_size = pos;
-		off_t begin = pos - min(pos, sc_step_size);
+		// Start backward search by dropping lowest bit
+		off_t step_size = (pos > 0) ? (pos ^ (pos & (pos - 1))) * 2 : 0;
+
+		// Start first pass through loop just a little before the target extent,
+		// because the first iteration will be wasted if we have an exact match.
+		off_t begin = pos - min(pos, sc_back_step_size);

 		// This loop should not run forever
 		int loop_count = 0;
-		int loop_limit = 99;
+		const int loop_limit = 99;
 		while (true) {
-			if (loop_count == 90) {
-				EWLOG(log.str());
+#ifdef EXTENTWALKER_DEBUG
+			if (loop_count >= loop_limit) {
+				cerr << "Too many loops!" << endl << m_log.str() << endl;
+				abort();
 			}
-
-			THROW_CHECK1(runtime_error, loop_count, loop_count < loop_limit);
+#endif
+			THROW_CHECK2(runtime_error, *this, loop_count, loop_count < loop_limit);
 			++loop_count;

 			// Get file size every time in case it changes under us
@@ -189,7 +211,16 @@ namespace crucible {

 			// Get fiemap begin..EOF
 			fm = get_extent_map(begin);
-			EWLOG("fiemap result loop count #" << loop_count << ":" << fm);
+			EWLOG("fiemap result loop count #" << loop_count << " begin " << to_hex(begin) << " pos "
+				<< to_hex(pos) << " step_size " << to_hex(step_size) << ":\n" << fm);
+
+			// Sanity check on the data:  in order, not overlapping, not empty, not before pos
+			off_t sanity_pos = begin;
+			for (auto const &i : fm) {
+				THROW_CHECK1(runtime_error, fm, i.begin() >= sanity_pos);
+				THROW_CHECK1(runtime_error, fm, i.end() > i.begin());
+				sanity_pos = i.end();
+			}

 			// This algorithm seeks at least three extents: one before,
 			// one after, and one containing pos.  Files which contain
@@ -197,15 +228,15 @@ namespace crucible {
 			// so handle those cases separately.

 			// FIEMAP lies, and we catch it in a lie about the size of the
-			// second extent.  To work around this, try getting more than 3.
+			// second extent.  To work around this, sc_extent_fetch_min is at least 4.

 			// 0..2(ish) extents
 			if (fm.size() < sc_extent_fetch_min) {
-				// If we are not at beginning of file, move backward
+				// If we are not at beginning of file, move backward by zeroing the lowest bit
 				if (begin > 0) {
-					step_size /= 2;
+					step_size = (begin > 0) ? (begin ^ (begin & (begin - 1))) : 0;
 					auto next_begin = (begin - min(step_size, begin)) & ~(FIEMAP_BLOCK_SIZE - 1);
-					EWLOG("step backward " << to_hex(begin) << " -> " << to_hex(next_begin) << " extents size " << fm.size());
+					EWLOG("step backward " << to_hex(begin) << " -> " << to_hex(next_begin));
 					if (begin == next_begin) {
 						EWLOG("step backward stopped");
 						break;
@@ -233,18 +264,18 @@ namespace crucible {
 			// We have at least three extents, so there is now a first and last.
 			// We want pos to be between first and last.  There doesn't have
 			// to be an extent between these (it could be a hole).
-			auto &first_extent = fm.at(sc_extent_fetch_min - 2);
+			auto &first_extent = *fm.begin();
 			auto &last_extent = *fm.rbegin();
 			EWLOG("first_extent = " << first_extent);
 			EWLOG("last_extent = " << last_extent);

-			// First extent must end on or before pos
+			// First extent must end on or before pos; otherwise, go further back
 			if (first_extent.end() > pos) {
 				// Can we move backward?
 				if (begin > 0) {
-					step_size /= 2;
+					step_size = (begin > 0) ? (begin ^ (begin & (begin - 1))) : 0;
 					auto next_begin = (begin - min(step_size, begin)) & ~(FIEMAP_BLOCK_SIZE - 1);
-					EWLOG("step backward " << to_hex(begin) << " -> " << to_hex(next_begin) << " extents size " << fm.size());
+					EWLOG("step backward " << to_hex(begin) << " -> " << to_hex(next_begin));
 					if (begin == next_begin) {
 						EWLOG("step backward stopped");
 						break;
@@ -254,38 +285,29 @@ namespace crucible {
 				}

 				// We are as far back as we can go, so there must be no
-				// extent before pos (i.e. file starts with a hole).
+				// extent before pos (i.e. file starts with a hole
+				// or first extent starts at pos 0).
 				EWLOG("no extent before pos");
 				break;
 			}

-			// First extent ends on or before pos.
-
-			// If last extent is EOF then we have the entire file in the buffer.
+			// If last extent is EOF then we cannot more any further forward.
 			// pos could be in last extent, so skip the later checks that
 			// insist pos be located prior to the last extent.
 			if (last_extent.flags() & FIEMAP_EXTENT_LAST) {
 				break;
 			}

-			// Don't have EOF, must have an extent after pos.
+			// Don't have EOF, must have an extent after pos; otherwise, go forward
 			if (last_extent.begin() <= pos) {
+				// Set the bit just below the one we last cleared
 				step_size /= 2;
-				auto new_begin = (begin + step_size) & ~(FIEMAP_BLOCK_SIZE - 1);
+				auto new_begin = (begin + max(FIEMAP_BLOCK_SIZE, step_size)) & ~(FIEMAP_BLOCK_SIZE - 1);
 				EWLOG("step forward " << to_hex(begin) << " -> " << to_hex(new_begin));
-				if (begin == new_begin) {
-					EWLOG("step forward stopped");
-					break;
-				}
 				begin = new_begin;
 				continue;
 			}

-			// Last extent begins after pos, first extent ends on or before pos.
-			// All other cases should have been handled before here.
-			THROW_CHECK2(runtime_error, pos, first_extent, first_extent.end() <= pos);
-			THROW_CHECK2(runtime_error, pos, last_extent, last_extent.begin() > pos);
-
 			// We should probably stop now
 			break;
 		}
@@ -300,6 +322,11 @@ namespace crucible {
 		while (fmi != fm.end()) {
 			Extent new_extent(*fmi);
 			THROW_CHECK2(runtime_error, ipos, new_extent.m_begin, ipos <= new_extent.m_begin);
+			// Don't map extents past EOF, we can't read them
+			if (new_extent.m_begin >= m_stat.st_size) {
+				last_extent_is_last = true;
+				break;
+			}
 			if (new_extent.m_begin > ipos) {
 				Extent hole_extent;
 				hole_extent.m_begin = ipos;
@@ -327,13 +354,13 @@ namespace crucible {
 				hole_extent.m_flags |= FIEMAP_EXTENT_LAST;
 			}
 			new_vec.push_back(hole_extent);
-			ipos += new_vec.size();
+			ipos += hole_extent.size();
 		}
+		// Extent list must now be non-empty, at least a hole
 		THROW_CHECK1(runtime_error, new_vec.size(), !new_vec.empty());

-		// Allow last extent to extend beyond desired range (e.g. at EOF)
-		// ...but that's not what this does
-		// THROW_CHECK3(runtime_error, ipos, new_vec.rbegin()->m_end, m_stat.st_size, ipos <= new_vec.rbegin()->m_end);
+		// ipos must match end of last extent
+		THROW_CHECK3(runtime_error, ipos, new_vec.rbegin()->m_end, m_stat.st_size, ipos == new_vec.rbegin()->m_end);

 		// If we have the last extent in the file, truncate it to the file size.
 		if (ipos >= m_stat.st_size) {
@@ -342,9 +369,10 @@ namespace crucible {
 			new_vec.rbegin()->m_end = m_stat.st_size;
 		}

-		// Verify contiguous, ascending order, at least one Extent
+		// Verify at least one Extent
 		THROW_CHECK1(runtime_error, new_vec, !new_vec.empty());

+		// Verify contiguous, ascending order, only extent with FIEMAP_EXTENT_LAST flag is the last extent
 		ipos = new_vec.begin()->m_begin;
 		bool last_flag_last = false;
 		for (auto e : new_vec) {
@@ -354,7 +382,6 @@ namespace crucible {
 			ipos += e.size();
 			last_flag_last = e.m_flags & FIEMAP_EXTENT_LAST;
 		}
-		THROW_CHECK1(runtime_error, new_vec, !last_extent_is_last || new_vec.rbegin()->m_end == ipos);

 		m_extents = new_vec;
 		m_current = m_extents.begin();
@@ -370,7 +397,7 @@ namespace crucible {
 	void
 	ExtentWalker::seek(off_t pos)
 	{
-		CHATTER_UNWIND("seek " << to_hex(pos));
+		EWTRACE("seek " << to_hex(pos));
 		THROW_CHECK1(out_of_range, pos, pos >= 0);
 		Itr rv = find_in_cache(pos);
 		if (rv != m_extents.end()) {
@@ -379,29 +406,28 @@ namespace crucible {
 		}
 		run_fiemap(pos);
 		m_current = find_in_cache(pos);
+		THROW_CHECK2(runtime_error, *this, to_hex(pos), m_current != m_extents.end());
 	}

 	Extent
 	ExtentWalker::current()
 	{
 		THROW_CHECK2(invalid_argument, *this, m_extents.size(), m_current != m_extents.end());
-		CHATTER_UNWIND("current " << *m_current);
 		return *m_current;
 	}

-
 	bool
 	ExtentWalker::next()
 	{
-		CHATTER_UNWIND("next");
+		EWTRACE("next");
 		THROW_CHECK1(invalid_argument, (m_current != m_extents.end()), m_current != m_extents.end());
 		if (current().m_end >= m_stat.st_size) {
-			CHATTER_UNWIND("next EOF");
+			EWTRACE("next EOF");
 			return false;
 		}
 		auto next_pos = current().m_end;
 		if (next_pos >= m_stat.st_size) {
-			CHATTER_UNWIND("next next_pos = " << next_pos << " m_stat.st_size = " << m_stat.st_size);
+			EWTRACE("next next_pos = " << next_pos << " m_stat.st_size = " << m_stat.st_size);
 			return false;
 		}
 		seek(next_pos);
@@ -419,16 +445,16 @@ namespace crucible {
 	bool
 	ExtentWalker::prev()
 	{
-		CHATTER_UNWIND("prev");
+		EWTRACE("prev");
 		THROW_CHECK1(invalid_argument, (m_current != m_extents.end()), m_current != m_extents.end());
 		auto prev_iter = m_current;
 		if (prev_iter->m_begin == 0) {
-			CHATTER_UNWIND("prev BOF");
+			EWTRACE("prev BOF");
 			return false;
 		}
 		THROW_CHECK1(invalid_argument, (prev_iter != m_extents.begin()), prev_iter != m_extents.begin());
 		--prev_iter;
-		CHATTER_UNWIND("prev seeking to " << *prev_iter << "->m_begin");
+		EWTRACE("prev seeking to " << *prev_iter << "->m_begin");
 		auto prev_end = current().m_begin;
 		seek(prev_iter->m_begin);
 		THROW_CHECK1(runtime_error, (m_current != m_extents.end()), m_current != m_extents.end());
@@ -470,7 +496,7 @@ namespace crucible {
 	BtrfsExtentWalker::Vec
 	BtrfsExtentWalker::get_extent_map(off_t pos)
 	{
-		BtrfsIoctlSearchKey sk(sc_extent_fetch_max * (sizeof(btrfs_file_extent_item) + sizeof(btrfs_ioctl_search_header)));
+		BtrfsIoctlSearchKey sk(65536);
 		if (!m_root_fd) {
 			m_root_fd = m_fd;
 		}
@@ -487,7 +513,7 @@ namespace crucible {
 		sk.min_type = sk.max_type = BTRFS_EXTENT_DATA_KEY;
 		sk.nr_items = sc_extent_fetch_max;

-		CHATTER_UNWIND("sk " << sk << " root_fd " << name_fd(m_root_fd));
+		EWTRACE("sk " << sk << " root_fd " << name_fd(m_root_fd));
 		sk.do_ioctl(m_root_fd);

 		Vec rv;
@@ -513,20 +539,20 @@ namespace crucible {

 			Extent e;
 			e.m_begin = i.offset;
-			auto compressed = call_btrfs_get(btrfs_stack_file_extent_compression, i.m_data);
+			auto compressed = btrfs_get_member(&btrfs_file_extent_item::compression, i.m_data);
 			// FIEMAP told us about compressed extents and we can too
 			if (compressed) {
 				e.m_flags |= FIEMAP_EXTENT_ENCODED;
 			}

-			auto type = call_btrfs_get(btrfs_stack_file_extent_type, i.m_data);
+			auto type = btrfs_get_member(&btrfs_file_extent_item::type, i.m_data);
 			off_t len = -1;
 			switch (type) {
 				default:
 					cerr << "Unhandled file extent type " << type << " in root " << m_tree_id << " ino " << m_stat.st_ino << endl;
 					break;
 				case BTRFS_FILE_EXTENT_INLINE:
-					len = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data));
+					len = ranged_cast<off_t>(btrfs_get_member(&btrfs_file_extent_item::ram_bytes, i.m_data));
 					e.m_flags |= FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED;
 					// Inline extents are never obscured, so don't bother filling in m_physical_len, etc.
 					break;
@@ -534,17 +560,17 @@ namespace crucible {
 					e.m_flags |= Extent::PREALLOC;
 					// fallthrough
 				case BTRFS_FILE_EXTENT_REG: {
-					e.m_physical = call_btrfs_get(btrfs_stack_file_extent_disk_bytenr, i.m_data);
+					e.m_physical = btrfs_get_member(&btrfs_file_extent_item::disk_bytenr, i.m_data);

 					// This is the length of the full extent (decompressed)
-					off_t ram = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data));
+					off_t ram = ranged_cast<off_t>(btrfs_get_member(&btrfs_file_extent_item::ram_bytes, i.m_data));

 					// This is the length of the part of the extent appearing in the file (decompressed)
-					len = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_num_bytes, i.m_data));
+					len = ranged_cast<off_t>(btrfs_get_member(&btrfs_file_extent_item::num_bytes, i.m_data));

 					// This is the offset from start of on-disk extent to the part we see in the file (decompressed)
 					// May be negative due to the kind of bug we're stuck with forever, so no cast range check
-					off_t offset = call_btrfs_get(btrfs_stack_file_extent_offset, i.m_data);
+					off_t offset = btrfs_get_member(&btrfs_file_extent_item::offset, i.m_data);

 					// If there is a physical address there must be size too
 					if (e.m_physical) {
@@ -597,7 +623,7 @@ namespace crucible {
 					e.m_flags |= FIEMAP_EXTENT_LAST;
 				}
 				// FIXME:  no FIEMAP_EXTENT_SHARED
-				// WONTFIX:  non-trivial to replicate LOGIAL_INO
+				// WONTFIX:  non-trivial to replicate LOGICAL_INO
 				rv.push_back(e);
 			}
 		}
@@ -613,6 +639,7 @@ namespace crucible {
 	ExtentWalker::Vec
 	ExtentWalker::get_extent_map(off_t pos)
 	{
+		EWLOG("get_extent_map(" << to_hex(pos) << ")");
 		Fiemap fm;
 		fm.fm_start = ranged_cast<uint64_t>(pos);
 		fm.fm_length = ranged_cast<uint64_t>(numeric_limits<off_t>::max() - pos);
@@ -626,7 +653,9 @@ namespace crucible {
 			e.m_physical = i.fe_physical;
 			e.m_flags = i.fe_flags;
 			rv.push_back(e);
+			EWLOG("push_back(" << e << ")");
 		}
+		EWLOG("get_extent_map(" << to_hex(pos) << ") returning " << rv.size() << " extents");
 		return rv;
 	}

--- a/lib/fd.cc
+++ b/lib/fd.cc
@@ -107,12 +107,6 @@ namespace crucible {
 		}
 	}

-	IOHandle::IOHandle() :
-		m_fd(-1)
-	{
-		CHATTER_TRACE("open fd " << m_fd << " in " << this);
-	}
-
 	IOHandle::IOHandle(int fd) :
 		m_fd(fd)
 	{
@@ -120,12 +114,52 @@ namespace crucible {
 	}

 	int
-	IOHandle::release_fd()
+	IOHandle::get_fd() const
 	{
-		CHATTER_TRACE("release fd " << m_fd << " in " << this);
-		int rv = m_fd;
-		m_fd = -1;
-		return rv;
+		return m_fd;
+	}
+
+	NamedPtr<IOHandle, int> Fd::s_named_ptr([](int fd) { return make_shared<IOHandle>(fd); });
+
+	Fd::Fd() :
+		m_handle(s_named_ptr(-1))
+	{
+	}
+
+	Fd::Fd(int fd) :
+		m_handle(s_named_ptr(fd < 0 ? -1 : fd))
+	{
+	}
+
+	Fd &
+	Fd::operator=(int const fd)
+	{
+		m_handle = s_named_ptr(fd < 0 ? -1 : fd);
+		return *this;
+	}
+
+	Fd &
+	Fd::operator=(const shared_ptr<IOHandle> &handle)
+	{
+		m_handle = s_named_ptr.insert(handle, handle->get_fd());
+		return *this;
+	}
+
+	Fd::operator int() const
+	{
+		return m_handle->get_fd();
+	}
+
+	bool
+	Fd::operator!() const
+	{
+		return m_handle->get_fd() < 0;
+	}
+
+	shared_ptr<IOHandle>
+	Fd::operator->() const
+	{
+		return m_handle;
 	}

 	// XXX: necessary?  useful?
@@ -529,6 +563,8 @@ namespace crucible {
 		THROW_ERROR(runtime_error, "readlink: maximum buffer size exceeded");
 	}

+	static string __relative_path;
+
 	string
 	relative_path()
 	{
--- a/lib/fs.cc
+++ b/lib/fs.cc
@@ -5,7 +5,6 @@
 #include "crucible/limits.h"
 #include "crucible/ntoa.h"
 #include "crucible/string.h"
-#include "crucible/uuid.h"

 // FS_IOC_FIEMAP
 #include <linux/fs.h>
@@ -136,56 +135,11 @@ namespace crucible {
 		DIE_IF_MINUS_ONE(ioctl(dst_fd, BTRFS_IOC_CLONE_RANGE, &args));
 	}

-	// Userspace emulation of extent-same ioctl to work around kernel bugs
-	// (a memory leak, a deadlock, inability to cope with unaligned EOF, and a length limit)
-	// The emulation is incomplete:  no locking, and we always change ctime
-	void
-	BtrfsExtentSameByClone::do_ioctl()
-	{
-		if (length <= 0) {
-			throw out_of_range(string("length = 0 in ") + __PRETTY_FUNCTION__);
-		}
-		vector<char> cmp_buf_common(length);
-		vector<char> cmp_buf_iter(length);
-		pread_or_die(m_fd, cmp_buf_common.data(), length, logical_offset);
-		for (auto i = m_info.begin(); i != m_info.end(); ++i) {
-			i->status = -EIO;
-			i->bytes_deduped = 0;
-
-			// save atime/ctime for later
-			Stat target_stat(i->fd);
-
-			pread_or_die(i->fd, cmp_buf_iter.data(), length, i->logical_offset);
-			if (cmp_buf_common == cmp_buf_iter) {
-
-				// This never happens, so stop checking.
-				// assert(!memcmp(cmp_buf_common.data(), cmp_buf_iter.data(), length));
-
-				btrfs_clone_range(m_fd, logical_offset, length, i->fd, i->logical_offset);
-				i->status = 0;
-				i->bytes_deduped = length;
-
-				// The extent-same ioctl does not change mtime (as of patch v4)
-				struct timespec restore_ts[2] = {
-					target_stat.st_atim,
-					target_stat.st_mtim
-				};
-
-				// Ignore futimens failure as the real extent-same ioctl would never raise it
-				futimens(i->fd, restore_ts);
-
-			} else {
-				assert(memcmp(cmp_buf_common.data(), cmp_buf_iter.data(), length));
-				i->status = BTRFS_SAME_DATA_DIFFERS;
-			}
-		}
-	}
-
 	void
 	BtrfsExtentSame::do_ioctl()
 	{
 		dest_count = m_info.size();
-		vector<char> ioctl_arg = vector_copy_struct<btrfs_ioctl_same_args>(this);
+		vector<uint8_t> ioctl_arg = vector_copy_struct<btrfs_ioctl_same_args>(this);
 		ioctl_arg.resize(sizeof(btrfs_ioctl_same_args) + dest_count * sizeof(btrfs_ioctl_same_extent_info), 0);
 		btrfs_ioctl_same_args *ioctl_ptr = reinterpret_cast<btrfs_ioctl_same_args *>(ioctl_arg.data());
 		size_t count = 0;
@@ -233,16 +187,18 @@ namespace crucible {
 	}

 	BtrfsDataContainer::BtrfsDataContainer(size_t buf_size) :
-		m_data(buf_size, 0)
+		m_data(buf_size)
 	{
 	}

 	void *
-	BtrfsDataContainer::prepare()
+	BtrfsDataContainer::prepare(size_t container_size)
 	{
+		if (m_data.size() < container_size) {
+			m_data.resize(container_size);
+		}
 		btrfs_data_container *p = reinterpret_cast<btrfs_data_container *>(m_data.data());
-		size_t min_size = offsetof(btrfs_data_container, val);
-		size_t container_size = m_data.size();
+		const size_t min_size = offsetof(btrfs_data_container, val);
 		if (container_size < min_size) {
 			THROW_ERROR(out_of_range, "container size " << container_size << " smaller than minimum " << min_size);
 		}
@@ -301,33 +257,123 @@ namespace crucible {
 	}

 	BtrfsIoctlLogicalInoArgs::BtrfsIoctlLogicalInoArgs(uint64_t new_logical, size_t new_size) :
+		m_container_size(new_size),
 		m_container(new_size)
 	{
 		memset_zero<btrfs_ioctl_logical_ino_args>(this);
 		logical = new_logical;
 	}

+	size_t
+	BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::size() const
+	{
+		return m_end - m_begin;
+	}
+
+	BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::const_iterator
+	BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::cbegin() const
+	{
+		return m_begin;
+	}
+
+	BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::const_iterator
+	BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::cend() const
+	{
+		return m_end;
+	}
+
+	BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::iterator
+	BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::begin() const
+	{
+		return m_begin;
+	}
+
+	BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::iterator
+	BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::end() const
+	{
+		return m_end;
+	}
+
+	BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::iterator
+	BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::data() const
+	{
+		return m_begin;
+	}
+
+	BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::operator vector<BtrfsInodeOffsetRoot>() const
+	{
+		return vector<BtrfsInodeOffsetRoot>(m_begin, m_end);
+	}
+
+	void
+	BtrfsIoctlLogicalInoArgs::BtrfsInodeOffsetRootSpan::clear()
+	{
+		m_end = m_begin = nullptr;
+	}
+
+	void
+	BtrfsIoctlLogicalInoArgs::set_flags(uint64_t new_flags)
+	{
+		// We are still supporting building with old headers that don't have .flags yet
+		*(&reserved[0] + 3) = new_flags;
+	}
+
+	uint64_t
+	BtrfsIoctlLogicalInoArgs::get_flags() const
+	{
+		// We are still supporting building with old headers that don't have .flags yet
+		return *(&reserved[0] + 3);
+	}
+
 	bool
 	BtrfsIoctlLogicalInoArgs::do_ioctl_nothrow(int fd)
 	{
 		btrfs_ioctl_logical_ino_args *p = static_cast<btrfs_ioctl_logical_ino_args *>(this);
-		inodes = reinterpret_cast<uint64_t>(m_container.prepare());
+		inodes = reinterpret_cast<uint64_t>(m_container.prepare(m_container_size));
 		size = m_container.get_size();

 		m_iors.clear();

-		if (ioctl(fd, BTRFS_IOC_LOGICAL_INO, p)) {
-			return false;
+		static unsigned long bili_version = 0;
+
+		if (get_flags() == 0) {
+			// Could use either V1 or V2
+			if (bili_version) {
+				// We tested both versions and came to a decision
+				if (ioctl(fd, bili_version, p)) {
+					return false;
+				}
+			}  else {
+				// Try V2
+				if (ioctl(fd, BTRFS_IOC_LOGICAL_INO_V2, p)) {
+					// V2 failed, try again with V1
+					if (ioctl(fd, BTRFS_IOC_LOGICAL_INO, p)) {
+						// both V1 and V2 failed, doesn't tell us which one to choose
+						return false;
+					}
+					// V1 and V2 both tested with same arguments, V1 OK, and V2 failed
+					bili_version = BTRFS_IOC_LOGICAL_INO;
+				} else {
+					// V2 succeeded, don't use V1 any more
+					bili_version = BTRFS_IOC_LOGICAL_INO_V2;
+				}
+			}
+		} else {
+			// Flags/size require a V2 feature, no fallback to V1 possible
+			if (ioctl(fd, BTRFS_IOC_LOGICAL_INO_V2, p)) {
+				return false;
+			}
+			// V2 succeeded so we don't need to probe any more
+			bili_version = BTRFS_IOC_LOGICAL_INO_V2;
 		}

 		btrfs_data_container *bdc = reinterpret_cast<btrfs_data_container *>(p->inodes);
 		BtrfsInodeOffsetRoot *input_iter = reinterpret_cast<BtrfsInodeOffsetRoot *>(bdc->val);
-		m_iors.reserve(bdc->elem_cnt);
-
-		for (auto count = bdc->elem_cnt; count > 2; count -= 3) {
-			m_iors.push_back(*input_iter++);
-		}

+		// elem_cnt counts uint64_t, but BtrfsInodeOffsetRoot is 3x uint64_t
+		THROW_CHECK1(runtime_error, bdc->elem_cnt, bdc->elem_cnt % 3 == 0);
+		m_iors.m_begin = input_iter;
+		m_iors.m_end = input_iter + bdc->elem_cnt / 3;
 		return true;
 	}

@@ -350,7 +396,7 @@ namespace crucible {
 	}

 	BtrfsIoctlInoPathArgs::BtrfsIoctlInoPathArgs(uint64_t inode, size_t new_size) :
-		m_container(new_size)
+		m_container_size(new_size)
 	{
 		memset_zero<btrfs_ioctl_ino_path_args>(this);
 		inum = inode;
@@ -360,8 +406,9 @@ namespace crucible {
 	BtrfsIoctlInoPathArgs::do_ioctl_nothrow(int fd)
 	{
 		btrfs_ioctl_ino_path_args *p = static_cast<btrfs_ioctl_ino_path_args *>(this);
-		fspath = reinterpret_cast<uint64_t>(m_container.prepare());
-		size = m_container.get_size();
+		BtrfsDataContainer container(m_container_size);
+		fspath = reinterpret_cast<uint64_t>(container.prepare(m_container_size));
+		size = container.get_size();

 		m_paths.clear();

@@ -377,8 +424,8 @@ namespace crucible {

 		for (auto count = bdc->elem_cnt; count > 0; --count) {
 			const char *path = cp + *up++;
-			if (static_cast<size_t>(path - cp) > m_container.get_size()) {
-				THROW_ERROR(out_of_range, "offset " << (path - cp) << " > size " << m_container.get_size() << " in " << __PRETTY_FUNCTION__);
+			if (static_cast<size_t>(path - cp) > container.get_size()) {
+				THROW_ERROR(out_of_range, "offset " << (path - cp) << " > size " << container.get_size() << " in " << __PRETTY_FUNCTION__);
 			}
 			m_paths.push_back(string(path));
 		}
@@ -629,7 +676,7 @@ namespace crucible {
 		THROW_CHECK1(out_of_range, m_min_count, m_min_count <= m_max_count);

 		auto extent_count = m_min_count;
-		vector<char> ioctl_arg = vector_copy_struct<fiemap>(this);
+		vector<uint8_t> ioctl_arg = vector_copy_struct<fiemap>(this);

 		ioctl_arg.resize(sizeof(fiemap) + extent_count * sizeof(fiemap_extent), 0);

@@ -698,22 +745,16 @@ namespace crucible {
 	}

 	size_t
-	BtrfsIoctlSearchHeader::set_data(const vector<char> &v, size_t offset)
+	BtrfsIoctlSearchHeader::set_data(const vector<uint8_t> &v, size_t offset)
 	{
 		THROW_CHECK2(invalid_argument, offset, v.size(), offset + sizeof(btrfs_ioctl_search_header) <= v.size());
-		*static_cast<btrfs_ioctl_search_header *>(this) = *reinterpret_cast<const btrfs_ioctl_search_header *>(&v[offset]);
+		memcpy(static_cast<btrfs_ioctl_search_header *>(this), &v[offset], sizeof(btrfs_ioctl_search_header));
 		offset += sizeof(btrfs_ioctl_search_header);
 		THROW_CHECK2(invalid_argument, offset + len, v.size(), offset + len <= v.size());
-		m_data = vector<char>(&v[offset], &v[offset + len]);
+		m_data = Spanner<const uint8_t>(&v[offset], &v[offset + len]);
 		return offset + len;
 	}

-	bool
-	BtrfsIoctlSearchHeader::operator<(const BtrfsIoctlSearchHeader &that) const
-	{
-		return tie(objectid, type, offset, len, transid) < tie(that.objectid, that.type, that.offset, that.len, that.transid);
-	}
-
 	bool
 	BtrfsIoctlSearchKey::do_ioctl_nothrow(int fd)
 	{
@@ -723,15 +764,12 @@ namespace crucible {
 		// Keep the ioctl buffer from one run to the next to save on malloc costs
 		size_t target_buf_size = sizeof(btrfs_ioctl_search_args_v2) + m_buf_size;

-		thread_local vector<char> ioctl_arg;
-		if (ioctl_arg.size() < m_buf_size) {
-			ioctl_arg = vector_copy_struct<btrfs_ioctl_search_key>(this);
-			ioctl_arg.resize(target_buf_size);
-		} else {
-			memcpy(ioctl_arg.data(), static_cast<btrfs_ioctl_search_key*>(this), sizeof(btrfs_ioctl_search_key));
-		}
+		m_ioctl_arg = vector_copy_struct<btrfs_ioctl_search_key>(this);
+		m_ioctl_arg.resize(target_buf_size);

-		btrfs_ioctl_search_args_v2 *ioctl_ptr = reinterpret_cast<btrfs_ioctl_search_args_v2 *>(ioctl_arg.data());
+		m_result.clear();
+
+		btrfs_ioctl_search_args_v2 *ioctl_ptr = reinterpret_cast<btrfs_ioctl_search_args_v2 *>(m_ioctl_arg.data());

 		ioctl_ptr->buf_size = m_buf_size;

@@ -743,12 +781,10 @@ namespace crucible {

 		static_cast<btrfs_ioctl_search_key&>(*this) = ioctl_ptr->key;

-		m_result.clear();
-
 		size_t offset = pointer_distance(ioctl_ptr->buf, ioctl_ptr);
 		for (decltype(nr_items) i = 0; i < nr_items; ++i) {
 			BtrfsIoctlSearchHeader item;
-			offset = item.set_data(ioctl_arg, offset);
+			offset = item.set_data(m_ioctl_arg, offset);
 			m_result.insert(item);
 		}

@@ -775,14 +811,16 @@ namespace crucible {
 		}
 	}

-	ostream &hexdump(ostream &os, const vector<char> &v)
+	template <class V>
+	ostream &
+	hexdump(ostream &os, const V &v)
 	{
-		os << "vector<char> { size = " << v.size() << ", data:\n";
+		os << "vector<uint8_t> { size = " << v.size() << ", data:\n";
 		for (size_t i = 0; i < v.size(); i += 8) {
 			string hex, ascii;
 			for (size_t j = i; j < i + 8; ++j) {
 				if (j < v.size()) {
-					unsigned char c = v[j];
+					uint8_t c = v[j];
 					char buf[8];
 					sprintf(buf, "%02x ", c);
 					hex += buf;
@@ -979,7 +1017,7 @@ namespace crucible {
 				}

 				if (i.objectid == root_id && i.type == BTRFS_ROOT_ITEM_KEY) {
-					rv = max(rv, uint64_t(call_btrfs_get(btrfs_root_generation, i.m_data)));
+					rv = max(rv, uint64_t(btrfs_get_member(&btrfs_root_item::generation, i.m_data)));
 				}
 			}
 			if (sk.min_offset < numeric_limits<decltype(sk.min_offset)>::max()) {
@@ -1032,7 +1070,6 @@ namespace crucible {
 		os << "BtrfsIoctlFsInfoArgs {"
 			<< " max_id = " << a.max_id << ","
 			<< " num_devices = " << a.num_devices << ","
-			<< " fsid = " << a.uuid() << ","
 #if 0
 			<< " nodesize = " << a.nodesize << ","
 			<< " sectorsize = " << a.sectorsize << ","
@@ -1047,22 +1084,29 @@ namespace crucible {

 	BtrfsIoctlFsInfoArgs::BtrfsIoctlFsInfoArgs()
 	{
-		memset_zero<btrfs_ioctl_fs_info_args>(this);
+		memset_zero<btrfs_ioctl_fs_info_args_v2>(this);
+		flags = BTRFS_FS_INFO_FLAG_CSUM_INFO;
 	}

 	void
 	BtrfsIoctlFsInfoArgs::do_ioctl(int fd)
 	{
-		btrfs_ioctl_fs_info_args *p = static_cast<btrfs_ioctl_fs_info_args *>(this);
+		btrfs_ioctl_fs_info_args_v2 *p = static_cast<btrfs_ioctl_fs_info_args_v2 *>(this);
 		if (ioctl(fd, BTRFS_IOC_FS_INFO, p)) {
 			THROW_ERRNO("BTRFS_IOC_FS_INFO: fd " << fd);
 		}
 	}

-	string
-	BtrfsIoctlFsInfoArgs::uuid() const
+	uint16_t
+	BtrfsIoctlFsInfoArgs::csum_type() const
 	{
-		return uuid_unparse(fsid);
+		return this->btrfs_ioctl_fs_info_args_v2::csum_type;
+	}
+
+	uint16_t
+	BtrfsIoctlFsInfoArgs::csum_size() const
+	{
+		return this->btrfs_ioctl_fs_info_args_v2::csum_size;
 	}

 };
--- a/lib/ntoa.cc
+++ b/lib/ntoa.cc
@@ -1,8 +1,7 @@
 #include "crucible/ntoa.h"

-#include <cassert>
-#include <sstream>
-#include <string>
+#include "crucible/error.h"
+#include "crucible/string.h"

 namespace crucible {
 	using namespace std;
@@ -12,7 +11,7 @@ namespace crucible {
 		string out;
 		while (n && table->a) {
 			// No bits in n outside of mask
-			assert( ((~table->mask) & table->n) == 0);
+			THROW_CHECK2(invalid_argument, table->mask, table->n, ((~table->mask) & table->n) == 0);
 			if ( (n & table->mask) == table->n) {
 				if (!out.empty()) {
 					out += "|";
@@ -23,12 +22,10 @@ namespace crucible {
 			++table;
 		}
 		if (n) {
-			ostringstream oss;
-			oss << "0x" << hex << n;
 			if (!out.empty()) {
 				out += "|";
 			}
-			out += oss.str();
+			out += to_hex(n);
 		}
 		if (out.empty()) {
 			out = "0";
--- a/lib/process.cc
+++ b/lib/process.cc
@@ -2,6 +2,7 @@

 #include "crucible/chatter.h"
 #include "crucible/error.h"
+#include "crucible/ntoa.h"

 #include <cstdlib>
 #include <utility>
@@ -110,9 +111,6 @@ namespace crucible {
 		}
 	}

-	template<>
-	struct ResourceHandle<Process::id, Process>;
-
 	pid_t
 	gettid()
 	{
@@ -152,4 +150,69 @@ namespace crucible {
 		return loadavg[2];
 	}

+	static const struct bits_ntoa_table signals_table[] = {
+
+		// POSIX.1-1990
+		NTOA_TABLE_ENTRY_ENUM(SIGHUP),
+		NTOA_TABLE_ENTRY_ENUM(SIGINT),
+		NTOA_TABLE_ENTRY_ENUM(SIGQUIT),
+		NTOA_TABLE_ENTRY_ENUM(SIGILL),
+		NTOA_TABLE_ENTRY_ENUM(SIGABRT),
+		NTOA_TABLE_ENTRY_ENUM(SIGFPE),
+		NTOA_TABLE_ENTRY_ENUM(SIGKILL),
+		NTOA_TABLE_ENTRY_ENUM(SIGSEGV),
+		NTOA_TABLE_ENTRY_ENUM(SIGPIPE),
+		NTOA_TABLE_ENTRY_ENUM(SIGALRM),
+		NTOA_TABLE_ENTRY_ENUM(SIGTERM),
+		NTOA_TABLE_ENTRY_ENUM(SIGUSR1),
+		NTOA_TABLE_ENTRY_ENUM(SIGUSR2),
+		NTOA_TABLE_ENTRY_ENUM(SIGCHLD),
+		NTOA_TABLE_ENTRY_ENUM(SIGCONT),
+		NTOA_TABLE_ENTRY_ENUM(SIGSTOP),
+		NTOA_TABLE_ENTRY_ENUM(SIGTSTP),
+		NTOA_TABLE_ENTRY_ENUM(SIGTTIN),
+		NTOA_TABLE_ENTRY_ENUM(SIGTTOU),
+
+		// SUSv2 and POSIX.1-2001
+		NTOA_TABLE_ENTRY_ENUM(SIGBUS),
+		NTOA_TABLE_ENTRY_ENUM(SIGPOLL),
+		NTOA_TABLE_ENTRY_ENUM(SIGPROF),
+		NTOA_TABLE_ENTRY_ENUM(SIGSYS),
+		NTOA_TABLE_ENTRY_ENUM(SIGTRAP),
+		NTOA_TABLE_ENTRY_ENUM(SIGURG),
+		NTOA_TABLE_ENTRY_ENUM(SIGVTALRM),
+		NTOA_TABLE_ENTRY_ENUM(SIGXCPU),
+		NTOA_TABLE_ENTRY_ENUM(SIGXFSZ),
+
+		// Other
+		NTOA_TABLE_ENTRY_ENUM(SIGIOT),
+#ifdef SIGEMT
+		NTOA_TABLE_ENTRY_ENUM(SIGEMT),
+#endif
+		NTOA_TABLE_ENTRY_ENUM(SIGSTKFLT),
+		NTOA_TABLE_ENTRY_ENUM(SIGIO),
+#ifdef SIGCLD
+		NTOA_TABLE_ENTRY_ENUM(SIGCLD),
+#endif
+		NTOA_TABLE_ENTRY_ENUM(SIGPWR),
+#ifdef SIGINFO
+		NTOA_TABLE_ENTRY_ENUM(SIGINFO),
+#endif
+#ifdef SIGLOST
+		NTOA_TABLE_ENTRY_ENUM(SIGLOST),
+#endif
+		NTOA_TABLE_ENTRY_ENUM(SIGWINCH),
+#ifdef SIGUNUSED
+		NTOA_TABLE_ENTRY_ENUM(SIGUNUSED),
+#endif
+
+		NTOA_TABLE_ENTRY_END(),
+	};
+
+	string
+	signal_ntoa(int sig)
+	{
+		return bits_ntoa(sig, signals_table);
+	}
+
 }
--- a/lib/string.cc
+++ b/lib/string.cc
@@ -16,7 +16,7 @@ namespace crucible {
 	uint64_t
 	from_hex(const string &s)
 	{
-		return stoull(s, 0, 0);
+		return stoull(s, nullptr, 0);
 	}

 	vector<string>
--- a/lib/task.cc
+++ b/lib/task.cc
@@ -1,12 +1,10 @@
 #include "crucible/task.h"

-#include "crucible/cleanup.h"
 #include "crucible/error.h"
 #include "crucible/process.h"
 #include "crucible/time.h"

 #include <atomic>
-#include <cmath>
 #include <condition_variable>
 #include <list>
 #include <map>
@@ -14,42 +12,130 @@
 #include <set>
 #include <thread>

+#include <cassert>
+#include <cmath>
+
 namespace crucible {
 	using namespace std;

-	static thread_local weak_ptr<TaskState> tl_current_task_wp;
+	class TaskState;
+	using TaskStatePtr = shared_ptr<TaskState>;
+	using TaskStateWeak = weak_ptr<TaskState>;
+
+	class TaskConsumer;
+	using TaskConsumerPtr = shared_ptr<TaskConsumer>;
+	using TaskConsumerWeak = weak_ptr<TaskConsumer>;
+
+	using TaskQueue = list<TaskStatePtr>;
+
+	static thread_local TaskStatePtr tl_current_task;
+
+	/// because we don't want to bump -std=c++-17 just to get scoped_lock
+	class PairLock {
+		unique_lock<mutex>	m_lock1, m_lock2;
+	public:
+		PairLock(mutex &m1, mutex &m2) :
+			m_lock1(m1, defer_lock),
+			m_lock2(m2, defer_lock)
+		{
+			if (&m1 == &m2) {
+				m_lock1.lock();
+			} else {
+				lock(m_lock1, m_lock2);
+			}
+		}
+	};

 	class TaskState : public enable_shared_from_this<TaskState> {
+		mutex					m_mutex;
 		const function<void()> 			m_exec_fn;
 		const string				m_title;
+
+		/// Tasks to be executed after the current task is executed
+		list<TaskStatePtr>			m_post_exec_queue;
+
+		/// Incremented by run() and append().  Decremented by exec().
+		size_t					m_run_count = 0;
+
+		/// Set when task starts execution by exec().
+		/// Cleared when exec() ends.
+		bool					m_is_running = false;
+
+		/// Sequential identifier for next task
+		static atomic<TaskId>			s_next_id;
+
+		/// Sequential identifier for next task
+		static atomic<size_t>			s_instance_count;
+
+		/// Identifier for this task
 		TaskId					m_id;

-		static atomic<TaskId>			s_next_id;
+		/// Backend for append()
+		void append_nolock(const TaskStatePtr &task);
+
+		/// Clear the post-execution queue.  Recursively destroys post-exec
+		/// queues of all tasks in post-exec queue.  Useful only when
+		/// cancelling the entire task queue.
+		void clear();
+
+	friend class TaskMasterState;
+	friend class TaskConsumer;
+
+		/// Clear any TaskQueue, not just this one.
+		static void clear_queue(TaskQueue &tq);
+
+		/// Rescue any TaskQueue, not just this one.
+		static void rescue_queue(TaskQueue &tq);
+
+		TaskState &operator=(const TaskState &) = delete;
+		TaskState(const TaskState &) = delete;
+
 	public:
+		~TaskState();
 		TaskState(string title, function<void()> exec_fn);

+		/// Run the task at most one more time.  If task has
+		/// already started running, a new instance is scheduled.
+		/// If an instance is already scheduled by run() or
+		/// append(), does nothing.  Otherwise, schedules a new
+		/// instance at the end of TaskMaster's global queue.
+		void run();
+
+		/// Execute task immediately in current thread if it is not already
+		/// executing in another thread; otherwise, append the current task
+		/// to itself to be executed immediately in the other thread.
 		void exec();
+
+		/// Return title of task.
 		string title() const;
+
+		/// Return ID of task.
 		TaskId id() const;
+
+		/// Queue task to execute after current task finishes executing
+		/// or is destroyed.
+		void append(const TaskStatePtr &task);
+
+		/// How masy Tasks are there?  Good for catching leaks
+		static size_t instance_count();
 	};

 	atomic<TaskId> TaskState::s_next_id;
-
-	class TaskConsumer;
-	class TaskMasterState;
+	atomic<size_t> TaskState::s_instance_count;

 	class TaskMasterState : public enable_shared_from_this<TaskMasterState> {
 		mutex 					m_mutex;
 		condition_variable 			m_condvar;
-		list<shared_ptr<TaskState>>		m_queue;
+		TaskQueue				m_queue;
 		size_t					m_thread_max;
 		size_t					m_thread_min = 0;
-		set<shared_ptr<TaskConsumer>>		m_threads;
+		set<TaskConsumerPtr>			m_threads;
 		shared_ptr<thread>			m_load_tracking_thread;
 		double					m_load_target = 0;
 		double					m_prev_loadavg;
 		size_t					m_configured_thread_max;
 		double					m_thread_target;
+		bool					m_cancelled = false;

 	friend class TaskConsumer;
 	friend class TaskMaster;
@@ -62,31 +148,70 @@ namespace crucible {
 		size_t calculate_thread_count_nolock();
 		void set_loadavg_target(double target);
 		void loadavg_thread_fn();
+		void cancel();
+
+		TaskMasterState &operator=(const TaskMasterState &) = delete;
+		TaskMasterState(const TaskMasterState &) = delete;

 	public:
 		~TaskMasterState();
 		TaskMasterState(size_t thread_max = thread::hardware_concurrency());

-		static void push_back(shared_ptr<TaskState> task);
-		static void push_front(shared_ptr<TaskState> task);
+		static void push_back(const TaskStatePtr &task);
+		static void push_front(TaskQueue &queue);
 		size_t get_queue_count();
+		size_t get_thread_count();
 	};

 	class TaskConsumer : public enable_shared_from_this<TaskConsumer> {
-		weak_ptr<TaskMasterState>	m_master;
-		thread				m_thread;
-		shared_ptr<TaskState>		m_current_task;
+		shared_ptr<TaskMasterState>	m_master;
+		TaskStatePtr			m_current_task;
+
+	friend class TaskState;
+		TaskQueue			m_local_queue;

 		void consumer_thread();
 		shared_ptr<TaskState> current_task_locked();
-	public:
-		TaskConsumer(weak_ptr<TaskMasterState> tms);
-		shared_ptr<TaskState> current_task();
 	friend class TaskMaster;
 	friend class TaskMasterState;
+	public:
+		TaskConsumer(const shared_ptr<TaskMasterState> &tms);
+		shared_ptr<TaskState> current_task();
+	private:
+		// Make sure this gets constructed _last_
+		shared_ptr<thread>		m_thread;
 	};

-	static shared_ptr<TaskMasterState> s_tms = make_shared<TaskMasterState>();
+	static thread_local TaskConsumerPtr tl_current_consumer;
+
+	static auto s_tms = make_shared<TaskMasterState>();
+
+	void
+	TaskState::rescue_queue(TaskQueue &queue)
+	{
+		if (queue.empty()) {
+			return;
+		}
+		auto tlcc = tl_current_consumer;
+		if (tlcc) {
+			// We are executing under a TaskConsumer, splice our post-exec queue at front.
+			// No locks needed because we are using only thread-local objects.
+			tlcc->m_local_queue.splice(tlcc->m_local_queue.begin(), queue);
+		} else {
+			// We are not executing under a TaskConsumer.
+			// Create a new task to wrap our post-exec queue,
+			// then push it to the front of the global queue using normal locking methods.
+			TaskStatePtr rescue_task(make_shared<TaskState>("rescue_task", [](){}));
+			swap(rescue_task->m_post_exec_queue, queue);
+			TaskQueue tq_one { rescue_task };
+			TaskMasterState::push_front(tq_one);
+		}
+	}
+
+	TaskState::~TaskState()
+	{
+		--s_instance_count;
+	}

 	TaskState::TaskState(string title, function<void()> exec_fn) :
 		m_exec_fn(exec_fn),
@@ -94,6 +219,57 @@ namespace crucible {
 		m_id(++s_next_id)
 	{
 		THROW_CHECK0(invalid_argument, !m_title.empty());
+		++s_instance_count;
+	}
+
+	size_t
+	TaskState::instance_count()
+	{
+		return s_instance_count;
+	}
+
+	size_t
+	Task::instance_count()
+	{
+		return TaskState::instance_count();
+	}
+
+	void
+	TaskState::clear()
+	{
+		TaskQueue post_exec_queue;
+		unique_lock<mutex> lock(m_mutex);
+		swap(post_exec_queue, m_post_exec_queue);
+		lock.unlock();
+		clear_queue(post_exec_queue);
+	}
+
+	void
+	TaskState::clear_queue(TaskQueue &tq)
+	{
+		while (!tq.empty()) {
+			auto i = *tq.begin();
+			tq.pop_front();
+			i->clear();
+		}
+	}
+
+	void
+	TaskState::append_nolock(const TaskStatePtr &task)
+	{
+		THROW_CHECK0(invalid_argument, task);
+		m_post_exec_queue.push_back(task);
+	}
+
+	void
+	TaskState::append(const TaskStatePtr &task)
+	{
+		THROW_CHECK0(invalid_argument, task);
+		PairLock lock(m_mutex, task->m_mutex);
+		if (!task->m_run_count) {
+			++task->m_run_count;
+			append_nolock(task);
+		}
 	}

 	void
@@ -102,21 +278,35 @@ namespace crucible {
 		THROW_CHECK0(invalid_argument, m_exec_fn);
 		THROW_CHECK0(invalid_argument, !m_title.empty());

-		char buf[24];
-		memset(buf, '\0', sizeof(buf));
+		unique_lock<mutex> lock(m_mutex);
+		if (m_is_running) {
+			append_nolock(shared_from_this());
+			return;
+		} else {
+			--m_run_count;
+			m_is_running = true;
+		}
+		lock.unlock();
+
+		char buf[24] = { 0 };
 		DIE_IF_MINUS_ERRNO(pthread_getname_np(pthread_self(), buf, sizeof(buf)));
-		Cleanup pthread_name_cleaner([&]() {
-			pthread_setname_np(pthread_self(), buf);
-		});
 		DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), m_title.c_str()));

-		weak_ptr<TaskState> this_task_wp = shared_from_this();
-		Cleanup current_task_cleaner([&]() {
-			swap(this_task_wp, tl_current_task_wp);
-		});
-		swap(this_task_wp, tl_current_task_wp);
+		TaskStatePtr this_task = shared_from_this();
+		swap(this_task, tl_current_task);

-		m_exec_fn();
+		catch_all([&]() {
+			m_exec_fn();
+		});
+
+		swap(this_task, tl_current_task);
+		pthread_setname_np(pthread_self(), buf);
+
+		lock.lock();
+		m_is_running = false;
+
+		// Splice task post_exec queue at front of local queue
+		TaskState::rescue_queue(m_post_exec_queue);
 	}

 	string
@@ -132,6 +322,17 @@ namespace crucible {
 		return m_id;
 	}

+	void
+	TaskState::run()
+	{
+		unique_lock<mutex> lock(m_mutex);
+		if (m_run_count) {
+			return;
+		}
+		++m_run_count;
+		TaskMasterState::push_back(shared_from_this());
+	}
+
 	TaskMasterState::TaskMasterState(size_t thread_max) :
 		m_thread_max(thread_max),
 		m_configured_thread_max(thread_max),
@@ -161,21 +362,31 @@ namespace crucible {
 	}

 	void
-	TaskMasterState::push_back(shared_ptr<TaskState> task)
+	TaskMasterState::push_back(const TaskStatePtr &task)
 	{
 		THROW_CHECK0(runtime_error, task);
 		unique_lock<mutex> lock(s_tms->m_mutex);
+		if (s_tms->m_cancelled) {
+			task->clear();
+			return;
+		}
 		s_tms->m_queue.push_back(task);
 		s_tms->m_condvar.notify_all();
 		s_tms->start_threads_nolock();
 	}

 	void
-	TaskMasterState::push_front(shared_ptr<TaskState> task)
+	TaskMasterState::push_front(TaskQueue &queue)
 	{
-		THROW_CHECK0(runtime_error, task);
+		if (queue.empty()) {
+			return;
+		}
 		unique_lock<mutex> lock(s_tms->m_mutex);
-		s_tms->m_queue.push_front(task);
+		if (s_tms->m_cancelled) {
+			TaskState::clear_queue(queue);
+			return;
+		}
+		s_tms->m_queue.splice(s_tms->m_queue.begin(), queue);
 		s_tms->m_condvar.notify_all();
 		s_tms->start_threads_nolock();
 	}
@@ -192,6 +403,13 @@ namespace crucible {
 		return s_tms->m_queue.size();
 	}

+	size_t
+	TaskMaster::get_thread_count()
+	{
+		unique_lock<mutex> lock(s_tms->m_mutex);
+		return s_tms->m_threads.size();
+	}
+
 	ostream &
 	TaskMaster::print_queue(ostream &os)
 	{
@@ -226,6 +444,11 @@ namespace crucible {
 	size_t
 	TaskMasterState::calculate_thread_count_nolock()
 	{
+		if (m_cancelled) {
+			// No threads running while cancelled
+			return 0;
+		}
+
 		if (m_load_target == 0) {
 			// No limits, no stats, use configured thread count
 			return m_configured_thread_max;
@@ -296,6 +519,12 @@ namespace crucible {
 	TaskMasterState::set_thread_count(size_t thread_max)
 	{
 		unique_lock<mutex> lock(m_mutex);
+		// XXX: someday we might want to uncancel, and this would be the place to do it;
+		// however, when we cancel we destroy the entire Task queue, and that might be
+		// non-trivial to recover from
+		if (m_cancelled) {
+			return;
+		}
 		m_configured_thread_max = thread_max;
 		lock.unlock();
 		adjust_thread_count();
@@ -308,10 +537,32 @@ namespace crucible {
 		s_tms->set_thread_count(thread_max);
 	}

+	void
+	TaskMasterState::cancel()
+	{
+		unique_lock<mutex> lock(m_mutex);
+		m_cancelled = true;
+		decltype(m_queue) empty_queue;
+		m_queue.swap(empty_queue);
+		m_condvar.notify_all();
+		lock.unlock();
+		TaskState::clear_queue(empty_queue);
+	}
+
+	void
+	TaskMaster::cancel()
+	{
+		s_tms->cancel();
+	}
+
 	void
 	TaskMasterState::set_thread_min_count(size_t thread_min)
 	{
 		unique_lock<mutex> lock(m_mutex);
+		// XXX: someday we might want to uncancel, and this would be the place to do it
+		if (m_cancelled) {
+			return;
+		}
 		m_thread_min = thread_min;
 		lock.unlock();
 		adjust_thread_count();
@@ -328,7 +579,7 @@ namespace crucible {
 	TaskMasterState::loadavg_thread_fn()
 	{
 		pthread_setname_np(pthread_self(), "load_tracker");
-		while (true) {
+		while (!m_cancelled) {
 			adjust_thread_count();
 			nanosleep(5.0);
 		}
@@ -340,6 +591,9 @@ namespace crucible {
 		THROW_CHECK1(out_of_range, target, target >= 0);

 		unique_lock<mutex> lock(m_mutex);
+		if (m_cancelled) {
+			return;
+		}
 		m_load_target = target;
 		m_prev_loadavg = getloadavg1();

@@ -375,20 +629,21 @@ namespace crucible {
 	Task::run() const
 	{
 		THROW_CHECK0(runtime_error, m_task_state);
-		TaskMasterState::push_back(m_task_state);
+		m_task_state->run();
 	}

 	void
-	Task::run_earlier() const
+	Task::append(const Task &that) const
 	{
 		THROW_CHECK0(runtime_error, m_task_state);
-		TaskMasterState::push_front(m_task_state);
+		THROW_CHECK0(runtime_error, that);
+		m_task_state->append(that.m_task_state);
 	}

 	Task
 	Task::current_task()
 	{
-		return Task(tl_current_task_wp.lock());
+		return Task(tl_current_task);
 	}

 	string
@@ -431,46 +686,88 @@ namespace crucible {
 	shared_ptr<TaskState>
 	TaskConsumer::current_task()
 	{
-		auto master_locked = m_master.lock();
-		unique_lock<mutex> lock(master_locked->m_mutex);
+		unique_lock<mutex> lock(m_master->m_mutex);
 		return current_task_locked();
 	}

 	void
 	TaskConsumer::consumer_thread()
 	{
-		auto master_locked = m_master.lock();
-		while (true) {
-			unique_lock<mutex> lock(master_locked->m_mutex);
-			if (master_locked->m_thread_max < master_locked->m_threads.size()) {
+		// Keep a copy because we will be destroying *this later
+		auto master_copy = m_master;
+
+		// Constructor is running with master locked.
+		// Wait until that is done before trying to do anything.
+		unique_lock<mutex> lock(master_copy->m_mutex);
+
+		// Detach thread so destructor doesn't call terminate
+		m_thread->detach();
+
+		// Set thread name so it isn't empty or the name of some other thread
+		DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), "task_consumer"));
+
+		// It is now safe to access our own shared_ptr
+		TaskConsumerPtr this_consumer = shared_from_this();
+		swap(this_consumer, tl_current_consumer);
+
+		while (!master_copy->m_cancelled) {
+			if (master_copy->m_thread_max < master_copy->m_threads.size()) {
+				// We are one of too many threads, exit now
 				break;
 			}

-			if (master_locked->m_queue.empty()) {
-				master_locked->m_condvar.wait(lock);
+			if (!m_local_queue.empty()) {
+				m_current_task = *m_local_queue.begin();
+				m_local_queue.pop_front();
+			} else if (!master_copy->m_queue.empty()) {
+				m_current_task = *master_copy->m_queue.begin();
+				master_copy->m_queue.pop_front();
+			} else {
+				master_copy->m_condvar.wait(lock);
 				continue;
 			}

-			m_current_task = *master_locked->m_queue.begin();
-			master_locked->m_queue.pop_front();
+			// Execute task without lock
 			lock.unlock();
 			catch_all([&]() {
 				m_current_task->exec();
 			});
+
+			// Update m_current_task with lock
+			TaskStatePtr hold_task;
+			lock.lock();
+			swap(hold_task, m_current_task);
+
+			// Destroy hold_task without lock
+			lock.unlock();
+			hold_task.reset();
+
+			// Invariant: lock held
 			lock.lock();
-			m_current_task.reset();
 		}

-		unique_lock<mutex> lock(master_locked->m_mutex);
-		m_thread.detach();
-		master_locked->m_threads.erase(shared_from_this());
-		master_locked->m_condvar.notify_all();
+		// There is no longer a current consumer, but hold our own shared
+		// state so it's still there in the destructor
+		swap(this_consumer, tl_current_consumer);
+
+		// Release lock to rescue queue (may attempt to queue a new task at TaskMaster).
+		// rescue_queue normally sends tasks to the local queue of the current TaskConsumer thread,
+		// but we just disconnected ourselves from that.
+		lock.unlock();
+		TaskState::rescue_queue(m_local_queue);
+
+		// Hold lock so we can erase ourselves
+		lock.lock();
+
+		// Fun fact:  shared_from_this() isn't usable until the constructor returns...
+		master_copy->m_threads.erase(shared_from_this());
+		master_copy->m_condvar.notify_all();
 	}

-	TaskConsumer::TaskConsumer(weak_ptr<TaskMasterState> tms) :
-		m_master(tms),
-		m_thread([=](){ consumer_thread(); })
+	TaskConsumer::TaskConsumer(const shared_ptr<TaskMasterState> &tms) :
+		m_master(tms)
 	{
+		m_thread = make_shared<thread>([=](){ consumer_thread(); });
 	}

 	class BarrierState {
@@ -541,9 +838,10 @@ namespace crucible {
 	class ExclusionState {
 		mutex		m_mutex;
 		bool		m_locked = false;
-		set<Task>	m_tasks;
+		Task		m_task;

 	public:
+		ExclusionState(const string &title);
 		~ExclusionState();
 		void release();
 		bool try_lock();
@@ -555,8 +853,13 @@ namespace crucible {
 	{
 	}

-	Exclusion::Exclusion() :
-		m_exclusion_state(make_shared<ExclusionState>())
+	Exclusion::Exclusion(const string &title) :
+		m_exclusion_state(make_shared<ExclusionState>(title))
+	{
+	}
+
+	ExclusionState::ExclusionState(const string &title) :
+		m_task(title, [](){})
 	{
 	}

@@ -565,16 +868,7 @@ namespace crucible {
 	{
 		unique_lock<mutex> lock(m_mutex);
 		m_locked = false;
-		bool first = true;
-		for (auto i : m_tasks) {
-			if (first) {
-				i.run_earlier();
-				first = false;
-			} else {
-				i.run();
-			}
-		}
-		m_tasks.clear();
+		m_task.run();
 	}

 	ExclusionState::~ExclusionState()
@@ -605,7 +899,13 @@ namespace crucible {
 	ExclusionState::insert_task(Task task)
 	{
 		unique_lock<mutex> lock(m_mutex);
-		m_tasks.insert(task);
+		if (m_locked) {
+			// If Exclusion is locked then queue task for release;
+			m_task.append(task);
+		} else {
+			// otherwise, run the inserted task immediately
+			task.run();
+		}
 	}

 	bool
--- a/lib/time.cc
+++ b/lib/time.cc
@@ -116,23 +116,26 @@ namespace crucible {
 		}
 	}

+	double
+	RateLimiter::sleep_time(double cost)
+	{
+		borrow(cost);
+		unique_lock<mutex> lock(m_mutex);
+		update_tokens();
+		if (m_tokens >= 0) {
+			return 0;
+		}
+		return -m_tokens / m_rate;
+	}
+
 	void
 	RateLimiter::sleep_for(double cost)
 	{
-		borrow(cost);
-		while (1) {
-			unique_lock<mutex> lock(m_mutex);
-			update_tokens();
-			if (m_tokens >= 0) {
-				return;
-			}
-			double sleep_time(-m_tokens / m_rate);
-			lock.unlock();
-			if (sleep_time > 0.0) {
-				nanosleep(sleep_time);
-			} else {
-				return;
-			}
+		double time_to_sleep = sleep_time(cost);
+		if (time_to_sleep > 0.0) {
+			nanosleep(time_to_sleep);
+		} else {
+			return;
 		}
 	}

--- a/lib/uuid.cc
+++ b/lib/uuid.cc
@@ -1,16 +0,0 @@
-#include "crucible/uuid.h"
-
-namespace crucible {
-	using namespace std;
-
-	const size_t uuid_unparsed_size = 37; // "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx\0"
-
-	string
-	uuid_unparse(const unsigned char in[16])
-	{
-		char out[uuid_unparsed_size];
-		::uuid_unparse(in, out);
-		return string(out);
-	}
-
-}
--- a/12
+++ b/12
@@ -1,11 +1,13 @@
 # Default:
-CCFLAGS  = -Wall -Wextra -Werror -I../include -fpic -D_FILE_OFFSET_BITS=64
+CCFLAGS = -Wall -Wextra -Werror -O3

 # Optimized:
-# CCFLAGS  = -Wall -Wextra -Werror -O3 -march=native -I../include -fpic -D_FILE_OFFSET_BITS=64
+# CCFLAGS = -Wall -Wextra -Werror -O3 -march=native

 # Debug:
-# CCFLAGS  = -Wall -Wextra -Werror -O0 -I../include -ggdb -fpic -D_FILE_OFFSET_BITS=64
+# CCFLAGS = -Wall -Wextra -Werror -O0 -ggdb

-CFLAGS   += $(CCFLAGS) -std=c99
-CXXFLAGS += $(CCFLAGS) -std=c++11 -Wold-style-cast
+CCFLAGS += -I../include -D_FILE_OFFSET_BITS=64
+
+BEES_CFLAGS   = $(CCFLAGS) -std=c99 $(CFLAGS)
+BEES_CXXFLAGS = $(CCFLAGS) -std=c++11 -Wold-style-cast $(CXXFLAGS)
--- a/scripts/beesd.conf.sample
+++ b/scripts/beesd.conf.sample
@@ -23,12 +23,12 @@ UUID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
 # sHash table entries are 16 bytes each
 # (64-bit hash, 52-bit block number, and some metadata bits)
 # Each entry represents a minimum of 4K on disk.
-# unique data size    hash table size    average dedup block size
+# unique data size    hash table size    average dedupe block size
 #     1TB                 4GB                  4K
 #     1TB                 1GB                 16K
 #     1TB               256MB                 64K
 #     1TB                16MB               1024K
 #    64TB                 1GB               1024K
 #
-# Size MUST be power of 16M
-# DB_SIZE=$((64*$AL16M)) # 1G in bytes
+# Size MUST be multiple of 128KB
+# DB_SIZE=$((1024*1024*1024)) # 1G in bytes
--- a/scripts/beesd.in
+++ b/scripts/beesd.in
@@ -9,8 +9,9 @@ YN(){ [[ "$1" =~ (1|Y|y) ]]; }
 export BEESHOME BEESSTATUS
 export WORK_DIR CONFIG_DIR
 export CONFIG_FILE
-export UUID AL16M
+export UUID AL16M AL128K

+readonly AL128K="$((128*1024))"
 readonly AL16M="$((16*1024*1024))"
 readonly CONFIG_DIR=@ETC_PREFIX@/bees/

@@ -89,7 +90,7 @@ WORK_DIR="${WORK_DIR:-/run/bees/}"
 MNT_DIR="${MNT_DIR:-$WORK_DIR/mnt/$UUID}"
 BEESHOME="${BEESHOME:-$MNT_DIR/.beeshome}"
 BEESSTATUS="${BEESSTATUS:-$WORK_DIR/$UUID.status}"
-DB_SIZE="${DB_SIZE:-$((64*AL16M))}"
+DB_SIZE="${DB_SIZE:-$((8192*AL128K))}"

 INFO "Check: Disk exists"
 if [ ! -b "/dev/disk/by-uuid/$UUID" ]; then
@@ -109,11 +110,7 @@ mkdir -p "$WORK_DIR" || exit 1
 INFO "MOUNT DIR: $MNT_DIR"
 mkdir -p "$MNT_DIR" || exit 1

-umount_w(){ mountpoint -q "$1" && umount -l "$1"; }
-force_umount(){ umount_w "$MNT_DIR"; }
-trap force_umount SIGINT SIGTERM EXIT
-
-mount -osubvolid=5 /dev/disk/by-uuid/$UUID "$MNT_DIR" || exit 1
+mount --make-private -osubvolid=5 /dev/disk/by-uuid/$UUID "$MNT_DIR" || exit 1

 if [ ! -d "$BEESHOME" ]; then
    INFO "Create subvol $BEESHOME for store bees data"
@@ -128,8 +125,8 @@ fi
    touch "$DB_PATH"
    OLD_SIZE="$(du -b "$DB_PATH" | sed 's/\t/ /g' | cut -d' ' -f1)"
    NEW_SIZE="$DB_SIZE"
-    if (( "$NEW_SIZE"%AL16M > 0 )); then
-        ERRO "DB_SIZE Must be multiple of 16M"
+    if (( "$NEW_SIZE"%AL128K > 0 )); then
+        ERRO "DB_SIZE Must be multiple of 128K"
    fi
    if (( "$OLD_SIZE" != "$NEW_SIZE" )); then
        INFO "Resize db: $OLD_SIZE -> $NEW_SIZE"
@@ -142,4 +139,4 @@ fi
 MNT_DIR="$(realpath $MNT_DIR)"

 cd "$MNT_DIR"
-"$bees_bin" "${ARGUMENTS[@]}" $OPTIONS "$MNT_DIR"
+exec "$bees_bin" "${ARGUMENTS[@]}" $OPTIONS "$MNT_DIR"
--- a/scripts/beesd@.service.in
+++ b/scripts/beesd@.service.in
@@ -6,20 +6,53 @@ After=sysinit.target
 [Service]
 Type=simple
 ExecStart=@PREFIX@/sbin/beesd --no-timestamps %i
-Nice=19
-KillMode=control-group
-KillSignal=SIGTERM
-CPUShares=128
-StartupCPUShares=256
-BlockIOWeight=100
-StartupBlockIOWeight=250
+CPUAccounting=true
+CPUSchedulingPolicy=batch
+CPUWeight=12
 IOSchedulingClass=idle
 IOSchedulingPriority=7
-CPUSchedulingPolicy=batch
+IOWeight=10
+KillMode=control-group
+KillSignal=SIGTERM
+MemoryAccounting=true
 Nice=19
 Restart=on-abnormal
-CPUAccounting=true
-MemoryAccounting=true
+StartupCPUWeight=25
+StartupIOWeight=25
+
+# Hide other users' process in /proc/
+ProtectProc=invisible
+
+# Mount / as read-only
+ProtectSystem=strict
+
+# Forbidden access to /home, /root and /run/user
+ProtectHome=true
+
+# Mount tmpfs on /tmp/ and /var/tmp/.
+# Cannot mount at /run/ or /var/run/ for they are used by systemd.
+PrivateTmp=true
+
+# Disable network access
+PrivateNetwork=true
+
+# Use private IPC namespace, utc namespace
+PrivateIPC=true
+ProtectHostname=true
+
+# Disable write access to kernel variables throug /proc
+ProtectKernelTunables=true
+
+# Disable access to control groups
+ProtectControlGroups=true
+
+# Set capabilities of the new program
+# The first three are required for accessing any file on the mounted filesystem.
+# The last one is required for mounting the filesystem.
+AmbientCapabilities=CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_FOWNER CAP_SYS_ADMIN
+
+# With NoNewPrivileges, running sudo cannot gain any new privilege
+NoNewPrivileges=true

 [Install]
 WantedBy=basic.target
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -1 +1,3 @@
+*.new.c
+bees-usage.c
 bees-version.[ch]
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,14 +1,17 @@
+BEES = ../bin/bees
 PROGRAMS = \
-	../bin/bees \
 	../bin/fiemap \
 	../bin/fiewalk \

-all: $(PROGRAMS)
+PROGRAM_OBJS = $(foreach b,$(PROGRAMS),$(patsubst ../bin/%,%.o,$(b)))
+
+all: $(BEES) $(PROGRAMS)

 include ../makeflags
+-include ../localconf

 LIBS = -lcrucible -lpthread
-LDFLAGS = -L../lib
+BEES_LDFLAGS = -L../lib $(LDFLAGS)

 BEES_OBJS = \
 	bees.o \
@@ -17,31 +20,44 @@ BEES_OBJS = \
 	bees-resolve.o \
 	bees-roots.o \
 	bees-thread.o \
+	bees-trace.o \
 	bees-types.o \

+ALL_OBJS = $(BEES_OBJS) $(PROGRAM_OBJS)
+
 bees-version.c: bees.h $(BEES_OBJS:.o=.cc) Makefile
-	echo "const char *BEES_VERSION = \"$(shell git describe --always --dirty || echo UNKNOWN)\";" > bees-version.new.c
+	echo "const char *BEES_VERSION = \"$(BEES_VERSION)\";" > bees-version.new.c
 	mv -f bees-version.new.c bees-version.c

-.depends/%.dep: %.cc Makefile
-	@mkdir -p .depends
-	$(CXX) $(CXXFLAGS) -M -MF $@ -MT $(<:.cc=.o) $<
+bees-usage.c: bees-usage.txt Makefile
+	(echo 'const char *BEES_USAGE = '; sed -r 's/^(.*)$$/"\1\\n"/' < bees-usage.txt; echo ';') > bees-usage.new.c
+	mv -f bees-usage.new.c bees-usage.c

-depends.mk: $(BEES_OBJS:%.o=.depends/%.dep)
+.depends:
+	mkdir -p $@
+
+.depends/%.dep: %.cc Makefile | .depends
+	$(CXX) $(BEES_CXXFLAGS) -M -MF $@ -MT $(<:.cc=.o) $<
+
+depends.mk: $(ALL_OBJS:%.o=.depends/%.dep)
 	cat $^ > $@.new
 	mv -f $@.new $@

 include depends.mk

-%.o: %.cc %.h
-	$(CXX) $(CXXFLAGS) -o $@ -c $<
+%.o: %.c ../makeflags
+	$(CC) $(BEES_CFLAGS) -o $@ -c $<

-../bin/%: %.o
-	@echo Implicit bin rule "$<" '->' "$@"
-	$(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $< $(LIBS)
+%.o: %.cc ../makeflags
+	$(CXX) $(BEES_CXXFLAGS) -o $@ -c $<

-../bin/bees: $(BEES_OBJS) bees-version.o
-	$(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)
+$(PROGRAMS): ../bin/%: %.o
+	$(CXX) $(BEES_CXXFLAGS) $(BEES_LDFLAGS) -o $@ $< $(LIBS)
+
+$(PROGRAMS): ../lib/libcrucible.a
+
+$(BEES): $(BEES_OBJS) bees-version.o bees-usage.o ../lib/libcrucible.a
+	$(CXX) $(BEES_CXXFLAGS) $(BEES_LDFLAGS) -o $@ $^ $(LIBS)

 clean:
 	rm -fv *.o bees-version.c
--- a/src/bees-context.cc
+++ b/src/bees-context.cc
@@ -1,5 +1,6 @@
 #include "bees.h"

+#include "crucible/cleanup.h"
 #include "crucible/limits.h"
 #include "crucible/string.h"
 #include "crucible/task.h"
@@ -8,32 +9,31 @@
 #include <iostream>
 #include <vector>

+// round
+#include <cmath>
+// struct rusage
+#include <sys/resource.h>
+
+// struct sigset
+#include <signal.h>
+
 using namespace crucible;
 using namespace std;

-static inline
-const char *
-getenv_or_die(const char *name)
-{
-	const char *rv = getenv(name);
-	if (!rv) {
-		THROW_ERROR(runtime_error, "Environment variable " << name << " not defined");
-	}
-	return rv;
-}

-BeesFdCache::BeesFdCache()
+BeesFdCache::BeesFdCache(shared_ptr<BeesContext> ctx) :
+	m_ctx(ctx)
 {
-	m_root_cache.func([&](shared_ptr<BeesContext> ctx, uint64_t root) -> Fd {
+	m_root_cache.func([&](uint64_t root) -> Fd {
 		Timer open_timer;
-		auto rv = ctx->roots()->open_root_nocache(root);
+		auto rv = m_ctx->roots()->open_root_nocache(root);
 		BEESCOUNTADD(open_root_ms, open_timer.age() * 1000);
 		return rv;
 	});
 	m_root_cache.max_size(BEES_ROOT_FD_CACHE_SIZE);
-	m_file_cache.func([&](shared_ptr<BeesContext> ctx, uint64_t root, uint64_t ino) -> Fd {
+	m_file_cache.func([&](uint64_t root, uint64_t ino) -> Fd {
 		Timer open_timer;
-		auto rv = ctx->roots()->open_root_ino_nocache(root, ino);
+		auto rv = m_ctx->roots()->open_root_ino_nocache(root, ino);
 		BEESCOUNTADD(open_ino_ms, open_timer.age() * 1000);
 		return rv;
 	});
@@ -44,30 +44,25 @@ void
 BeesFdCache::clear()
 {
 	BEESNOTE("Clearing root FD cache to enable subvol delete");
+	BEESLOGDEBUG("Clearing root FD cache to enable subvol delete");
 	m_root_cache.clear();
 	BEESCOUNT(root_clear);
+	BEESLOGDEBUG("Clearing open FD cache to enable file delete");
 	BEESNOTE("Clearing open FD cache to enable file delete");
 	m_file_cache.clear();
 	BEESCOUNT(open_clear);
 }

 Fd
-BeesFdCache::open_root(shared_ptr<BeesContext> ctx, uint64_t root)
+BeesFdCache::open_root(uint64_t root)
 {
-	return m_root_cache(ctx, root);
+	return m_root_cache(root);
 }

 Fd
-BeesFdCache::open_root_ino(shared_ptr<BeesContext> ctx, uint64_t root, uint64_t ino)
+BeesFdCache::open_root_ino(uint64_t root, uint64_t ino)
 {
-	return m_file_cache(ctx, root, ino);
-}
-
-void
-BeesFdCache::insert_root_ino(shared_ptr<BeesContext> ctx, Fd fd)
-{
-	BeesFileId fid(fd);
-	return m_file_cache.insert(fd, ctx, fid.root(), fid.ino());
+	return m_file_cache(root, ino);
 }

 void
@@ -77,21 +72,19 @@ BeesContext::dump_status()
 	if (!status_charp) return;
 	string status_file(status_charp);
 	BEESLOGINFO("Writing status to file '" << status_file << "' every " << BEES_STATUS_INTERVAL << " sec");
-	while (1) {
-		BEESNOTE("waiting " << BEES_STATUS_INTERVAL);
-		sleep(BEES_STATUS_INTERVAL);
-
+	Timer total_timer;
+	while (!m_stop_status) {
 		BEESNOTE("writing status to file '" << status_file << "'");
 		ofstream ofs(status_file + ".tmp");

 		auto thisStats = BeesStats::s_global;
 		ofs << "TOTAL:\n";
 		ofs << "\t" << thisStats << "\n";
-		auto avg_rates = thisStats / m_total_timer.age();
+		auto avg_rates = thisStats / total_timer.age();
 		ofs << "RATES:\n";
 		ofs << "\t" << avg_rates << "\n";

-		ofs << "THREADS (work queue " << TaskMaster::get_queue_count() << " tasks):\n";
+		ofs << "THREADS (work queue " << TaskMaster::get_queue_count() << " of " << Task::instance_count() << " tasks, " << TaskMaster::get_thread_count() << " workers):\n";
 		for (auto t : BeesNote::get_status()) {
 			ofs << "\ttid " << t.first << ": " << t.second << "\n";
 		}
@@ -108,48 +101,74 @@ BeesContext::dump_status()

 		BEESNOTE("renaming status file '" << status_file << "'");
 		rename((status_file + ".tmp").c_str(), status_file.c_str());
+
+		BEESNOTE("idle " << BEES_STATUS_INTERVAL);
+		unique_lock<mutex> lock(m_stop_mutex);
+		if (m_stop_status) {
+			return;
+		}
+		m_stop_condvar.wait_for(lock, chrono::duration<double>(BEES_STATUS_INTERVAL));
 	}
 }

 void
 BeesContext::show_progress()
 {
-	auto lastProgressStats = BeesStats::s_global;
-	auto lastStats = lastProgressStats;
+	auto lastStats = BeesStats::s_global;
 	Timer stats_timer;
-	while (1) {
-		sleep(BEES_PROGRESS_INTERVAL);
+	Timer all_timer;
+	while (!stop_requested()) {
+		BEESNOTE("idle " << BEES_PROGRESS_INTERVAL);

-		if (stats_timer.age() > BEES_STATS_INTERVAL) {
-			stats_timer.lap();
-
-			auto thisStats = BeesStats::s_global;
-			auto avg_rates = lastStats / BEES_STATS_INTERVAL;
-			BEESLOGINFO("TOTAL: " << thisStats);
-			BEESLOGINFO("RATES: " << avg_rates);
-			lastStats = thisStats;
+		unique_lock<mutex> lock(m_stop_mutex);
+		if (m_stop_requested) {
+			return;
 		}
+		m_stop_condvar.wait_for(lock, chrono::duration<double>(BEES_PROGRESS_INTERVAL));

-		BEESLOGINFO("ACTIVITY:");
-
+		// Snapshot stats and timer state
 		auto thisStats = BeesStats::s_global;
-		auto deltaStats = thisStats - lastProgressStats;
-		if (deltaStats) {
-			BEESLOGINFO("\t" << deltaStats / BEES_PROGRESS_INTERVAL);
-		};
-		lastProgressStats = thisStats;
+		auto stats_age = stats_timer.age();
+		auto all_age = all_timer.age();
+		stats_timer.lap();

+		BEESNOTE("logging event counter totals for last " << all_timer);
+		BEESLOGINFO("TOTAL COUNTS (" << all_age << "s):\n\t" << thisStats);
+
+		BEESNOTE("logging event counter rates for last " << all_timer);
+		auto avg_rates = thisStats / all_age;
+		BEESLOGINFO("TOTAL RATES (" << all_age << "s):\n\t" << avg_rates);
+
+		BEESNOTE("logging event counter delta counts for last " << stats_age);
+		BEESLOGINFO("DELTA COUNTS (" << stats_age << "s):");
+
+		auto deltaStats = thisStats - lastStats;
+		BEESLOGINFO("\t" << deltaStats / stats_age);
+
+		BEESNOTE("logging event counter delta rates for last " << stats_age);
+		BEESLOGINFO("DELTA RATES (" << stats_age << "s):");
+
+		auto deltaRates = deltaStats / stats_age;
+		BEESLOGINFO("\t" << deltaRates);
+
+		BEESNOTE("logging current thread status");
 		BEESLOGINFO("THREADS:");

 		for (auto t : BeesNote::get_status()) {
 			BEESLOGINFO("\ttid " << t.first << ": " << t.second);
 		}
+
+		lastStats = thisStats;
 	}
 }

 Fd
 BeesContext::home_fd()
 {
+	if (!!m_home_fd) {
+		return m_home_fd;
+	}
+
 	const char *base_dir = getenv("BEESHOME");
 	if (!base_dir) {
 		base_dir = ".beeshome";
@@ -161,12 +180,10 @@ BeesContext::home_fd()
 	return m_home_fd;
 }

-BeesContext::BeesContext(shared_ptr<BeesContext> parent) :
-	m_parent_ctx(parent)
+bool
+BeesContext::is_root_ro(uint64_t root)
 {
-	if (m_parent_ctx) {
-		m_fd_cache = m_parent_ctx->fd_cache();
-	}
+	return roots()->is_root_ro(root);
 }

 bool
@@ -175,9 +192,16 @@ BeesContext::dedup(const BeesRangePair &brp)
 	// TOOLONG and NOTE can retroactively fill in the filename details, but LOG can't
 	BEESNOTE("dedup " << brp);

-	brp.first.fd(shared_from_this());
 	brp.second.fd(shared_from_this());

+	if (is_root_ro(brp.second.fid().root())) {
+		// BEESLOGDEBUG("WORKAROUND: dst root is read-only in " << name_fd(brp.second.fd()));
+		BEESCOUNT(dedup_workaround_btrfs_send);
+		return false;
+	}
+
+	brp.first.fd(shared_from_this());
+
 	BEESTOOLONG("dedup " << brp);

 	BeesAddress first_addr(brp.first.fd(), brp.first.begin());
@@ -202,11 +226,6 @@ BeesContext::dedup(const BeesRangePair &brp)
 	if (rv) {
 		BEESCOUNT(dedup_hit);
 		BEESCOUNTADD(dedup_bytes, brp.first.size());
-		thread_local BeesFileRange last_src_bfr;
-		if (!last_src_bfr.overlaps(brp.first)) {
-			BEESCOUNTADD(dedup_unique_bytes, brp.first.size());
-			last_src_bfr = brp.first;
-		}
 	} else {
 		BEESCOUNT(dedup_miss);
 		BEESLOGWARN("NO Dedup! " << brp);
@@ -216,11 +235,11 @@ BeesContext::dedup(const BeesRangePair &brp)
 }

 BeesRangePair
-BeesContext::dup_extent(const BeesFileRange &src)
+BeesContext::dup_extent(const BeesFileRange &src, const shared_ptr<BeesTempFile> &tmpfile)
 {
 	BEESTRACE("dup_extent " << src);
 	BEESCOUNTADD(dedup_copy, src.size());
-	return BeesRangePair(tmpfile()->make_copy(src), src);
+	return BeesRangePair(tmpfile->make_copy(src), src);
 }

 void
@@ -228,7 +247,8 @@ BeesContext::rewrite_file_range(const BeesFileRange &bfr)
 {
 	auto m_ctx = shared_from_this();
 	BEESNOTE("Rewriting bfr " << bfr);
-	BeesRangePair dup_brp(dup_extent(BeesFileRange(bfr.fd(), bfr.begin(), min(bfr.file_size(), bfr.end()))));
+	auto rewrite_tmpfile = tmpfile();
+	BeesRangePair dup_brp(dup_extent(BeesFileRange(bfr.fd(), bfr.begin(), min(bfr.file_size(), bfr.end())), rewrite_tmpfile));
 	// BEESLOG("\tdup_brp " << dup_brp);
 	BeesBlockData orig_bbd(bfr.fd(), bfr.begin(), min(BLOCK_SIZE_SUMS, bfr.size()));
 	// BEESLOG("\torig_bbd " << orig_bbd);
@@ -297,23 +317,33 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
 	if (e.flags() & Extent::PREALLOC) {
 		// Prealloc is all zero and we replace it with a hole.
 		// No special handling is required here.  Nuke it and move on.
-		BEESLOGINFO("prealloc extent " << e);
-		// Must not extend past EOF
-		auto extent_size = min(e.end(), bfr.file_size()) - e.begin();
-		BeesFileRange prealloc_bfr(m_ctx->tmpfile()->make_hole(extent_size));
-		BeesRangePair brp(prealloc_bfr, bfr);
-		// Raw dedup here - nothing else to do with this extent, nothing to merge with
-		if (m_ctx->dedup(brp)) {
-			BEESCOUNT(dedup_prealloc_hit);
-			BEESCOUNTADD(dedup_prealloc_bytes, e.size());
-			return bfr;
-		} else {
-			BEESCOUNT(dedup_prealloc_miss);
-		}
+		Task(
+			"dedup_prealloc",
+			[m_ctx, bfr, e]() {
+				BEESLOGINFO("prealloc extent " << e);
+				// Must not extend past EOF
+				auto extent_size = min(e.end(), bfr.file_size()) - e.begin();
+				// Must hold tmpfile until dedupe is done
+				auto tmpfile = m_ctx->tmpfile();
+				BeesFileRange prealloc_bfr(tmpfile->make_hole(extent_size));
+				// Apparently they can both extend past EOF
+				BeesFileRange copy_bfr(bfr.fd(), e.begin(), e.begin() + extent_size);
+				BeesRangePair brp(prealloc_bfr, copy_bfr);
+				// Raw dedupe here - nothing else to do with this extent, nothing to merge with
+				if (m_ctx->dedup(brp)) {
+					BEESCOUNT(dedup_prealloc_hit);
+					BEESCOUNTADD(dedup_prealloc_bytes, e.size());
+					// return bfr;
+				} else {
+					BEESCOUNT(dedup_prealloc_miss);
+				}
+			}
+		).run();
+		return bfr; // if dedupe success, which we now blindly assume
 	}

 	// OK we need to read extent now
-	readahead(bfr.fd(), bfr.begin(), bfr.size());
+	bees_readahead(bfr.fd(), bfr.begin(), bfr.size());

 	map<off_t, pair<BeesHash, BeesAddress>> insert_map;
 	set<off_t> noinsert_set;
@@ -567,7 +597,7 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
 	// If the extent contains obscured blocks, and we can find no
 	// other refs to the extent that reveal those blocks, nuke the incoming extent.
 	// Don't rewrite extents that are bigger than the maximum FILE_EXTENT_SAME size
-	// because we can't make extents that large with dedup.
+	// because we can't make extents that large with dedupe.
 	// Don't rewrite small extents because it is a waste of time without being
 	// able to combine them into bigger extents.
 	if (!rewrite_extent && (e.flags() & Extent::OBSCURED) && (e.physical_len() > BLOCK_SIZE_MAX_COMPRESSED_EXTENT) && (e.physical_len() < BLOCK_SIZE_MAX_EXTENT_SAME)) {
@@ -723,7 +753,7 @@ BeesContext::scan_forward(const BeesFileRange &bfr)

 	Extent e;
 	catch_all([&]() {
-		while (true) {
+		while (!stop_requested()) {
 			e = ew.current();

 			catch_all([&]() {
@@ -756,18 +786,88 @@ BeesResolveAddrResult::BeesResolveAddrResult()
 {
 }

+void
+BeesContext::wait_for_balance()
+{
+	if (!BEES_SERIALIZE_BALANCE) {
+		return;
+	}
+
+	Timer balance_timer;
+	BEESNOTE("WORKAROUND: waiting for balance to stop");
+	while (true) {
+		btrfs_ioctl_balance_args args;
+		memset_zero<btrfs_ioctl_balance_args>(&args);
+		const int ret = ioctl(root_fd(), BTRFS_IOC_BALANCE_PROGRESS, &args);
+		if (ret < 0) {
+			// Either can't get balance status or not running, exit either way
+			break;
+		}
+
+		if (!(args.state & BTRFS_BALANCE_STATE_RUNNING)) {
+			// Balance not running, doesn't matter if paused or cancelled
+			break;
+		}
+
+		BEESLOGDEBUG("WORKAROUND: Waiting " << balance_timer << "s for balance to stop");
+		unique_lock<mutex> lock(m_abort_mutex);
+		if (m_abort_requested) {
+			// Force the calling function to stop.	We cannot
+			// proceed to LOGICAL_INO while balance is running
+			// until the bugs are fixed, and it's probably
+			// not going to be particularly fast to have
+			// both bees and balance banging the disk anyway.
+			BeesTracer::set_silent();
+			throw std::runtime_error("Stop requested while balance running");
+		}
+		m_abort_condvar.wait_for(lock, chrono::duration<double>(BEES_BALANCE_POLL_INTERVAL));
+	}
+}
+
 BeesResolveAddrResult
 BeesContext::resolve_addr_uncached(BeesAddress addr)
 {
 	THROW_CHECK1(invalid_argument, addr, !addr.is_magic());
 	THROW_CHECK0(invalid_argument, !!root_fd());
+
+	// If we look at per-thread CPU usage we get a better estimate of
+	// how badly btrfs is performing without confounding factors like
+	// transaction latency, competing threads, and freeze/SIGSTOP
+	// pausing the bees process.
+
+	// There can be only one of these running at a time, or some lingering
+	// backref bug will kill the whole system.  Also it looks like there
+	// are so many locks held while LOGICAL_INO runs that there is no
+	// point in trying to run two of them on the same filesystem.
+	// ...but it works most of the time, and the performance hit from
+	// not running resolve in multiple threads is significant.
+	// But "most of the time" really just means "between forced reboots",
+	// and with recent improvements in kernel uptime, this is now in the
+	// top 3 crash causes.
+	static mutex s_resolve_mutex;
+	unique_lock<mutex> lock(s_resolve_mutex, defer_lock);
+	if (BEES_SERIALIZE_RESOLVE) {
+		BEESNOTE("waiting to resolve addr " << addr);
+		lock.lock();
+	}
+
+	// Is there a bug where resolve and balance cause a crash (BUG_ON at fs/btrfs/ctree.c:1227)?
+	// Apparently yes, and more than one.
+	// Wait for the balance to finish before we run LOGICAL_INO
+	wait_for_balance();
+
+	// Time how long this takes
 	Timer resolve_timer;

-	// There is no performance benefit if we restrict the buffer size.
        BtrfsIoctlLogicalInoArgs log_ino(addr.get_physical_or_zero());

+	// Get this thread's system CPU usage
+	struct rusage usage_before;
+	DIE_IF_MINUS_ONE(getrusage(RUSAGE_THREAD, &usage_before));
+
 	{
 		BEESTOOLONG("Resolving addr " << addr << " in " << root_path() << " refs " << log_ino.m_iors.size());
+		BEESNOTE("resolving addr " << addr << " with LOGICAL_INO");
 		if (log_ino.do_ioctl_nothrow(root_fd())) {
 			BEESCOUNT(resolve_ok);
 		} else {
@@ -776,20 +876,51 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
 		BEESCOUNTADD(resolve_ms, resolve_timer.age() * 1000);
 	}

-	// Prevent unavoidable performance bug from crippling the rest of the system
+	// Again!
+	struct rusage usage_after;
+	DIE_IF_MINUS_ONE(getrusage(RUSAGE_THREAD, &usage_after));
+
+	double sys_usage_delta =
+		(usage_after.ru_stime.tv_sec + usage_after.ru_stime.tv_usec / 1000000.0) -
+		(usage_before.ru_stime.tv_sec + usage_before.ru_stime.tv_usec / 1000000.0);
+
+	double user_usage_delta =
+		(usage_after.ru_utime.tv_sec + usage_after.ru_utime.tv_usec / 1000000.0) -
+		(usage_before.ru_utime.tv_sec + usage_before.ru_utime.tv_usec / 1000000.0);
+
 	auto rt_age = resolve_timer.age();

-	// Avoid performance bug
 	BeesResolveAddrResult rv;
-	rv.m_biors = log_ino.m_iors;
-	if (rt_age < BEES_TOXIC_DURATION && log_ino.m_iors.size() < BEES_MAX_EXTENT_REF_COUNT) {
+
+	// Avoid performance problems - pretend resolve failed if there are too many refs
+	const size_t rv_count = log_ino.m_iors.size();
+	if (rv_count < BEES_MAX_EXTENT_REF_COUNT) {
+		rv.m_biors = log_ino.m_iors;
+	} else {
+		BEESLOGINFO("addr " << addr << " refs " << rv_count << " overflows configured ref limit " << BEES_MAX_EXTENT_REF_COUNT);
+		BEESCOUNT(resolve_overflow);
+	}
+
+	// Avoid crippling performance bug
+	if (sys_usage_delta < BEES_TOXIC_SYS_DURATION) {
 		rv.m_is_toxic = false;
 	} else {
-		BEESLOGWARN("WORKAROUND: toxic address " << addr << " in " << root_path() << " with " << log_ino.m_iors.size() << " refs took " << rt_age << "s in LOGICAL_INO");
+		BEESLOGNOTICE("WORKAROUND: toxic address: addr = " << addr << ", sys_usage_delta = " << round(sys_usage_delta* 1000.0) / 1000.0 << ", user_usage_delta = " << round(user_usage_delta * 1000.0) / 1000.0 << ", rt_age = " << rt_age << ", refs " << rv_count);
 		BEESCOUNT(resolve_toxic);
 		rv.m_is_toxic = true;
 	}

+	// Count how many times this happens so we can figure out how
+	// important this case is
+	static size_t most_refs_ever = 2730;
+	if (rv_count > most_refs_ever) {
+		BEESLOGINFO("addr " << addr << " refs " << rv_count << " beats previous record " << most_refs_ever);
+		most_refs_ever = rv_count;
+	}
+	if (rv_count > 2730) {
+		BEESCOUNT(resolve_large);
+	}
+
 	return rv;
 }

@@ -816,10 +947,6 @@ BeesContext::set_root_fd(Fd fd)
 	Stat st(fd);
 	THROW_CHECK1(invalid_argument, st.st_ino, st.st_ino == BTRFS_FIRST_FREE_OBJECTID);
 	m_root_fd = fd;
-	BtrfsIoctlFsInfoArgs fsinfo;
-	fsinfo.do_ioctl(fd);
-	m_root_uuid = fsinfo.uuid();
-	BEESLOGINFO("Filesystem UUID is " << m_root_uuid);

 	// 65536 is big enough for two max-sized extents.
 	// Need enough total space in the cache for the maximum number of active threads.
@@ -827,80 +954,210 @@ BeesContext::set_root_fd(Fd fd)
 	m_resolve_cache.func([&](BeesAddress addr) -> BeesResolveAddrResult {
 		return resolve_addr_uncached(addr);
 	});
+}

-	// Start queue producers
-	roots();
-
-	BEESLOGINFO("returning from set_root_fd in " << name_fd(fd));
+const char *
+BeesHalt::what() const noexcept
+{
+	return "bees stop requested";
 }

 void
-BeesContext::blacklist_add(const BeesFileId &fid)
+BeesContext::start()
+{
+	BEESLOGNOTICE("Starting bees main loop...");
+	BEESNOTE("starting BeesContext");
+
+	m_progress_thread = make_shared<BeesThread>("progress_report");
+	m_status_thread = make_shared<BeesThread>("status_report");
+	m_progress_thread->exec([=]() {
+		show_progress();
+	});
+	m_status_thread->exec([=]() {
+		dump_status();
+	});
+
+	// Set up temporary file pool
+	m_tmpfile_pool.generator([=]() -> shared_ptr<BeesTempFile> {
+		return make_shared<BeesTempFile>(shared_from_this());
+	});
+	m_tmpfile_pool.checkin([](const shared_ptr<BeesTempFile> &btf) {
+		catch_all([&](){
+			btf->reset();
+		});
+	});
+
+	// Force these to exist now so we don't have recursive locking
+	// operations trying to access them
+	fd_cache();
+	hash_table();
+
+	// Kick off the crawlers
+	roots()->start();
+}
+
+void
+BeesContext::stop()
+{
+	Timer stop_timer;
+	BEESLOGNOTICE("Stopping bees...");
+
+	BEESNOTE("aborting blocked tasks");
+	BEESLOGDEBUG("Aborting blocked tasks");
+	unique_lock<mutex> abort_lock(m_abort_mutex);
+	m_abort_requested = true;
+	m_abort_condvar.notify_all();
+	abort_lock.unlock();
+
+	BEESNOTE("pausing work queue");
+	BEESLOGDEBUG("Pausing work queue");
+	TaskMaster::set_thread_count(0);
+
+	BEESNOTE("setting stop_request flag");
+	BEESLOGDEBUG("Setting stop_request flag");
+	unique_lock<mutex> lock(m_stop_mutex);
+	m_stop_requested = true;
+	m_stop_condvar.notify_all();
+	lock.unlock();
+
+	// Stop crawlers first so we get good progress persisted on disk
+	BEESNOTE("stopping crawlers");
+	BEESLOGDEBUG("Stopping crawlers");
+	if (m_roots) {
+		m_roots->stop();
+		m_roots.reset();
+	} else {
+		BEESLOGDEBUG("Crawlers not running");
+	}
+
+	BEESNOTE("cancelling work queue");
+	BEESLOGDEBUG("Cancelling work queue");
+	TaskMaster::cancel();
+
+	BEESNOTE("stopping hash table");
+	BEESLOGDEBUG("Stopping hash table");
+	if (m_hash_table) {
+		m_hash_table->stop();
+		m_hash_table.reset();
+	} else {
+		BEESLOGDEBUG("Hash table not running");
+	}
+
+	BEESNOTE("closing tmpfiles");
+	BEESLOGDEBUG("Closing tmpfiles");
+	m_tmpfile_pool.clear();
+
+	BEESNOTE("closing FD caches");
+	BEESLOGDEBUG("Closing FD caches");
+	if (m_fd_cache) {
+		m_fd_cache->clear();
+		BEESNOTE("destroying FD caches");
+		BEESLOGDEBUG("Destroying FD caches");
+		m_fd_cache.reset();
+	}
+
+	BEESNOTE("waiting for progress thread");
+	BEESLOGDEBUG("Waiting for progress thread");
+	m_progress_thread->join();
+
+	// XXX: nobody can see this BEESNOTE because we are killing the
+	// thread that publishes it
+	BEESNOTE("waiting for status thread");
+	BEESLOGDEBUG("Waiting for status thread");
+	lock.lock();
+	m_stop_status = true;
+	m_stop_condvar.notify_all();
+	lock.unlock();
+	m_status_thread->join();
+
+	BEESLOGNOTICE("bees stopped in " << stop_timer << " sec");
+}
+
+bool
+BeesContext::stop_requested() const
+{
+	unique_lock<mutex> lock(m_stop_mutex);
+	return m_stop_requested;
+}
+
+void
+BeesContext::blacklist_insert(const BeesFileId &fid)
 {
 	BEESLOGDEBUG("Adding " << fid << " to blacklist");
 	unique_lock<mutex> lock(m_blacklist_mutex);
 	m_blacklist.insert(fid);
 }

+void
+BeesContext::blacklist_erase(const BeesFileId &fid)
+{
+	BEESLOGDEBUG("Removing " << fid << " from blacklist");
+	unique_lock<mutex> lock(m_blacklist_mutex);
+	m_blacklist.erase(fid);
+}
+
 bool
 BeesContext::is_blacklisted(const BeesFileId &fid) const
 {
-	// Everything on root 1 is blacklisted, no locks necessary.
+	// Everything on root 1 is blacklisted (it is mostly free space cache), no locks necessary.
 	if (fid.root() == 1) {
 		return true;
 	}
 	unique_lock<mutex> lock(m_blacklist_mutex);
-	return m_blacklist.count(fid);
+	return m_blacklist.find(fid) != m_blacklist.end();
 }

 shared_ptr<BeesTempFile>
 BeesContext::tmpfile()
 {
-	// There need be only one, this is not a high-contention path
-	static mutex s_mutex;
-	unique_lock<mutex> lock(s_mutex);
+	unique_lock<mutex> lock(m_stop_mutex);

-	if (!m_tmpfiles[this_thread::get_id()]) {
-		m_tmpfiles[this_thread::get_id()] = make_shared<BeesTempFile>(shared_from_this());
+	if (m_stop_requested) {
+		throw BeesHalt();
 	}
-	auto rv = m_tmpfiles[this_thread::get_id()];
-	return rv;
+
+	lock.unlock();
+
+	return m_tmpfile_pool();
 }

 shared_ptr<BeesFdCache>
 BeesContext::fd_cache()
 {
-	static mutex s_mutex;
-	unique_lock<mutex> lock(s_mutex);
-	if (!m_fd_cache) {
-		m_fd_cache = make_shared<BeesFdCache>();
+	unique_lock<mutex> lock(m_stop_mutex);
+	if (m_stop_requested) {
+		throw BeesHalt();
 	}
-	auto rv = m_fd_cache;
-	return rv;
+	if (!m_fd_cache) {
+		m_fd_cache = make_shared<BeesFdCache>(shared_from_this());
+	}
+	return m_fd_cache;
 }

 shared_ptr<BeesRoots>
 BeesContext::roots()
 {
-	static mutex s_mutex;
-	unique_lock<mutex> lock(s_mutex);
+	unique_lock<mutex> lock(m_stop_mutex);
+	if (m_stop_requested) {
+		throw BeesHalt();
+	}
 	if (!m_roots) {
 		m_roots = make_shared<BeesRoots>(shared_from_this());
 	}
-	auto rv = m_roots;
-	return rv;
+	return m_roots;
 }

 shared_ptr<BeesHashTable>
 BeesContext::hash_table()
 {
-	static mutex s_mutex;
-	unique_lock<mutex> lock(s_mutex);
+	unique_lock<mutex> lock(m_stop_mutex);
+	if (m_stop_requested) {
+		throw BeesHalt();
+	}
 	if (!m_hash_table) {
 		m_hash_table = make_shared<BeesHashTable>(shared_from_this(), "beeshash.dat");
 	}
-	auto rv = m_hash_table;
-	return rv;
+	return m_hash_table;
 }

 void
@@ -910,9 +1167,3 @@ BeesContext::set_root_path(string path)
 	m_root_path = path;
 	set_root_fd(open_or_die(m_root_path, FLAGS_OPEN_DIR));
 }
-
-void
-BeesContext::insert_root_ino(Fd fd)
-{
-	fd_cache()->insert_root_ino(shared_from_this(), fd);
-}
--- a/src/bees-hash.cc
+++ b/src/bees-hash.cc
@@ -1,5 +1,6 @@
 #include "bees.h"

+#include "crucible/city.h"
 #include "crucible/crc64.h"
 #include "crucible/string.h"

@@ -11,6 +12,12 @@
 using namespace crucible;
 using namespace std;

+BeesHash::BeesHash(const uint8_t *ptr, size_t len) :
+	// m_hash(CityHash64(reinterpret_cast<const char *>(ptr), len))
+	m_hash(Digest::CRC::crc64(ptr, len))
+{
+}
+
 ostream &
 operator<<(ostream &os, const BeesHash &bh)
 {
@@ -35,7 +42,7 @@ dump_bucket_locked(BeesHashTable::Cell *p, BeesHashTable::Cell *q)
 }
 #endif

-const bool VERIFY_CLEARS_BUGS = false;
+static const bool VERIFY_CLEARS_BUGS = false;

 bool
 verify_cell_range(BeesHashTable::Cell *p, BeesHashTable::Cell *q, bool clear_bugs = VERIFY_CLEARS_BUGS)
@@ -108,8 +115,9 @@ BeesHashTable::flush_dirty_extent(uint64_t extent_index)
 	bool wrote_extent = false;

 	catch_all([&]() {
-		uint8_t *dirty_extent     = m_extent_ptr[extent_index].p_byte;
-		uint8_t *dirty_extent_end = m_extent_ptr[extent_index + 1].p_byte;
+		uint8_t *const dirty_extent      = m_extent_ptr[extent_index].p_byte;
+		uint8_t *const dirty_extent_end  = m_extent_ptr[extent_index + 1].p_byte;
+		const size_t dirty_extent_offset = dirty_extent - m_byte_ptr;
 		THROW_CHECK1(out_of_range, dirty_extent,     dirty_extent     >= m_byte_ptr);
 		THROW_CHECK1(out_of_range, dirty_extent_end, dirty_extent_end <= m_byte_ptr_end);
 		THROW_CHECK2(out_of_range, dirty_extent_end, dirty_extent, dirty_extent_end - dirty_extent == BLOCK_SIZE_HASHTAB_EXTENT);
@@ -124,19 +132,21 @@ BeesHashTable::flush_dirty_extent(uint64_t extent_index)
 		lock.unlock();

 		// Write the extent (or not)
-		pwrite_or_die(m_fd, extent_copy, dirty_extent - m_byte_ptr);
+		pwrite_or_die(m_fd, extent_copy, dirty_extent_offset);
 		BEESCOUNT(hash_extent_out);

+		// Nope, this causes a _dramatic_ loss of performance.
+		// const size_t dirty_extent_size   = dirty_extent_end - dirty_extent;
+		// bees_unreadahead(m_fd, dirty_extent_offset, dirty_extent_size);
+
 		wrote_extent = true;
 	});

-	BEESNOTE("flush rate limited after extent #" << extent_index << " of " << m_extents << " extents");
-	m_flush_rate_limit.sleep_for(BLOCK_SIZE_HASHTAB_EXTENT);
 	return wrote_extent;
 }

-void
-BeesHashTable::flush_dirty_extents()
+size_t
+BeesHashTable::flush_dirty_extents(bool slowly)
 {
 	THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);

@@ -144,12 +154,24 @@ BeesHashTable::flush_dirty_extents()
 	for (size_t extent_index = 0; extent_index < m_extents; ++extent_index) {
 		if (flush_dirty_extent(extent_index)) {
 			++wrote_extents;
+			if (slowly) {
+				BEESNOTE("flush rate limited after extent #" << extent_index << " of " << m_extents << " extents");
+				chrono::duration<double> sleep_time(m_flush_rate_limit.sleep_time(BLOCK_SIZE_HASHTAB_EXTENT));
+				unique_lock<mutex> lock(m_stop_mutex);
+				if (m_stop_requested) {
+					BEESLOGDEBUG("Stop requested in hash table flush_dirty_extents");
+					// This function is called by another thread with !slowly,
+					// so we just get out of the way here.
+					break;
+				}
+				m_stop_condvar.wait_for(lock, sleep_time);
+			}
 		}
 	}
-
-	BEESNOTE("idle after writing " << wrote_extents << " of " << m_extents << " extents");
-	unique_lock<mutex> lock(m_dirty_mutex);
-	m_dirty_condvar.wait(lock);
+	if (!slowly) {
+		BEESLOGINFO("Flushed " << wrote_extents << " of " << m_extents << " extents");
+	}
+	return wrote_extents;
 }

 void
@@ -160,15 +182,34 @@ BeesHashTable::set_extent_dirty_locked(uint64_t extent_index)

 	// Signal writeback thread
 	unique_lock<mutex> dirty_lock(m_dirty_mutex);
+	m_dirty = true;
 	m_dirty_condvar.notify_one();
 }

 void
 BeesHashTable::writeback_loop()
 {
-	while (true) {
-		flush_dirty_extents();
+	while (!m_stop_requested) {
+		auto wrote_extents = flush_dirty_extents(true);
+
+		BEESNOTE("idle after writing " << wrote_extents << " of " << m_extents << " extents");
+
+		unique_lock<mutex> lock(m_dirty_mutex);
+		if (m_stop_requested) {
+			break;
+		}
+		if (m_dirty) {
+			m_dirty = false;
+		} else {
+			m_dirty_condvar.wait(lock);
+		}
 	}
+	catch_all([&]() {
+		// trigger writeback on our way out
+		BEESTOOLONG("unreadahead hash table size " << pretty(m_size));
+		bees_unreadahead(m_fd, 0, m_size);
+	});
+	BEESLOGDEBUG("Exited hash table writeback_loop");
 }

 static
@@ -186,7 +227,7 @@ void
 BeesHashTable::prefetch_loop()
 {
 	bool not_locked = true;
-	while (true) {
+	while (!m_stop_requested) {
 		size_t width = 64;
 		vector<size_t> occupancy(width, 0);
 		size_t occupied_count = 0;
@@ -196,7 +237,8 @@ BeesHashTable::prefetch_loop()
 		size_t toxic_count = 0;
 		size_t unaligned_eof_count = 0;

-		for (uint64_t ext = 0; ext < m_extents; ++ext) {
+		m_prefetch_running = true;
+		for (uint64_t ext = 0; ext < m_extents && !m_stop_requested; ++ext) {
 			BEESNOTE("prefetching hash table extent #" << ext << " of " << m_extents);
 			catch_all([&]() {
 				fetch_missing_extent_by_index(ext);
@@ -237,6 +279,7 @@ BeesHashTable::prefetch_loop()
 				}
 			});
 		}
+		m_prefetch_running = false;

 		BEESNOTE("calculating hash table statistics");

@@ -300,7 +343,7 @@ BeesHashTable::prefetch_loop()
 			m_stats_file.write(graph_blob.str());
 		});

-		if (not_locked) {
+		if (not_locked && !m_stop_requested) {
 			// Always do the mlock, whether shared or not
 			THROW_CHECK1(runtime_error, m_size, m_size > 0);
 			BEESLOGINFO("mlock(" << pretty(m_size) << ")...");
@@ -314,7 +357,12 @@ BeesHashTable::prefetch_loop()
 		}

 		BEESNOTE("idle " << BEES_HASH_TABLE_ANALYZE_INTERVAL << "s");
-		nanosleep(BEES_HASH_TABLE_ANALYZE_INTERVAL);
+		unique_lock<mutex> lock(m_stop_mutex);
+		if (m_stop_requested) {
+			BEESLOGDEBUG("Stop requested in hash table prefetch");
+			return;
+		}
+		m_stop_condvar.wait_for(lock, chrono::duration<double>(BEES_HASH_TABLE_ANALYZE_INTERVAL));
 	}
 }

@@ -360,19 +408,30 @@ BeesHashTable::fetch_missing_extent_by_index(uint64_t extent_index)
 	BEESTRACE("Fetching hash extent #" << extent_index << " of " << m_extents << " extents");
 	BEESTOOLONG("Fetching hash extent #" << extent_index << " of " << m_extents << " extents");

-	uint8_t *dirty_extent     = m_extent_ptr[extent_index].p_byte;
-	uint8_t *dirty_extent_end = m_extent_ptr[extent_index + 1].p_byte;
+	uint8_t *const dirty_extent      = m_extent_ptr[extent_index].p_byte;
+	uint8_t *const dirty_extent_end  = m_extent_ptr[extent_index + 1].p_byte;
+	const size_t dirty_extent_size   = dirty_extent_end - dirty_extent;
+	const size_t dirty_extent_offset = dirty_extent - m_byte_ptr;

 	// If the read fails don't retry, just go with whatever data we have
 	m_extent_metadata.at(extent_index).m_missing = false;

 	catch_all([&]() {
 		BEESTOOLONG("pread(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
-		pread_or_die(m_fd, dirty_extent, dirty_extent_end - dirty_extent, dirty_extent - m_byte_ptr);
-	});
+		pread_or_die(m_fd, dirty_extent, dirty_extent_size, dirty_extent_offset);

-	// Only count extents successfully read
-	BEESCOUNT(hash_extent_in);
+		// Only count extents successfully read
+		BEESCOUNT(hash_extent_in);
+
+		// Won't need that again
+		bees_unreadahead(m_fd, dirty_extent_offset, dirty_extent_size);
+
+		// If we are in prefetch, give the kernel a hint about the next extent
+		if (m_prefetch_running) {
+			// XXX: don't call this if bees_readahead is implemented by pread()
+			bees_readahead(m_fd, dirty_extent_offset + dirty_extent_size, dirty_extent_size);
+		}
+	});
 }

 void
@@ -384,25 +443,9 @@ BeesHashTable::fetch_missing_extent_by_hash(HashType hash)
 	fetch_missing_extent_by_index(extent_index);
 }

-bool
-BeesHashTable::is_toxic_hash(BeesHashTable::HashType hash) const
-{
-	return m_toxic_hashes.find(hash) != m_toxic_hashes.end();
-}
-
 vector<BeesHashTable::Cell>
 BeesHashTable::find_cell(HashType hash)
 {
-	// This saves a lot of time prefilling the hash table, and there's no risk of eviction
-	if (is_toxic_hash(hash)) {
-		BEESCOUNT(hash_toxic);
-		BeesAddress toxic_addr(0x1000);
-		toxic_addr.set_toxic();
-		Cell toxic_cell(hash, toxic_addr);
-		vector<Cell> rv;
-		rv.push_back(toxic_cell);
-		return rv;
-	}
 	fetch_missing_extent_by_hash(hash);
 	BEESTOOLONG("find_cell hash " << BeesHash(hash));
 	vector<Cell> rv;
@@ -414,11 +457,9 @@ BeesHashTable::find_cell(HashType hash)
 	return rv;
 }

-// Move an entry to the end of the list.  Used after an attempt to resolve
-// an address in the hash table fails.  Probably more correctly called
-// push_back_hash_addr, except it never inserts.  Shared hash tables
-// never erase anything, since there is no way to tell if an entry is
-// out of date or just belonging to the wrong filesystem.
+/// Remove a hash from the table, leaving an empty space on the list
+/// where the hash used to be.  Used when an invalid address is found
+/// because lookups on invalid addresses really hurt.
 void
 BeesHashTable::erase_hash_addr(HashType hash, AddrType addr)
 {
@@ -430,7 +471,6 @@ BeesHashTable::erase_hash_addr(HashType hash, AddrType addr)
 	Cell *ip = find(er.first, er.second, mv);
 	bool found = (ip < er.second);
 	if (found) {
-		// Lookups on invalid addresses really hurt us.  Kill it with fire!
 		*ip = Cell(0, 0);
 		set_extent_dirty_locked(hash_to_extent_index(hash));
 		BEESCOUNT(hash_erase);
@@ -439,14 +479,17 @@ BeesHashTable::erase_hash_addr(HashType hash, AddrType addr)
 			BEESLOGDEBUG("while erasing hash " << hash << " addr " << addr);
 		}
 #endif
+	} else {
+		BEESCOUNT(hash_erase_miss);
 	}
 }

-// If entry is already present in list, move it to the front of the
-// list without dropping any entries, and return true.  If entry is not
-// present in list, insert it at the front of the list, possibly dropping
-// the last entry in the list, and return false.  Used to move duplicate
-// hash blocks to the front of the list.
+/// Insert a hash entry at the head of the list.  If entry is already
+/// present in list, move it to the front of the list without dropping
+/// any entries, and return true.  If entry is not present in list,
+/// insert it at the front of the list, possibly dropping the last entry
+/// in the list, and return false.  Used to move duplicate hash blocks
+/// to the front of the list.
 bool
 BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
 {
@@ -470,7 +513,7 @@ BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
 		auto dp = ip;
 		--sp;
 		// If we are deleting the last entry then don't copy it
-		if (ip == er.second) {
+		if (dp == er.second) {
 			--sp;
 			--dp;
 			BEESCOUNT(hash_evict);
@@ -484,6 +527,8 @@ BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
 		er.first[0] = mv;
 		set_extent_dirty_locked(hash_to_extent_index(hash));
 		BEESCOUNT(hash_front);
+	} else {
+		BEESCOUNT(hash_front_already);
 	}
 #if 0
 	if (verify_cell_range(er.first, er.second)) {
@@ -493,11 +538,12 @@ BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
 	return found;
 }

-// If entry is already present in list, returns true and does not
-// modify list.  If entry is not present in list, returns false and
-// inserts at a random position in the list, possibly evicting the entry
-// at the end of the list.  Used to insert new unique (not-yet-duplicate)
-// blocks in random order.
+/// Insert a hash entry at some unspecified point in the list.
+/// If entry is already present in list, returns true and does not
+/// modify list.  If entry is not present in list, returns false and
+/// inserts at a random position in the list, possibly evicting the entry
+/// at the end of the list.  Used to insert new unique (not-yet-duplicate)
+/// blocks in random order.
 bool
 BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr)
 {
@@ -514,7 +560,9 @@ BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr)
 	auto pos = distribution(generator);

 	int case_cond = 0;
+#if 0
 	vector<Cell> saved(er.first, er.second);
+#endif

 	if (found) {
 		// If hash already exists after pos, swap with pos
@@ -560,7 +608,12 @@ BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr)
 	}

 	// Evict something and insert at pos
-	move_backward(er.first + pos, er.second - 1, er.second);
+	// move_backward(er.first + pos, er.second - 1, er.second);
+	ip = er.second - 1;
+	while (ip > er.first + pos) {
+		auto dp = ip;
+		*dp = *--ip;
+	}
 	er.first[pos] = mv;
 	BEESCOUNT(hash_evict);
 	case_cond = 5;
@@ -712,26 +765,58 @@ BeesHashTable::BeesHashTable(shared_ptr<BeesContext> ctx, string filename, off_t

 	// Blacklist might fail if the hash table is not stored on a btrfs
 	catch_all([&]() {
-		m_ctx->blacklist_add(BeesFileId(m_fd));
+		m_ctx->blacklist_insert(BeesFileId(m_fd));
 	});
-
-	// Skip zero because we already weed that out before it gets near a hash function
-	for (unsigned i = 1; i < 256; ++i) {
-		vector<uint8_t> v(BLOCK_SIZE_SUMS, i);
-		HashType hash = Digest::CRC::crc64(v.data(), v.size());
-		m_toxic_hashes.insert(hash);
-	}
 }

 BeesHashTable::~BeesHashTable()
 {
+	BEESLOGDEBUG("Destroy BeesHashTable");
 	if (m_cell_ptr && m_size) {
-		flush_dirty_extents();
+		// Dirty extents should have been flushed before now,
+		// e.g. in stop().  If that didn't happen, don't fall
+		// into the same trap (and maybe throw an exception) here.
+		// flush_dirty_extents(false);
 		catch_all([&]() {
+			// drop the memory mapping
+			BEESTOOLONG("unmap handle table size " << pretty(m_size));
 			DIE_IF_NON_ZERO(munmap(m_cell_ptr, m_size));
-			m_cell_ptr = nullptr;
-			m_size = 0;
 		});
+		m_cell_ptr = nullptr;
+		m_size = 0;
 	}
+	BEESLOGDEBUG("BeesHashTable destroyed");
 }

+void
+BeesHashTable::stop()
+{
+	BEESNOTE("stopping BeesHashTable threads");
+	BEESLOGDEBUG("Stopping BeesHashTable threads");
+
+	unique_lock<mutex> lock(m_stop_mutex);
+	m_stop_requested = true;
+	m_stop_condvar.notify_all();
+	lock.unlock();
+
+	// Wake up hash writeback too
+	unique_lock<mutex> dirty_lock(m_dirty_mutex);
+	m_dirty_condvar.notify_all();
+	dirty_lock.unlock();
+
+	BEESNOTE("waiting for hash_prefetch thread");
+	BEESLOGDEBUG("Waiting for hash_prefetch thread");
+	m_prefetch_thread.join();
+
+	BEESNOTE("waiting for hash_writeback thread");
+	BEESLOGDEBUG("Waiting for hash_writeback thread");
+	m_writeback_thread.join();
+
+	if (m_cell_ptr && m_size) {
+		BEESLOGDEBUG("Flushing hash table");
+		BEESNOTE("flushing hash table");
+		flush_dirty_extents(false);
+	}
+
+	BEESLOGDEBUG("BeesHashTable stopped");
+}
--- a/src/bees-resolve.cc
+++ b/src/bees-resolve.cc
@@ -161,9 +161,11 @@ BeesResolver::adjust_offset(const BeesFileRange &haystack, const BeesBlockData &

 	// Found the hash but not the data.  Yay!
 	m_found_hash = true;
+#if 0
 	BEESLOGINFO("HASH COLLISION\n"
 		<< "\tneedle " << needle << "\n"
 		<< "\tstraw " << straw);
+#endif
 	BEESCOUNT(hash_collision);

 	// Ran out of offsets to try
@@ -227,6 +229,8 @@ BeesResolver::chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &
 	// Search near the resolved address for a matching data block.
 	// ...even if it's not compressed, we should do this sanity
 	// check before considering the block as a duplicate candidate.
+	// FIXME:  this is mostly obsolete now and we shouldn't do it here.
+	// Don't bother fixing it because it will all go away with (extent, offset) reads.
 	auto new_bbd = adjust_offset(haystack_bbd, needle_bbd);
 	if (new_bbd.empty()) {
 		// matching offset search failed
@@ -413,6 +417,7 @@ BeesResolver::replace_dst(const BeesFileRange &dst_bfr)
 		if (bbd.addr().get_physical_or_zero() == src_bbd.addr().get_physical_or_zero()) {
 			BEESCOUNT(replacedst_same);
 			// stop looping here, all the other srcs will probably fail this test too
+			BeesTracer::set_silent();
 			throw runtime_error("FIXME: bailing out here, need to fix this further up the call stack");
 		}

@@ -435,7 +440,7 @@ BeesResolver::replace_dst(const BeesFileRange &dst_bfr)
 			BEESCOUNT(replacedst_dedup_hit);
 			m_found_dup = true;
 			overlap_bfr = brp.second;
-			// FIXME:  find best range first, then dedup that
+			// FIXME:  find best range first, then dedupe that
 			return true; // i.e. break
 		} else {
 			BEESCOUNT(replacedst_dedup_miss);
--- a/src/bees-roots.cc
+++ b/src/bees-roots.cc
@@ -5,14 +5,13 @@
 #include "crucible/string.h"
 #include "crucible/task.h"

+#include <algorithm>
 #include <fstream>
 #include <tuple>

 using namespace crucible;
 using namespace std;

-BeesRoots::ScanMode BeesRoots::s_scan_mode = BeesRoots::SCAN_MODE_ZERO;
-
 string
 format_time(time_t t)
 {
@@ -46,8 +45,8 @@ BeesCrawlState::BeesCrawlState() :
 bool
 BeesCrawlState::operator<(const BeesCrawlState &that) const
 {
-	return tie(m_min_transid, m_objectid, m_offset, m_root, m_max_transid)
-		< tie(that.m_min_transid, that.m_objectid, that.m_offset, that.m_root, that.m_max_transid);
+	return tie(m_min_transid, m_max_transid, m_objectid, m_offset, m_root)
+		< tie(that.m_min_transid, that.m_max_transid, that.m_objectid, that.m_offset, that.m_root);
 }

 string
@@ -67,27 +66,26 @@ void
 BeesRoots::set_scan_mode(ScanMode mode)
 {
 	THROW_CHECK1(invalid_argument, mode, mode < SCAN_MODE_COUNT);
-	s_scan_mode = mode;
+	m_scan_mode = mode;
 	BEESLOGINFO("Scan mode set to " << mode << " (" << scan_mode_ntoa(mode) << ")");
 }

+void
+BeesRoots::set_workaround_btrfs_send(bool do_avoid)
+{
+	m_workaround_btrfs_send = do_avoid;
+	if (m_workaround_btrfs_send) {
+		BEESLOGINFO("WORKAROUND: btrfs send workaround enabled");
+	} else {
+		BEESLOGINFO("btrfs send workaround disabled");
+	}
+}
+
 string
 BeesRoots::crawl_state_filename() const
 {
-	string rv;
-
-	// Legacy filename included UUID
-	rv += "beescrawl.";
-	rv += m_ctx->root_uuid();
-	rv += ".dat";
-
-	struct stat buf;
-	if (fstatat(m_ctx->home_fd(), rv.c_str(), &buf, AT_SYMLINK_NOFOLLOW)) {
-		// Use new filename
-		rv = "beescrawl.dat";
-	}
-
-	return rv;
+	// Legacy filename included UUID.  That feature was removed in 2016.
+	return "beescrawl.dat";
 }

 ostream &
@@ -139,12 +137,6 @@ BeesRoots::state_save()

 	m_crawl_state_file.write(ofs.str());

-	// Renaming things is hard after release
-	if (m_crawl_state_file.name() != "beescrawl.dat") {
-		renameat(m_ctx->home_fd(), m_crawl_state_file.name().c_str(), m_ctx->home_fd(), "beescrawl.dat");
-		m_crawl_state_file.name("beescrawl.dat");
-	}
-
 	BEESNOTE("relocking crawl state");
 	lock.lock();
 	// Not really correct but probably close enough
@@ -185,9 +177,12 @@ BeesRoots::transid_min()
 		return 0;
 	}
 	uint64_t rv = numeric_limits<uint64_t>::max();
+	const uint64_t max_rv = rv;
 	for (auto i : m_root_crawl_map) {
 		rv = min(rv, i.second->get_state_end().m_min_transid);
 	}
+	// If we get through this loop without setting rv, we'll create broken crawlers due to integer overflow.
+	THROW_CHECK2(runtime_error, rv, max_rv, max_rv > rv);
 	return rv;
 }

@@ -195,43 +190,37 @@ uint64_t
 BeesRoots::transid_max_nocache()
 {
 	uint64_t rv = 0;
-	uint64_t root = BTRFS_FS_TREE_OBJECTID;
-	BEESNOTE("Calculating transid_max (" << rv << " as of root " << root << ")");
-	BEESTRACE("Calculating transid_max...");
-
-	rv = btrfs_get_root_transid(root);
-
-	// XXX:  Do we need any of this?  Or is
-	// m_transid_re.update(btrfs_get_root_transid(BTRFS_FS_TREE_OBJECTID)) good enough?
+	BEESNOTE("Calculating transid_max");
+	BEESTRACE("Calculating transid_max");

+	// We look for the root of the extent tree and read its transid.
+	// Should run in O(1) time and be fairly reliable.
 	BtrfsIoctlSearchKey sk;
 	sk.tree_id = BTRFS_ROOT_TREE_OBJECTID;
-	sk.min_type = sk.max_type = BTRFS_ROOT_BACKREF_KEY;
-	sk.min_objectid = root;
+	sk.min_type = sk.max_type = BTRFS_ROOT_ITEM_KEY;
+	sk.min_objectid = sk.max_objectid = BTRFS_EXTENT_TREE_OBJECTID;

 	while (true) {
 		sk.nr_items = 1024;
+		BEESTRACE("transid_max search sk " << sk);
 		sk.do_ioctl(m_ctx->root_fd());

 		if (sk.m_result.empty()) {
 			break;
 		}

+		// We are just looking for the highest transid on the filesystem.
+		// We don't care which object it comes from.
 		for (auto i : sk.m_result) {
 			sk.next_min(i);
-			if (i.type == BTRFS_ROOT_BACKREF_KEY) {
-				if (i.transid > rv) {
-					BEESLOGDEBUG("transid_max root " << i.objectid << " parent " << i.offset << " transid " << i.transid);
-					BEESCOUNT(transid_max_miss);
-				}
-				root = i.objectid;
-			}
 			if (i.transid > rv) {
 				rv = i.transid;
 			}
 		}
 	}
-	m_transid_re.update(rv);
+
+	// transid must be greater than zero, or we did something very wrong
+	THROW_CHECK1(runtime_error, rv, rv > 0);
 	return rv;
 }

@@ -245,6 +234,7 @@ size_t
 BeesRoots::crawl_batch(shared_ptr<BeesCrawl> this_crawl)
 {
 	BEESNOTE("Crawling batch " << this_crawl->get_state_begin());
+	BEESTRACE("Crawling batch " << this_crawl->get_state_begin());
 	auto ctx_copy = m_ctx;
 	size_t batch_count = 0;
 	auto subvol = this_crawl->get_state_begin().m_root;
@@ -285,7 +275,7 @@ BeesRoots::crawl_roots()
 		BEESLOGINFO("idle: crawl map is empty!");
 	}

-	switch (s_scan_mode) {
+	switch (m_scan_mode) {

 		case SCAN_MODE_ZERO: {
 			// Scan the same inode/offset tuple in each subvol (good for snapshots)
@@ -370,6 +360,13 @@ BeesRoots::crawl_roots()
 	return false;
 }

+void
+BeesRoots::clear_caches()
+{
+	m_ctx->fd_cache()->clear();
+	m_root_ro_cache.clear();
+}
+
 void
 BeesRoots::crawl_thread()
 {
@@ -387,22 +384,22 @@ BeesRoots::crawl_thread()
 		}
 		if (run_again) {
 			shared_this->m_crawl_task.run();
-		} else {
-			shared_this->m_task_running = false;
 		}
 	});

 	// Monitor transid_max and wake up roots when it changes
 	BEESNOTE("tracking transid");
 	auto last_count = m_transid_re.count();
-	while (true) {
-		// Measure current transid
+	while (!m_stop_requested) {
+		BEESTRACE("Measure current transid");
 		catch_all([&]() {
+			BEESTRACE("calling transid_max_nocache");
 			m_transid_re.update(transid_max_nocache());
 		});

-		// Make sure we have a full complement of crawlers
+		BEESTRACE("Make sure we have a full complement of crawlers");
 		catch_all([&]() {
+			BEESTRACE("calling insert_new_crawl");
 			insert_new_crawl();
 		});

@@ -412,22 +409,22 @@ BeesRoots::crawl_thread()
 		// Even open files are a problem if they're big enough.
 		auto new_count = m_transid_re.count();
 		if (new_count != last_count) {
-			m_ctx->fd_cache()->clear();
+			clear_caches();
 		}
 		last_count = new_count;

 		// If no crawl task is running, start a new one
-		bool already_running = m_task_running.exchange(true);
-		if (!already_running) {
-			auto resumed_after_time = m_crawl_timer.lap();
-			BEESLOGINFO("Crawl master resumed after " << resumed_after_time << "s at transid " << new_count);
-			m_crawl_task.run();
-		}
+		m_crawl_task.run();

 		auto poll_time = m_transid_re.seconds_for(m_transid_factor);
 		BEESLOGDEBUG("Polling " << poll_time << "s for next " << m_transid_factor << " transid " << m_transid_re);
 		BEESNOTE("waiting " << poll_time << "s for next " << m_transid_factor << " transid " << m_transid_re);
-		nanosleep(poll_time);
+		unique_lock<mutex> lock(m_stop_mutex);
+		if (m_stop_requested) {
+			BEESLOGDEBUG("Stop requested in crawl thread");
+			break;
+		}
+		m_stop_condvar.wait_for(lock, chrono::duration<double>(poll_time));
 	}
 }

@@ -442,7 +439,16 @@ BeesRoots::writeback_thread()
 			state_save();
 		});

-		nanosleep(BEES_WRITEBACK_INTERVAL);
+		unique_lock<mutex> lock(m_stop_mutex);
+		if (m_stop_requested) {
+			BEESLOGDEBUG("Stop requested in writeback thread");
+			catch_all([&]() {
+				BEESNOTE("flushing crawler state");
+				state_save();
+			});
+			return;
+		}
+		m_stop_condvar.wait_for(lock, chrono::duration<double>(BEES_WRITEBACK_INTERVAL));
 	}
 }

@@ -475,19 +481,24 @@ BeesRoots::insert_new_crawl()
 	unique_lock<mutex> lock(m_mutex);
 	set<uint64_t> excess_roots;
 	for (auto i : m_root_crawl_map) {
+		BEESTRACE("excess_roots.insert(" << i.first << ")");
 		excess_roots.insert(i.first);
 	}
 	lock.unlock();

 	while (new_bcs.m_root) {
+		BEESTRACE("excess_roots.erase(" << new_bcs.m_root << ")");
 		excess_roots.erase(new_bcs.m_root);
+		BEESTRACE("insert_root(" << new_bcs << ")");
 		insert_root(new_bcs);
 		BEESCOUNT(crawl_create);
+		BEESTRACE("next_root(" << new_bcs.m_root << ")");
 		new_bcs.m_root = next_root(new_bcs.m_root);
 	}

 	for (auto i : excess_roots) {
 		new_bcs.m_root = i;
+		BEESTRACE("crawl_state_erase(" << new_bcs << ")");
 		crawl_state_erase(new_bcs);
 	}
 }
@@ -524,6 +535,16 @@ BeesRoots::state_load()
 			loaded_state.m_started = d.at("started");
 		}
 		BEESLOGDEBUG("loaded_state " << loaded_state);
+		if (loaded_state.m_min_transid == numeric_limits<uint64_t>::max()) {
+			BEESLOGWARN("WARNING: root " << loaded_state.m_root << ": bad min_transid " << loaded_state.m_min_transid << ", resetting to 0");
+			loaded_state.m_min_transid = 0;
+			BEESCOUNT(bug_bad_min_transid);
+		}
+		if (loaded_state.m_max_transid == numeric_limits<uint64_t>::max()) {
+			BEESLOGWARN("WARNING: root " << loaded_state.m_root << ": bad max_transid " << loaded_state.m_max_transid << ", resetting to " << loaded_state.m_min_transid);
+			loaded_state.m_max_transid = loaded_state.m_min_transid;
+			BEESCOUNT(bug_bad_max_transid);
+		}
 		insert_root(loaded_state);
 	}
 }
@@ -532,8 +553,16 @@ BeesRoots::BeesRoots(shared_ptr<BeesContext> ctx) :
 	m_ctx(ctx),
 	m_crawl_state_file(ctx->home_fd(), crawl_state_filename()),
 	m_crawl_thread("crawl_transid"),
-	m_writeback_thread("crawl_writeback"),
-	m_task_running(false)
+	m_writeback_thread("crawl_writeback")
+{
+	m_root_ro_cache.func([&](uint64_t root) -> bool {
+		return is_root_ro_nocache(root);
+	});
+	m_root_ro_cache.max_size(BEES_ROOT_FD_CACHE_SIZE);
+}
+
+void
+BeesRoots::start()
 {
 	m_crawl_thread.exec([&]() {
 		// Measure current transid before creating any crawlers
@@ -545,6 +574,7 @@ BeesRoots::BeesRoots(shared_ptr<BeesContext> ctx) :
 		catch_all([&]() {
 			state_load();
 		});
+
 		m_writeback_thread.exec([&]() {
 			writeback_thread();
 		});
@@ -552,6 +582,29 @@ BeesRoots::BeesRoots(shared_ptr<BeesContext> ctx) :
 	});
 }

+void
+BeesRoots::stop()
+{
+	BEESLOGDEBUG("BeesRoots stop requested");
+	BEESNOTE("stopping BeesRoots");
+	unique_lock<mutex> lock(m_stop_mutex);
+	m_stop_requested = true;
+	m_stop_condvar.notify_all();
+	lock.unlock();
+
+	// Stop crawl writeback first because we will break progress
+	// state tracking when we cancel the TaskMaster queue
+	BEESLOGDEBUG("Waiting for crawl writeback");
+	BEESNOTE("waiting for crawl_writeback thread");
+	m_writeback_thread.join();
+
+	BEESLOGDEBUG("Waiting for crawl thread");
+	BEESNOTE("waiting for crawl_thread thread");
+	m_crawl_thread.join();
+
+	BEESLOGDEBUG("BeesRoots stopped");
+}
+
 Fd
 BeesRoots::open_root_nocache(uint64_t rootid)
 {
@@ -581,8 +634,8 @@ BeesRoots::open_root_nocache(uint64_t rootid)
 		for (auto i : sk.m_result) {
 			sk.next_min(i);
 			if (i.type == BTRFS_ROOT_BACKREF_KEY && i.objectid == rootid) {
-				auto dirid = call_btrfs_get(btrfs_stack_root_ref_dirid, i.m_data);
-				auto name_len = call_btrfs_get(btrfs_stack_root_ref_name_len, i.m_data);
+				auto dirid = btrfs_get_member(&btrfs_root_ref::dirid, i.m_data);
+				auto name_len = btrfs_get_member(&btrfs_root_ref::name_len, i.m_data);
 				auto name_start = sizeof(struct btrfs_root_ref);
 				auto name_end = name_len + name_start;
 				THROW_CHECK2(runtime_error, i.m_data.size(), name_end, i.m_data.size() >= name_end);
@@ -640,6 +693,7 @@ BeesRoots::open_root_nocache(uint64_t rootid)
 				Stat st(rv);
 				THROW_CHECK1(runtime_error, st.st_ino, st.st_ino == BTRFS_FIRST_FREE_OBJECTID);
 				// BEESLOGDEBUG("open_root_nocache " << rootid << ": " << name_fd(rv));
+
 				BEESCOUNT(root_ok);
 				return rv;
 			}
@@ -658,9 +712,36 @@ BeesRoots::open_root(uint64_t rootid)
 		return Fd();
 	}

-	return m_ctx->fd_cache()->open_root(m_ctx, rootid);
+	return m_ctx->fd_cache()->open_root(rootid);
 }

+bool
+BeesRoots::is_root_ro_nocache(uint64_t root)
+{
+	BEESTRACE("checking subvol flags on root " << root);
+	Fd root_fd = open_root(root);
+	BEESTRACE("checking subvol flags on root " << root << " path " << name_fd(root_fd));
+
+	uint64_t flags = 0;
+	DIE_IF_NON_ZERO(ioctl(root_fd, BTRFS_IOC_SUBVOL_GETFLAGS, &flags));
+	if (flags & BTRFS_SUBVOL_RDONLY) {
+		BEESLOGDEBUG("WORKAROUND: Avoiding RO root " << root);
+		BEESCOUNT(root_workaround_btrfs_send);
+		return true;
+	}
+	return false;
+}
+
+bool
+BeesRoots::is_root_ro(uint64_t root)
+{
+	// If we are not implementing the workaround there is no need for cache
+	if (!m_workaround_btrfs_send) {
+		return false;
+	}
+
+	return m_root_ro_cache(root);
+}

 uint64_t
 BeesRoots::next_root(uint64_t root)
@@ -702,6 +783,16 @@ BeesRoots::open_root_ino_nocache(uint64_t root, uint64_t ino)
 {
 	BEESTRACE("opening root " << root << " ino " << ino);

+	// Check the tmpfiles map first
+	{
+		unique_lock<mutex> lock(m_tmpfiles_mutex);
+		auto found = m_tmpfiles.find(BeesFileId(root, ino));
+		if (found != m_tmpfiles.end()) {
+			BEESCOUNT(open_tmpfile);
+			return found->second;
+		}
+	}
+
 	Fd root_fd = open_root(root);
 	if (!root_fd) {
 		BEESCOUNT(open_no_root);
@@ -733,7 +824,7 @@ BeesRoots::open_root_ino_nocache(uint64_t root, uint64_t ino)
 	for (auto file_path : ipa.m_paths) {
 		BEESTRACE("Looking up root " << root << " ino " << ino << " in dir " << name_fd(root_fd) << " path " << file_path);
 		BEESCOUNT(open_file);
-		// Just open file RO.  root can do the dedup ioctl without
+		// Just open file RO.  root can do the dedupe ioctl without
 		// opening in write mode, and if we do open in write mode,
 		// we can't exec the file while we have it open.
 		const char *fp_cstr = file_path.c_str();
@@ -777,19 +868,19 @@ BeesRoots::open_root_ino_nocache(uint64_t root, uint64_t ino)
 			break;
 		}

-		// The kernel rejects dedup requests with
+		// The kernel rejects dedupe requests with
 		// src and dst that have different datasum flags
 		// (datasum is a flag in the inode).
 		//
 		// We can detect the common case where a file is
 		// marked with nodatacow (which implies nodatasum).
-		// nodatacow files are arguably out of scope for dedup,
-		// since dedup would just make them datacow again.
+		// nodatacow files are arguably out of scope for dedupe,
+		// since dedupe would just make them datacow again.
 		// To handle these we pretend we couldn't open them.
 		//
 		// A less common case is nodatasum + datacow files.
-		// Those are availble for dedup but we have to solve
-		// some other problems before we can dedup them.  They
+		// Those are availble for dedupe but we have to solve
+		// some other problems before we can dedupe them.  They
 		// require a separate hash table namespace from datasum
 		// + datacow files, and we have to create nodatasum
 		// temporary files when we rewrite extents.
@@ -819,7 +910,7 @@ BeesRoots::open_root_ino_nocache(uint64_t root, uint64_t ino)
 Fd
 BeesRoots::open_root_ino(uint64_t root, uint64_t ino)
 {
-	return m_ctx->fd_cache()->open_root_ino(m_ctx, root, ino);
+	return m_ctx->fd_cache()->open_root_ino(root, ino);
 }

 RateEstimator &
@@ -828,6 +919,25 @@ BeesRoots::transid_re()
 	return m_transid_re;
 }

+void
+BeesRoots::insert_tmpfile(Fd fd)
+{
+	BeesFileId fid(fd);
+	unique_lock<mutex> lock(m_tmpfiles_mutex);
+	auto rv = m_tmpfiles.insert(make_pair(fid, fd));
+	THROW_CHECK1(runtime_error, fd, rv.second);
+}
+
+void
+BeesRoots::erase_tmpfile(Fd fd)
+{
+	BeesFileId fid(fd);
+	unique_lock<mutex> lock(m_tmpfiles_mutex);
+	auto found = m_tmpfiles.find(fid);
+	THROW_CHECK1(runtime_error, fd, found != m_tmpfiles.end());
+	m_tmpfiles.erase(found);
+}
+
 BeesCrawl::BeesCrawl(shared_ptr<BeesContext> ctx, BeesCrawlState initial_state) :
 	m_ctx(ctx),
 	m_state(initial_state)
@@ -883,27 +993,38 @@ BeesCrawl::fetch_extents()
 		return next_transid();
 	}

+	// Check for btrfs send workaround: don't scan RO roots at all, pretend
+	// they are just empty.  We can't free any space there, and we
+	// don't have the necessary analysis logic to be able to use
+	// them as dedupe src extents (yet).
+	//
+	// This will keep the max_transid up to date so if the root
+	// is ever switched back to read-write, it won't trigger big
+	// expensive in-kernel searches for ancient transids.
+	if (m_ctx->is_root_ro(old_state.m_root)) {
+		BEESLOGDEBUG("WORKAROUND: skipping scan of RO root " << old_state.m_root);
+		BEESCOUNT(root_workaround_btrfs_send);
+		return next_transid();
+	}
+
 	BEESNOTE("crawling " << get_state_end());

 	Timer crawl_timer;

-	BtrfsIoctlSearchKey sk(BEES_MAX_CRAWL_SIZE * (sizeof(btrfs_file_extent_item) + sizeof(btrfs_ioctl_search_header)));
+	BtrfsIoctlSearchKey sk(BEES_MAX_CRAWL_BYTES);
 	sk.tree_id = old_state.m_root;
 	sk.min_objectid = old_state.m_objectid;
 	sk.min_type = sk.max_type = BTRFS_EXTENT_DATA_KEY;
 	sk.min_offset = old_state.m_offset;
 	sk.min_transid = old_state.m_min_transid;
-	// Don't set max_transid here.	We want to see old extents with
-	// new references, and max_transid filtering in the kernel locks
-	// the filesystem while slowing us down.
-	// sk.max_transid = old_state.m_max_transid;
+	// Don't set max_transid to m_max_transid here.	 See below.
 	sk.max_transid = numeric_limits<uint64_t>::max();
-	sk.nr_items = BEES_MAX_CRAWL_SIZE;
+	sk.nr_items = BEES_MAX_CRAWL_ITEMS;

 	// Lock in the old state
 	set_state(old_state);

-	BEESTRACE("Searching crawl sk " << static_cast<btrfs_ioctl_search_key&>(sk));
+	BEESTRACE("Searching crawl sk " << sk);
 	bool ioctl_ok = false;
 	{
 		BEESNOTE("searching crawl sk " << static_cast<btrfs_ioctl_search_key&>(sk));
@@ -916,7 +1037,7 @@ BeesCrawl::fetch_extents()
 	if (ioctl_ok) {
 		BEESCOUNT(crawl_search);
 	} else {
-		BEESLOGWARN("Search ioctl failed: " << strerror(errno));
+		BEESLOGWARN("Search ioctl(" << sk << ") failed: " << strerror(errno));
 		BEESCOUNT(crawl_fail);
 	}

@@ -962,11 +1083,10 @@ BeesCrawl::fetch_extents()
 			continue;
 		}

-		auto gen = call_btrfs_get(btrfs_stack_file_extent_generation, i.m_data);
+		auto gen = btrfs_get_member(&btrfs_file_extent_item::generation, i.m_data);
 		if (gen < get_state_end().m_min_transid) {
 			BEESCOUNT(crawl_gen_low);
 			++count_low;
-			// We want (need?) to scan these anyway?
 			// The header generation refers to the transid
 			// of the metadata page holding the current ref.
 			// This includes anything else in that page that
@@ -974,21 +1094,26 @@ BeesCrawl::fetch_extents()
 			// old it is.
 			// The file_extent_generation refers to the
 			// transid of the extent item's page, which is
-			// a different approximation of what we want.
-			// Combine both of these filters to minimize
-			// the number of times we unnecessarily re-read
-			// an extent.
+			// what we really want when we are slicing up
+			// the extent data by transid.
 			continue;
 		}
 		if (gen > get_state_end().m_max_transid) {
 			BEESCOUNT(crawl_gen_high);
 			++count_high;
-			// We have to filter these here because we can't
-			// do it in the kernel.
+			// We want to see old extents with references in
+			// new pages, which means we have to get extent
+			// refs from every page older than min_transid,
+			// not every page between min_transid and
+			// max_transid.  This means that we will get
+			// refs to new extent data that we don't want to
+			// process yet, because we'll process it again
+			// on the next crawl cycle.  We filter out refs
+			// to new extents here.
 			continue;
 		}

-		auto type = call_btrfs_get(btrfs_stack_file_extent_type, i.m_data);
+		auto type = btrfs_get_member(&btrfs_file_extent_item::type, i.m_data);
 		switch (type) {
 			default:
 				BEESLOGDEBUG("Unhandled file extent type " << type << " in root " << get_state_end().m_root << " ino " << i.objectid << " offset " << to_hex(i.offset));
@@ -1006,10 +1131,10 @@ BeesCrawl::fetch_extents()
 				BEESCOUNT(crawl_prealloc);
 				// fallthrough
 			case BTRFS_FILE_EXTENT_REG: {
-				auto physical = call_btrfs_get(btrfs_stack_file_extent_disk_bytenr, i.m_data);
-				auto ram = call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data);
-				auto len = call_btrfs_get(btrfs_stack_file_extent_num_bytes, i.m_data);
-				auto offset = call_btrfs_get(btrfs_stack_file_extent_offset, i.m_data);
+				auto physical = btrfs_get_member(&btrfs_file_extent_item::disk_bytenr, i.m_data);
+				auto ram = btrfs_get_member(&btrfs_file_extent_item::ram_bytes, i.m_data);
+				auto len = btrfs_get_member(&btrfs_file_extent_item::num_bytes, i.m_data);
+				auto offset = btrfs_get_member(&btrfs_file_extent_item::offset, i.m_data);
 				BEESTRACE("Root " << get_state_end().m_root << " ino " << i.objectid << " physical " << to_hex(physical)
 					<< " logical " << to_hex(i.offset) << ".." << to_hex(i.offset + len)
 					<< " gen " << gen);
--- a/src/bees-thread.cc
+++ b/src/bees-thread.cc
@@ -70,11 +70,6 @@ BeesThread::~BeesThread()

 	BEESLOGDEBUG("BeesThread destructor " << m_name);
 	if (m_thread_ptr->joinable()) {
-		BEESLOGDEBUG("Cancelling thread " << m_name);
-		int rv = pthread_cancel(m_thread_ptr->native_handle());
-		if (rv) {
-			BEESLOGDEBUG("pthread_cancel returned " << strerror(-rv));
-		}
 		BEESLOGDEBUG("Waiting for thread " << m_name);
 		Timer thread_time;
 		m_thread_ptr->join();
--- a/src/bees-trace.cc
+++ b/src/bees-trace.cc
@@ -0,0 +1,170 @@
+#include "bees.h"
+
+// tracing ----------------------------------------
+
+int bees_log_level = 8;
+
+thread_local BeesTracer *BeesTracer::tl_next_tracer = nullptr;
+thread_local bool BeesTracer::tl_first = true;
+thread_local bool BeesTracer::tl_silent = false;
+
+#if __cplusplus >= 201703
+static
+bool
+exception_check()
+{
+	return uncaught_exceptions();
+}
+#else
+static
+bool
+exception_check()
+{
+	return uncaught_exception();
+}
+#endif
+
+BeesTracer::~BeesTracer()
+{
+	if (!tl_silent && exception_check()) {
+		if (tl_first) {
+			BEESLOGNOTICE("--- BEGIN TRACE --- exception ---");
+			tl_first = false;
+		}
+		try {
+			m_func();
+		} catch (exception &e) {
+			BEESLOGNOTICE("Nested exception: " << e.what());
+		} catch (...) {
+			BEESLOGNOTICE("Nested exception ...");
+		}
+		if (!m_next_tracer) {
+			BEESLOGNOTICE("---  END  TRACE --- exception ---");
+		}
+	}
+	tl_next_tracer = m_next_tracer;
+	if (!m_next_tracer) {
+		tl_silent = false;
+		tl_first = true;
+	}
+}
+
+BeesTracer::BeesTracer(function<void()> f, bool silent) :
+	m_func(f)
+{
+	m_next_tracer = tl_next_tracer;
+	tl_next_tracer = this;
+	tl_silent = silent;
+}
+
+void
+BeesTracer::trace_now()
+{
+	BeesTracer *tp = tl_next_tracer;
+	BEESLOGNOTICE("--- BEGIN TRACE ---");
+	while (tp) {
+		tp->m_func();
+		tp = tp->m_next_tracer;
+	}
+	BEESLOGNOTICE("---  END  TRACE ---");
+}
+
+bool
+BeesTracer::get_silent()
+{
+	return tl_silent;
+}
+
+void
+BeesTracer::set_silent()
+{
+	tl_silent = true;
+}
+
+thread_local BeesNote *BeesNote::tl_next = nullptr;
+mutex BeesNote::s_mutex;
+map<pid_t, BeesNote*> BeesNote::s_status;
+thread_local string BeesNote::tl_name;
+
+BeesNote::~BeesNote()
+{
+	tl_next = m_prev;
+	unique_lock<mutex> lock(s_mutex);
+	if (tl_next) {
+		s_status[crucible::gettid()] = tl_next;
+	} else {
+		s_status.erase(crucible::gettid());
+	}
+}
+
+BeesNote::BeesNote(function<void(ostream &os)> f) :
+	m_func(f)
+{
+	m_name = get_name();
+	m_prev = tl_next;
+	tl_next = this;
+	unique_lock<mutex> lock(s_mutex);
+	s_status[crucible::gettid()] = tl_next;
+}
+
+void
+BeesNote::set_name(const string &name)
+{
+	tl_name = name;
+	catch_all([&]() {
+		DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), name.c_str()));
+	});
+}
+
+string
+BeesNote::get_name()
+{
+	// Use explicit name if given
+	if (!tl_name.empty()) {
+		return tl_name;
+	}
+
+	// Try a Task name.  If there is one, return it, but do not
+	// remember it.  Each output message may be a different Task.
+	// The current task is thread_local so we don't need to worry
+	// about it being destroyed under us.
+	auto current_task = Task::current_task();
+	if (current_task) {
+		return current_task.title();
+	}
+
+	// OK try the pthread name next.
+	char buf[24];
+	memset(buf, '\0', sizeof(buf));
+	int err = pthread_getname_np(pthread_self(), buf, sizeof(buf));
+	if (err) {
+		return string("pthread_getname_np: ") + strerror(err);
+	}
+	buf[sizeof(buf) - 1] = '\0';
+
+	// thread_getname_np returns process name
+	// ...by default?  ...for the main thread?
+	// ...except during exception handling?
+	// ...randomly?
+	return buf;
+}
+
+BeesNote::ThreadStatusMap
+BeesNote::get_status()
+{
+	unique_lock<mutex> lock(s_mutex);
+	ThreadStatusMap rv;
+	for (auto t : s_status) {
+		ostringstream oss;
+		if (!t.second->m_name.empty()) {
+			oss << t.second->m_name << ": ";
+		}
+		if (t.second->m_timer.age() > BEES_TOO_LONG) {
+			oss << "[" << t.second->m_timer << "s] ";
+		}
+		t.second->m_func(oss);
+		rv[t.first] = oss.str();
+	}
+	return rv;
+}
+
--- a/src/bees-types.cc
+++ b/src/bees-types.cc
@@ -1,6 +1,5 @@
 #include "bees.h"

-#include "crucible/crc64.h"
 #include "crucible/limits.h"
 #include "crucible/ntoa.h"
 #include "crucible/string.h"
@@ -386,8 +385,8 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
 	BEESTRACE("e_second " << e_second);

 	// Preread entire extent
-	readahead(second.fd(), e_second.begin(), e_second.size());
-	readahead(first.fd(), e_second.begin() + first.begin() - second.begin(), e_second.size());
+	bees_readahead(second.fd(), e_second.begin(), e_second.size());
+	bees_readahead(first.fd(), e_second.begin() + first.begin() - second.begin(), e_second.size());

 	auto hash_table = ctx->hash_table();

@@ -406,7 +405,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
 				BEESCOUNT(pairbackward_hole);
 				break;
 			}
-			readahead(second.fd(), e_second.begin(), e_second.size());
+			bees_readahead(second.fd(), e_second.begin(), e_second.size());
 #else
 			// This tends to repeatedly process extents that were recently processed.
 			// We tend to catch duplicate blocks early since we scan them forwards.
@@ -515,7 +514,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
 				BEESCOUNT(pairforward_hole);
 				break;
 			}
-			readahead(second.fd(), e_second.begin(), e_second.size());
+			bees_readahead(second.fd(), e_second.begin(), e_second.size());
 		}
 		BEESCOUNT(pairforward_try);

@@ -961,14 +960,10 @@ BeesHash
 BeesBlockData::hash() const
 {
 	if (!m_hash_done) {
-		// We can only dedup unaligned EOF blocks against other unaligned EOF blocks,
+		// We can only dedupe unaligned EOF blocks against other unaligned EOF blocks,
 		// so we do NOT round up to a full sum block size.
 		const Blob &blob = data();
-		// TODO:  It turns out that file formats with 4K block
-		// alignment and embedded CRC64 do exist, and every block
-		// of such files has the same hash.  Could use a subset
-		// of SHA1 here instead.
-		m_hash = Digest::CRC::crc64(blob.data(), blob.size());
+		m_hash = BeesHash(blob.data(), blob.size());
 		m_hash_done = true;
 		BEESCOUNT(block_hash);
 	}
@@ -980,9 +975,8 @@ bool
 BeesBlockData::is_data_zero() const
 {
 	// The CRC64 of zero is zero, so skip some work if we already know the CRC
-	if (m_hash_done && m_hash != 0) {
-		return false;
-	}
+	// ...but that doesn't work for any other hash function, and it
+	// saves us next to nothing.

 	// OK read block (maybe) and check every byte
 	for (auto c : data()) {
--- a/src/bees-usage.txt
+++ b/src/bees-usage.txt
@@ -0,0 +1,35 @@
+Usage: %s [options] fs-root-path
+Performs best-effort extent-same deduplication on btrfs.
+
+fs-root-path MUST be the root of a btrfs filesystem tree (subvol id 5).
+Other directories will be rejected.
+
+Options:
+    -h, --help            Show this help
+
+Load management options:
+    -c, --thread-count    Worker thread count (default CPU count * factor)
+    -C, --thread-factor   Worker thread factor (default 1)
+    -G, --thread-min      Minimum worker thread count (default 0)
+    -g, --loadavg-target  Target load average for worker threads (default none)
+
+Filesystem tree traversal options:
+    -m, --scan-mode       Scanning mode (0..2, default 0)
+
+Workarounds:
+    -a, --workaround-btrfs-send    Workaround for btrfs send
+                                   (ignore RO snapshots)
+
+Logging options:
+    -t, --timestamps      Show timestamps in log output (default)
+    -T, --no-timestamps   Omit timestamps in log output
+    -p, --absolute-paths  Show absolute paths (default)
+    -P, --strip-paths     Strip $CWD from beginning of all paths in the log
+    -v, --verbose         Set maximum log level (0..8, default 8)
+
+Optional environment variables:
+    BEESHOME    Path to hash table and configuration files
+                (default is .beeshome/ in the root of the filesystem).
+
+    BEESSTATUS  File to write status to (tmpfs recommended, e.g. /run).
+                No status is written if this variable is unset.
--- a/src/bees.cc
+++ b/src/bees.cc
@@ -7,6 +7,7 @@

 #include <cctype>
 #include <cmath>
+#include <cstdio>

 #include <iostream>
 #include <memory>
@@ -30,184 +31,14 @@
 using namespace crucible;
 using namespace std;

-int bees_log_level = 8;
-
-int
+void
 do_cmd_help(char *argv[])
 {
-	cerr << "Usage: " << argv[0] << " [options] fs-root-path [fs-root-path-2...]\n"
-		"Performs best-effort extent-same deduplication on btrfs.\n"
-		"\n"
-		"fs-root-path MUST be the root of a btrfs filesystem tree (id 5).\n"
-		"Other directories will be rejected.\n"
-		"\n"
-		"Options:\n"
-		"\t-h, --help\t\tShow this help\n"
-		"\t-c, --thread-count\tWorker thread count (default CPU count * factor)\n"
-		"\t-C, --thread-factor\tWorker thread factor (default " << BEES_DEFAULT_THREAD_FACTOR << ")\n"
-		"\t-G, --thread-min\t\tMinimum worker thread count with load average target (default 0)\n"
-		"\t-g, --loadavg-target\t\tTarget load average for worker threads (default is no target)\n"
-		"\t-m, --scan-mode\t\tScanning mode (0..2, default 0)\n"
-		"\t-t, --timestamps\tShow timestamps in log output (default)\n"
-		"\t-T, --no-timestamps\tOmit timestamps in log output\n"
-		"\t-p, --absolute-paths\tShow absolute paths (default)\n"
-		"\t-P, --strip-paths\tStrip $CWD from beginning of all paths in the log\n"
-		"\t-v, --verbose\tSet maximum log level (0..8, default 8)\n"
-		"\n"
-		"Optional environment variables:\n"
-		"\tBEESHOME\tPath to hash table and configuration files\n"
-		"\t\t\t(default is .beeshome/ in the root of each filesystem).\n"
-		"\n"
-		"\tBEESSTATUS\tFile to write status to (tmpfs recommended, e.g. /run).\n"
-		"\t\t\tNo status is written if this variable is unset.\n"
-		"\n"
-	<< endl;
-	return 0;
-}
-
-// tracing ----------------------------------------
-
-thread_local BeesTracer *BeesTracer::tl_next_tracer = nullptr;
-
-BeesTracer::~BeesTracer()
-{
-	if (uncaught_exception()) {
-		try {
-			m_func();
-		} catch (exception &e) {
-			BEESLOGERR("Nested exception: " << e.what());
-		} catch (...) {
-			BEESLOGERR("Nested exception ...");
-		}
-		if (!m_next_tracer) {
-			BEESLOGERR("---  END  TRACE --- exception ---");
-		}
-	}
-	tl_next_tracer = m_next_tracer;
-}
-
-BeesTracer::BeesTracer(function<void()> f) :
-	m_func(f)
-{
-	m_next_tracer = tl_next_tracer;
-	tl_next_tracer = this;
-}
-
-void
-BeesTracer::trace_now()
-{
-	BeesTracer *tp = tl_next_tracer;
-	BEESLOGERR("--- BEGIN TRACE ---");
-	while (tp) {
-		tp->m_func();
-		tp = tp->m_next_tracer;
-	}
-	BEESLOGERR("---  END  TRACE ---");
-}
-
-thread_local BeesNote *BeesNote::tl_next = nullptr;
-mutex BeesNote::s_mutex;
-map<pid_t, BeesNote*> BeesNote::s_status;
-thread_local string BeesNote::tl_name;
-
-BeesNote::~BeesNote()
-{
-	tl_next = m_prev;
-	unique_lock<mutex> lock(s_mutex);
-	if (tl_next) {
-		s_status[gettid()] = tl_next;
-	} else {
-		s_status.erase(gettid());
-	}
-}
-
-BeesNote::BeesNote(function<void(ostream &os)> f) :
-	m_func(f)
-{
-	m_name = get_name();
-	m_prev = tl_next;
-	tl_next = this;
-	unique_lock<mutex> lock(s_mutex);
-	s_status[gettid()] = tl_next;
-}
-
-void
-BeesNote::set_name(const string &name)
-{
-	tl_name = name;
-	catch_all([&]() {
-		DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), name.c_str()));
-	});
-}
-
-string
-BeesNote::get_name()
-{
-	// Use explicit name if given
-	if (!tl_name.empty()) {
-		return tl_name;
-	}
-
-	// Try a Task name.  If there is one, return it, but do not
-	// remember it.  Each output message may be a different Task.
-	// The current task is thread_local so we don't need to worry
-	// about it being destroyed under us.
-	auto current_task = Task::current_task();
-	if (current_task) {
-		return current_task.title();
-	}
-
-	// OK try the pthread name next.
-	char buf[24];
-	memset(buf, '\0', sizeof(buf));
-	int err = pthread_getname_np(pthread_self(), buf, sizeof(buf));
-	if (err) {
-		return string("pthread_getname_np: ") + strerror(err);
-	}
-	buf[sizeof(buf) - 1] = '\0';
-
-	// thread_getname_np returns process name
-	// ...by default?  ...for the main thread?
-	// ...except during exception handling?
-	// ...randomly?
-	return buf;
-}
-
-BeesNote::ThreadStatusMap
-BeesNote::get_status()
-{
-	unique_lock<mutex> lock(s_mutex);
-	ThreadStatusMap rv;
-	for (auto t : s_status) {
-		ostringstream oss;
-		if (!t.second->m_name.empty()) {
-			oss << t.second->m_name << ": ";
-		}
-		if (t.second->m_timer.age() > BEES_TOO_LONG) {
-			oss << "[" << t.second->m_timer << "s] ";
-		}
-		t.second->m_func(oss);
-		rv[t.first] = oss.str();
-	}
-	return rv;
+	fprintf(stderr, BEES_USAGE, argv[0]);
 }

 // static inline helpers ----------------------------------------

-static inline
-bool
-bees_addr_check(uint64_t v)
-{
-	return !(v & (1ULL << 63));
-}
-
-static inline
-bool
-bees_addr_check(int64_t v)
-{
-	return !(v & (1ULL << 63));
-}
-
 string
 pretty(double d)
 {
@@ -276,9 +107,10 @@ BeesStatTmpl<T>::add_count(string idx, size_t amount)
 {
 	unique_lock<mutex> lock(m_mutex);
 	if (!m_stats_map.count(idx)) {
-		m_stats_map[idx] = 0;
+		m_stats_map[idx] = amount;
+	} else {
+		m_stats_map[idx] += amount;
 	}
-	m_stats_map.at(idx) += amount;
 }

 template <class T>
@@ -393,6 +225,43 @@ bees_sync(int fd)
 	BEESCOUNTADD(sync_ms, sync_timer.age() * 1000);
 }

+void
+bees_readahead(int const fd, off_t offset, size_t size)
+{
+	Timer readahead_timer;
+	BEESNOTE("readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
+	BEESTOOLONG("readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
+	// In the kernel, readahead() is identical to posix_fadvise(..., POSIX_FADV_DONTNEED)
+	DIE_IF_NON_ZERO(readahead(fd, offset, size));
+#if 0
+	// Make sure this data is in page cache by brute force
+	// This isn't necessary and it might even be slower
+	BEESNOTE("emulating readahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
+	while (size) {
+		static uint8_t dummy[BEES_READAHEAD_SIZE];
+		size_t this_read_size = min(size, sizeof(dummy));
+		// Ignore errors and short reads.
+		// It turns out our size parameter isn't all that accurate.
+		(void)!pread(fd, dummy, this_read_size, offset);
+		BEESCOUNT(readahead_count);
+		BEESCOUNTADD(readahead_bytes, this_read_size);
+		offset += this_read_size;
+		size -= this_read_size;
+	}
+#endif
+	BEESCOUNTADD(readahead_ms, readahead_timer.age() * 1000);
+}
+
+void
+bees_unreadahead(int const fd, off_t offset, size_t size)
+{
+	Timer unreadahead_timer;
+	BEESNOTE("unreadahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
+	BEESTOOLONG("unreadahead " << name_fd(fd) << " offset " << to_hex(offset) << " len " << pretty(size));
+	DIE_IF_NON_ZERO(posix_fadvise(fd, offset, size, POSIX_FADV_DONTNEED));
+	BEESCOUNTADD(readahead_unread_ms, unreadahead_timer.age() * 1000);
+}
+
 BeesStringFile::BeesStringFile(Fd dir_fd, string name, size_t limit) :
 	m_dir_fd(dir_fd),
 	m_name(name),
@@ -448,7 +317,7 @@ BeesStringFile::write(string contents)
 		write_or_die(ofd, contents);
 #if 0
 		// This triggers too many btrfs bugs.  I wish I was kidding.
-		// Forget snapshots, balance, compression, and dedup:
+		// Forget snapshots, balance, compression, and dedupe:
 		// the system call you have to fear on btrfs is fsync().
 		// Also note that when bees renames a temporary over an
 		// existing file, it flushes the temporary, so we get
@@ -465,39 +334,6 @@ BeesStringFile::write(string contents)
 	renameat_or_die(m_dir_fd, tmpname, m_dir_fd, m_name);
 }

-void
-BeesTempFile::create()
-{
-	// BEESLOG("creating temporary file in " << m_ctx->root_path());
-	BEESNOTE("creating temporary file in " << m_ctx->root_path());
-	BEESTOOLONG("creating temporary file in " << m_ctx->root_path());
-
-	Timer create_timer;
-	DIE_IF_MINUS_ONE(m_fd = openat(m_ctx->root_fd(), ".", FLAGS_OPEN_TMPFILE, S_IRUSR | S_IWUSR));
-	BEESCOUNT(tmp_create);
-
-	// Can't reopen this file, so don't allow any resolves there
-	// Resolves won't work there anyway.  There are lots of tempfiles
-	// and they're short-lived, so this ends up being just a memory leak
-	// m_ctx->blacklist_add(BeesFileId(m_fd));
-
-	// Put this inode in the cache so we can resolve it later
-	m_ctx->insert_root_ino(m_fd);
-
-	// Set compression attribute
-	BEESTRACE("Getting FS_COMPR_FL on m_fd " << name_fd(m_fd));
-	int flags = ioctl_iflags_get(m_fd);
-	flags |= FS_COMPR_FL;
-	BEESTRACE("Setting FS_COMPR_FL on m_fd " << name_fd(m_fd) << " flags " << to_hex(flags));
-	ioctl_iflags_set(m_fd, flags);
-
-	// Always leave first block empty to avoid creating a file with an inline extent
-	m_end_offset = BLOCK_SIZE_CLONE;
-
-	// Count time spent here
-	BEESCOUNTADD(tmp_create_ms, create_timer.age() * 1000);
-}
-
 void
 BeesTempFile::resize(off_t offset)
 {
@@ -505,9 +341,6 @@ BeesTempFile::resize(off_t offset)
 	BEESNOTE("Resizing temporary file " << name_fd(m_fd) << " to " << to_hex(offset));
 	BEESTRACE("Resizing temporary file " << name_fd(m_fd) << " to " << to_hex(offset));

-	// Ensure that file covers m_end_offset..offset
-	THROW_CHECK2(invalid_argument, m_end_offset, offset, m_end_offset < offset);
-
 	// Truncate
 	Timer resize_timer;
 	DIE_IF_NON_ZERO(ftruncate(m_fd, offset));
@@ -520,25 +353,66 @@ BeesTempFile::resize(off_t offset)
 	BEESCOUNTADD(tmp_resize_ms, resize_timer.age() * 1000);
 }

+void
+BeesTempFile::reset()
+{
+	// Always leave first block empty to avoid creating a file with an inline extent
+	resize(BLOCK_SIZE_CLONE);
+}
+
+
+BeesTempFile::~BeesTempFile()
+{
+	BEESLOGDEBUG("destroying temporary file " << this << " in " << m_ctx->root_path() << " fd " << name_fd(m_fd));
+
+	// Remove this file from open_root_ino lookup table
+	m_roots->erase_tmpfile(m_fd);
+
+	// Remove from blacklist
+	m_ctx->blacklist_erase(BeesFileId(m_fd));
+}
+
 BeesTempFile::BeesTempFile(shared_ptr<BeesContext> ctx) :
 	m_ctx(ctx),
+	m_roots(ctx->roots()),
 	m_end_offset(0)
 {
-	create();
+	BEESLOGDEBUG("creating temporary file " << this << " in " << m_ctx->root_path());
+	BEESNOTE("creating temporary file in " << m_ctx->root_path());
+	BEESTOOLONG("creating temporary file in " << m_ctx->root_path());
+
+	Timer create_timer;
+	DIE_IF_MINUS_ONE(m_fd = openat(m_ctx->root_fd(), ".", FLAGS_OPEN_TMPFILE, S_IRUSR | S_IWUSR));
+	BEESCOUNT(tmp_create);
+
+	// Don't include this file in new extent scans
+	m_ctx->blacklist_insert(BeesFileId(m_fd));
+
+	// Add this file to open_root_ino lookup table
+	m_roots->insert_tmpfile(m_fd);
+
+	// Set compression attribute
+	BEESTRACE("Getting FS_COMPR_FL on m_fd " << name_fd(m_fd));
+	int flags = ioctl_iflags_get(m_fd);
+	flags |= FS_COMPR_FL;
+	BEESTRACE("Setting FS_COMPR_FL on m_fd " << name_fd(m_fd) << " flags " << to_hex(flags));
+	ioctl_iflags_set(m_fd, flags);
+
+	// Count time spent here
+	BEESCOUNTADD(tmp_create_ms, create_timer.age() * 1000);
+
+	// Set initial size
+	reset();
 }

 void
 BeesTempFile::realign()
 {
-	if (m_end_offset > BLOCK_SIZE_MAX_TEMP_FILE) {
-		BEESLOGINFO("temporary file size " << to_hex(m_end_offset) << " > max " << BLOCK_SIZE_MAX_TEMP_FILE);
-		BEESCOUNT(tmp_trunc);
-		return create();
-	}
 	if (m_end_offset & BLOCK_MASK_CLONE) {
 		// BEESTRACE("temporary file size " << to_hex(m_end_offset) << " not aligned");
 		BEESCOUNT(tmp_realign);
-		return create();
+		reset();
+		return;
 	}
 	// OK as is
 	BEESCOUNT(tmp_aligned);
@@ -614,13 +488,12 @@ BeesTempFile::make_copy(const BeesFileRange &src)
 	}
 	BEESCOUNTADD(tmp_copy_ms, copy_timer.age() * 1000);

-	// We seem to get lockups without this!
 	if (did_block_write) {
-#if 1
-		// Is this fixed by "Btrfs: fix deadlock between dedup on same file and starting writeback"?
-		// No.
-		// Is this fixed in kernel 4.14.34?
-		// No.
+#if 0
+		// There were a lot of kernel bugs leading to lockups.
+		// Most of them are fixed now.
+		// Unnecessary sync makes us slow, but maybe it has some robustness utility.
+		// TODO:  make this configurable.
 		bees_sync(m_fd);
 #endif
 	}
@@ -629,12 +502,88 @@ BeesTempFile::make_copy(const BeesFileRange &src)
 	return rv;
 }

+static
+ostream &
+operator<<(ostream &os, const siginfo_t &si)
+{
+	return os << "siginfo_t { "
+		<< "signo = " << si.si_signo << " (" << signal_ntoa(si.si_signo) << "), "
+		<< "errno = " << si.si_errno << ", "
+		<< "code = " << si.si_code << ", "
+		// << "trapno = " << si.si_trapno << ", "
+		<< "pid = " << si.si_pid << ", "
+		<< "uid = " << si.si_uid << ", "
+		<< "status = " << si.si_status << ", "
+		<< "utime = " << si.si_utime << ", "
+		<< "stime = " << si.si_stime << ", "
+		// << "value = " << si.si_value << ", "
+		<< "int = " << si.si_int << ", "
+		<< "ptr = " << si.si_ptr << ", "
+		<< "overrun = " << si.si_overrun << ", "
+		<< "timerid = " << si.si_timerid << ", "
+		<< "addr = " << si.si_addr << ", "
+		<< "band = " << si.si_band << ", "
+		<< "fd = " << si.si_fd << ", "
+		// << "addr_lsb = " << si.si_addr_lsb << ", "
+		// << "lower = " << si.si_lower << ", "
+		// << "upper = " << si.si_upper << ", "
+		// << "pkey = " << si.si_pkey << ", "
+		<< "call_addr = " << si.si_call_addr << ", "
+		<< "syscall = " << si.si_syscall << ", "
+		<< "arch = " << si.si_arch
+		<< " }";
+}
+
+static sigset_t new_sigset, old_sigset;
+
+void
+block_term_signal()
+{
+	BEESLOGDEBUG("Masking signals");
+
+	DIE_IF_NON_ZERO(sigemptyset(&new_sigset));
+	DIE_IF_NON_ZERO(sigaddset(&new_sigset, SIGTERM));
+	DIE_IF_NON_ZERO(sigaddset(&new_sigset, SIGINT));
+	DIE_IF_NON_ZERO(sigprocmask(SIG_BLOCK, &new_sigset, &old_sigset));
+}
+
+void
+wait_for_term_signal()
+{
+	BEESNOTE("waiting for signals");
+	BEESLOGDEBUG("Waiting for signals...");
+	siginfo_t info;
+
+	// Ironically, sigwaitinfo can be interrupted by a signal.
+	while (true) {
+		const int rv = sigwaitinfo(&new_sigset, &info);
+		if (rv == -1) {
+			if (errno == EINTR) {
+				BEESLOGDEBUG("Restarting sigwaitinfo");
+				continue;
+			}
+			THROW_ERRNO("sigwaitinfo errno = " << errno);
+		} else {
+			BEESLOGNOTICE("Received signal " << rv << " info " << info);
+			// Unblock so we die immediately if signalled again
+			DIE_IF_NON_ZERO(sigprocmask(SIG_BLOCK, &old_sigset, &new_sigset));
+			break;
+		}
+	}
+	BEESLOGDEBUG("Signal catcher exiting");
+}
+
 int
 bees_main(int argc, char *argv[])
 {
 	set_catch_explainer([&](string s) {
-		BEESLOGERR("\n\n*** EXCEPTION ***\n\t" << s << "\n***\n");
-		BEESCOUNT(exception_caught);
+		if (BeesTracer::get_silent()) {
+			BEESLOGDEBUG("exception (ignored): " << s);
+			BEESCOUNT(exception_caught_silent);
+		} else {
+			BEESLOGNOTICE("\n\n*** EXCEPTION ***\n\t" << s << "\n***\n");
+			BEESCOUNT(exception_caught);
+		}
 	});

 	// The thread name for the main function is also what the kernel
@@ -643,11 +592,16 @@ bees_main(int argc, char *argv[])
 	BeesNote::set_name("bees");
 	BEESNOTE("main");

-	list<shared_ptr<BeesContext>> all_contexts;
-	shared_ptr<BeesContext> bc;
-
 	THROW_CHECK1(invalid_argument, argc, argc >= 0);

+	// Have to block signals now before we create a bunch of threads
+	// so the threads will also have the signals blocked.
+	block_term_signal();
+
+	// Create a context so we can apply configuration to it
+	shared_ptr<BeesContext> bc = make_shared<BeesContext>();
+	BEESLOGDEBUG("context constructed");
+
 	string cwd(readlink_or_die("/proc/self/cwd"));

 	// Defaults
@@ -656,30 +610,54 @@ bees_main(int argc, char *argv[])
 	unsigned thread_count = 0;
 	unsigned thread_min = 0;
 	double load_target = 0;
+	bool workaround_btrfs_send = false;
+	BeesRoots::ScanMode root_scan_mode = BeesRoots::SCAN_MODE_ZERO;
+
+	// Configure getopt_long
+	static const struct option long_options[] = {
+		{ "thread-factor",         required_argument, NULL, 'C' },
+		{ "thread-min",            required_argument, NULL, 'G' },
+		{ "strip-paths",           no_argument,       NULL, 'P' },
+		{ "no-timestamps",         no_argument,       NULL, 'T' },
+		{ "workaround-btrfs-send", no_argument,       NULL, 'a' },
+		{ "thread-count",          required_argument, NULL, 'c' },
+		{ "loadavg-target",        required_argument, NULL, 'g' },
+		{ "help",                  no_argument,       NULL, 'h' },
+		{ "scan-mode",             required_argument, NULL, 'm' },
+		{ "absolute-paths",        no_argument,       NULL, 'p' },
+		{ "timestamps",            no_argument,       NULL, 't' },
+		{ "verbose",               required_argument, NULL, 'v' },
+		{ 0, 0, 0, 0 },
+	};
+
+	// Build getopt_long's short option list from the long_options table.
+	// While we're at it, make sure we didn't duplicate any options.
+	string getopt_list;
+	set<decltype(option::val)> option_vals;
+	for (const struct option *op = long_options; op->val; ++op) {
+		THROW_CHECK1(runtime_error, op->val, !option_vals.count(op->val));
+		option_vals.insert(op->val);
+		if ((op->val & 0xff) != op->val) {
+			continue;
+		}
+		getopt_list += op->val;
+		if (op->has_arg == required_argument) {
+			getopt_list += ':';
+		}
+	}

 	// Parse options
 	int c;
-	while (1) {
+	while (true) {
 		int option_index = 0;
-		static const struct option long_options[] = {
-			{ "thread-factor",  required_argument, NULL, 'C' },
-			{ "thread-min",     required_argument, NULL, 'G' },
-			{ "strip-paths",    no_argument,       NULL, 'P' },
-			{ "no-timestamps",  no_argument,       NULL, 'T' },
-			{ "thread-count",   required_argument, NULL, 'c' },
-			{ "loadavg-target", required_argument, NULL, 'g' },
-			{ "help",           no_argument,       NULL, 'h' },
-			{ "scan-mode", 	    required_argument, NULL, 'm' },
-			{ "absolute-paths", no_argument,       NULL, 'p' },
-			{ "timestamps",     no_argument,       NULL, 't' },
-			{ "verbose",        required_argument, NULL, 'v' },
-		};

-		c = getopt_long(argc, argv, "C:G:PTc:hg:m:ptv:", long_options, &option_index);
+		c = getopt_long(argc, argv, getopt_list.c_str(), long_options, &option_index);
 		if (-1 == c) {
 			break;
 		}

+		BEESLOGDEBUG("Parsing option '" << static_cast<char>(c) << "'");
+
 		switch (c) {

 			case 'C':
@@ -694,6 +672,9 @@ bees_main(int argc, char *argv[])
 			case 'T':
 				chatter_prefix_timestamp = false;
 				break;
+			case 'a':
+				workaround_btrfs_send = true;
+				break;
 			case 'c':
 				thread_count = stoul(optarg);
 				break;
@@ -701,7 +682,7 @@ bees_main(int argc, char *argv[])
 				load_target = stod(optarg);
 				break;
 			case 'm':
-				BeesRoots::set_scan_mode(static_cast<BeesRoots::ScanMode>(stoul(optarg)));
+				root_scan_mode = static_cast<BeesRoots::ScanMode>(stoul(optarg));
 				break;
 			case 'p':
 				crucible::set_relative_path("");
@@ -720,12 +701,17 @@ bees_main(int argc, char *argv[])
 				break;

 			case 'h':
-				do_cmd_help(argv); // fallthrough
 			default:
-				return 2;
+				do_cmd_help(argv);
+				return EXIT_FAILURE;
 		}
 	}

+	if (optind + 1 != argc) {
+		BEESLOGERR("Only one filesystem path per bees process");
+		return EXIT_FAILURE;
+	}
+
 	Chatter::enable_timestamp(chatter_prefix_timestamp);

 	if (!relative_path().empty()) {
@@ -762,29 +748,28 @@ bees_main(int argc, char *argv[])
 	BEESLOGNOTICE("setting worker thread pool maximum size to " << thread_count);
 	TaskMaster::set_thread_count(thread_count);

-	// Create a context and start crawlers
-	bool did_subscription = false;
-	while (optind < argc) {
-		catch_all([&]() {
-			bc = make_shared<BeesContext>(bc);
-			bc->set_root_path(argv[optind++]);
-			did_subscription = true;
-		});
-	}
+	// Set root path
+	string root_path = argv[optind++];
+	BEESLOGNOTICE("setting root path to '" << root_path << "'");
+	bc->set_root_path(root_path);

-	if (!did_subscription) {
-		BEESLOGWARN("WARNING: no filesystems added");
-	}
+	// Workaround for btrfs send
+	bc->roots()->set_workaround_btrfs_send(workaround_btrfs_send);

-	BeesThread status_thread("status", [&]() {
-		bc->dump_status();
-	});
+	// Set root scan mode
+	bc->roots()->set_scan_mode(root_scan_mode);
+
+	// Start crawlers
+	bc->start();

 	// Now we just wait forever
-	bc->show_progress();
+	wait_for_term_signal();
+
+	// Shut it down
+	bc->stop();

 	// That is all.
-	return 0;
+	return EXIT_SUCCESS;
 }

 int
@@ -794,13 +779,14 @@ main(int argc, char *argv[])

 	if (argc < 2) {
 		do_cmd_help(argv);
-		return 2;
+		return EXIT_FAILURE;
 	}

 	int rv = 1;
 	catch_and_explain([&]() {
 		rv = bees_main(argc, argv);
 	});
+	BEESLOGNOTICE("Exiting with status " << rv << " " << (rv ? "(failure)" : "(success)"));
 	return rv;
 }

--- a/src/bees.h
+++ b/src/bees.h
@@ -8,6 +8,7 @@
 #include "crucible/fd.h"
 #include "crucible/fs.h"
 #include "crucible/lockset.h"
+#include "crucible/pool.h"
 #include "crucible/progress.h"
 #include "crucible/time.h"
 #include "crucible/task.h"
@@ -28,7 +29,7 @@ using namespace std;
 // Block size for clone alignment (FIXME: should read this from /sys/fs/btrfs/<FS-UUID>/clone_alignment)
 const off_t BLOCK_SIZE_CLONE = 4096;

-// Block size for dedup checksums (arbitrary, but must be a multiple of clone alignment)
+// Block size for dedupe checksums (arbitrary, but must be a multiple of clone alignment)
 const off_t BLOCK_SIZE_SUMS = 4096;

 // Block size for memory allocations and file mappings  (FIXME: should be CPU page size)
@@ -49,14 +50,14 @@ const off_t BLOCK_SIZE_MAX_EXTENT = 128 * 1024 * 1024;
 const off_t BLOCK_MASK_CLONE = BLOCK_SIZE_CLONE - 1;
 const off_t BLOCK_MASK_SUMS = BLOCK_SIZE_SUMS - 1;

-// Maximum temporary file size
+// Maximum temporary file size (maximum extent size for temporary copy)
 const off_t BLOCK_SIZE_MAX_TEMP_FILE = 1024 * 1024 * 1024;

 // Bucket size for hash table (size of one hash bucket)
 const off_t BLOCK_SIZE_HASHTAB_BUCKET = BLOCK_SIZE_MMAP;

 // Extent size for hash table (since the nocow file attribute does not seem to be working today)
-const off_t BLOCK_SIZE_HASHTAB_EXTENT = 16 * 1024 * 1024;
+const off_t BLOCK_SIZE_HASHTAB_EXTENT = BLOCK_SIZE_MAX_COMPRESSED_EXTENT;

 // Bytes per second we want to flush (8GB every two hours)
 const double BEES_FLUSH_RATE = 8.0 * 1024 * 1024 * 1024 / 7200.0;
@@ -88,11 +89,11 @@ const double BEES_DEFAULT_THREAD_FACTOR = 1.0;
 // Log warnings when an operation takes too long
 const double BEES_TOO_LONG = 5.0;

-// Avoid any extent where LOGICAL_INO takes this long
-const double BEES_TOXIC_DURATION = 9.9;
-// EXPERIMENT:  Kernel v4.14+ may let us ignore toxicity
-// NOPE:  kernel 4.14 has the same toxicity problems as any previous kernel
-// const double BEES_TOXIC_DURATION = 99.9;
+// Avoid any extent where LOGICAL_INO takes this much kernel CPU time
+const double BEES_TOXIC_SYS_DURATION = 0.1;
+
+// Maximum number of refs to a single extent
+const size_t BEES_MAX_EXTENT_REF_COUNT = (16 * 1024 * 1024 / 24) - 1;

 // How long between hash table histograms
 const double BEES_HASH_TABLE_ANALYZE_INTERVAL = BEES_STATS_INTERVAL;
@@ -101,7 +102,10 @@ const double BEES_HASH_TABLE_ANALYZE_INTERVAL = BEES_STATS_INTERVAL;
 const size_t BEES_MAX_QUEUE_SIZE = 128;

 // Read this many items at a time in SEARCHv2
-const size_t BEES_MAX_CRAWL_SIZE = 1024;
+const size_t BEES_MAX_CRAWL_ITEMS = 8;
+
+// Read this many bytes at a time in SEARCHv2 (one maximum-sized metadata page)
+const size_t BEES_MAX_CRAWL_BYTES = 64 * 1024;

 // Insert this many items before switching to a new subvol
 const size_t BEES_MAX_CRAWL_BATCH = 128;
@@ -109,10 +113,17 @@ const size_t BEES_MAX_CRAWL_BATCH = 128;
 // Wait this many transids between crawls
 const size_t BEES_TRANSID_FACTOR = 10;

-// If an extent has this many refs, pretend it does not exist
-// to avoid a crippling btrfs performance bug
-// The actual limit in LOGICAL_INO seems to be 2730, but let's leave a little headroom
-const size_t BEES_MAX_EXTENT_REF_COUNT = 2560;
+// Wait this long for a balance to stop
+const double BEES_BALANCE_POLL_INTERVAL = 60.0;
+
+// Workaround for backref bugs
+const bool BEES_SERIALIZE_RESOLVE = false;
+
+// Workaround for tree mod log bugs
+const bool BEES_SERIALIZE_BALANCE = false;
+
+// Workaround for silly dedupe / ineffective readahead behavior
+const size_t BEES_READAHEAD_SIZE = 1024 * 1024;

 // Flags
 const int FLAGS_OPEN_COMMON   = O_NOFOLLOW | O_NONBLOCK | O_CLOEXEC | O_NOATIME | O_LARGEFILE | O_NOCTTY;
@@ -127,7 +138,7 @@ const int FLAGS_OPEN_FANOTIFY = O_RDWR | O_NOATIME | O_CLOEXEC | O_LARGEFILE;

 // macros ----------------------------------------

-#define BEESLOG(lv,x)   do { if (lv < bees_log_level) { Chatter c(lv, BeesNote::get_name()); c << x; } } while (0)
+#define BEESLOG(lv,x)   do { if (lv < bees_log_level) { Chatter __chatter(lv, BeesNote::get_name()); __chatter << x; } } while (0)
 #define BEESLOGTRACE(x) do { BEESLOG(LOG_DEBUG, x); BeesTracer::trace_now(); } while (0)

 #define BEESTRACE(x)   BeesTracer  SRSLY_WTF_C(beesTracer_,  __LINE__) ([&]()                 { BEESLOG(LOG_ERR, x);   })
@@ -167,7 +178,7 @@ public:
 	T at(string idx) const;

 friend ostream& operator<< <>(ostream &os, const BeesStatTmpl<T> &bs);
-friend class BeesStats;
+friend struct BeesStats;
 };

 using BeesRates = BeesStatTmpl<double>;
@@ -188,10 +199,14 @@ class BeesTracer {
 	BeesTracer *m_next_tracer = 0;

 	thread_local static BeesTracer *tl_next_tracer;
+	thread_local static bool tl_silent;
+	thread_local static bool tl_first;
 public:
-	BeesTracer(function<void()> f);
+	BeesTracer(function<void()> f, bool silent = false);
 	~BeesTracer();
 	static void trace_now();
+	static bool get_silent();
+	static void set_silent();
 };

 class BeesNote {
@@ -319,7 +334,6 @@ public:

 	// Blocks with no physical address (not yet allocated, hole, or "other").
 	// PREALLOC blocks have a physical address so they're not magic enough to be handled here.
-	// Compressed blocks have a physical address but it's two-dimensional.
 	enum MagicValue {
 		ZERO,		// BeesAddress uninitialized
 		DELALLOC,	// delayed allocation
@@ -374,7 +388,7 @@ class BeesStringFile {
 	size_t	m_limit;

 public:
-	BeesStringFile(Fd dir_fd, string name, size_t limit = 1024 * 1024);
+	BeesStringFile(Fd dir_fd, string name, size_t limit = 16 * 1024 * 1024);
 	string read();
 	void write(string contents);
 	void name(const string &new_name);
@@ -415,6 +429,8 @@ public:
 	BeesHashTable(shared_ptr<BeesContext> ctx, string filename, off_t size = BLOCK_SIZE_HASHTAB_EXTENT);
 	~BeesHashTable();

+	void stop();
+
 	vector<Cell>	find_cell(HashType hash);
 	bool		push_random_hash_addr(HashType hash, AddrType addr);
 	void		erase_hash_addr(HashType hash, AddrType addr);
@@ -444,12 +460,20 @@ private:
 	BeesThread  		m_writeback_thread;
 	BeesThread	        m_prefetch_thread;
 	RateLimiter		m_flush_rate_limit;
-	set<HashType>		m_toxic_hashes;
 	BeesStringFile		m_stats_file;

+	// Prefetch readahead hint
+	bool			m_prefetch_running = false;
+
 	// Mutex/condvar for the writeback thread
 	mutex			m_dirty_mutex;
 	condition_variable	m_dirty_condvar;
+	bool			m_dirty;
+
+	// Mutex/condvar to stop
+	mutex			m_stop_mutex;
+	condition_variable	m_stop_condvar;
+	bool			m_stop_requested = false;

 	// Per-extent structures
 	struct ExtentMetaData {
@@ -469,9 +493,8 @@ private:
 	void fetch_missing_extent_by_hash(HashType hash);
 	void fetch_missing_extent_by_index(uint64_t extent_index);
 	void set_extent_dirty_locked(uint64_t extent_index);
-	void flush_dirty_extents();
+	size_t flush_dirty_extents(bool slowly);
 	bool flush_dirty_extent(uint64_t extent_index);
-	bool is_toxic_hash(HashType h) const;

 	size_t			hash_to_extent_index(HashType ht);
 	unique_lock<mutex>	lock_extent_by_hash(HashType ht);
@@ -532,13 +555,22 @@ class BeesRoots : public enable_shared_from_this<BeesRoots> {
 	BeesThread				m_writeback_thread;
 	RateEstimator				m_transid_re;
 	size_t					m_transid_factor = BEES_TRANSID_FACTOR;
-	atomic<bool>				m_task_running;
 	Task					m_crawl_task;
+	bool					m_workaround_btrfs_send = false;
+	LRUCache<bool, uint64_t>		m_root_ro_cache;
+
+	mutex					m_tmpfiles_mutex;
+	map<BeesFileId, Fd>			m_tmpfiles;
+
+	mutex					m_stop_mutex;
+	condition_variable			m_stop_condvar;
+	bool					m_stop_requested = false;

 	void insert_new_crawl();
 	void insert_root(const BeesCrawlState &bcs);
 	Fd open_root_nocache(uint64_t root);
 	Fd open_root_ino_nocache(uint64_t root, uint64_t ino);
+	bool is_root_ro_nocache(uint64_t root);
 	uint64_t transid_min();
 	uint64_t transid_max();
 	uint64_t transid_max_nocache();
@@ -555,15 +587,23 @@ class BeesRoots : public enable_shared_from_this<BeesRoots> {
 	void current_state_set(const BeesCrawlState &bcs);
 	RateEstimator& transid_re();
 	size_t crawl_batch(shared_ptr<BeesCrawl> crawl);
+	void clear_caches();
+	void insert_tmpfile(Fd fd);
+	void erase_tmpfile(Fd fd);

 friend class BeesFdCache;
 friend class BeesCrawl;
+friend class BeesTempFile;

 public:
 	BeesRoots(shared_ptr<BeesContext> ctx);
+	void start();
+	void stop();
+
 	Fd open_root(uint64_t root);
 	Fd open_root_ino(uint64_t root, uint64_t ino);
 	Fd open_root_ino(const BeesFileId &bfi) { return open_root_ino(bfi.root(), bfi.ino()); }
+	bool is_root_ro(uint64_t root);

 	// TODO:  think of better names for these.
 	// or TODO:  do extent-tree scans instead
@@ -574,10 +614,11 @@ public:
 		SCAN_MODE_COUNT, // must be last
 	};

-	static void set_scan_mode(ScanMode new_mode);
+	void set_scan_mode(ScanMode new_mode);
+	void set_workaround_btrfs_send(bool do_avoid);

 private:
-	static ScanMode s_scan_mode;
+	ScanMode m_scan_mode = SCAN_MODE_ZERO;
 	static string scan_mode_ntoa(ScanMode new_mode);

 };
@@ -589,6 +630,7 @@ struct BeesHash {
 	BeesHash(Type that) : m_hash(that) { }
 	operator Type() const { return m_hash; }
 	BeesHash& operator=(const Type that) { m_hash = that; return *this; }
+	BeesHash(const uint8_t *ptr, size_t len);
 private:
 	Type	m_hash;

@@ -645,30 +687,32 @@ friend ostream & operator<<(ostream &os, const BeesRangePair &brp);

 class BeesTempFile {
 	shared_ptr<BeesContext> m_ctx;
+	shared_ptr<BeesRoots>   m_roots;
 	Fd			m_fd;
 	off_t			m_end_offset;

-	void create();
 	void realign();
 	void resize(off_t new_end_offset);

 public:
+	~BeesTempFile();
 	BeesTempFile(shared_ptr<BeesContext> ctx);
 	BeesFileRange make_hole(off_t count);
 	BeesFileRange make_copy(const BeesFileRange &src);
+	void reset();
 };

 class BeesFdCache {
-	LRUCache<Fd, shared_ptr<BeesContext>, uint64_t>			m_root_cache;
-	LRUCache<Fd, shared_ptr<BeesContext>, uint64_t, uint64_t>	m_file_cache;
-	Timer								m_root_cache_timer;
-	Timer								m_file_cache_timer;
+	shared_ptr<BeesContext> 		m_ctx;
+	LRUCache<Fd, uint64_t>			m_root_cache;
+	LRUCache<Fd, uint64_t, uint64_t>	m_file_cache;
+	Timer					m_root_cache_timer;
+	Timer					m_file_cache_timer;

 public:
-	BeesFdCache();
-	Fd open_root(shared_ptr<BeesContext> ctx, uint64_t root);
-	Fd open_root_ino(shared_ptr<BeesContext> ctx, uint64_t root, uint64_t ino);
-	void insert_root_ino(shared_ptr<BeesContext> ctx, Fd fd);
+	BeesFdCache(shared_ptr<BeesContext> ctx);
+	Fd open_root(uint64_t root);
+	Fd open_root_ino(uint64_t root, uint64_t ino);
 	void clear();
 };

@@ -679,6 +723,10 @@ struct BeesResolveAddrResult {
 	bool is_toxic() const { return m_is_toxic; }
 };

+struct BeesHalt : exception {
+	const char *what() const noexcept override;
+};
+
 class BeesContext : public enable_shared_from_this<BeesContext> {
 	shared_ptr<BeesContext>				m_parent_ctx;

@@ -687,47 +735,57 @@ class BeesContext : public enable_shared_from_this<BeesContext> {
 	shared_ptr<BeesFdCache>				m_fd_cache;
 	shared_ptr<BeesHashTable>			m_hash_table;
 	shared_ptr<BeesRoots>				m_roots;
-
-	map<thread::id, shared_ptr<BeesTempFile>>	m_tmpfiles;
+	Pool<BeesTempFile>				m_tmpfile_pool;

 	LRUCache<BeesResolveAddrResult, BeesAddress>	m_resolve_cache;

 	string						m_root_path;
 	Fd						m_root_fd;
-	string						m_root_uuid;

 	mutable mutex					m_blacklist_mutex;
 	set<BeesFileId>					m_blacklist;

-	string						m_uuid;
-
 	Timer						m_total_timer;

 	LockSet<uint64_t>				m_extent_lock_set;

+	mutable mutex					m_stop_mutex;
+	condition_variable				m_stop_condvar;
+	bool						m_stop_requested = false;
+	bool						m_stop_status = false;
+
+	mutable mutex					m_abort_mutex;
+	condition_variable				m_abort_condvar;
+	bool						m_abort_requested = false;
+
+	shared_ptr<BeesThread>				m_progress_thread;
+	shared_ptr<BeesThread>				m_status_thread;
+
 	void set_root_fd(Fd fd);

 	BeesResolveAddrResult resolve_addr_uncached(BeesAddress addr);
+	void wait_for_balance();

 	BeesFileRange scan_one_extent(const BeesFileRange &bfr, const Extent &e);
 	void rewrite_file_range(const BeesFileRange &bfr);

 public:
-	BeesContext(shared_ptr<BeesContext> parent_ctx = nullptr);
+	BeesContext() = default;

 	void set_root_path(string path);

 	Fd root_fd() const { return m_root_fd; }
 	Fd home_fd();
 	string root_path() const { return m_root_path; }
-	string root_uuid() const { return m_root_uuid; }

 	BeesFileRange scan_forward(const BeesFileRange &bfr);

-	BeesRangePair dup_extent(const BeesFileRange &src);
+	bool is_root_ro(uint64_t root);
+	BeesRangePair dup_extent(const BeesFileRange &src, const shared_ptr<BeesTempFile> &tmpfile);
 	bool dedup(const BeesRangePair &brp);

-	void blacklist_add(const BeesFileId &fid);
+	void blacklist_insert(const BeesFileId &fid);
+	void blacklist_erase(const BeesFileId &fid);
 	bool is_blacklisted(const BeesFileId &fid) const;

 	BeesResolveAddrResult resolve_addr(BeesAddress addr);
@@ -736,6 +794,10 @@ public:
 	void dump_status();
 	void show_progress();

+	void start();
+	void stop();
+	bool stop_requested() const;
+
 	shared_ptr<BeesFdCache> fd_cache();
 	shared_ptr<BeesHashTable> hash_table();
 	shared_ptr<BeesRoots> roots();
@@ -743,9 +805,6 @@ public:

 	const Timer &total_timer() const { return m_total_timer; }
 	LockSet<uint64_t> &extent_lock_set() { return m_extent_lock_set; }
-
-	// TODO: move the rest of the FD cache methods here
-	void insert_root_ino(Fd fd);
 };

 class BeesResolver {
@@ -755,10 +814,10 @@ class BeesResolver {
 	set<BeesFileRange>			m_ranges;
 	unsigned				m_bior_count;

-	// We found matching data, so we can dedup
+	// We found matching data, so we can dedupe
 	bool					m_found_data = false;

-	// We found matching data, so we *did* dedup
+	// We found matching data, so we *did* dedupe
 	bool					m_found_dup = false;

 	// We found matching hash, so the hash table is still correct
@@ -826,9 +885,12 @@ public:

 // And now, a giant pile of extern declarations
 extern int bees_log_level;
+extern const char *BEES_USAGE;
 extern const char *BEES_VERSION;
 string pretty(double d);
 void bees_sync(int fd);
+void bees_readahead(int fd, off_t offset, size_t size);
+void bees_unreadahead(int fd, off_t offset, size_t size);
 string format_time(time_t t);

 #endif
--- a/src/fiemap.cc
+++ b/src/fiemap.cc
@@ -23,6 +23,7 @@ main(int argc, char **argv)
 		cout << "File: " << filename << endl;
 		Fd fd = open_or_die(filename, O_RDONLY);
 		Fiemap fm;
+		fm.fm_flags &= ~(FIEMAP_FLAG_SYNC);
 		fm.m_max_count = 100;
 		if (argc > 2) { fm.fm_start = stoull(argv[2], nullptr, 0); }
 		if (argc > 3) { fm.fm_length = stoull(argv[3], nullptr, 0); }
--- a/test/Makefile
+++ b/test/Makefile
@@ -3,6 +3,7 @@ PROGRAMS = \
 	crc64 \
 	fd \
 	limits \
+	namedptr \
 	path \
 	process \
 	progress \
@@ -14,13 +15,16 @@ test: $(PROGRAMS:%=%.txt) Makefile
 FORCE:

 include ../makeflags
+-include ../localconf

 LIBS = -lcrucible -lpthread
-LDFLAGS = -L../lib -Wl,-rpath=$(shell realpath ../lib)
+BEES_LDFLAGS = -L../lib $(LDFLAGS)

-.depends/%.dep: %.cc tests.h Makefile
-	@mkdir -p .depends
-	$(CXX) $(CXXFLAGS) -M -MF $@ -MT $(<:.cc=.o) $<
+.depends:
+	mkdir -p $@
+
+.depends/%.dep: %.cc tests.h Makefile | .depends
+	$(CXX) $(BEES_CXXFLAGS) -M -MF $@ -MT $(<:.cc=.o) $<

 depends.mk: $(PROGRAMS:%=.depends/%.dep)
 	cat $^ > $@.new
@@ -28,13 +32,11 @@ depends.mk: $(PROGRAMS:%=.depends/%.dep)

 include depends.mk

-%.o: %.cc %.h ../makeflags Makefile
-	@echo "Implicit rule %.o: %.cc"
-	$(CXX) $(CXXFLAGS) -o $@ -c $<
+$(PROGRAMS:%=%.o): %.o: %.cc ../makeflags Makefile
+	$(CXX) $(BEES_CXXFLAGS) -o $@ -c $<

-$(PROGRAMS): %: %.o ../makeflags Makefile
-	@echo "Implicit rule %: %.o"
-	$(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $< $(LIBS)
+$(PROGRAMS): %: %.o ../makeflags Makefile ../lib/libcrucible.a
+	$(CXX) $(BEES_CXXFLAGS) $(BEES_LDFLAGS) -o $@ $< $(LIBS)

 %.txt: % Makefile FORCE
 	./$< >$@ 2>&1 || (RC=$$?; cat $@; exit $$RC)
--- a/test/fd.cc
+++ b/test/fd.cc
@@ -37,7 +37,7 @@ test_basic_read()
 	char read_buf[test_string_len];
 	read_or_die(f, read_buf);
 	assert(!strncmp(read_buf, test_string, test_string_len));
-	f->close();
+	f = Fd();
 }

 static
@@ -207,8 +207,8 @@ static void test_assign_int_close()
 		assert(j == -1);
 		// Bonus conversion operator tests
 		assert(fd == -1);
-		// Chasing a closed ref now triggers an exception
-		assert(catch_all([&]() { return fd->get_fd() == -1; }));
+		// Chasing a closed ref no longer triggers an exception
+		assert(fd->get_fd() == -1);
 	}
 	assert_is_closed(i, true);
 }
@@ -228,8 +228,8 @@ static void test_assign_int_close_2()
 		assert(j == -1);
 		// Bonus conversion operator tests
 		assert(fd == -1);
-		// Chasing a closed ref now triggers an exception
-		assert(catch_all([&]() { return fd->get_fd() == -1; }));
+		// Chasing a closed ref no longer triggers an exception
+		assert(fd->get_fd() == -1);
 	}
 	assert_is_closed(i, true);
 }
@@ -262,36 +262,52 @@ static void test_map()
 	assert_is_closed(c, false);
 }

-static void test_close_method()
+static void test_close()
 {
 	Fd fd = open("fd.cc", O_RDONLY);
 	int i = fd;
 	assert_is_closed(i, false);
-	fd->close();
+	fd = Fd();
 	assert_is_closed(i, true);
 }

-static void test_shared_close_method()
+static void test_shared_close()
 {
 	Fd fd = open("fd.cc", O_RDONLY);
 	int i = fd;
 	Fd fd2 = fd;
 	assert_is_closed(i, false);
 	assert_is_closed(fd2, false);
-	fd->close();
+	fd2 = Fd();
+	assert_is_closed(i, false);
+	assert_is_closed(fd, false);
+	assert_is_closed(fd2, true);
+	fd = Fd();
 	assert_is_closed(i, true);
+	assert_is_closed(fd, true);
 	assert_is_closed(fd2, true);
 }

 struct DerivedFdResource : public Fd::resource_type {
 	string	m_name;
-	DerivedFdResource(string name) : m_name(name) {
-		Fd::resource_type::operator=(open(name.c_str(), O_RDONLY));
+	DerivedFdResource(string name) : Fd::resource_type(open(name.c_str(), O_RDONLY)), m_name(name) {
 		assert_is_closed(this->get_fd(), false);
 	}
 	const string &name() const { return m_name; }
 };

+template<class T>
+shared_ptr<T>
+cast(const Fd &fd)
+{
+	auto dp = dynamic_pointer_cast<T>(fd.operator->());
+	if (!dp) {
+		cerr << "expect bad cast exception: " << flush;
+		throw bad_cast();
+	}
+	return dp;
+}
+
 struct DerivedFd : public Fd {
 	using resource_type = DerivedFdResource;
 	DerivedFd(string name) {
@@ -299,7 +315,7 @@ struct DerivedFd : public Fd {
 		Fd::operator=(static_pointer_cast<Fd::resource_type>(ptr));
 	}
 	shared_ptr<DerivedFdResource> operator->() const {
-		shared_ptr<DerivedFdResource> rv = cast<DerivedFdResource>();
+		shared_ptr<DerivedFdResource> rv = cast<DerivedFdResource>(*this);
 		THROW_CHECK1(out_of_range, rv, rv);
 		return rv;
 	}
@@ -328,12 +344,12 @@ static void test_derived_cast()
 	Fd fd2(fd);
 	Fd fd3 = open("fd.cc", O_RDONLY);
 	assert(fd->name() == "fd.cc");
-	assert(fd.cast<Fd::resource_type>());
-	assert(fd.cast<DerivedFd::resource_type>());
-	assert(fd2.cast<Fd::resource_type>());
-	assert(fd2.cast<DerivedFd::resource_type>());
-	assert(fd3.cast<Fd::resource_type>());
-	assert(catch_all([&](){ assert(!fd3.cast<DerivedFd::resource_type>()); } ));
+	assert(cast<Fd::resource_type>(fd));
+	assert(cast<DerivedFd::resource_type>(fd));
+	assert(cast<Fd::resource_type>(fd2));
+	assert(cast<DerivedFd::resource_type>(fd2));
+	assert(cast<Fd::resource_type>(fd3));
+	assert(catch_all([&](){ assert(!cast<DerivedFd::resource_type>(fd3)); } ));
 }

 static void test_derived_map()
@@ -381,8 +397,8 @@ int main(int, const char **)
 	RUN_A_TEST(test_assign_int_close());
 	RUN_A_TEST(test_assign_int_close_2());
 	RUN_A_TEST(test_map());
-	RUN_A_TEST(test_close_method());
-	RUN_A_TEST(test_shared_close_method());
+	RUN_A_TEST(test_close());
+	RUN_A_TEST(test_shared_close());
 	RUN_A_TEST(test_derived_resource_type());
 	RUN_A_TEST(test_derived_map());
 	RUN_A_TEST(test_derived_cast());
--- a/test/namedptr.cc
+++ b/test/namedptr.cc
@@ -0,0 +1,84 @@
+#include "tests.h"
+#include "crucible/error.h"
+#include "crucible/namedptr.h"
+
+#include <cassert>
+#include <set>
+
+using namespace crucible;
+
+struct named_thing {
+	static set<named_thing*> s_set;
+	int m_a, m_b;
+	named_thing() = delete;
+	named_thing(const named_thing &that) :
+		m_a(that.m_a),
+		m_b(that.m_b)
+	{
+		cerr << "named_thing(" << m_a << ", " << m_b << ") " << this << " copied from " << &that << "." << endl;
+		auto rv = s_set.insert(this);
+		THROW_CHECK1(runtime_error, *rv.first, rv.second);
+	}
+	named_thing(int a, int b) :
+		m_a(a), m_b(b)
+	{
+		cerr << "named_thing(" << a << ", " << b << ") " << this << " constructed." << endl;
+		auto rv = s_set.insert(this);
+		THROW_CHECK1(runtime_error, *rv.first, rv.second);
+	}
+	~named_thing() {
+		auto rv = s_set.erase(this);
+		assert(rv == 1);
+		cerr << "named_thing(" << m_a << ", " << m_b << ") " << this << " destroyed." << endl;
+		m_a = ~m_a;
+		m_b = ~m_b;
+	}
+	void check(int a, int b) {
+		THROW_CHECK2(runtime_error, m_a, a, m_a == a);
+		THROW_CHECK2(runtime_error, m_b, b, m_b == b);
+	}
+	static void check_empty() {
+		THROW_CHECK1(runtime_error, s_set.size(), s_set.empty());
+	}
+};
+
+set<named_thing*> named_thing::s_set;
+
+static
+void
+test_namedptr()
+{
+	NamedPtr<named_thing, int, int> names;
+	names.func([](int a, int b) -> shared_ptr<named_thing> { return make_shared<named_thing>(a, b); });
+
+	auto a_3_5 = names(3, 5);
+	auto b_3_5 = names(3, 5);
+	{
+		auto c_2_7 = names(2, 7);
+		b_3_5 = a_3_5;
+		a_3_5->check(3, 5);
+		b_3_5->check(3, 5);
+		c_2_7->check(2, 7);
+	}
+	auto d_2_7 = names(2, 7);
+	a_3_5->check(3, 5);
+	a_3_5.reset();
+	b_3_5->check(3, 5);
+	d_2_7->check(2, 7);
+}
+
+static
+void
+test_leak()
+{
+	named_thing::check_empty();
+}
+
+int
+main(int, char**)
+{
+	RUN_A_TEST(test_namedptr());
+	RUN_A_TEST(test_leak());
+
+	exit(EXIT_SUCCESS);
+}
--- a/test/task.cc
+++ b/test/task.cc
@@ -3,6 +3,8 @@
 #include "crucible/task.h"
 #include "crucible/time.h"

+#include <atomic>
+#include <chrono>
 #include <cassert>
 #include <condition_variable>
 #include <mutex>
@@ -70,13 +72,14 @@ test_finish()
 	TaskMaster::print_queue(oss);
 	TaskMaster::print_workers(oss);
 	TaskMaster::set_thread_count(0);
-	// cerr << "finish done" << endl;
+	cerr << "finish done...";
 }

 void
 test_unfinish()
 {
 	TaskMaster::set_thread_count();
+	cerr << "unfinish done...";
 }


@@ -99,7 +102,7 @@ test_barrier(size_t count)
 		oss << "task #" << c;
 		Task t(
 			oss.str(),
-			[c, &task_done, &mtx, &cv, bl]() mutable {
+			[c, &task_done, &mtx, bl]() mutable {
 				// cerr << "Task #" << c << endl;
 				unique_lock<mutex> lock(mtx);
 				task_done.at(c) = true;
@@ -150,60 +153,78 @@ void
 test_exclusion(size_t count)
 {
 	mutex only_one;
-	Exclusion excl;
+	auto excl = make_shared<Exclusion>("test_excl");

 	mutex mtx;
 	condition_variable cv;

-	unique_lock<mutex> lock(mtx);
+	size_t tasks_running(0);
+	atomic<size_t> lock_success_count(0);
+	atomic<size_t> lock_failure_count(0);

-	auto b = make_shared<Barrier>();
+	vector<size_t> pings;
+	pings.resize(count);

 	// Run several tasks in parallel
 	for (size_t c = 0; c < count; ++c) {
-		auto bl = b->lock();
 		ostringstream oss;
 		oss << "task #" << c;
 		Task t(
 			oss.str(),
-			[c, &only_one, &mtx, &excl, bl]() mutable {
+			[c, &only_one, excl, &lock_success_count, &lock_failure_count, &pings, &tasks_running, &cv, &mtx]() mutable {
 				// cerr << "Task #" << c << endl;
-				auto lock = excl.try_lock();
+				(void)c;
+				auto lock = excl->try_lock();
 				if (!lock) {
-					excl.insert_task(Task::current_task());
+					excl->insert_task(Task::current_task());
+					++lock_failure_count;
 					return;
 				}
+				++lock_success_count;
 				bool locked = only_one.try_lock();
 				assert(locked);
 				nanosleep(0.0001);
 				only_one.unlock();
-				bl.release();
+				unique_lock<mutex> mtx_lock(mtx);
+				--tasks_running;
+				++pings[c];
+				cv.notify_all();
 			}
 		);
+		unique_lock<mutex> mtx_lock(mtx);
+		++tasks_running;
 		t.run();
 	}

-	bool done_flag = false;
+	// excl.reset();

-	Task completed(
-		"Waiting for Barrier",
-		[&mtx, &cv, &done_flag]() {
-			unique_lock<mutex> lock(mtx);
-			// cerr << "Running cv notify" << endl;
-			done_flag = true;
-			cv.notify_all();
+	unique_lock<mutex> lock(mtx);
+	while (tasks_running) {
+		auto cv_rv = cv.wait_for(lock, chrono::duration<double>(1));
+		if (cv_rv == cv_status::timeout) {
+			// TaskMaster::print_tasks(cerr);
+			for (auto i : pings) {
+				cerr << i << " ";
+			}
+			cerr << endl << "tasks_running = " << tasks_running << endl;
+			cerr << "lock_success_count " << lock_success_count << endl;
+			cerr << "lock_failure_count " << lock_failure_count << endl;
 		}
-	);
-	b->insert_task(completed);
+	}
+	cerr << "lock_success_count " << lock_success_count << endl;
+	cerr << "lock_failure_count " << lock_failure_count << endl;

-	b.reset();
-
-	while (true) {
-		if (done_flag) {
-			break;
+	bool oops = false;
+	for (size_t c = 0; c < pings.size(); ++c) {
+		if (pings[c] != 1) {
+			cerr << "pings[" << c << "] = " << pings[c] << endl;
+			oops = true;
 		}
-
-		cv.wait(lock);
+	}
+	if (oops) {
+		assert(!"Pings not OK");
+	} else {
+		cerr << "Pings OK" << endl;
 	}
 }