build: include localconf everywhere

Overriding makeflags did not work from localconf in the src, lib, or test directories. Signed-off-by: Zygo Blaxell <bees@furryterror.org>
roots: make it build with clang
2025-08-02 13:53:28 +02:00 · 2021-02-08 12:52:45 -05:00 · 2021-02-08 12:49:48 -05:00 · 2021-02-08 12:49:42 -05:00 · 2021-02-08 12:49:40 -05:00 · 2021-02-08 12:49:38 -05:00
70 changed files with 4482 additions and 2460 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 *.[ao]
 *.bak
 *.new
+*.dep
 *.so*
 Doxyfile
 README.html
@@ -10,3 +11,7 @@ html/
 latex/
 make.log
 make.log.new
+localconf
+lib/configure.h
+scripts/beesd
+scripts/beesd@.service
--- a/Defines.mk
+++ b/Defines.mk
@@ -0,0 +1,8 @@
+MAKE += PREFIX=$(PREFIX) LIBEXEC_PREFIX=$(LIBEXEC_PREFIX) ETC_PREFIX=$(ETC_PREFIX)
+
+define TEMPLATE_COMPILER =
+sed $< >$@ \
+		-e's#@PREFIX@#$(PREFIX)#' \
+		-e's#@ETC_PREFIX@#$(ETC_PREFIX)#' \
+		-e's#@LIBEXEC_PREFIX@#$(LIBEXEC_PREFIX)#'
+endef
--- a/80
+++ b/80
@@ -1,19 +1,83 @@
-default install all: lib src test README.html
+PREFIX ?= /usr
+ETC_PREFIX ?= /etc
+LIBDIR ?= lib

-clean:
-	git clean -dfx
+LIB_PREFIX ?= $(PREFIX)/$(LIBDIR)
+LIBEXEC_PREFIX ?= $(LIB_PREFIX)/bees

-.PHONY: lib src
+SYSTEMD_SYSTEM_UNIT_DIR ?= $(shell pkg-config systemd --variable=systemdsystemunitdir)

-lib:
-	$(MAKE) -C lib
+MARKDOWN := $(firstword $(shell type -P markdown markdown2 markdown_py 2>/dev/null || echo markdown))

+BEES_VERSION ?= $(shell git describe --always --dirty || echo UNKNOWN)
+
+# allow local configuration to override above variables
+-include localconf
+
+DEFAULT_MAKE_TARGET ?= reallyall
+
+ifeq ($(DEFAULT_MAKE_TARGET),reallyall)
+	RUN_INSTALL_TESTS = test
+endif
+
+include Defines.mk
+
+default: $(DEFAULT_MAKE_TARGET)
+
+all: lib src scripts
+docs: README.html
+reallyall: all docs test
+
+clean: ## Cleanup
+	git clean -dfx -e localconf
+
+.PHONY: lib src test
+
+lib: ## Build libs
+	$(MAKE) TAG="$(BEES_VERSION)" -C lib
+
+src: ## Build bins
 src: lib
-	$(MAKE) -C src
+	$(MAKE) BEES_VERSION="$(BEES_VERSION)" -C src

+test: ## Run tests
 test: lib src
 	$(MAKE) -C test

+scripts/%: scripts/%.in
+	$(TEMPLATE_COMPILER)
+
+scripts: scripts/beesd scripts/beesd@.service
+
 README.html: README.md
-	markdown README.md > README.html.new
+	$(MARKDOWN) README.md > README.html.new
 	mv -f README.html.new README.html
+
+install_libs: lib
+	install -Dm644 lib/libcrucible.so $(DESTDIR)$(LIB_PREFIX)/libcrucible.so
+
+install_tools: ## Install support tools + libs
+install_tools: install_libs src
+	install -Dm755 bin/fiemap $(DESTDIR)$(PREFIX)/bin/fiemap
+	install -Dm755 bin/fiewalk $(DESTDIR)$(PREFIX)/sbin/fiewalk
+
+install_bees: ## Install bees + libs
+install_bees: install_libs src $(RUN_INSTALL_TESTS)
+	install -Dm755 bin/bees	$(DESTDIR)$(LIBEXEC_PREFIX)/bees
+
+install_scripts: ## Install scipts
+install_scripts: scripts
+	install -Dm755 scripts/beesd $(DESTDIR)$(PREFIX)/sbin/beesd
+	install -Dm644 scripts/beesd.conf.sample $(DESTDIR)/$(ETC_PREFIX)/bees/beesd.conf.sample
+ifneq (SYSTEMD_SYSTEM_UNIT_DIR,)
+	install -Dm644 scripts/beesd@.service $(DESTDIR)$(SYSTEMD_SYSTEM_UNIT_DIR)/beesd@.service
+endif
+
+install: ## Install distribution
+install: install_bees install_scripts $(OPTIONAL_INSTALL_TARGETS)
+
+help: ## Show help
+	@fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/\t/'
+
+bees: reallyall
+fly: install
--- a/README.md
+++ b/README.md
@@ -1,30 +1,52 @@
 BEES
 ====

-Best-Effort Extent-Same, a btrfs deduplication daemon.
+Best-Effort Extent-Same, a btrfs dedup agent.

 About Bees
 ----------

-Bees is a daemon designed to run continuously on live file servers.
-Bees scans and deduplicates whole filesystems in a single pass instead
-of separate scan and dedup phases.  RAM usage does _not_ depend on
-unique data size or the number of input files.  Hash tables and scan
-progress are stored persistently so the daemon can resume after a reboot.
-Bees uses the Linux kernel's `dedupe_file_range` feature to ensure data
-is handled safely even if other applications concurrently modify it.
+Bees is a block-oriented userspace dedup agent designed to avoid
+scalability problems on large filesystems.

-Bees is intentionally btrfs-specific for performance and capability.
-Bees uses the btrfs `SEARCH_V2` ioctl to scan for new data without the
-overhead of repeatedly walking filesystem trees with the POSIX API.
-Bees uses `LOGICAL_INO` and `INO_PATHS` to leverage btrfs's existing
-metadata instead of building its own redundant data structures.
-Bees can cope with Btrfs filesystem compression.  Bees can reassemble
-Btrfs extents to deduplicate extents that contain a mix of duplicate
-and unique data blocks.
+Bees is designed to degrade gracefully when underprovisioned with RAM.
+Bees does not use more RAM or storage as filesystem data size increases.
+The dedup hash table size is fixed at creation time and does not change.
+The effective dedup block size is dynamic and adjusts automatically to
+fit the hash table into the configured RAM limit.  Hash table overflow
+is not implemented to eliminate the IO overhead of hash table overflow.
+Hash table entries are only 16 bytes per dedup block to keep the average
+dedup block size small.

-Bees includes a number of workarounds for Btrfs kernel bugs to (try to)
-avoid ruining your day.  You're welcome.
+Bees does not require alignment between dedup blocks or extent boundaries
+(i.e. it can handle any multiple-of-4K offset between dup block pairs).
+Bees rearranges blocks into shared and unique extents if required to
+work within current btrfs kernel dedup limitations.
+
+Bees can dedup any combination of compressed and uncompressed extents.
+
+Bees operates in a single pass which removes duplicate extents immediately
+during scan.  There are no separate scanning and dedup phases.
+
+Bees uses only data-safe btrfs kernel operations, so it can dedup live
+data (e.g. build servers, sqlite databases, VM disk images).  It does
+not modify file attributes or timestamps.
+
+Bees does not store any information about filesystem structure, so it is
+not affected by the number or size of files (except to the extent that
+these cause performance problems for btrfs in general).  It retrieves such
+information on demand through btrfs SEARCH_V2 and LOGICAL_INO ioctls.
+This eliminates the storage required to maintain the equivalents of
+these functions in userspace.  It's also why bees has no XFS support.
+
+Bees is a daemon designed to run continuously and maintain its state
+across crahes and reboots.  Bees uses checkpoints for persistence to
+eliminate the IO overhead of a transactional data store.  On restart,
+bees will dedup any data that was added to the filesystem since the
+last checkpoint.
+
+Bees is used to dedup filesystems ranging in size from 16GB to 35TB, with
+hash tables ranging in size from 128MB to 11GB.

 How Bees Works
 --------------
@@ -78,18 +100,16 @@ and some metadata bits).  Each entry represents a minimum of 4K on disk.
        1TB                16MB               1024K
       64TB                 1GB               1024K

-It is possible to resize the hash table by changing the size of
-`beeshash.dat` (e.g. with `truncate`) and restarting `bees`.  This
-does not preserve all the existing hash table entries, but it does
-preserve more than zero of them--especially if the old and new sizes
-are a power-of-two multiple of each other.
+To change the size of the hash table, use 'truncate' to change the hash
+table size, delete `beescrawl.dat` so that bees will start over with a
+fresh full-filesystem rescan, and restart `bees`.

 Things You Might Expect That Bees Doesn't Have
 ----------------------------------------------

-* There's no configuration file or getopt command line option processing
-(patches welcome!).  There are some tunables hardcoded in the source
-that could eventually become configuration options.
+* There's no configuration file (patches welcome!).  There are some tunables
+hardcoded in the source that could eventually become configuration options.
+There's also an incomplete option parser (patches welcome!).

 * There's no way to *stop* the Bees daemon.  Use SIGKILL, SIGTERM, or
 Ctrl-C for now.  Some of the destructors are unreachable and have never
@@ -114,11 +134,6 @@ performance by caching, but really fixing this requires rewriting the
 crawler to scan the btrfs extent tree directly instead of the subvol
 FS trees.

-* Bees had support for multiple worker threads in the past; however,
-this was removed because it made Bees too aggressive to coexist with
-other applications on the same machine.  It also hit the *slow backrefs*
-on N CPU cores instead of just one.
-
 * Block reads are currently more allocation- and CPU-intensive than they
 should be, especially for filesystems on SSD where the IO overhead is
 much smaller.  This is a problem for power-constrained environments
@@ -129,51 +144,67 @@ blocks, but has no defragmentation capability yet.  When possible, Bees
 will attempt to work with existing extent boundaries, but it will not
 aggregate blocks together from multiple extents to create larger ones.

+* It is possible to resize the hash table without starting over with
+a new full-filesystem scan; however, this has not been implemented yet.
+
 Good Btrfs Feature Interactions
 -------------------------------

 Bees has been tested in combination with the following:

-* btrfs compression (either method), mixtures of compressed and uncompressed extents
+* btrfs compression (zlib, lzo, zstd), mixtures of compressed and uncompressed extents
 * PREALLOC extents (unconditionally replaced with holes)
 * HOLE extents and btrfs no-holes feature
 * Other deduplicators, reflink copies (though Bees may decide to redo their work)
-* btrfs snapshots and non-snapshot subvols (RW only)
+* btrfs snapshots and non-snapshot subvols (RW and RO)
 * Concurrent file modification (e.g. PostgreSQL and sqlite databases, build daemons)
-* all btrfs RAID profiles (people ask about this, but it's irrelevant)
+* all btrfs RAID profiles (people ask about this, but it's irrelevant to bees)
 * IO errors during dedup (read errors will throw exceptions, Bees will catch them and skip over the affected extent)
 * Filesystems mounted *with* the flushoncommit option
 * 4K filesystem data block size / clone alignment
-* 64-bit CPUs (amd64)
+* 64-bit and 32-bit host CPUs (amd64, x86, arm)
 * Large (>16M) extents
 * Huge files (>1TB--although Btrfs performance on such files isn't great in general)
 * filesystems up to 25T bytes, 100M+ files
-
+* btrfs receive
+* btrfs nodatacow/nodatasum inode attribute or mount option (bees skips all nodatasum files)
+* open(O_DIRECT) (seems to work as well--or as poorly--with bees as with any other btrfs feature)

 Bad Btrfs Feature Interactions
 ------------------------------

+Bees has been tested in combination with the following, and various problems are known:
+
+* bcache, lvmcache:  *severe (filesystem-destroying) metadata corruption
+  issues* observed in testing and reported by users, apparently only when
+  used with bees.  Plain SSD and HDD seem to be OK.
+* btrfs send:  sometimes aborts with an I/O error when bees changes the
+  data layout during a send.  The send can be restarted and will work
+  if bees has finished processing the snapshot being sent.  No data
+  corruption observed other than the truncated send.
+* btrfs qgroups:  very slow, sometimes hangs
+* btrfs autodefrag mount option:  hangs and high CPU usage problems
+  reported by users.  bees cannot distinguish autodefrag activity from
+  normal filesystem activity and will likely try to undo the autodefrag,
+  so it should probably be turned off for bees in any case.
+
+Untested Btrfs Feature Interactions
+-----------------------------------
+
 Bees has not been tested with the following, and undesirable interactions may occur:

 * Non-4K filesystem data block size (should work if recompiled)
-* 32-bit CPUs (x86, arm)
 * Non-equal hash (SUM) and filesystem data block (CLONE) sizes (probably never will work)
-* btrfs read-only snapshots (never tested, probably wouldn't work well)
-* btrfs send/receive (receive is probably OK, but send requires RO snapshots.  See above)
-* btrfs qgroups (never tested, no idea what might happen)
 * btrfs seed filesystems (does anyone even use those?)
-* btrfs autodefrag mount option (never tested, could fight with Bees)
-* btrfs nodatacow mount option or inode attribute (*could* work, but might not)
 * btrfs out-of-tree kernel patches (e.g. in-band dedup or encryption)
-* btrfs-convert from ext2/3/4 (never tested)
+* btrfs-convert from ext2/3/4 (never tested, might run out of space or ignore significant portions of the filesystem due to sanity checks)
 * btrfs mixed block groups (don't know a reason why it would *not* work, but never tested)
-* open(O_DIRECT)
-* Filesystems mounted *without* the flushoncommit option
+* Filesystems mounted *without* the flushoncommit option (don't know the impact of crashes during dedup writes vs. ordinary writes)

 Other Caveats
 -------------

-* btrfs balance will invalidate parts of the dedup table.  Bees will
+* btrfs balance will invalidate parts of the dedup hash table.  Bees will
  happily rebuild the table, but it will have to scan all the blocks
  again.

@@ -184,41 +215,100 @@ Other Caveats

 * Bees creates temporary files (with O_TMPFILE) and uses them to split
  and combine extents elsewhere in btrfs.  These will take up to 2GB
-  during normal operation.
+  of disk space per thread during normal operation.

 * Like all deduplicators, Bees will replace data blocks with metadata
-  references.  It is a good idea to ensure there are several GB of
-  unallocated space (see `btrfs fi df`) on the filesystem before running
-  Bees for the first time.  Use
+  references.  It is a good idea to ensure there is sufficient unallocated
+  space (see `btrfs fi usage`) on the filesystem to allow the metadata
+  to multiply in size by the number of snapshots before running Bees
+  for the first time.  Use

-        btrfs balance start -dusage=100,limit=1 /your/filesystem
+        btrfs balance start -dusage=100,limit=N /your/filesystem

-  If possible, raise the `limit` parameter to the current size of metadata
-  usage (from `btrfs fi df`) plus 1.
+  where the `limit` parameter 'N' should be calculated as follows:
+
+	* start with the current size of metadata usage (from `btrfs fi
+	  df`) in GB, plus 1
+
+	* multiply by the proportion of disk space in subvols with
+	  snapshots (i.e. if there are no snapshots, multiply by 0;
+	  if all of the data is shared between at least one origin
+	  and one snapshot subvol, multiply by 1)
+
+	* multiply by the number of snapshots (i.e. if there is only
+	  one subvol, multiply by 0; if there are 3 snapshots and one
+	  origin subvol, multiply by 3)
+
+  `limit = GB_metadata * (disk_space_in_snapshots / total_disk_space) * number_of_snapshots`
+
+  Monitor unallocated space to ensure that the filesystem never runs out
+  of metadata space (whether Bees is running or not--this is a general
+  btrfs requirement).


 A Brief List Of Btrfs Kernel Bugs
 ---------------------------------

-Fixed bugs:
+Missing features (usually not available in older LTS kernels):

 * 3.13: `FILE_EXTENT_SAME` ioctl added.  No way to reliably dedup with
  concurrent modifications before this.
 * 3.16: `SEARCH_V2` ioctl added.  Bees could use `SEARCH` instead.
 * 4.2: `FILE_EXTENT_SAME` no longer updates mtime, can be used at EOF.
-  Kernel deadlock bugs fixed.
+
+Future features (kernel features Bees does not yet use, but may rely on
+in the future):
+
+* 4.14: `LOGICAL_INO_V2` allows userspace to create forward and backward
+  reference maps to entire physical extents with a single ioctl call,
+  and raises the limit of 2730 references per extent.  Bees has not yet
+  been rewritten to take full advantage of these features.
+
+Bug fixes (sometimes included in older LTS kernels):
+
+* Bugs fixed prior to 4.4.107 are not listed here.
+* 4.5: hang in the `INO_PATHS` ioctl used by Bees.
+* 4.5: use-after-free in the `FILE_EXTENT_SAME` ioctl used by Bees.
+* 4.6: lost inodes after a rename, crash, and log tree replay
+  (triggered by the fsync() while writing `beescrawl.dat`).
 * 4.7: *slow backref* bug no longer triggers a softlockup panic.  It still
-  too long to resolve a block address to a root/inode/offset triple.
+  takes too long to resolve a block address to a root/inode/offset triple.
+* 4.10: reduced CPU time cost of the LOGICAL_INO ioctl and dedup
+  backref processing in general.
+* 4.11: yet another dedup deadlock case is fixed.  Alas, it is not the
+  last one.
+* 4.14: backref performance improvements make LOGICAL_INO even faster
+  in the worst cases (but possibly slower in the best cases?).
+* 4.14.29: WARN_ON(ref->count < 0) in fs/btrfs/backref.c triggers
+  almost once per second.  The WARN_ON is incorrect and can be removed.

-Unfixed kernel bugs (as of 4.5.7) with workarounds in Bees:
+Unfixed kernel bugs (as of 4.14.34) with workarounds in Bees:

-* *slow backref*: If the number of references to a single shared extent
-  within a single file grows above a few thousand, the kernel consumes CPU
-  for up to 40 uninterruptible minutes while holding various locks that
-  block access to the filesystem.  Bees avoids this bug by measuring the
-  time the kernel spends performing certain operations and permanently
-  blacklisting any extent or hash where the kernel starts to get slow.
-  Inside Bees, such blocks are marked as 'toxic' hash/block addresses.
+* *Deadlocks* in the kernel dedup ioctl when files are modified
+  immediately before dedup.  `BeesTempFile::make_copy` calls `fsync()`
+  immediately before dedup to work around this.  If the `fsync()` is
+  removed, the filesystem hangs within a few hours, requiring a reboot
+  to recover.  Even with the `fsync()`, it is possible to lose the
+  kernel race condition and encounter a deadlock within a machine-year.
+  VM image workloads may trigger this faster.  Over the past years
+  several specific deadlock cases have been fixed, but at least one
+  remains.
+
+* *Bad interactions* with other Linux block layers:  bcache and lvmcache
+  can fail spectacularly, and apparently only while running bees.
+  This is definitely a kernel bug, either in btrfs or the lower block
+  layers.  Avoid using bees with these tools, or test very carefully
+  before deployment.
+
+* *slow backrefs* (aka toxic extents): If the number of references to a
+  single shared extent within a single file grows above a few thousand,
+  the kernel consumes CPU for minutes at a time while holding various
+  locks that block access to the filesystem.  Bees avoids this bug by
+  measuring the time the kernel spends performing certain operations
+  and permanently blacklisting any extent or hash where the kernel
+  starts to get slow.  Inside Bees, such blocks are marked as 'toxic'
+  hash/block addresses.  Linux kernel v4.14 is better but can still
+  have problems.

 * `LOGICAL_INO` output is arbitrarily limited to 2730 references
  even if more buffer space is provided for results.  Once this number
@@ -229,86 +319,140 @@ Unfixed kernel bugs (as of 4.5.7) with workarounds in Bees:
  This places an obvious limit on dedup efficiency for extremely common
  blocks or filesystems with many snapshots (although this limit is
  far greater than the effective limit imposed by the *slow backref* bug).
+  *Fixed in v4.14.*
+
+* `LOGICAL_INO` on compressed extents returns a list of root/inode/offset
+  tuples matching the extent bytenr of its argument.  On uncompressed
+  extents, any r/i/o tuple whose extent offset does not match the
+  argument's extent offset is discarded, i.e. only the single 4K block
+  matching the argument is returned, so a complete map of the extent
+  references requires calling `LOGICAL_INO` for every single block of
+  the extent.  This is undesirable behavior for Bees, which wants a
+  list of all extent refs referencing a data extent (i.e. Bees wants
+  the compressed-extent behavior in all cases).  *Fixed in v4.14.*

 * `FILE_EXTENT_SAME` is arbitrarily limited to 16MB.  This is less than
  128MB which is the maximum extent size that can be created by defrag
  or prealloc.  Bees avoids feedback loops this can generate while
  attempting to replace extents over 16MB in length.

-* `DEFRAG_RANGE` is useless.  The ioctl attempts to implement `btrfs
-  fi defrag` in the kernel, and will arbitrarily defragment more or
-  less than the range requested to match the behavior expected from the
-  userspace tool.  Bees implements its own defrag instead, copying data
-  to a temporary file and using the `FILE_EXTENT_SAME` ioctl to replace
-  precisely the specified range of offending fragmented blocks.
+* **Systems with many CPU cores** may [lock up when bees runs with one
+  worker thread for every core](https://github.com/Zygo/bees/issues/91).
+  bees limits the number of threads it will try to create based on
+  detected CPU core count.  Users may override this limit with the
+  [`--thread-count` option](options.md).

-* When writing BeesStringFile, a crash can cause the directory entry
-  `beescrawl.UUID.dat.tmp` to exist without a corresponding inode.
-  This directory entry cannot be renamed or removed; however, it does
-  not prevent the creation of a second directory entry with the same
-  name that functions normally, so it doesn't prevent Bees operation.
-
-  The orphan directory entry can be removed by deleting its subvol,
-  so place BEESHOME on a separate subvol so you can delete these orphan
-  directory entries when they occur (or use btrfs zero-log before mounting
-  the filesystem after a crash).
-
-* If the fsync() BeesTempFile::make_copy is removed, the filesystem
-  hangs within a few hours, requiring a reboot to recover.
-
-Not really a bug, but a gotcha nonetheless:
+Not really bugs, but gotchas nonetheless:

 * If a process holds a directory FD open, the subvol containing the
  directory cannot be deleted (`btrfs sub del` will start the deletion
  process, but it will not proceed past the first open directory FD).
  `btrfs-cleaner` will simply skip over the directory *and all of its
  children* until the FD is closed.  Bees avoids this gotcha by closing
-  all of the FDs in its directory FD cache every 15 minutes.
+  all of the FDs in its directory FD cache every 10 btrfs transactions.

+* If a file is deleted while Bees is caching an open FD to the file,
+  Bees continues to scan the file.  For very large files (e.g. VM
+  images), the deletion of the file can be delayed indefinitely.
+  To limit this delay, Bees closes all FDs in its file FD cache every
+  10 btrfs transactions.

+* If a snapshot is deleted, bees will generate a burst of exceptions
+  for references to files in the snapshot that no longer exist.  This
+  lasts until the FD caches are cleared.

-Requirements
+Installation
+============
+
+Bees can be installed by following one these instructions:
+
+Arch package
 ------------

-* C++11 compiler (tested with GCC 4.9)
+Bees is available in Arch Linux AUR. Install with:

-  Sorry.  I really like closures.
+`$ pacaur -S bees-git`

-* btrfs-progs (tested with 4.1..4.7)
+Gentoo ebuild
+-------------
+
+Bees is available as a Gentoo ebuild. Just copy `bees-9999.ebuild` from
+`contrib/gentoo` including the `files` subdirectory to your local
+overlay category `sys-fs`.
+
+You can copy the ebuild to match a Bees version number, and it will
+build that tagged version. It is partly supported since v0.5,
+previous versions won't work.
+
+Build from source
+-----------------
+
+Build with `make`. The build produces `bin/bees` and `lib/libcrucible.so`,
+which must be copied to somewhere in `$PATH` and `$LD_LIBRARY_PATH`
+on the target system respectively.
+
+It will also generate `scripts/beesd@.service` for systemd users. This
+service makes use of a helper script `scripts/beesd` to boot the service.
+Both of the latter use the filesystem UUID to mount the root subvolume
+within a temporary runtime directory.
+
+### Ubuntu 16.04 - 17.04:
+`$ apt -y install build-essential btrfs-tools uuid-dev markdown && make`
+
+### Ubuntu 14.04:
+You can try to carry on the work done here: https://gist.github.com/dagelf/99ee07f5638b346adb8c058ab3d57492
+
+Packaging
+---------
+
+See 'Dependencies' below. Package maintainers can pick ideas for building and
+configuring the source package from the Gentoo ebuild in `contrib/gentoo`.
+You can configure some build options by creating a file `localconf` and
+adjust settings for your distribution environment there.
+
+Please also review the Makefile for additional hints.
+
+Dependencies
+------------
+
+* C++11 compiler (tested with GCC 4.9, 6.2.0, 8.1.0)
+
+  Sorry.  I really like closures and shared_ptr, so support
+  for earlier compiler versions is unlikely.
+
+* btrfs-progs (tested with 4.1..4.15.1) or libbtrfs-dev
+  (tested with version 4.16.1)

  Needed for btrfs.h and ctree.h during compile.
-  Not needed at runtime.
+  Also needed by the service wrapper script.

 * libuuid-dev

-  TODO: remove the one function used from this library.
-  It supports a feature Bees no longer implements.
+  This library is only required for a feature that was removed after v0.1.
+  The lingering support code can be removed.

-* Linux kernel 4.2 or later
+* Linux kernel version: *minimum* 4.4.107, *4.14.29 or later recommended*

-  Don't bother trying to make Bees work with older kernels.
-  It won't end well.
+  Don't bother trying to make Bees work with kernel versions older than
+  4.4.107.  It may appear to work, but it won't end well:  there are
+  too many missing features and bugs (including data corruption bugs)
+  to work around in older kernels.

-* 64-bit host and target CPU
+  Kernel versions between 4.4.107 and 4.14.29 are usable with bees,
+  but bees can trigger known performance bugs and hangs in dedup-related
+  functions.

-  This code has never been tested on a 32-bit target CPU.
+* markdown

-  A 64-bit host CPU may be required for the self-tests.
-  Some of the ioctls don't work properly with a 64-bit
-  kernel and 32-bit userspace.
-
-Build
-----
-
-Build with `make`.
-
-The build produces `bin/bees` and `lib/libcrucible.so`, which must be
-copied to somewhere in `$PATH` and `$LD_LIBRARY_PATH` on the target
-system respectively.
+* util-linux version that provides `blkid` command for the helper
+  script `scripts/beesd` to work

 Setup
 -----

+If you don't want to use the helper script `scripts/beesd` to setup and
+configure bees, here's how you manually setup bees.
+
 Create a directory for bees state files:

        export BEESHOME=/some/path
@@ -320,57 +464,128 @@ of 16M).  This example creates a 1GB hash table:
        truncate -s 1g "$BEESHOME/beeshash.dat"
        chmod 700 "$BEESHOME/beeshash.dat"

-Configuration
-------------
-
-The only runtime configurable options are environment variables:
-
-* BEESHOME: Directory containing Bees state files:
- * beeshash.dat         | persistent hash table (must be a multiple of 16M)
- * beescrawl.`UUID`.dat | state of SEARCH_V2 crawlers
- * beesstats.txt        | statistics and performance counters
-* BEESSTATS: File containing a snapshot of current Bees state (performance
-  counters and current status of each thread).
-
-Other options (e.g. interval between filesystem crawls) can be configured
-in src/bees.h.
-
-Running
-------
-
-We created this directory in the previous section:
-
-        export BEESHOME=/some/path
-
-Use a tmpfs for BEESSTATUS, it updates once per second:
-
-        export BEESSTATUS=/run/bees.status
-
 bees can only process the root subvol of a btrfs (seriously--if the
 argument is not the root subvol directory, Bees will just throw an
 exception and stop).

 Use a bind mount, and let only bees access it:

-        mount -osubvol=/ /dev/<your-filesystem> /var/lib/bees/root
+	UUID=3399e413-695a-4b0b-9384-1b0ef8f6c4cd
+	mkdir -p /var/lib/bees/$UUID
+	mount /dev/disk/by-uuid/$UUID /var/lib/bees/$UUID -osubvol=/

-Reduce CPU and IO priority to be kinder to other applications
-sharing this host (or raise them for more aggressive disk space
-recovery).  If you use cgroups, put `bees` in its own cgroup, then reduce
-the `blkio.weight` and `cpu.shares` parameters.  You can also use
-`schedtool` and `ionice` in the shell script that launches `bees`:
+If you don't set BEESHOME, the path ".beeshome" will be used relative
+to the root subvol of the filesystem.  For example:
+
+	btrfs sub create /var/lib/bees/$UUID/.beeshome
+	truncate -s 1g /var/lib/bees/$UUID/.beeshome/beeshash.dat
+	chmod 700 /var/lib/bees/$UUID/.beeshome/beeshash.dat
+
+You can use any relative path in BEESHOME.  The path will be taken
+relative to the root of the deduped filesystem (in other words it can
+be the name of a subvol):
+
+	export BEESHOME=@my-beeshome
+	btrfs sub create /var/lib/bees/$UUID/$BEESHOME
+	truncate -s 1g /var/lib/bees/$UUID/$BEESHOME/beeshash.dat
+	chmod 700 /var/lib/bees/$UUID/$BEESHOME/beeshash.dat
+
+Configuration
+-------------
+
+There are some runtime configurable options using environment variables:
+
+* BEESHOME: Directory containing Bees state files:
+ * beeshash.dat  | persistent hash table.  Must be a multiple of 16M.
+                   This contains 16-byte records:  8 bytes for CRC64,
+                   8 bytes for physical address and some metadata bits.
+ * beescrawl.dat | state of SEARCH_V2 crawlers.  ASCII text.
+ * beesstats.txt | statistics and performance counters.  ASCII text.
+* BEESSTATUS: File containing a snapshot of current Bees state:  performance
+  counters and current status of each thread.  The file is meant to be
+  human readable, but understanding it probably requires reading the source.
+  You can watch bees run in realtime with a command like:
+
+	watch -n1 cat $BEESSTATUS
+
+Other options (e.g. interval between filesystem crawls) can be configured
+in src/bees.h or on the cmdline (see 'Command Line Options' below).
+
+Running
+-------
+
+Reduce CPU and IO priority to be kinder to other applications sharing
+this host (or raise them for more aggressive disk space recovery).  If you
+use cgroups, put `bees` in its own cgroup, then reduce the `blkio.weight`
+and `cpu.shares` parameters.  You can also use `schedtool` and `ionice`
+in the shell script that launches `bees`:

        schedtool -D -n20 $$
        ionice -c3 -p $$

 Let the bees fly:

-        bees /var/lib/bees/root >> /var/log/bees.log 2>&1
+	for fs in /var/lib/bees/*-*-*-*-*/; do
+		bees "$fs" >> "$fs/.beeshome/bees.log" 2>&1 &
+	done

 You'll probably want to arrange for /var/log/bees.log to be rotated
 periodically.  You may also want to set umask to 077 to prevent disclosure
 of information about the contents of the filesystem through the log file.

+There are also some shell wrappers in the `scripts/` directory.
+
+
+
+Command Line Options
+--------------------
+
+* --thread-count (-c) COUNT
+  * Specify maximum number of worker threads for scanning.  Overrides
+    --thread-factor (-C) and default/autodetected values,
+    and the hardcoded thread limit.
+* --thread-factor (-C) FACTOR
+  * Specify ratio of worker threads to CPU cores.  Overridden by --thread-count (-c).
+    Default is 1.0, i.e. 1 worker thread per detected CPU.  Use values
+    below 1.0 to leave some cores idle, or above 1.0 if there are more
+    disks than CPUs in the filesystem.
+    If the computed thread count is higher than `BEES_DEFAULT_THREAD_LIMIT`
+    (currently 8), then only that number of threads will be created.
+    This limit can be overridden by the `--thread-count` option; however,
+    be aware that there are kernel issues with systems that have many CPU
+    cores when users try to run bees on all of them.
+* --loadavg-target (-g) LOADAVG
+  * Specify load average target for dynamic worker threads.
+    Threads will be started or stopped subject to the upper limit imposed
+    by thread-factor, thread-min and thread-count until the load average
+    is within +/- 0.5 of LOADAVG.
+* --thread-min (-G) COUNT
+  * Specify minimum number of worker threads for scanning.
+    Ignored unless -g option is used to specify a target load.
+
+* --scan-mode (-m) MODE
+  * Specify extent scanning algorithm.  Default mode is 0.
+    _EXPERIMENTAL_ feature that may go away.
+    * Mode 0: scan extents in ascending order of (inode, subvol, offset).
+      Keeps shared extents between snapshots together.  Reads files sequentially.
+      Minimizes temporary space usage.
+    * Mode 1: scan extents from all subvols in parallel.  Good performance
+      on non-spinning media when subvols are unrelated.
+    * Mode 2: scan all extents from one subvol at a time.  Good sequential
+      read performance for spinning media.  Maximizes temporary space usage.
+
+* --timestamps (-t)
+  * Enable timestamps in log output.
+* --no-timestamps (-T)
+  * Disable timestamps in log output.
+* --absolute-paths (-p)
+  * Paths in log output will be absolute.
+* --strip-paths (-P)
+  * Paths in log output will have the working directory at Bees startup
+    stripped.
+* --verbose (-v)
+  * Set log verbosity (0 = no output, 8 = all output, default 8).
+

 Bug Reports and Contributions
 -----------------------------
@@ -386,6 +601,6 @@ You can also use Github:
 Copyright & License
 ===================

-Copyright 2015-2016 Zygo Blaxell <bees@furryterror.org>.
+Copyright 2015-2017 Zygo Blaxell <bees@furryterror.org>.

 GPL (version 3 or later).
--- a/contrib/gentoo-bees/metadata/layout.conf
+++ b/contrib/gentoo-bees/metadata/layout.conf
@@ -0,0 +1,18 @@
+# manifest-hashes specify hashes used for new/updated entries
+# the current set went live on 2017-11-21, per 2017-11-12 Council meeting
+# https://archives.gentoo.org/gentoo-dev/message/ba2e5d9666ebd7e1bff1143485a37856
+manifest-hashes = BLAKE2B SHA512
+
+# The following hashes are required on all Manifest entries. If any
+# of them are missing, repoman will refetch and rehash old distfiles.
+# Otherwise, old distfiles will keep using their current hash set.
+manifest-required-hashes = BLAKE2B
+
+# No more old ChangeLogs in Git
+update-changelog = false
+
+# Sign Git commits, and NOT Manifests
+sign-commits = true
+sign-manifests = false
+
+masters = gentoo
--- a/contrib/gentoo-bees/profiles/repo_name
+++ b/contrib/gentoo-bees/profiles/repo_name
@@ -0,0 +1 @@
+bees
--- a/contrib/gentoo-bees/sys-fs/bees/Manifest
+++ b/contrib/gentoo-bees/sys-fs/bees/Manifest
@@ -0,0 +1,2 @@
+EBUILD bees-9999.ebuild 2001 BLAKE2B 7fa1c9d043a4334579dfad3560d1593717e548c0d31695cf8ccf8ffe45f2347584c7da43b47cad873745f3c843207433c6b892a0469c5618f107c68f78fd5fe2 SHA512 d49266e007895c049e1c9f7e28ec2f649b386a6441eccba02ee411f14ad395925eecdaa8a747962ccc526f9e1d3aba9fd68f4452a1d276d4e5b7d48c80102cd8
+MISC metadata.xml 479 BLAKE2B ef5e110ba8d88f0188dbc0d12bec2ad45c51abf707656f6fe4e0fa498d933fe9c32c5dc4c9b446402ec686084459f9f075e52f33402810962c1ac6b149fb70c8 SHA512 3fcc136ed4c55323cac4f8cf542210eb77f73e2a80f95fcce2d688bc645f6e5126404776536dedc938b18287b54abbc264610cc2f587a42a3a8e6d7bf8415aaa
--- a/contrib/gentoo-bees/sys-fs/bees/bees-9999.ebuild
+++ b/contrib/gentoo-bees/sys-fs/bees/bees-9999.ebuild
@@ -0,0 +1,66 @@
+# Copyright 1999-2018 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+EAPI=7
+
+inherit linux-info
+
+DESCRIPTION="Best-Effort Extent-Same, a btrfs dedup agent"
+HOMEPAGE="https://github.com/Zygo/bees"
+
+if [[ ${PV} == "9999" ]] ; then
+	EGIT_REPO_URI="https://github.com/Zygo/bees.git"
+	inherit git-r3
+else
+	SRC_URI="https://github.com/Zygo/bees/archive/v${PV}.tar.gz -> ${P}.tar.gz"
+	KEYWORDS="~amd64"
+fi
+
+LICENSE="GPL-3"
+SLOT="0"
+IUSE="tools"
+
+DEPEND="
+	>=sys-apps/util-linux-2.30.2
+	>=sys-fs/btrfs-progs-4.1
+"
+RDEPEND="${DEPEND}"
+
+CONFIG_CHECK="~BTRFS_FS"
+ERROR_BTRFS_FS="CONFIG_BTRFS_FS: bees does currently only work with btrfs"
+
+pkg_pretend() {
+	if [[ ${MERGE_TYPE} != buildonly ]]; then
+		if kernel_is -lt 4 4 3; then
+			ewarn "Kernel versions below 4.4.3 lack critical features needed for bees to"
+			ewarn "properly operate, so it won't work. It's recommended to run at least"
+			ewarn "kernel version 4.11 for best performance and reliability."
+			ewarn
+		elif kernel_is -lt 4 11; then
+			ewarn "With kernel versions below 4.11, bees may severely degrade system performance"
+			ewarn "and responsiveness. Especially, the kernel may deadlock while bees is"
+			ewarn "running, it's recommended to run at least kernel 4.11."
+			ewarn
+		elif kernel_is -lt 4 14 29; then
+			ewarn "With kernel versions below 4.14.29, bees may generate a lot of bogus WARN_ON()"
+			ewarn "messages in the kernel log. These messages can be ignored and this is fixed"
+			ewarn "with more recent kernels:"
+			ewarn "# WARNING: CPU: 3 PID: 18172 at fs/btrfs/backref.c:1391 find_parent_nodes+0xc41/0x14e0"
+			ewarn
+		fi
+		elog "Bees recommends to run the latest current kernel for performance and"
+		elog "reliability reasons, see README.md."
+	fi
+}
+
+src_configure() {
+	cat >localconf <<-EOF || die
+		LIBEXEC_PREFIX=/usr/libexec
+		PREFIX=/usr
+		LIBDIR=$(get_libdir)
+		DEFAULT_MAKE_TARGET=all
+	EOF
+	if use tools; then
+		echo OPTIONAL_INSTALL_TARGETS=install_tools >>localconf || die
+	fi
+}
--- a/contrib/gentoo-bees/sys-fs/bees/metadata.xml
+++ b/contrib/gentoo-bees/sys-fs/bees/metadata.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE pkgmetadata SYSTEM "http://www.gentoo.org/dtd/metadata.dtd">
+<pkgmetadata>
+<maintainer type="person">
+	<email>hurikhan77+bgo@gmail.com</email>
+	<name>Kai Krakow</name>
+</maintainer>
+<use>
+	<flag name="tools">Build extra tools useful for debugging (fiemap, feiwalk, beestop)</flag>
+</use>
+<upstream>
+	<bugs-to>https://github.com/Zygo/bees/issues</bugs-to>
+	<remote-id type="github">Zygo/bees</remote-id>
+</upstream>
+</pkgmetadata>
--- a/include/crucible/bool.h
+++ b/include/crucible/bool.h
@@ -1,13 +0,0 @@
-#ifndef CRUCIBLE_BOOL_H
-#define CRUCIBLE_BOOL_H
-
-namespace crucible {
-	struct DefaultBool {
-		bool m_b;
-		DefaultBool(bool init = false) : m_b(init) {}
-		operator bool() const { return m_b; }
-		bool &operator=(const bool &that) { return m_b = that; }
-	};
-}
-
-#endif // CRUCIBLE_BOOL_H
--- a/include/crucible/btrfs.h
+++ b/include/crucible/btrfs.h
@@ -23,6 +23,7 @@
 #undef min
 #undef max
 #undef mutex
+#undef swap

 #ifndef BTRFS_FIRST_FREE_OBJECTID

@@ -130,7 +131,7 @@
 	};

 #endif
-	 
+
 #ifndef BTRFS_IOC_CLONE_RANGE

 	struct btrfs_ioctl_clone_range_args {
--- a/include/crucible/cache.h
+++ b/include/crucible/cache.h
@@ -8,6 +8,7 @@
 #include <map>
 #include <mutex>
 #include <tuple>
+#include <vector>

 namespace crucible {
 	using namespace std;
@@ -17,17 +18,27 @@ namespace crucible {
 	public:
 		using Key = tuple<Arguments...>;
 		using Func = function<Return(Arguments...)>;
-		using Time = unsigned;
-		using Value = pair<Time, Return>;
 	private:
+		struct Value {
+			Value *fp = nullptr;
+			Value *bp = nullptr;
+			Key key;
+			Return ret;
+			Value(Key k, Return r) : key(k), ret(r) { }
+			// Crash early!
+			~Value() { fp = bp = nullptr; };
+		};
+
 		Func		m_fn;
-		Time		m_ctr;
 		map<Key, Value>	m_map;
 		LockSet<Key>	m_lockset;
 		size_t		m_max_size;
 		mutex		m_mutex;
+		Value		*m_last = nullptr;

 		void check_overflow();
+		void move_to_front(Value *vp);
+		void erase_one(Value *vp);
 	public:
 		LRUCache(Func f = Func(), size_t max_size = 100);

@@ -45,26 +56,81 @@ namespace crucible {
 	template <class Return, class... Arguments>
 	LRUCache<Return, Arguments...>::LRUCache(Func f, size_t max_size) :
 		m_fn(f),
-		m_ctr(0),
 		m_max_size(max_size)
 	{
 	}

+	template <class Return, class... Arguments>
+	void
+	LRUCache<Return, Arguments...>::erase_one(Value *vp)
+	{
+		THROW_CHECK0(invalid_argument, vp);
+		Value *vp_bp = vp->bp;
+		THROW_CHECK0(runtime_error, vp_bp);
+		Value *vp_fp = vp->fp;
+		THROW_CHECK0(runtime_error, vp_fp);
+		vp_fp->bp = vp_bp;
+		vp_bp->fp = vp_fp;
+		// If we delete the head of the list then advance the head by one
+		if (vp == m_last) {
+			// If the head of the list is also the tail of the list then clear m_last
+			if (vp_fp == m_last) {
+				m_last = nullptr;
+			} else {
+				m_last = vp_fp;
+			}
+		}
+		m_map.erase(vp->key);
+		if (!m_last) {
+			THROW_CHECK0(runtime_error, m_map.empty());
+		} else {
+			THROW_CHECK0(runtime_error, !m_map.empty());
+		}
+	}
+
 	template <class Return, class... Arguments>
 	void
 	LRUCache<Return, Arguments...>::check_overflow()
 	{
-		if (m_map.size() <= m_max_size) return;
-		vector<pair<Key, Time>> map_contents;
-		map_contents.reserve(m_map.size());
-		for (auto i : m_map) {
-			map_contents.push_back(make_pair(i.first, i.second.first));
+		while (m_map.size() >= m_max_size) {
+			THROW_CHECK0(runtime_error, m_last);
+			THROW_CHECK0(runtime_error, m_last->bp);
+			erase_one(m_last->bp);
 		}
-		sort(map_contents.begin(), map_contents.end(), [](const pair<Key, Time> &a, const pair<Key, Time> &b) {
-			return a.second < b.second;
-		});
-		for (size_t i = 0; i < map_contents.size() / 2; ++i) {
-			m_map.erase(map_contents[i].first);
+	}
+
+	template <class Return, class... Arguments>
+	void
+	LRUCache<Return, Arguments...>::move_to_front(Value *vp)
+	{
+		if (!m_last) {
+			// Create new LRU list
+			m_last = vp->fp = vp->bp = vp;
+		} else if (m_last != vp) {
+			Value *vp_fp = vp->fp;
+			Value *vp_bp = vp->bp;
+			if (vp_fp && vp_bp) {
+				// There are at least two and we are removing one that isn't m_last
+				// Connect adjacent nodes to each other (has no effect if vp is new), removing vp from list
+				vp_fp->bp = vp_bp;
+				vp_bp->fp = vp_fp;
+			} else {
+				// New insertion, both must be null
+				THROW_CHECK0(runtime_error, !vp_fp);
+				THROW_CHECK0(runtime_error, !vp_bp);
+			}
+			// Splice new node into list
+			Value *last_bp = m_last->bp;
+			THROW_CHECK0(runtime_error, last_bp);
+			// New element points to both ends of list
+			vp->fp = m_last;
+			vp->bp = last_bp;
+			// Insert vp as fp from the end of the list
+			last_bp->fp = vp;
+			// Insert vp as bp from the second from the start of the list
+			m_last->bp = vp;
+			// Update start of list
+			m_last = vp;
 		}
 	}

@@ -74,6 +140,9 @@ namespace crucible {
 	{
 		unique_lock<mutex> lock(m_mutex);
 		m_max_size = new_max_size;
+		// FIXME:  this really reduces the cache size to new_max_size - 1
+		// because every other time we call this method, it is immediately
+		// followed by insert.
 		check_overflow();
 	}

@@ -89,8 +158,11 @@ namespace crucible {
 	void
 	LRUCache<Return, Arguments...>::clear()
 	{
+		// Move the map onto the stack, then destroy it after we've released the lock.
+		decltype(m_map) new_map;
 		unique_lock<mutex> lock(m_mutex);
-		m_map.clear();
+		m_map.swap(new_map);
+		m_last = nullptr;
 	}

 	template <class Return, class... Arguments>
@@ -100,8 +172,8 @@ namespace crucible {
 		unique_lock<mutex> lock(m_mutex);
 		for (auto it = m_map.begin(); it != m_map.end(); ) {
 			auto next_it = ++it;
-			if (pred(it.second.second)) {
-				m_map.erase(it);
+			if (pred(it.second.ret)) {
+				erase_one(&it.second);
 			}
 			it = next_it;
 		}
@@ -120,7 +192,7 @@ namespace crucible {
 		if (found == m_map.end()) {
 			// No, release cache lock and acquire key lock
 			lock.unlock();
-			typename LockSet<Key>::Lock key_lock(m_lockset, k);
+			auto key_lock = m_lockset.make_lock(k);

 			// Did item appear in cache while we were waiting for key?
 			lock.lock();
@@ -129,31 +201,38 @@ namespace crucible {

 				// No, we hold key and cache locks, but item not in cache.
 				// Release cache lock and call function
-				auto ctr_copy = m_ctr++;
 				lock.unlock();
-				Value v(ctr_copy, m_fn(args...));
+
+				// Create new value
+				Value v(k, m_fn(args...));
+
+				// Reacquire cache lock
+				lock.lock();
+
+				// Make room
+				check_overflow();

 				// Reacquire cache lock and insert return value
-				lock.lock();
 				tie(found, inserted) = m_map.insert(make_pair(k, v));

 				// We hold a lock on this key so we are the ones to insert it
 				THROW_CHECK0(runtime_error, inserted);

-				// Release key lock and clean out overflow
+				// Release key lock, keep the cache lock
 				key_lock.unlock();
-				check_overflow();
+
 			}
 		}

 		// Item should be in cache now
 		THROW_CHECK0(runtime_error, found != m_map.end());

-		// We are using this object so update the timestamp
-		if (!inserted) {
-			found->second.first = m_ctr++;
-		}
-		return found->second.second;
+		// (Re)insert at head of LRU
+		move_to_front(&(found->second));
+
+		// Make copy before releasing lock
+		auto rv = found->second.ret;
+		return rv;
 	}

 	template<class Return, class... Arguments>
@@ -162,7 +241,10 @@ namespace crucible {
 	{
 		Key k(args...);
 		unique_lock<mutex> lock(m_mutex);
-		m_map.erase(k);
+		auto found = m_map.find(k);
+		if (found != m_map.end()) {
+			erase_one(&found->second);
+		}
 	}

 	template<class Return, class... Arguments>
@@ -186,35 +268,31 @@ namespace crucible {
 		if (found == m_map.end()) {
 			// No, release cache lock and acquire key lock
 			lock.unlock();
-			typename LockSet<Key>::Lock key_lock(m_lockset, k);
+			auto key_lock = m_lockset.make_lock(k);

 			// Did item appear in cache while we were waiting for key?
 			lock.lock();
 			found = m_map.find(k);
 			if (found == m_map.end()) {

+				// Make room
+				check_overflow();
+
 				// No, we hold key and cache locks, but item not in cache.
-				// Release cache lock and insert the provided return value
-				auto ctr_copy = m_ctr++;
-				Value v(ctr_copy, r);
+				// Insert the provided return value (no need to unlock here)
+				Value v(k, r);
 				tie(found, inserted) = m_map.insert(make_pair(k, v));

 				// We hold a lock on this key so we are the ones to insert it
 				THROW_CHECK0(runtime_error, inserted);
-
-				// Release key lock and clean out overflow
-				key_lock.unlock();
-				check_overflow();
 			}
 		}

 		// Item should be in cache now
 		THROW_CHECK0(runtime_error, found != m_map.end());

-		// We are using this object so update the timestamp
-		if (!inserted) {
-			found->second.first = m_ctr++;
-		}
+		// (Re)insert at head of LRU
+		move_to_front(&(found->second));
 	}
 }

--- a/include/crucible/chatter.h
+++ b/include/crucible/chatter.h
@@ -8,6 +8,8 @@
 #include <string>
 #include <typeinfo>

+#include <syslog.h>
+
  /** \brief Chatter wraps a std::ostream reference with a destructor that
      writes a newline, and inserts timestamp, pid, and tid prefixes on output.

@@ -33,18 +35,21 @@ namespace crucible {
 	using namespace std;

 	class Chatter {
+		int m_loglevel;
 		string m_name;
 		ostream &m_os;
 		ostringstream m_oss;

 	public:
-		Chatter(string name, ostream &os = cerr);
+		Chatter(int loglevel, string name, ostream &os = cerr);
 		Chatter(Chatter &&c);
 		ostream &get_os() { return m_oss; }

 		template <class T> Chatter &operator<<(const T& arg);

 		~Chatter();
+
+		static void enable_timestamp(bool prefix_timestamp);
 	};

 	template <class Argument>
@@ -86,16 +91,6 @@ namespace crucible {
 		}
 	};

-	template <>
-	struct ChatterTraits<ostream &> {
-		Chatter &
-		operator()(Chatter &c, ostream & arg)
-		{
-			c.get_os() << arg;
-			return c;
-		}
-	};
-
 	class ChatterBox {
 		string m_file;
 		int m_line;
@@ -111,7 +106,7 @@ namespace crucible {

 		template <class T> Chatter operator<<(const T &t)
 		{
-			Chatter c(m_pretty_function, m_os);
+			Chatter c(LOG_NOTICE, m_pretty_function, m_os);
 			c << t;
 			return c;
 		}
--- a/include/crucible/cleanup.h
+++ b/include/crucible/cleanup.h
@@ -0,0 +1,18 @@
+#ifndef CRUCIBLE_CLEANUP_H
+#define CRUCIBLE_CLEANUP_H
+
+#include <functional>
+
+namespace crucible {
+	using namespace std;
+
+	class Cleanup {
+		function<void()> m_cleaner;
+	public:
+		Cleanup(function<void()> func);
+		~Cleanup();
+	};
+
+}
+
+#endif // CRUCIBLE_CLEANUP_H
--- a/include/crucible/crc64.h
+++ b/include/crucible/crc64.h
@@ -3,11 +3,11 @@

 #include <cstdint>
 #include <cstdlib>
+#include <cstring>

 namespace crucible {
 	namespace Digest {
 		namespace CRC {
-			uint64_t crc64(const char *s);
 			uint64_t crc64(const void *p, size_t len);
 		};
 	};
--- a/include/crucible/error.h
+++ b/include/crucible/error.h
@@ -81,31 +81,25 @@ namespace crucible {
 // macro for throwing an error
 #define THROW_ERROR(type, expr) do { \
 	std::ostringstream _te_oss; \
-	_te_oss << expr; \
+	_te_oss << expr << " at " << __FILE__ << ":" << __LINE__; \
 	throw type(_te_oss.str()); \
 } while (0)

 // macro for throwing a system_error with errno
 #define THROW_ERRNO(expr) do { \
 	std::ostringstream _te_oss; \
-	_te_oss << expr; \
+	_te_oss << expr << " at " << __FILE__ << ":" << __LINE__; \
 	throw std::system_error(std::error_code(errno, std::system_category()), _te_oss.str()); \
 } while (0)

 // macro for throwing a system_error with some other variable
 #define THROW_ERRNO_VALUE(value, expr) do { \
 	std::ostringstream _te_oss; \
-	_te_oss << expr; \
+	_te_oss << expr << " at " << __FILE__ << ":" << __LINE__; \
 	throw std::system_error(std::error_code((value), std::system_category()), _te_oss.str()); \
 } while (0)

 // macros for checking a constraint
-#define CHECK_CONSTRAINT(value, expr) do { \
-	if (!(expr)) { \
-		THROW_ERROR(out_of_range, #value << " = " << value << " failed constraint check (" << #expr << ")"); \
-	} \
-} while(0)
-
 #define THROW_CHECK0(type, expr) do { \
 	if (!(expr)) { \
 		THROW_ERROR(type, "failed constraint check (" << #expr << ")"); \
--- a/include/crucible/execpipe.h
+++ b/include/crucible/execpipe.h
@@ -1,28 +0,0 @@
-#ifndef CRUCIBLE_EXECPIPE_H
-#define CRUCIBLE_EXECPIPE_H
-
-#include "crucible/fd.h"
-
-#include <functional>
-#include <limits>
-#include <string>
-
-namespace crucible {
-	using namespace std;
-
-	void redirect_stdin(const Fd &child_fd);
-	void redirect_stdin_stdout(const Fd &child_fd);
-	void redirect_stdin_stdout_stderr(const Fd &child_fd);
-	void redirect_stdout(const Fd &child_fd);
-	void redirect_stdout_stderr(const Fd &child_fd);
-
-	// Open a pipe (actually socketpair) to child process, then execute code in that process.
-	// e.g. popen([] () { system("echo Hello, World!"); });
-	// Forked process will exit when function returns.
-	Fd popen(function<int()> f, function<void(const Fd &child_fd)> import_fd_fn = redirect_stdin_stdout);
-
-	// Read all the data from fd into a string
-        string read_all(Fd fd, size_t max_bytes = numeric_limits<size_t>::max(), size_t chunk_bytes = 4096);
-};
-
-#endif // CRUCIBLE_EXECPIPE_H
--- a/include/crucible/extentwalker.h
+++ b/include/crucible/extentwalker.h
@@ -8,15 +8,15 @@ namespace crucible {

 	// FIXME:  ExtentCursor is probably a better name
 	struct Extent {
-		off_t		m_begin;
-		off_t		m_end;
-		uint64_t	m_physical;
-		uint64_t	m_flags;
+		off_t		m_begin = 0;
+		off_t		m_end = 0;
+		uint64_t	m_physical = 0;
+		uint64_t	m_flags = 0;

 		// Btrfs extent reference details
-		off_t		m_physical_len;
-		off_t		m_logical_len;
-		off_t		m_offset;
+		off_t		m_physical_len = 0;
+		off_t		m_logical_len = 0;
+		off_t		m_offset = 0;

 		// fiemap flags are uint32_t, so bits 32..63 are OK for us

@@ -38,10 +38,12 @@ namespace crucible {
 		off_t physical_len() const { return m_physical_len; }
 		off_t logical_len() const { return m_logical_len; }
 		off_t offset() const { return m_offset; }
+		bool compressed() const;
+		uint64_t bytenr() const;
 		bool operator==(const Extent &that) const;
 		bool operator!=(const Extent &that) const { return !(*this == that); }

-		Extent();
+		Extent() = default;
 		Extent(const Extent &e) = default;
 	};

@@ -56,7 +58,7 @@ namespace crucible {

 		virtual Vec get_extent_map(off_t pos);

-		static const unsigned sc_extent_fetch_max = 64;
+		static const unsigned sc_extent_fetch_max = 16;
 		static const unsigned sc_extent_fetch_min = 4;
 		static const off_t sc_step_size = 0x1000 * (sc_extent_fetch_max / 2);

--- a/include/crucible/fd.h
+++ b/include/crucible/fd.h
@@ -13,6 +13,10 @@
 #include <sys/stat.h>
 #include <fcntl.h>

+// ioctl
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+
 // socket
 #include <sys/socket.h>

@@ -53,6 +57,10 @@ namespace crucible {

        typedef ResourceHandle<int, IOHandle> Fd;

+	static string __relative_path;
+	void set_relative_path(string path);
+	string relative_path();
+
 	// Functions named "foo_or_die" throw exceptions on failure.

 	// Attempt to open the file with the given mode
@@ -70,10 +78,11 @@ namespace crucible {
 	string mmap_flags_ntoa(int flags);

 	// Unlink, rename
-	void unlink_or_die(const string &file);
 	void rename_or_die(const string &from, const string &to);
 	void renameat_or_die(int fromfd, const string &frompath, int tofd, const string &topath);

+	void ftruncate_or_die(int fd, off_t size);
+
 	// Read or write structs:
 	// There is a template specialization to read or write strings
 	// Three-arg version of read_or_die/write_or_die throws an error on incomplete read/writes
@@ -120,6 +129,9 @@ namespace crucible {
 	template<> void pread_or_die<string>(int fd, string& str, off_t offset);
 	template<> void pread_or_die<vector<char>>(int fd, vector<char>& str, off_t offset);
 	template<> void pread_or_die<vector<uint8_t>>(int fd, vector<uint8_t>& str, off_t offset);
+	template<> void pwrite_or_die<string>(int fd, const string& str, off_t offset);
+	template<> void pwrite_or_die<vector<char>>(int fd, const vector<char>& str, off_t offset);
+	template<> void pwrite_or_die<vector<uint8_t>>(int fd, const vector<uint8_t>& str, off_t offset);

 	// A different approach to reading a simple string
 	string read_string(int fd, size_t size);
@@ -137,6 +149,9 @@ namespace crucible {
 		Stat &lstat(const string &filename);
 	};

+	int ioctl_iflags_get(int fd);
+	void ioctl_iflags_set(int fd, int attr);
+
 	string st_mode_ntoa(mode_t mode);

 	// Because it's not trivial to do correctly
--- a/include/crucible/fs.h
+++ b/include/crucible/fs.h
@@ -13,6 +13,7 @@

 #include <cstdint>
 #include <iosfwd>
+#include <set>
 #include <vector>

 #include <fcntl.h>
@@ -111,8 +112,8 @@ namespace crucible {
 		BTRFS_COMPRESS_NONE  = 0,
 		BTRFS_COMPRESS_ZLIB  = 1,
 		BTRFS_COMPRESS_LZO   = 2,
-		BTRFS_COMPRESS_TYPES = 2,
-		BTRFS_COMPRESS_LAST  = 3,
+		BTRFS_COMPRESS_ZSTD  = 3,
+		BTRFS_COMPRESS_TYPES = 3
 	} btrfs_compression_type;

 	struct FiemapExtent : public fiemap_extent {
@@ -150,13 +151,14 @@ namespace crucible {
 		BtrfsIoctlSearchHeader();
 		vector<char> m_data;
 		size_t set_data(const vector<char> &v, size_t offset);
+		bool operator<(const BtrfsIoctlSearchHeader &that) const;
 	};

 	ostream & operator<<(ostream &os, const btrfs_ioctl_search_header &hdr);
 	ostream & operator<<(ostream &os, const BtrfsIoctlSearchHeader &hdr);

 	struct BtrfsIoctlSearchKey : public btrfs_ioctl_search_key {
-		BtrfsIoctlSearchKey(size_t buf_size = 1024 * 1024);
+		BtrfsIoctlSearchKey(size_t buf_size = 4096);
 		virtual bool do_ioctl_nothrow(int fd);
 		virtual void do_ioctl(int fd);

@@ -164,14 +166,15 @@ namespace crucible {
 		void next_min(const BtrfsIoctlSearchHeader& ref);

 		size_t m_buf_size;
-		vector<BtrfsIoctlSearchHeader> m_result;
+		set<BtrfsIoctlSearchHeader> m_result;
+
 	};

 	ostream & operator<<(ostream &os, const btrfs_ioctl_search_key &key);
 	ostream & operator<<(ostream &os, const BtrfsIoctlSearchKey &key);

 	string btrfs_search_type_ntoa(unsigned type);
-	string btrfs_search_objectid_ntoa(unsigned objectid);
+	string btrfs_search_objectid_ntoa(uint64_t objectid);

 	uint64_t btrfs_get_root_id(int fd);
 	uint64_t btrfs_get_root_transid(int fd);
--- a/include/crucible/interp.h
+++ b/include/crucible/interp.h
@@ -1,106 +0,0 @@
-#ifndef CRUCIBLE_INTERP_H
-#define CRUCIBLE_INTERP_H
-
-#include "crucible/error.h"
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace crucible {
-	using namespace std;
-
-	struct ArgList : public vector<string> {
-		ArgList(const char **argv);
-		// using vector<string>::vector ... doesn't work:
-		// error: ‘std::vector<std::basic_string<char> >::vector’ names constructor
-		// Still doesn't work in 4.9 because it can't manage a conversion
-		ArgList(const vector<string> &&that);
-	};
-
-	struct ArgActor {
-		struct ArgActorBase {
-			virtual void predicate(void *obj, string arg);
-		};
-
-		template <class T>
-		struct ArgActorDerived {
-			function<void(T, string)> m_func;
-
-			ArgActorDerived(decltype(m_func) func) :
-				m_func(func)
-			{
-			}
-
-			void predicate(void *obj, string arg) override
-			{
-				T &op = *(reinterpret_cast<T*>(obj));
-				m_func(op, obj);
-			}
-		};
-
-		template <class T>
-		ArgActor(T, function<void(T, string)> func) :
-			m_actor(make_shared(ArgActorDerived<T>(func)))
-		{
-		}
-
-		ArgActor() = default;
-
-		void predicate(void *t, string arg)
-		{
-			if (m_actor) {
-				m_actor->predicate(t, arg);
-			} else {
-				THROW_ERROR(invalid_argument, "null m_actor for predicate arg '" << arg << "'");
-			}
-		}
-
-	private:
-		shared_ptr<ArgActorBase> m_actor;
-	};
-
-	struct ArgParser {
-		~ArgParser();
-		ArgParser();
-
-		void add_opt(string opt, ArgActor actor);
-
-		template <class T>
-		void
-		parse(T t, const ArgList &args)
-		{
-			void *vt = &t;
-			parse_backend(vt, args);
-		}
-		
-	private:
-		void parse_backend(void *t, const ArgList &args);
-		map<string, ArgActor>	m_string_opts;
-	};
-
-	struct Command {
-		virtual ~Command();
-		virtual int exec(const ArgList &args) = 0;
-	};
-
-	struct Proc : public Command {
-		int exec(const ArgList &args) override;
-		Proc(const function<int(const ArgList &)> &f);
-	private:
-		function<int(const ArgList &)> m_cmd;
-	};
-
-	struct Interp {
-		virtual ~Interp();
-		Interp(const map<string, shared_ptr<Command> > &cmdlist);
-		void add_command(const string &name, const shared_ptr<Command> &command);
-		int exec(const ArgList &args);
-	private:
-		Interp(const Interp &) = delete;
-		map<string, shared_ptr<Command> > m_commands;
-	};
-
-};
-#endif // CRUCIBLE_INTERP_H
--- a/include/crucible/lockset.h
+++ b/include/crucible/lockset.h
@@ -2,13 +2,16 @@
 #define CRUCIBLE_LOCKSET_H

 #include <crucible/error.h>
+#include <crucible/process.h>

 #include <cassert>

 #include <condition_variable>
 #include <iostream>
+#include <limits>
+#include <map>
+#include <memory>
 #include <mutex>
-#include <set>

 namespace crucible {
 	using namespace std;
@@ -17,14 +20,36 @@ namespace crucible {
 	class LockSet {

 	public:
-		using key_type = T;
-		using set_type = set<T>;
+		using set_type = map<T, pid_t>;
+		using key_type = typename set_type::key_type;

 	private:

 		set_type			m_set;
 		mutex				m_mutex;
 		condition_variable		m_condvar;
+		size_t				m_max_size = numeric_limits<size_t>::max();
+
+		bool full();
+		bool locked(const key_type &name);
+
+		class Lock {
+			LockSet		&m_lockset;
+			key_type	m_name;
+			bool		m_locked;
+
+			Lock() = delete;
+			Lock(const Lock &) = delete;
+			Lock& operator=(const Lock &) = delete;
+			Lock(Lock &&that) = delete;
+			Lock& operator=(Lock &&that) = delete;
+		public:
+			~Lock();
+			Lock(LockSet &lockset, const key_type &name, bool start_locked = true);
+			void lock();
+			void unlock();
+			bool try_lock();
+		};

 	public:
 		~LockSet();
@@ -36,26 +61,21 @@ namespace crucible {
 		size_t size();
 		bool empty();
 		set_type copy();
-		void wait_unlock(double interval);

-		class Lock {
-			LockSet		&m_lockset;
-			key_type	m_name;
-			bool		m_locked;
+		void max_size(size_t max);
+
+		class LockHandle {
+			shared_ptr<Lock> m_lock;

-			Lock() = delete;
-			Lock(const Lock &) = delete;
-			Lock& operator=(const Lock &) = delete;
 		public:
-			~Lock();
-			Lock(LockSet &lockset, const key_type &m_name, bool start_locked = true);
-			Lock(Lock &&that);
-			Lock& operator=(Lock &&that);
-			void lock();
-			void unlock();
-			bool try_lock();
+			LockHandle(LockSet &lockset, const key_type &name, bool start_locked = true) :
+				m_lock(make_shared<Lock>(lockset, name, start_locked)) {}
+			void lock() { m_lock->lock(); }
+			void unlock() { m_lock->unlock(); }
+			bool try_lock() { return m_lock->try_lock(); }
 		};

+		LockHandle make_lock(const key_type &name, bool start_locked = true);
 	};

 	template <class T>
@@ -68,15 +88,36 @@ namespace crucible {
 		assert(m_set.empty());
 	}

+	template <class T>
+	bool
+	LockSet<T>::full()
+	{
+		return m_set.size() >= m_max_size;
+	}
+
+	template <class T>
+	bool
+	LockSet<T>::locked(const key_type &name)
+	{
+		return m_set.count(name);
+	}
+
+	template <class T>
+	void
+	LockSet<T>::max_size(size_t s)
+	{
+		m_max_size = s;
+	}
+
 	template <class T>
 	void
 	LockSet<T>::lock(const key_type &name)
 	{
 		unique_lock<mutex> lock(m_mutex);
-		while (m_set.count(name)) {
+		while (full() || locked(name)) {
 			m_condvar.wait(lock);
 		}
-		auto rv = m_set.insert(name);
+		auto rv = m_set.insert(make_pair(name, crucible::gettid()));
 		THROW_CHECK0(runtime_error, rv.second);
 	}

@@ -85,10 +126,10 @@ namespace crucible {
 	LockSet<T>::try_lock(const key_type &name)
 	{
 		unique_lock<mutex> lock(m_mutex);
-		if (m_set.count(name)) {
+		if (full() || locked(name)) {
 			return false;
 		}
-		auto rv = m_set.insert(name);
+		auto rv = m_set.insert(make_pair(name, crucible::gettid()));
 		THROW_CHECK1(runtime_error, name, rv.second);
 		return true;
 	}
@@ -98,20 +139,11 @@ namespace crucible {
 	LockSet<T>::unlock(const key_type &name)
 	{
 		unique_lock<mutex> lock(m_mutex);
-		m_condvar.notify_all();
 		auto erase_count = m_set.erase(name);
+		m_condvar.notify_all();
 		THROW_CHECK1(invalid_argument, erase_count, erase_count == 1);
 	}

-	template <class T>
-	void
-	LockSet<T>::wait_unlock(double interval)
-	{
-		unique_lock<mutex> lock(m_mutex);
-		if (m_set.empty()) return;
-		m_condvar.wait_for(lock, chrono::duration<double>(interval));
-	}
-
 	template <class T>
 	size_t
 	LockSet<T>::size()
@@ -133,7 +165,10 @@ namespace crucible {
 	LockSet<T>::copy()
 	{
 		unique_lock<mutex> lock(m_mutex);
-		return m_set;
+		// Make temporary copy of set while protected by mutex
+		auto rv = m_set;
+		// Return temporary copy after releasing lock
+		return rv;
 	}

 	template <class T>
@@ -183,26 +218,10 @@ namespace crucible {
 	}

 	template <class T>
-	LockSet<T>::Lock::Lock(Lock &&that) :
-		m_lockset(that.lockset),
-		m_name(that.m_name),
-		m_locked(that.m_locked)
+	typename LockSet<T>::LockHandle
+	LockSet<T>::make_lock(const key_type &name, bool start_locked)
 	{
-		that.m_locked = false;
-	}
-
-	template <class T>
-	typename LockSet<T>::Lock &
-	LockSet<T>::Lock::operator=(Lock &&that)
-	{
-		THROW_CHECK2(invalid_argument, &m_lockset, &that.m_lockset, &m_lockset == &that.m_lockset);
-		if (m_locked && that.m_name != m_name) {
-			unlock();
-		}
-		m_name = that.m_name;
-		m_locked = that.m_locked;
-		that.m_locked = false;
-		return *this;
+		return LockHandle(*this, name, start_locked);
 	}

 }
--- a/include/crucible/ntoa.h
+++ b/include/crucible/ntoa.h
@@ -7,12 +7,12 @@ namespace crucible {
 	using namespace std;

 	struct bits_ntoa_table {
-		unsigned long n;
-		unsigned long mask;
+		unsigned long long n;
+		unsigned long long mask;
 		const char *a;
 	};

-	string bits_ntoa(unsigned long n, const bits_ntoa_table *a);
+	string bits_ntoa(unsigned long long n, const bits_ntoa_table *a);

 };

--- a/include/crucible/process.h
+++ b/include/crucible/process.h
@@ -74,5 +74,8 @@ namespace crucible {
 	typedef ResourceHandle<Process::id, Process> Pid;

 	pid_t gettid();
+	double getloadavg1();
+	double getloadavg5();
+	double getloadavg15();
 }
 #endif // CRUCIBLE_PROCESS_H
--- a/include/crucible/progress.h
+++ b/include/crucible/progress.h
@@ -0,0 +1,122 @@
+#ifndef CRUCIBLE_PROGRESS_H
+#define CRUCIBLE_PROGRESS_H
+
+#include "crucible/error.h"
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <mutex>
+
+namespace crucible {
+	using namespace std;
+
+	template <class T>
+	class ProgressTracker {
+		struct ProgressTrackerState;
+		class ProgressHolderState;
+	public:
+		using value_type = T;
+		using ProgressHolder = shared_ptr<ProgressHolderState>;
+
+		ProgressTracker(const value_type &v);
+		value_type begin();
+		value_type end();
+
+		ProgressHolder hold(const value_type &v);
+
+	friend class ProgressHolderState;
+
+	private:
+		struct ProgressTrackerState {
+			using key_type = pair<value_type, ProgressHolderState *>;
+			mutex			m_mutex;
+			map<key_type, bool>	m_in_progress;
+			value_type		m_begin;
+			value_type		m_end;
+		};
+
+		class ProgressHolderState {
+			shared_ptr<ProgressTrackerState>	m_state;
+			const value_type			m_value;
+		public:
+			ProgressHolderState(shared_ptr<ProgressTrackerState> state, const value_type &v);
+			~ProgressHolderState();
+			value_type get() const;
+		};
+
+
+		shared_ptr<ProgressTrackerState>	m_state;
+	};
+
+	template <class T>
+	typename ProgressTracker<T>::value_type
+	ProgressTracker<T>::begin()
+	{
+		unique_lock<mutex> lock(m_state->m_mutex);
+		return m_state->m_begin;
+	}
+
+	template <class T>
+	typename ProgressTracker<T>::value_type
+	ProgressTracker<T>::end()
+	{
+		unique_lock<mutex> lock(m_state->m_mutex);
+		return m_state->m_end;
+	}
+
+	template <class T>
+	typename ProgressTracker<T>::value_type
+	ProgressTracker<T>::ProgressHolderState::get() const
+	{
+		return m_value;
+	}
+
+	template <class T>
+	ProgressTracker<T>::ProgressTracker(const ProgressTracker::value_type &t) :
+		m_state(make_shared<ProgressTrackerState>())
+	{
+		m_state->m_begin = t;
+		m_state->m_end = t;
+	}
+
+	template <class T>
+	ProgressTracker<T>::ProgressHolderState::ProgressHolderState(shared_ptr<ProgressTrackerState> state, const value_type &v) :
+		m_state(state),
+		m_value(v)
+	{
+		unique_lock<mutex> lock(m_state->m_mutex);
+		m_state->m_in_progress[make_pair(m_value, this)] = true;
+		if (m_state->m_end < m_value) {
+			m_state->m_end = m_value;
+		}
+	}
+
+	template <class T>
+	ProgressTracker<T>::ProgressHolderState::~ProgressHolderState()
+	{
+		unique_lock<mutex> lock(m_state->m_mutex);
+		m_state->m_in_progress[make_pair(m_value, this)] = false;
+		auto p = m_state->m_in_progress.begin();
+		while (p != m_state->m_in_progress.end()) {
+			if (p->second) {
+				break;
+			}
+			if (m_state->m_begin < p->first.first) {
+				m_state->m_begin = p->first.first;
+			}
+			m_state->m_in_progress.erase(p);
+			p = m_state->m_in_progress.begin();
+		}
+	}
+
+	template <class T>
+	shared_ptr<typename ProgressTracker<T>::ProgressHolderState>
+	ProgressTracker<T>::hold(const value_type &v)
+	{
+		return make_shared<ProgressHolderState>(m_state, v);
+	}
+
+}
+
+#endif // CRUCIBLE_PROGRESS_H
--- a/include/crucible/resource.h
+++ b/include/crucible/resource.h
@@ -8,6 +8,7 @@
 #include <memory>
 #include <mutex>
 #include <iostream>
+#include <stdexcept>

 namespace crucible {
 	using namespace std;
@@ -44,36 +45,29 @@ namespace crucible {

 	private:
 		using traits_type = ResourceTraits<Key, Resource>;
-
-		class ResourceHolder {
-			resource_ptr_type m_ptr;
-		public:
-			~ResourceHolder();
-			ResourceHolder(resource_ptr_type that);
-			ResourceHolder(const ResourceHolder &that) = default;
-			ResourceHolder(ResourceHolder &&that) = default;
-			ResourceHolder& operator=(ResourceHolder &&that) = default;
-			ResourceHolder& operator=(const ResourceHolder &that) = default;
-			resource_ptr_type get_resource_ptr() const;
-		};
-
-		using holder_ptr_type = shared_ptr<ResourceHolder>;
-		using weak_holder_ptr_type = weak_ptr<ResourceHolder>;
-		using map_type = map<key_type, weak_holder_ptr_type>;
+		using weak_ptr_type = weak_ptr<Resource>;
+		using map_type = map<key_type, weak_ptr_type>;

 		// The only instance variable
-		holder_ptr_type m_ptr;
+		resource_ptr_type m_ptr;

 		// A bunch of static variables and functions
-		static mutex &s_mutex();
-		static shared_ptr<map_type> s_map();
-		static holder_ptr_type insert(const key_type &key);
-		static holder_ptr_type insert(const resource_ptr_type &res);
-		static void erase(const key_type &key);
+		static mutex s_map_mutex;
+		static map_type s_map;
+		static resource_ptr_type insert(const key_type &key);
+		static resource_ptr_type insert(const resource_ptr_type &res);
+		static void clean_locked();
 		static ResourceTraits<Key, Resource> s_traits;

 	public:

+		// Exceptions
+		struct duplicate_resource : public invalid_argument {
+			key_type m_key;
+			key_type get_key() const;
+			duplicate_resource(const key_type &key);
+		};
+
 		// test for resource.  A separate operator because key_type could be confused with bool.
 		bool operator!() const;

@@ -89,8 +83,15 @@ namespace crucible {
 		ResourceHandle(const resource_ptr_type &res);
 		ResourceHandle& operator=(const resource_ptr_type &res);

-		// default constructor is public
+		// default construct/assign/move is public and mostly harmless
 		ResourceHandle() = default;
+		ResourceHandle(const ResourceHandle &that) = default;
+		ResourceHandle(ResourceHandle &&that) = default;
+		ResourceHandle& operator=(const ResourceHandle &that) = default;
+		ResourceHandle& operator=(ResourceHandle &&that) = default;
+
+		// Nontrivial destructor
+		~ResourceHandle();

 		// forward anything else to the Resource constructor
 		// if we can do so unambiguously
@@ -109,7 +110,7 @@ namespace crucible {

 		// get pointer to Resource object (nothrow, result may be null)
 		resource_ptr_type get_resource_ptr() const;
-		// this version throws and is probably not thread safe
+		// this version throws
 		resource_ptr_type operator->() const;

 		// dynamic casting of the resource (throws if cast fails)
@@ -145,139 +146,94 @@ namespace crucible {
 	}

 	template <class Key, class Resource>
-	ResourceHandle<Key, Resource>::ResourceHolder::ResourceHolder(resource_ptr_type that) :
-		m_ptr(that)
+	ResourceHandle<Key, Resource>::duplicate_resource::duplicate_resource(const key_type &key) :
+		invalid_argument("duplicate resource"),
+		m_key(key)
 	{
-		// Cannot insert ourselves here since our shared_ptr does not exist yet.
 	}

 	template <class Key, class Resource>
-	mutex &
-	ResourceHandle<Key, Resource>::s_mutex()
+	auto
+	ResourceHandle<Key, Resource>::duplicate_resource::get_key() const -> key_type
 	{
-		static mutex gcc_won_t_instantiate_this_either;
-		return gcc_won_t_instantiate_this_either;
-	}
-
-	template <class Key, class Resource>
-	shared_ptr<typename ResourceHandle<Key, Resource>::map_type>
-	ResourceHandle<Key, Resource>::s_map()
-	{
-		static shared_ptr<map_type> gcc_won_t_instantiate_the_damn_static_vars;
-		if (!gcc_won_t_instantiate_the_damn_static_vars) {
-			gcc_won_t_instantiate_the_damn_static_vars = make_shared<map_type>();
-		}
-		return gcc_won_t_instantiate_the_damn_static_vars;
+		return m_key;
 	}

 	template <class Key, class Resource>
 	void
-	ResourceHandle<Key, Resource>::erase(const key_type &key)
+	ResourceHandle<Key, Resource>::clean_locked()
 	{
-		unique_lock<mutex> lock(s_mutex());
-		// Resources are allowed to set their Keys to null.
-		if (s_traits.is_null_key(key)) {
-			// Clean out any dead weak_ptr objects.
-			for (auto i = s_map()->begin(); i != s_map()->end(); ) {
-				if (! (*i).second.lock()) {
-					i = s_map()->erase(i);
-				} else {
-					++i;
-				}
+		// Must be called with lock held
+		for (auto i = s_map.begin(); i != s_map.end(); ) {
+			auto this_i = i;
+			++i;
+			if (this_i->second.expired()) {
+				s_map.erase(this_i);
 			}
-			return;
-		}
-		auto erased = s_map()->erase(key);
-		if (erased != 1) {
-			cerr << __PRETTY_FUNCTION__ << ": WARNING: s_map()->erase(" << key << ") returned " << erased << " != 1" << endl;
 		}
 	}

 	template <class Key, class Resource>
-	ResourceHandle<Key, Resource>::ResourceHolder::~ResourceHolder()
-	{
-		if (!m_ptr) {
-			// Probably something harmless like a failed constructor.
-			cerr << __PRETTY_FUNCTION__ << ": WARNING: destroying null m_ptr" << endl;
-			return;
-		}
-		Key key = s_traits.get_key(*m_ptr);
-		ResourceHandle::erase(key);
-	}
-
-	template <class Key, class Resource>
-	typename ResourceHandle<Key, Resource>::holder_ptr_type
+	typename ResourceHandle<Key, Resource>::resource_ptr_type
 	ResourceHandle<Key, Resource>::insert(const key_type &key)
 	{
 		// no Resources for null keys
 		if (s_traits.is_null_key(key)) {
-			return holder_ptr_type();
+			return resource_ptr_type();
 		}
-		unique_lock<mutex> lock(s_mutex());
-		// find ResourceHolder for non-null key
-		auto found = s_map()->find(key);
-		if (found != s_map()->end()) {
-			holder_ptr_type rv = (*found).second.lock();
-			// a weak_ptr may have expired
+		unique_lock<mutex> lock(s_map_mutex);
+		auto found = s_map.find(key);
+		if (found != s_map.end()) {
+			resource_ptr_type rv = found->second.lock();
 			if (rv) {
+				// Use existing Resource
 				return rv;
+			} else {
+				// It's OK for the map to temporarily contain an expired weak_ptr to some dead Resource
+				clean_locked();
 			}
 		}
 		// not found or expired, throw any existing ref away and make a new one
 		resource_ptr_type rpt = s_traits.make_resource(key);
-		holder_ptr_type hpt = make_shared<ResourceHolder>(rpt);
 		// store weak_ptr in map
-		(*s_map())[key] = hpt;
+		s_map[key] = rpt;
 		// return shared_ptr
-		return hpt;
+		return rpt;
 	};

 	template <class Key, class Resource>
-	typename ResourceHandle<Key, Resource>::holder_ptr_type
+	typename ResourceHandle<Key, Resource>::resource_ptr_type
 	ResourceHandle<Key, Resource>::insert(const resource_ptr_type &res)
 	{
-		// no Resource, no ResourceHolder.
+		// no Resources for null keys
 		if (!res) {
-			return holder_ptr_type();
+			return resource_ptr_type();
 		}
-		// no ResourceHolders for null keys either.
 		key_type key = s_traits.get_key(*res);
 		if (s_traits.is_null_key(key)) {
-			return holder_ptr_type();
+			return resource_ptr_type();
 		}
-		unique_lock<mutex> lock(s_mutex());
-		// find ResourceHolder for non-null key
-		auto found = s_map()->find(key);
-		if (found != s_map()->end()) {
-			holder_ptr_type rv = (*found).second.lock();
-			// The map doesn't own the ResourceHolders, the ResourceHandles do.
-			// It's OK for the map to contain an expired weak_ptr to some dead ResourceHolder...
+		unique_lock<mutex> lock(s_map_mutex);
+		// find Resource for non-null key
+		auto found = s_map.find(key);
+		if (found != s_map.end()) {
+			resource_ptr_type rv = found->second.lock();
+			// It's OK for the map to temporarily contain an expired weak_ptr to some dead Resource...
 			if (rv) {
-				// found ResourceHolder, look at pointer
-				resource_ptr_type rp = rv->get_resource_ptr();
-				// We do not store references to null Resources.
-				assert(rp);
-				// Key retrieved for an existing object must match key searched or be null.
-				key_type found_key = s_traits.get_key(*rp);
-				bool found_key_is_null = s_traits.is_null_key(found_key);
-				assert(found_key_is_null || found_key == key);
-				if (!found_key_is_null) {
-					// We do not store references to duplicate resources.
-					if (rp.owner_before(res) || res.owner_before(rp)) {
-						cerr << "inserting new Resource with existing Key " << key << " not allowed at " << __PRETTY_FUNCTION__ << endl;;
-						abort();
-						// THROW_ERROR(out_of_range, "inserting new Resource with existing Key " << key << " not allowed at " << __PRETTY_FUNCTION__);
-					}
-					// rv is good, return it
-					return rv;
+				// ...but not a duplicate Resource.
+				if (rv.owner_before(res) || res.owner_before(rv)) {
+					throw duplicate_resource(key);
 				}
+				// Use the existing Resource (discard the caller's).
+				return rv;
+			} else {
+				// Clean out expired weak_ptrs
+				clean_locked();
 			}
 		}
-		// not found or expired, make a new one
-		holder_ptr_type rv = make_shared<ResourceHolder>(res);
-		s_map()->insert(make_pair(key, weak_holder_ptr_type(rv)));
-		// no need to check s_map result, we are either replacing a dead weak_ptr or adding a new one
-		return rv;
+		// not found or expired, make a new one or replace old one
+		s_map[key] = res;
+		return res;
 	};

 	template <class Key, class Resource>
@@ -309,31 +265,47 @@ namespace crucible {
 	}

 	template <class Key, class Resource>
-	typename ResourceHandle<Key, Resource>::resource_ptr_type
-	ResourceHandle<Key, Resource>::ResourceHolder::get_resource_ptr() const
+	ResourceHandle<Key, Resource>::~ResourceHandle()
 	{
-		return m_ptr;
+		// No pointer, nothing to do
+		if (!m_ptr) {
+			return;
+		}
+		// Save key so we can clean the map
+		auto key = s_traits.get_key(*m_ptr);
+		// Save a weak_ptr so we can tell if we need to clean the map
+		weak_ptr_type wp = m_ptr;
+		// Drop shared_ptr
+		m_ptr.reset();
+		// If there are still other references to the shared_ptr, we can stop now
+		if (!wp.expired()) {
+			return;
+		}
+		// Remove weak_ptr from map if it has expired
+		// (and not been replaced in the meantime)
+		unique_lock<mutex> lock_map(s_map_mutex);
+		auto found = s_map.find(key);
+		// Map entry may have been replaced, so check for expiry again
+		if (found != s_map.end() && found->second.expired()) {
+			s_map.erase(key);
+		}
 	}

 	template <class Key, class Resource>
 	typename ResourceHandle<Key, Resource>::resource_ptr_type
 	ResourceHandle<Key, Resource>::get_resource_ptr() const
 	{
-		if (!m_ptr) {
-			return resource_ptr_type();
-		}
-		return m_ptr->get_resource_ptr();
+		return m_ptr;
 	}

 	template <class Key, class Resource>
 	typename ResourceHandle<Key, Resource>::resource_ptr_type
 	ResourceHandle<Key, Resource>::operator->() const
 	{
-		resource_ptr_type rp = get_resource_ptr();
-		if (!rp) {
+		if (!m_ptr) {
 			THROW_ERROR(out_of_range, __PRETTY_FUNCTION__ << " called on null Resource");
 		}
-		return rp;
+		return m_ptr;
 	}

 	template <class Key, class Resource>
@@ -342,11 +314,10 @@ namespace crucible {
 	ResourceHandle<Key, Resource>::cast() const
 	{
 		shared_ptr<T> dp;
-		resource_ptr_type rp = get_resource_ptr();
-		if (!rp) {
+		if (!m_ptr) {
 			return dp;
 		}
-		dp = dynamic_pointer_cast<T>(rp);
+		dp = dynamic_pointer_cast<T>(m_ptr);
 		if (!dp) {
 			throw bad_cast();
 		}
@@ -357,11 +328,10 @@ namespace crucible {
 	typename ResourceHandle<Key, Resource>::key_type
 	ResourceHandle<Key, Resource>::get_key() const
 	{
-		resource_ptr_type rp = get_resource_ptr();
-		if (!rp) {
+		if (!m_ptr) {
 			return s_traits.get_null_key();
 		} else {
-			return s_traits.get_key(*rp);
+			return s_traits.get_key(*m_ptr);
 		}
 	}

@@ -378,9 +348,15 @@ namespace crucible {
 		return s_traits.is_null_key(operator key_type());
 	}

+	// Apparently GCC wants these to be used before they are defined.
 	template <class Key, class Resource>
 	ResourceTraits<Key, Resource> ResourceHandle<Key, Resource>::s_traits;

+	template <class Key, class Resource>
+	mutex ResourceHandle<Key, Resource>::s_map_mutex;
+
+	template <class Key, class Resource>
+	typename ResourceHandle<Key, Resource>::map_type ResourceHandle<Key, Resource>::s_map;

 }

--- a/include/crucible/task.h
+++ b/include/crucible/task.h
@@ -0,0 +1,163 @@
+#ifndef CRUCIBLE_TASK_H
+#define CRUCIBLE_TASK_H
+
+#include <functional>
+#include <memory>
+#include <ostream>
+#include <string>
+
+namespace crucible {
+	using namespace std;
+
+	class TaskState;
+
+	using TaskId = uint64_t;
+
+	class Task {
+		shared_ptr<TaskState> m_task_state;
+
+		Task(shared_ptr<TaskState> pts);
+
+	public:
+
+		// create empty Task object
+		Task() = default;
+
+		// create Task object containing closure and description
+		Task(string title, function<void()> exec_fn);
+
+		// schedule Task at end of queue.
+		// May run Task in current thread or in other thread.
+		// May run Task before or after returning.
+		void run() const;
+
+		// schedule Task before other queued tasks
+		void run_earlier() const;
+
+		// describe Task as text
+		string title() const;
+
+		// Returns currently executing task if called from exec_fn.
+		// Usually used to reschedule the currently executing Task.
+		static Task current_task();
+
+		// Ordering for containers
+		bool operator<(const Task &that) const;
+
+		// Null test
+		operator bool() const;
+
+		// Unique non-repeating(ish) ID for task
+		TaskId id() const;
+	};
+
+	ostream &operator<<(ostream &os, const Task &task);
+
+	class TaskMaster {
+	public:
+		// Blocks until the running thread count reaches this number
+		static void set_thread_count(size_t threads);
+
+		// Sets minimum thread count when load average tracking enabled
+		static void set_thread_min_count(size_t min_threads);
+
+		// Calls set_thread_count with default
+		static void set_thread_count();
+
+		// Creates thread to track load average and adjust thread count dynamically
+		static void set_loadavg_target(double target);
+
+		// Writes the current non-executing Task queue
+		static ostream & print_queue(ostream &);
+
+		// Writes the current executing Task for each worker
+		static ostream & print_workers(ostream &);
+
+		// Gets the current number of queued Tasks
+		static size_t get_queue_count();
+
+	};
+
+	// Barrier executes waiting Tasks once the last BarrierLock
+	// is released.  Multiple unique Tasks may be scheduled while
+	// BarrierLocks exist and all will be run() at once upon
+	// release.  If no BarrierLocks exist, Tasks are executed
+	// immediately upon insertion.
+
+	class BarrierState;
+
+	class BarrierLock {
+		shared_ptr<BarrierState> m_barrier_state;
+		BarrierLock(shared_ptr<BarrierState> pbs);
+	friend class Barrier;
+	public:
+		// Release this Lock immediately and permanently
+		void release();
+	};
+
+	class Barrier {
+		shared_ptr<BarrierState> m_barrier_state;
+
+		Barrier(shared_ptr<BarrierState> pbs);
+	public:
+		Barrier();
+
+		// Prevent execution of tasks behind barrier until
+		// BarrierLock destructor or release() method is called.
+		BarrierLock lock();
+
+		// Schedule a task for execution when no Locks exist
+		void insert_task(Task t);
+	};
+
+	// Exclusion provides exclusive access to a ExclusionLock.
+	// One Task will be able to obtain the ExclusionLock; other Tasks
+	// may schedule themselves for re-execution after the ExclusionLock
+	// is released.
+
+	class ExclusionState;
+	class Exclusion;
+
+	class ExclusionLock {
+		shared_ptr<ExclusionState> m_exclusion_state;
+		ExclusionLock(shared_ptr<ExclusionState> pes);
+		ExclusionLock() = default;
+	friend class Exclusion;
+	public:
+		// Calls release()
+		~ExclusionLock();
+
+		// Release this Lock immediately and permanently
+		void release();
+
+		// Test for locked state
+		operator bool() const;
+	};
+
+	class Exclusion {
+		shared_ptr<ExclusionState> m_exclusion_state;
+
+		Exclusion(shared_ptr<ExclusionState> pes);
+	public:
+		Exclusion();
+
+		// Attempt to obtain a Lock.  If successful, current Task
+		// owns the Lock until the ExclusionLock is released
+		// (it is the ExclusionLock that owns the lock, so it can
+		// be passed to other Tasks or threads, but this is not
+		// recommended practice).
+		// If not successful, current Task is expected to call
+		// insert_task(current_task()), release any ExclusionLock
+		// objects it holds, and exit its Task function.
+		ExclusionLock try_lock();
+
+		// Execute Task when Exclusion is unlocked (possibly immediately).
+		// First Task is scheduled with run_earlier(), all others are
+		// scheduled with run().
+		void insert_task(Task t);
+	};
+
+
+}
+
+#endif // CRUCIBLE_TASK_H
--- a/include/crucible/time.h
+++ b/include/crucible/time.h
@@ -4,6 +4,8 @@
 #include "crucible/error.h"

 #include <chrono>
+#include <condition_variable>
+#include <limits>
 #include <mutex>
 #include <ostream>

@@ -17,10 +19,9 @@ namespace crucible {
 	public:
 		Timer();
 		double age() const;
+		chrono::high_resolution_clock::time_point get() const;
 		double report(int precision = 1000) const;
 		void reset();
-		void set(const chrono::high_resolution_clock::time_point &start);
-		void set(double delta);
 		double lap();
 		bool operator<(double d) const;
 		bool operator>(double d) const;
@@ -32,10 +33,11 @@ namespace crucible {
 		Timer	m_timer;
 		double	m_rate;
 		double	m_burst;
-		double  m_tokens;
+		double  m_tokens = 0.0;
 		mutex	m_mutex;

 		void update_tokens();
+		RateLimiter() = delete;
 	public:
 		RateLimiter(double rate, double burst);
 		RateLimiter(double rate);
@@ -44,6 +46,59 @@ namespace crucible {
 		void borrow(double cost = 1.0);
 	};

+	class RateEstimator {
+		mutable mutex m_mutex;
+		mutable condition_variable m_condvar;
+		Timer m_timer;
+		double m_num = 0.0;
+		double m_den = 0.0;
+		uint64_t m_last_count = numeric_limits<uint64_t>::max();
+		Timer m_last_update;
+		const double m_decay = 0.99;
+		Timer m_last_decay;
+		double m_min_delay;
+		double m_max_delay;
+
+		chrono::duration<double> duration_unlocked(uint64_t relative_count) const;
+		chrono::high_resolution_clock::time_point time_point_unlocked(uint64_t absolute_count) const;
+		double rate_unlocked() const;
+		pair<double, double> ratio_unlocked() const;
+		void update_unlocked(uint64_t new_count);
+	public:
+		RateEstimator(double min_delay = 1, double max_delay = 3600);
+
+		// Block until count reached
+		void wait_for(uint64_t new_count_relative) const;
+		void wait_until(uint64_t new_count_absolute) const;
+
+		// Computed rates and ratios
+		double rate() const;
+		pair<double, double> ratio() const;
+
+		// Inspect raw num/den
+		pair<double, double> raw() const;
+
+		// Write count
+		void update(uint64_t new_count);
+
+		// Ignore counts that go backwards
+		void update_monotonic(uint64_t new_count);
+
+		// Read count
+		uint64_t count() const;
+
+		// Convert counts to chrono types
+		chrono::high_resolution_clock::time_point time_point(uint64_t absolute_count) const;
+		chrono::duration<double> duration(uint64_t relative_count) const;
+
+		// Polling delay until count reached (limited by min/max delay)
+		double seconds_for(uint64_t new_count_relative) const;
+		double seconds_until(uint64_t new_count_absolute) const;
+	};
+
+	ostream &
+	operator<<(ostream &os, const RateEstimator &re);
+
 }

 #endif // CRUCIBLE_TIME_H
--- a/include/crucible/timequeue.h
+++ b/include/crucible/timequeue.h
@@ -1,188 +0,0 @@
-#ifndef CRUCIBLE_TIMEQUEUE_H
-#define CRUCIBLE_TIMEQUEUE_H
-
-#include <crucible/error.h>
-#include <crucible/time.h>
-
-#include <condition_variable>
-#include <limits>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <set>
-
-namespace crucible {
-	using namespace std;
-
-	template <class Task>
-	class TimeQueue {
-
-	public:
-		using Timestamp = chrono::high_resolution_clock::time_point;
-
-	private:
-		struct Item {
-			Timestamp m_time;
-			unsigned m_id;
-			Task m_task;
-
-			bool operator<(const Item &that) const {
-				if (m_time < that.m_time) return true;
-				if (that.m_time < m_time) return false;
-				return m_id < that.m_id;
-			}
-			static unsigned s_id;
-
-			Item(const Timestamp &time, const Task& task) :
-				m_time(time),
-				m_id(++s_id),
-				m_task(task)
-			{
-			}
-
-		};
-
-		set<Item>		m_set;
-		mutable mutex		m_mutex;
-		condition_variable	m_cond_full, m_cond_empty;
-		size_t			m_max_queue_depth;
-
-	public:
-		~TimeQueue();
-		TimeQueue(size_t max_queue_depth = numeric_limits<size_t>::max());
-
-		void push(const Task &task, double delay = 0);
-		void push_nowait(const Task &task, double delay = 0);
-		Task pop();
-		bool pop_nowait(Task &t);
-		double when() const;
-
-		size_t size() const;
-		bool empty() const;
-
-		list<Task> peek(size_t count) const;
-	};
-
-	template <class Task> unsigned TimeQueue<Task>::Item::s_id = 0;
-
-	template <class Task>
-	TimeQueue<Task>::~TimeQueue()
-	{
-		if (!m_set.empty()) {
-			cerr << "ERROR: " << m_set.size() << " locked items still in TimeQueue at destruction" << endl;
-		}
-	}
-
-	template <class Task>
-	void
-	TimeQueue<Task>::push(const Task &task, double delay)
-	{
-		Timestamp time = chrono::high_resolution_clock::now() + 
-			chrono::duration_cast<chrono::high_resolution_clock::duration>(chrono::duration<double>(delay));
-		unique_lock<mutex> lock(m_mutex);
-		while (m_set.size() > m_max_queue_depth) {
-			m_cond_full.wait(lock);
-		}
-		m_set.insert(Item(time, task));
-		m_cond_empty.notify_all();
-	}
-
-	template <class Task>
-	void
-	TimeQueue<Task>::push_nowait(const Task &task, double delay)
-	{
-		Timestamp time = chrono::high_resolution_clock::now() + 
-			chrono::duration_cast<chrono::high_resolution_clock::duration>(chrono::duration<double>(delay));
-		unique_lock<mutex> lock(m_mutex);
-		m_set.insert(Item(time, task));
-		m_cond_empty.notify_all();
-	}
-
-	template <class Task>
-	Task
-	TimeQueue<Task>::pop()
-	{
-		unique_lock<mutex> lock(m_mutex);
-		while (1) {
-			while (m_set.empty()) {
-				m_cond_empty.wait(lock);
-			}
-			Timestamp now = chrono::high_resolution_clock::now();
-			if (now > m_set.begin()->m_time) {
-				Task rv = m_set.begin()->m_task;
-				m_set.erase(m_set.begin());
-				m_cond_full.notify_all();
-				return rv;
-			}
-			m_cond_empty.wait_until(lock, m_set.begin()->m_time);
-		}
-	}
-
-	template <class Task>
-	bool
-	TimeQueue<Task>::pop_nowait(Task &t)
-	{
-		unique_lock<mutex> lock(m_mutex);
-		if (m_set.empty()) {
-			return false;
-		}
-		Timestamp now = chrono::high_resolution_clock::now();
-		if (now <= m_set.begin()->m_time) {
-			return false;
-		}
-		t = m_set.begin()->m_task;
-		m_set.erase(m_set.begin());
-		m_cond_full.notify_all();
-		return true;
-	}
-
-	template <class Task>
-	double
-	TimeQueue<Task>::when() const
-	{
-		unique_lock<mutex> lock(m_mutex);
-		if (m_set.empty()) {
-			return numeric_limits<double>::infinity();
-		}
-		return chrono::duration<double>(m_set.begin()->m_time - chrono::high_resolution_clock::now()).count();
-	}
-
-	template <class Task>
-	size_t
-	TimeQueue<Task>::size() const
-	{
-		unique_lock<mutex> lock(m_mutex);
-		return m_set.size();
-	}
-
-	template <class Task>
-	bool
-	TimeQueue<Task>::empty() const
-	{
-		unique_lock<mutex> lock(m_mutex);
-		return m_set.empty();
-	}
-
-	template <class Task>
-	list<Task>
-	TimeQueue<Task>::peek(size_t count) const
-	{
-		unique_lock<mutex> lock(m_mutex);
-		list<Task> rv;
-		auto it = m_set.begin();
-		while (count-- && it != m_set.end()) {
-			rv.push_back(it->m_task);
-			++it;
-		}
-		return rv;
-	}
-
-	template <class Task>
-	TimeQueue<Task>::TimeQueue(size_t max_depth) :
-		m_max_queue_depth(max_depth)
-	{
-	}
-
-}
-
-#endif // CRUCIBLE_TIMEQUEUE_H
--- a/include/crucible/version.h
+++ b/include/crucible/version.h
@@ -0,0 +1,8 @@
+#ifndef CRUCIBLE_VERSION_H
+#define CRUCIBLE_VERSION_H
+
+namespace crucible {
+	extern const char *VERSION;
+}
+
+#endif CRUCIBLE_VERSION_H
--- a/include/crucible/workqueue.h
+++ b/include/crucible/workqueue.h
@@ -1,189 +0,0 @@
-#ifndef CRUCIBLE_WORKQUEUE_H
-#define CRUCIBLE_WORKQUEUE_H
-
-#include <crucible/error.h>
-
-#include <condition_variable>
-#include <limits>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <set>
-
-namespace crucible {
-	using namespace std;
-
-	template <class Task>
-	class WorkQueue {
-
-	public:
-		using set_type = set<Task>;
-		using key_type = Task;
-
-	private:
-
-		set_type		m_set;
-		mutable mutex			m_mutex;
-		condition_variable	m_cond_full, m_cond_empty;
-		size_t			m_max_queue_depth;
-
-	public:
-		~WorkQueue();
-		template <class... Args> WorkQueue(size_t max_queue_depth, Args... args);
-		template <class... Args> WorkQueue(Args... args);
-
-		void push(const key_type &name);
-		void push_wait(const key_type &name, size_t limit);
-		void push_nowait(const key_type &name);
-
-		key_type pop();
-		bool pop_nowait(key_type &rv);
-		key_type peek();
-
-		size_t size() const;
-		bool empty();
-		set_type copy();
-		list<Task> peek(size_t count) const;
-
-	};
-
-	template <class Task>
-	WorkQueue<Task>::~WorkQueue()
-	{
-		if (!m_set.empty()) {
-			cerr << "ERROR: " << m_set.size() << " locked items still in WorkQueue " << this << " at destruction" << endl;
-		}
-	}
-
-	template <class Task>
-	void
-	WorkQueue<Task>::push(const key_type &name)
-	{
-		unique_lock<mutex> lock(m_mutex);
-		while (!m_set.count(name) && m_set.size() > m_max_queue_depth) {
-			m_cond_full.wait(lock);
-		}
-		m_set.insert(name);
-		m_cond_empty.notify_all();
-	}
-
-	template <class Task>
-	void
-	WorkQueue<Task>::push_wait(const key_type &name, size_t limit)
-	{
-		unique_lock<mutex> lock(m_mutex);
-		while (!m_set.count(name) && m_set.size() >= limit) {
-			m_cond_full.wait(lock);
-		}
-		m_set.insert(name);
-		m_cond_empty.notify_all();
-	}
-
-	template <class Task>
-	void
-	WorkQueue<Task>::push_nowait(const key_type &name)
-	{
-		unique_lock<mutex> lock(m_mutex);
-		m_set.insert(name);
-		m_cond_empty.notify_all();
-	}
-
-	template <class Task>
-	typename WorkQueue<Task>::key_type
-	WorkQueue<Task>::pop()
-	{
-		unique_lock<mutex> lock(m_mutex);
-		while (m_set.empty()) {
-			m_cond_empty.wait(lock);
-		}
-		key_type rv = *m_set.begin();
-		m_set.erase(m_set.begin());
-		m_cond_full.notify_all();
-		return rv;
-	}
-
-	template <class Task>
-	bool
-	WorkQueue<Task>::pop_nowait(key_type &rv)
-	{
-		unique_lock<mutex> lock(m_mutex);
-		if (m_set.empty()) {
-			return false;
-		}
-		rv = *m_set.begin();
-		m_set.erase(m_set.begin());
-		m_cond_full.notify_all();
-		return true;
-	}
-
-	template <class Task>
-	typename WorkQueue<Task>::key_type
-	WorkQueue<Task>::peek()
-	{
-		unique_lock<mutex> lock(m_mutex);
-		if (m_set.empty()) {
-			return key_type();
-		} else {
-			return *m_set.begin();
-		}
-	}
-
-	template <class Task>
-	size_t
-	WorkQueue<Task>::size() const
-	{
-		unique_lock<mutex> lock(m_mutex);
-		return m_set.size();
-	}
-
-	template <class Task>
-	bool
-	WorkQueue<Task>::empty()
-	{
-		unique_lock<mutex> lock(m_mutex);
-		return m_set.empty();
-	}
-
-	template <class Task>
-	typename WorkQueue<Task>::set_type
-	WorkQueue<Task>::copy()
-	{
-		unique_lock<mutex> lock(m_mutex);
-		return m_set;
-	}
-
-	template <class Task>
-	list<Task>
-	WorkQueue<Task>::peek(size_t count) const
-	{
-		unique_lock<mutex> lock(m_mutex);
-		list<Task> rv;
-		for (auto i : m_set) {
-			if (count--) {
-				rv.push_back(i);
-			} else {
-				break;
-			}
-		}
-		return rv;
-	}
-
-	template <class Task>
-	template <class... Args>
-	WorkQueue<Task>::WorkQueue(Args... args) :
-		m_set(args...),
-		m_max_queue_depth(numeric_limits<size_t>::max())
-	{
-	}
-
-	template <class Task>
-	template <class... Args>
-	WorkQueue<Task>::WorkQueue(size_t max_depth, Args... args) :
-		m_set(args...),
-		m_max_queue_depth(max_depth)
-	{
-	}
-
-}
-
-#endif // CRUCIBLE_WORKQUEUE_H
--- a/lib/.gitignore
+++ b/lib/.gitignore
@@ -0,0 +1 @@
+.version.*
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -1,37 +1,47 @@
-default: libcrucible.so
+TAG ?= $(shell git describe --always --dirty || echo UNKNOWN)

-OBJS = \
-	crc64.o \
+default: libcrucible.so
+%.so: Makefile
+
+CRUCIBLE_OBJS = \
 	chatter.o \
+	cleanup.o \
+	crc64.o \
 	error.o \
-	execpipe.o \
 	extentwalker.o \
 	fd.o \
 	fs.o \
-	interp.o \
 	ntoa.o \
 	path.o \
 	process.o \
 	string.o \
+	task.o \
 	time.o \
 	uuid.o \

 include ../makeflags
+-include ../localconf
+include ../Defines.mk

-LDFLAGS = -shared -luuid
+configure.h: configure.h.in
+	$(TEMPLATE_COMPILER)

-depends.mk: *.c *.cc
-	for x in *.c; do $(CC) $(CFLAGS) -M "$$x"; done > depends.mk.new
-	for x in *.cc; do $(CXX) $(CXXFLAGS) -M "$$x"; done >> depends.mk.new
-	mv -fv depends.mk.new depends.mk
+.depends/%.dep: %.cc configure.h Makefile
+	@mkdir -p .depends
+	$(CXX) $(CXXFLAGS) -M -MF $@ -MT $(<:.cc=.o) $<

-include depends.mk
+depends.mk: $(CRUCIBLE_OBJS:%.o=.depends/%.dep)
+	cat $^ > $@.new
+	mv -f $@.new $@

-%.o: %.c
-	$(CC) $(CFLAGS) -o $@ -c $<
+.version.cc: configure.h Makefile ../makeflags $(CRUCIBLE_OBJS:.o=.cc) ../include/crucible/*.h
+	echo "namespace crucible { const char *VERSION = \"$(TAG)\"; }" > $@.new
+	mv -f $@.new $@

-%.o: %.cc ../include/crucible/%.h
-	$(CXX) $(CXXFLAGS) -o $@ -c $<
+include depends.mk

-libcrucible.so: $(OBJS) Makefile
-	$(CXX) $(LDFLAGS) -o $@ $(OBJS)
+%.o: %.cc ../makeflags
+	$(CXX) $(CXXFLAGS) -fPIC -o $@ -c $<
+
+libcrucible.so: $(CRUCIBLE_OBJS) .version.o
+	$(CXX) $(LDFLAGS) -fPIC -shared -Wl,-soname,$@ -o $@ $^ -luuid
--- a/lib/chatter.cc
+++ b/lib/chatter.cc
@@ -15,8 +15,9 @@
 namespace crucible {
 	using namespace std;

-	static auto_ptr<set<string>> chatter_names;
+	static shared_ptr<set<string>> chatter_names;
 	static const char *SPACETAB = " \t";
+	static bool add_prefix_timestamp = true;

 	static
 	void
@@ -43,28 +44,41 @@ namespace crucible {
 		}
 	}

-	Chatter::Chatter(string name, ostream &os)
-		: m_name(name), m_os(os)
+	Chatter::Chatter(int loglevel, string name, ostream &os)
+		: m_loglevel(loglevel), m_name(name), m_os(os)
 	{
 	}

+	void
+	Chatter::enable_timestamp(bool prefix_timestamp)
+	{
+		add_prefix_timestamp = prefix_timestamp;
+	}
+
 	Chatter::~Chatter()
 	{
 		ostringstream header_stream;

-		time_t ltime;
-		DIE_IF_MINUS_ONE(time(&ltime));
-		struct tm ltm;
-		DIE_IF_ZERO(localtime_r(&ltime, &ltm));
+		if (add_prefix_timestamp) {
+			time_t ltime;
+			DIE_IF_MINUS_ONE(time(&ltime));
+			struct tm ltm;
+			DIE_IF_ZERO(localtime_r(&ltime, &ltm));

-		char buf[1024];
-		DIE_IF_ZERO(strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &ltm));
+			char buf[1024];
+			DIE_IF_ZERO(strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &ltm));

-		header_stream << buf;
-		header_stream << " " << getpid() << "." << gettid();
-		if (!m_name.empty()) {
-			header_stream << " " << m_name;
+			header_stream << buf;
+			header_stream << " " << getpid() << "." << crucible::gettid() << "<" << m_loglevel << ">";
+			if (!m_name.empty()) {
+				header_stream << " " << m_name;
+			}
+		} else {
+			header_stream << "<" << m_loglevel << ">";
+			header_stream << (m_name.empty() ? "thread" : m_name);
+			header_stream << "[" << crucible::gettid() << "]";
 		}
+
 		header_stream << ": ";

 		string out = m_oss.str();
@@ -86,7 +100,7 @@ namespace crucible {
 	}

 	Chatter::Chatter(Chatter &&c)
-		: m_name(c.m_name), m_os(c.m_os), m_oss(c.m_oss.str())
+		: m_loglevel(c.m_loglevel), m_name(c.m_name), m_os(c.m_os), m_oss(c.m_oss.str())
 	{
 		c.m_oss.str("");
 	}
@@ -110,6 +124,7 @@ namespace crucible {
 		} else if (!chatter_names->empty()) {
 			cerr << "CRUCIBLE_CHATTER does not list '" << m_file << "' or '" << m_pretty_function << "'" << endl;
 		}
+		(void)m_line; // not implemented yet
 		// cerr << "ChatterBox " << reinterpret_cast<void*>(this) << " constructed" << endl;
 	}

--- a/lib/cleanup.cc
+++ b/lib/cleanup.cc
@@ -0,0 +1,17 @@
+#include <crucible/cleanup.h>
+
+namespace crucible {
+
+	Cleanup::Cleanup(function<void()> func) :
+		m_cleaner(func)
+	{
+	}
+
+	Cleanup::~Cleanup()
+	{
+		if (m_cleaner) {
+			m_cleaner();
+		}
+	}
+
+}
--- a/lib/configure.h.in
+++ b/lib/configure.h.in
@@ -0,0 +1,6 @@
+#ifndef _CONFIGURE_H
+
+#define ETC_PREFIX "@ETC_PREFIX@"
+
+#define _CONFIGURE_H
+#endif
--- a/lib/crc64.cc
+++ b/lib/crc64.cc
@@ -1,3 +1,31 @@
+/* crc64.c -- compute CRC-64
+ * Copyright (C) 2013 Mark Adler
+ * Version 1.4  16 Dec 2013  Mark Adler
+ */
+
+/*
+ This software is provided 'as-is', without any express or implied
+ warranty.  In no event will the author be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ Mark Adler
+ madler@alumni.caltech.edu
+ */
+
+/* Substantially modified by Paul Jones for usage in bees */
+
 #include "crucible/crc64.h"

 #define POLY64REV 0xd800000000000000ULL
@@ -5,13 +33,16 @@
 namespace crucible {

 	static bool init = false;
-	static uint64_t CRCTable[256];
+	static uint64_t CRCTable[8][256];

 	static void init_crc64_table()
 	{
 		if (!init) {
-			for (int i = 0; i <= 255; i++) {
-				uint64_t part = i;
+			uint64_t crc;
+
+			// Generate CRCs for all single byte sequences
+			for (int n = 0; n < 256; n++) {
+				uint64_t part = n;
 				for (int j = 0; j < 8; j++) {
 					if (part & 1) {
 						part = (part >> 1) ^ POLY64REV;
@@ -19,37 +50,53 @@ namespace crucible {
 						part >>= 1;
 					}
 				}
-				CRCTable[i] = part;
+				CRCTable[0][n] = part;
+			}
+
+			// Generate nested CRC table for slice-by-8 lookup
+			for (int n = 0; n < 256; n++) {
+				crc = CRCTable[0][n];
+				for (int k = 1; k < 8; k++) {
+					crc = CRCTable[0][crc & 0xff] ^ (crc >> 8);
+					CRCTable[k][n] = crc;
+				}
 			}
 			init = true;
 		}
 	}

-	uint64_t
-	Digest::CRC::crc64(const char *s)
-	{
-		init_crc64_table();
-
-		uint64_t crc = 0;
-		for (; *s; s++) {
-			uint64_t temp1 = crc >> 8;
-			uint64_t temp2 = CRCTable[(crc ^ static_cast<uint64_t>(*s)) & 0xff];
-			crc = temp1 ^ temp2;
-		}
-
-		return crc;
-	}
-
 	uint64_t
 	Digest::CRC::crc64(const void *p, size_t len)
 	{
 		init_crc64_table();
-
+		const unsigned char *next = static_cast<const unsigned char *>(p);
 		uint64_t crc = 0;
-		for (const unsigned char *s = static_cast<const unsigned char *>(p); len; --len) {
-			uint64_t temp1 = crc >> 8;
-			uint64_t temp2 = CRCTable[(crc ^ *s++) & 0xff];
-			crc = temp1 ^ temp2;
+
+		// Process individual bytes until we reach an 8-byte aligned pointer
+		while (len && (reinterpret_cast<uintptr_t>(next) & 7) != 0) {
+			crc = CRCTable[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+			len--;
+		}
+
+		// Fast middle processing, 8 bytes (aligned!) per loop
+		while (len >= 8) {
+			crc ^= *(reinterpret_cast< const uint64_t *>(next));
+			crc = CRCTable[7][crc & 0xff] ^
+				  CRCTable[6][(crc >> 8) & 0xff] ^
+				  CRCTable[5][(crc >> 16) & 0xff] ^
+				  CRCTable[4][(crc >> 24) & 0xff] ^
+				  CRCTable[3][(crc >> 32) & 0xff] ^
+				  CRCTable[2][(crc >> 40) & 0xff] ^
+				  CRCTable[1][(crc >> 48) & 0xff] ^
+				  CRCTable[0][crc >> 56];
+			next += 8;
+			len -= 8;
+		}
+
+		// Process remaining bytes (can't be larger than 8)
+		while (len) {
+			crc = CRCTable[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+			len--;
 		}

 		return crc;
--- a/lib/error.cc
+++ b/lib/error.cc
@@ -32,7 +32,7 @@ namespace crucible {

 	// FIXME:  could probably avoid some of these levels of indirection
 	static
-	function<void(string s)> current_catch_explainer = [&](string s) {
+	function<void(string s)> current_catch_explainer = [](string s) {
 		cerr << s << endl;
 	};

--- a/lib/execpipe.cc
+++ b/lib/execpipe.cc
@@ -1,104 +0,0 @@
-#include "crucible/execpipe.h"
-
-#include "crucible/chatter.h"
-#include "crucible/error.h"
-#include "crucible/process.h"
-
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-namespace crucible {
-	using namespace std;
-
-	void
-	redirect_stdin(const Fd &child_fd)
-	{
-		dup2_or_die(child_fd, STDIN_FILENO);
-	}
-
-	void
-	redirect_stdin_stdout(const Fd &child_fd)
-	{
-		dup2_or_die(child_fd, STDOUT_FILENO);
-		dup2_or_die(child_fd, STDIN_FILENO);
-	}
-
-	void
-	redirect_stdin_stdout_stderr(const Fd &child_fd)
-	{
-		dup2_or_die(child_fd, STDERR_FILENO);
-		dup2_or_die(child_fd, STDOUT_FILENO);
-		dup2_or_die(child_fd, STDIN_FILENO);
-	}
-
-	void
-	redirect_stdout_stderr(const Fd &child_fd)
-	{
-		dup2_or_die(child_fd, STDERR_FILENO);
-		dup2_or_die(child_fd, STDOUT_FILENO);
-	}
-
-	void
-	redirect_stdout(const Fd &child_fd)
-	{
-		dup2_or_die(child_fd, STDOUT_FILENO);
-	}
-
-	void
-	redirect_stderr(const Fd &child_fd)
-	{
-		dup2_or_die(child_fd, STDERR_FILENO);
-	}
-
-	Fd popen(function<int()> f, function<void(const Fd &child_fd)> import_fd_fn)
-	{
-		Fd parent_fd, child_fd;
-		{
-			pair<Fd, Fd> fd_pair = socketpair_or_die();
-			parent_fd = fd_pair.first;
-			child_fd = fd_pair.second;
-		}
-
-		pid_t fv;
-		DIE_IF_MINUS_ONE(fv = fork());
-
-		if (fv) {
-			child_fd->close();
-			return parent_fd;
-		} else {
-			int rv = EXIT_FAILURE;
-			catch_all([&]() {
-				parent_fd->close();
-				import_fd_fn(child_fd);
-				// system("ls -l /proc/$$/fd/ >&2");
-
-				rv = f();
-			});
-			_exit(rv);
-			cerr << "PID " << getpid() << " TID " << gettid() << "STILL ALIVE" << endl;
-			system("ls -l /proc/$$/task/ >&2");
-			exit(EXIT_FAILURE);
-		}
-	}
-
-	string
-	read_all(Fd fd, size_t max_bytes, size_t chunk_bytes)
-	{
-		char buf[chunk_bytes];
-		string str;
-		size_t rv;
-		while (1) {
-			read_partial_or_die(fd, static_cast<void *>(buf), chunk_bytes, rv);
-			if (rv == 0) {
-				break;
-			}
-			if (max_bytes - str.size() < rv) {
-				THROW_ERROR(out_of_range, "Output size limit " << max_bytes << " exceeded by appending " << rv << " bytes read to " << str.size() << " already in string");
-			}
-			str.append(buf, rv);
-		}
-		return str;
-	}
-}
--- a/lib/extentwalker.cc
+++ b/lib/extentwalker.cc
@@ -6,7 +6,6 @@
 #include "crucible/limits.h"
 #include "crucible/string.h"

-
 namespace crucible {
 	using namespace std;

@@ -15,7 +14,6 @@ namespace crucible {
 	// fm_start, fm_length, fm_flags, m_extents
 	// fe_logical, fe_physical, fe_length, fe_flags

-	static const off_t MAX_OFFSET = numeric_limits<off_t>::max();
 	static const off_t FIEMAP_BLOCK_SIZE = 4096;

 	static bool __ew_do_log = getenv("EXTENTWALKER_DEBUG");
@@ -79,17 +77,6 @@ namespace crucible {
 			<< "] }";
 	}

-	Extent::Extent() :
-		m_begin(0),
-		m_end(0),
-		m_physical(0),
-		m_flags(0),
-		m_physical_len(0),
-		m_logical_len(0),
-		m_offset(0)
-	{
-	}
-
 	Extent::operator bool() const
 	{
 		THROW_CHECK2(invalid_argument, m_begin, m_end, m_end >= m_begin);
@@ -109,6 +96,18 @@ namespace crucible {
 		return m_begin == that.m_begin && m_end == that.m_end && m_physical == that.m_physical && m_flags == that.m_flags;
 	}

+	bool
+	Extent::compressed() const
+	{
+		return m_flags & FIEMAP_EXTENT_ENCODED;
+	}
+
+	uint64_t
+	Extent::bytenr() const
+	{
+		return compressed() ? m_physical : m_physical - m_offset;
+	}
+
 	ExtentWalker::ExtentWalker(Fd fd) :
 		m_fd(fd),
 		m_current(m_extents.begin())
@@ -332,7 +331,9 @@ namespace crucible {
 		THROW_CHECK1(runtime_error, new_vec.size(), !new_vec.empty());

 		// Allow last extent to extend beyond desired range (e.g. at EOF)
-		THROW_CHECK2(runtime_error, ipos, new_vec.rbegin()->m_end, ipos <= new_vec.rbegin()->m_end);
+		// ...but that's not what this does
+		// THROW_CHECK3(runtime_error, ipos, new_vec.rbegin()->m_end, m_stat.st_size, ipos <= new_vec.rbegin()->m_end);
+
 		// If we have the last extent in the file, truncate it to the file size.
 		if (ipos >= m_stat.st_size) {
 			THROW_CHECK2(runtime_error, new_vec.rbegin()->m_begin, m_stat.st_size, m_stat.st_size > new_vec.rbegin()->m_begin);
@@ -468,7 +469,7 @@ namespace crucible {
 	BtrfsExtentWalker::Vec
 	BtrfsExtentWalker::get_extent_map(off_t pos)
 	{
-		BtrfsIoctlSearchKey sk;
+		BtrfsIoctlSearchKey sk(sc_extent_fetch_max * (sizeof(btrfs_file_extent_item) + sizeof(btrfs_ioctl_search_header)));
 		if (!m_root_fd) {
 			m_root_fd = m_fd;
 		}
@@ -519,25 +520,26 @@ namespace crucible {

 			auto type = call_btrfs_get(btrfs_stack_file_extent_type, i.m_data);
 			off_t len = -1;
-                        switch (type) {
-                                default:
+			switch (type) {
+				default:
 					cerr << "Unhandled file extent type " << type << " in root " << m_tree_id << " ino " << m_stat.st_ino << endl;
 					break;
-                                case BTRFS_FILE_EXTENT_INLINE:
+				case BTRFS_FILE_EXTENT_INLINE:
 					len = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data));
 					e.m_flags |= FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED;
 					// Inline extents are never obscured, so don't bother filling in m_physical_len, etc.
-                                        break;
-                                case BTRFS_FILE_EXTENT_PREALLOC:
+					break;
+				case BTRFS_FILE_EXTENT_PREALLOC:
 					e.m_flags |= Extent::PREALLOC;
-                                case BTRFS_FILE_EXTENT_REG: {
+					// fallthrough
+				case BTRFS_FILE_EXTENT_REG: {
 					e.m_physical = call_btrfs_get(btrfs_stack_file_extent_disk_bytenr, i.m_data);

 					// This is the length of the full extent (decompressed)
-                                        off_t ram = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data));
+					off_t ram = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data));

 					// This is the length of the part of the extent appearing in the file (decompressed)
-                                        len = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_num_bytes, i.m_data));
+					len = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_num_bytes, i.m_data));

 					// This is the offset from start of on-disk extent to the part we see in the file (decompressed)
 					// May be negative due to the kind of bug we're stuck with forever, so no cast range check
--- a/lib/fd.cc
+++ b/lib/fd.cc
@@ -174,11 +174,13 @@ namespace crucible {
 	static const struct bits_ntoa_table mmap_flags_table[] = {
 		NTOA_TABLE_ENTRY_BITS(MAP_SHARED),
 		NTOA_TABLE_ENTRY_BITS(MAP_PRIVATE),
+#ifdef MAP_32BIT
 		NTOA_TABLE_ENTRY_BITS(MAP_32BIT),
+#endif
 		NTOA_TABLE_ENTRY_BITS(MAP_ANONYMOUS),
 		NTOA_TABLE_ENTRY_BITS(MAP_DENYWRITE),
 		NTOA_TABLE_ENTRY_BITS(MAP_EXECUTABLE),
-#if MAP_FILE
+#ifdef MAP_FILE
 		NTOA_TABLE_ENTRY_BITS(MAP_FILE),
 #endif
 		NTOA_TABLE_ENTRY_BITS(MAP_FIXED),
@@ -230,6 +232,14 @@ namespace crucible {
 		}
 	}

+	void
+	ftruncate_or_die(int fd, off_t size)
+	{
+		if (::ftruncate(fd, size)) {
+			THROW_ERRNO("ftruncate: " << name_fd(fd) << " size " << size);
+		}
+	}
+
 	string
 	socket_domain_ntoa(int domain)
 	{
@@ -426,6 +436,27 @@ namespace crucible {
 		return pread_or_die(fd, text.data(), text.size(), offset);
 	}

+	template<>
+	void
+	pwrite_or_die<vector<uint8_t>>(int fd, const vector<uint8_t> &text, off_t offset)
+	{
+		return pwrite_or_die(fd, text.data(), text.size(), offset);
+	}
+
+	template<>
+	void
+	pwrite_or_die<vector<char>>(int fd, const vector<char> &text, off_t offset)
+	{
+		return pwrite_or_die(fd, text.data(), text.size(), offset);
+	}
+
+	template<>
+	void
+	pwrite_or_die<string>(int fd, const string &text, off_t offset)
+	{
+		return pwrite_or_die(fd, text.data(), text.size(), offset);
+	}
+
 	Stat::Stat()
 	{
 		memset_zero<stat>(this);
@@ -459,6 +490,20 @@ namespace crucible {
 		lstat(filename);
 	}

+	int
+	ioctl_iflags_get(int fd)
+	{
+		int attr = 0;
+		DIE_IF_MINUS_ONE(ioctl(fd, FS_IOC_GETFLAGS, &attr));
+		return attr;
+	}
+
+	void
+	ioctl_iflags_set(int fd, int attr)
+	{
+		DIE_IF_MINUS_ONE(ioctl(fd, FS_IOC_SETFLAGS, &attr));
+	}
+
 	string
 	readlink_or_die(const string &path)
 	{
@@ -484,6 +529,22 @@ namespace crucible {
 		THROW_ERROR(runtime_error, "readlink: maximum buffer size exceeded");
 	}

+	string
+	relative_path()
+	{
+		return __relative_path;
+	}
+
+	void
+	set_relative_path(string path)
+	{
+		path = path + "/";
+		for (string::size_type i = path.find("//"); i != string::npos; i = path.find("//")) {
+			path.erase(i, 1);
+		}
+		__relative_path = path;
+	}
+
 	// Turn a FD into a human-recognizable filename OR an error message.
 	string
 	name_fd(int fd)
@@ -491,7 +552,12 @@ namespace crucible {
 		try {
 			ostringstream oss;
 			oss << "/proc/self/fd/" << fd;
-			return readlink_or_die(oss.str());
+			string path = readlink_or_die(oss.str());
+			if (!__relative_path.empty() && 0 == path.find(__relative_path))
+			{
+				path.erase(0, __relative_path.length());
+			}
+			return path;
 		} catch (exception &e) {
 			return string(e.what());
 		}
--- a/lib/fs.cc
+++ b/lib/fs.cc
@@ -468,6 +468,7 @@ namespace crucible {
 		static const bits_ntoa_table table[] = {
 			NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_ZLIB),
 			NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_LZO),
+			NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_ZSTD),
 			NTOA_TABLE_ENTRY_END()
 		};
 		return bits_ntoa(compress_type, table);
@@ -625,7 +626,7 @@ namespace crucible {
 	void
 	Fiemap::do_ioctl(int fd)
 	{
-		CHECK_CONSTRAINT(m_min_count, m_min_count <= m_max_count);
+		THROW_CHECK1(out_of_range, m_min_count, m_min_count <= m_max_count);

 		auto extent_count = m_min_count;
 		vector<char> ioctl_arg = vector_copy_struct<fiemap>(this);
@@ -700,18 +701,36 @@ namespace crucible {
 	BtrfsIoctlSearchHeader::set_data(const vector<char> &v, size_t offset)
 	{
 		THROW_CHECK2(invalid_argument, offset, v.size(), offset + sizeof(btrfs_ioctl_search_header) <= v.size());
-		memcpy(this, &v[offset], sizeof(btrfs_ioctl_search_header));
+		*static_cast<btrfs_ioctl_search_header *>(this) = *reinterpret_cast<const btrfs_ioctl_search_header *>(&v[offset]);
 		offset += sizeof(btrfs_ioctl_search_header);
 		THROW_CHECK2(invalid_argument, offset + len, v.size(), offset + len <= v.size());
 		m_data = vector<char>(&v[offset], &v[offset + len]);
 		return offset + len;
 	}

+	bool
+	BtrfsIoctlSearchHeader::operator<(const BtrfsIoctlSearchHeader &that) const
+	{
+		return tie(objectid, type, offset, len, transid) < tie(that.objectid, that.type, that.offset, that.len, that.transid);
+	}
+
 	bool
 	BtrfsIoctlSearchKey::do_ioctl_nothrow(int fd)
 	{
-		vector<char> ioctl_arg = vector_copy_struct<btrfs_ioctl_search_key>(this);
-		ioctl_arg.resize(sizeof(btrfs_ioctl_search_args_v2) + m_buf_size, 0);
+		// Normally we like to be paranoid and fill empty bytes with zero,
+		// but these buffers can be huge.  80% of a 4GHz CPU huge.
+
+		// Keep the ioctl buffer from one run to the next to save on malloc costs
+		size_t target_buf_size = sizeof(btrfs_ioctl_search_args_v2) + m_buf_size;
+
+		thread_local vector<char> ioctl_arg;
+		if (ioctl_arg.size() < m_buf_size) {
+			ioctl_arg = vector_copy_struct<btrfs_ioctl_search_key>(this);
+			ioctl_arg.resize(target_buf_size);
+		} else {
+			memcpy(ioctl_arg.data(), static_cast<btrfs_ioctl_search_key*>(this), sizeof(btrfs_ioctl_search_key));
+		}
+
 		btrfs_ioctl_search_args_v2 *ioctl_ptr = reinterpret_cast<btrfs_ioctl_search_args_v2 *>(ioctl_arg.data());

 		ioctl_ptr->buf_size = m_buf_size;
@@ -725,13 +744,12 @@ namespace crucible {
 		static_cast<btrfs_ioctl_search_key&>(*this) = ioctl_ptr->key;

 		m_result.clear();
-		m_result.reserve(nr_items);

 		size_t offset = pointer_distance(ioctl_ptr->buf, ioctl_ptr);
 		for (decltype(nr_items) i = 0; i < nr_items; ++i) {
 			BtrfsIoctlSearchHeader item;
 			offset = item.set_data(ioctl_arg, offset);
-			m_result.push_back(item);
+			m_result.insert(item);
 		}

 		return true;
@@ -834,7 +852,7 @@ namespace crucible {
 	}

 	string
-	btrfs_search_objectid_ntoa(unsigned objectid)
+	btrfs_search_objectid_ntoa(uint64_t objectid)
 	{
 		static const bits_ntoa_table table[] = {
 			NTOA_TABLE_ENTRY_ENUM(BTRFS_ROOT_TREE_OBJECTID),
@@ -906,7 +924,7 @@ namespace crucible {
 	ostream &
 	operator<<(ostream &os, const BtrfsIoctlSearchHeader &hdr)
 	{
-		os << "BtrfsIoctlSearchHeader { " 
+		os << "BtrfsIoctlSearchHeader { "
 			<< static_cast<const btrfs_ioctl_search_header &>(hdr)
 			<< ", data = ";
 		hexdump(os, hdr.m_data);
@@ -916,7 +934,7 @@ namespace crucible {
 	ostream &
 	operator<<(ostream &os, const BtrfsIoctlSearchKey &key)
 	{
-		os << "BtrfsIoctlSearchKey { " 
+		os << "BtrfsIoctlSearchKey { "
 			<< static_cast<const btrfs_ioctl_search_key &>(key)
 			<< ", buf_size = " << key.m_buf_size
 			<< ", buf[" << key.m_result.size() << "] = {";
--- a/lib/interp.cc
+++ b/lib/interp.cc
@@ -1,96 +0,0 @@
-#include "crucible/interp.h"
-
-#include "crucible/chatter.h"
-
-namespace crucible {
-	using namespace std;
-
-	int
-	Proc::exec(const ArgList &args)
-	{
-		return m_cmd(args);
-	}
-
-	Proc::Proc(const function<int(const ArgList &)> &f) :
-		m_cmd(f)
-	{
-	}
-
-	Command::~Command()
-	{
-	}
-
-	ArgList::ArgList(const char **argv)
-	{
-		while (argv && *argv) {
-			push_back(*argv++);
-		}
-	}
-
-	ArgList::ArgList(const vector<string> &&that) :
-		vector<string>(that)
-	{
-	}
-
-	Interp::~Interp()
-	{
-	}
-
-	Interp::Interp(const map<string, shared_ptr<Command> > &cmdlist) :
-		m_commands(cmdlist)
-	{
-	}
-
-	void
-	Interp::add_command(const string &name, const shared_ptr<Command> &command)
-	{
-		m_commands[name] = command;
-	}
-
-	int
-	Interp::exec(const ArgList &args)
-	{
-		auto next_arg = args.begin();
-		++next_arg;
-		return m_commands.at(args[0])->exec(vector<string>(next_arg, args.end()));
-	}
-
-	ArgParser::~ArgParser()
-	{
-	}
-
-	ArgParser::ArgParser()
-	{
-	}
-
-	void
-	ArgParser::add_opt(string opt, ArgActor actor)
-	{
-		m_string_opts[opt] = actor;
-	}
-
-	void
-	ArgParser::parse_backend(void *t, const ArgList &args)
-	{
-		bool quote_args = false;
-		for (string arg : args) {
-			if (quote_args) {
-				cerr << "arg: '" << arg << "'" << endl;
-				continue;
-			}
-			if (arg == "--") {
-				quote_args = true;
-				continue;
-			}
-			if (arg.compare(0, 2, "--") == 0) {
-				auto found = m_string_opts.find(arg.substr(2, string::npos));
-				if (found != m_string_opts.end()) {
-					found->second.predicate(t, "foo");
-				}
-				(void)t;
-			}
-		}
-	}
-
-
-};
--- a/lib/ntoa.cc
+++ b/lib/ntoa.cc
@@ -7,7 +7,7 @@
 namespace crucible {
 	using namespace std;

-	string bits_ntoa(unsigned long n, const bits_ntoa_table *table)
+	string bits_ntoa(unsigned long long n, const bits_ntoa_table *table)
 	{
 		string out;
 		while (n && table->a) {
--- a/lib/process.cc
+++ b/lib/process.cc
@@ -3,6 +3,7 @@
 #include "crucible/chatter.h"
 #include "crucible/error.h"

+#include <cstdlib>
 #include <utility>

 // for gettid()
@@ -109,13 +110,43 @@ namespace crucible {
 		}
 	}

-	template<>
-	struct ResourceHandle<Process::id, Process>;
-
 	pid_t
 	gettid()
 	{
 		return syscall(SYS_gettid);
 	}

+	double
+	getloadavg1()
+	{
+		double loadavg[1];
+		const int rv = ::getloadavg(loadavg, 1);
+		if (rv != 1) {
+			THROW_ERRNO("getloadavg(..., 1)");
+		}
+		return loadavg[0];
+	}
+
+	double
+	getloadavg5()
+	{
+		double loadavg[2];
+		const int rv = ::getloadavg(loadavg, 2);
+		if (rv != 2) {
+			THROW_ERRNO("getloadavg(..., 2)");
+		}
+		return loadavg[1];
+	}
+
+	double
+	getloadavg15()
+	{
+		double loadavg[3];
+		const int rv = ::getloadavg(loadavg, 3);
+		if (rv != 3) {
+			THROW_ERRNO("getloadavg(..., 3)");
+		}
+		return loadavg[2];
+	}
+
 }
--- a/lib/task.cc
+++ b/lib/task.cc
@@ -0,0 +1,644 @@
+#include "crucible/task.h"
+
+#include "crucible/cleanup.h"
+#include "crucible/error.h"
+#include "crucible/process.h"
+#include "crucible/time.h"
+
+#include <atomic>
+#include <cmath>
+#include <condition_variable>
+#include <list>
+#include <map>
+#include <mutex>
+#include <set>
+#include <thread>
+
+namespace crucible {
+	using namespace std;
+
+	static thread_local weak_ptr<TaskState> tl_current_task_wp;
+
+	class TaskState : public enable_shared_from_this<TaskState> {
+		const function<void()> 			m_exec_fn;
+		const string				m_title;
+		TaskId					m_id;
+
+		static atomic<TaskId>			s_next_id;
+	public:
+		TaskState(string title, function<void()> exec_fn);
+
+		void exec();
+		string title() const;
+		TaskId id() const;
+	};
+
+	atomic<TaskId> TaskState::s_next_id;
+
+	class TaskConsumer;
+	class TaskMasterState;
+
+	class TaskMasterState : public enable_shared_from_this<TaskMasterState> {
+		mutex 					m_mutex;
+		condition_variable 			m_condvar;
+		list<shared_ptr<TaskState>>		m_queue;
+		size_t					m_thread_max;
+		size_t					m_thread_min = 0;
+		set<shared_ptr<TaskConsumer>>		m_threads;
+		shared_ptr<thread>			m_load_tracking_thread;
+		double					m_load_target = 0;
+		double					m_prev_loadavg;
+		size_t					m_configured_thread_max;
+		double					m_thread_target;
+
+	friend class TaskConsumer;
+	friend class TaskMaster;
+
+		void start_threads_nolock();
+		void start_stop_threads();
+		void set_thread_count(size_t thread_max);
+		void set_thread_min_count(size_t thread_min);
+		void adjust_thread_count();
+		size_t calculate_thread_count_nolock();
+		void set_loadavg_target(double target);
+		void loadavg_thread_fn();
+
+	public:
+		~TaskMasterState();
+		TaskMasterState(size_t thread_max = thread::hardware_concurrency());
+
+		static void push_back(shared_ptr<TaskState> task);
+		static void push_front(shared_ptr<TaskState> task);
+		size_t get_queue_count();
+	};
+
+	class TaskConsumer : public enable_shared_from_this<TaskConsumer> {
+		weak_ptr<TaskMasterState>	m_master;
+		thread				m_thread;
+		shared_ptr<TaskState>		m_current_task;
+
+		void consumer_thread();
+		shared_ptr<TaskState> current_task_locked();
+	public:
+		TaskConsumer(weak_ptr<TaskMasterState> tms);
+		shared_ptr<TaskState> current_task();
+	friend class TaskMaster;
+	friend class TaskMasterState;
+	};
+
+	static shared_ptr<TaskMasterState> s_tms = make_shared<TaskMasterState>();
+
+	TaskState::TaskState(string title, function<void()> exec_fn) :
+		m_exec_fn(exec_fn),
+		m_title(title),
+		m_id(++s_next_id)
+	{
+		THROW_CHECK0(invalid_argument, !m_title.empty());
+	}
+
+	void
+	TaskState::exec()
+	{
+		THROW_CHECK0(invalid_argument, m_exec_fn);
+		THROW_CHECK0(invalid_argument, !m_title.empty());
+
+		char buf[24];
+		memset(buf, '\0', sizeof(buf));
+		DIE_IF_MINUS_ERRNO(pthread_getname_np(pthread_self(), buf, sizeof(buf)));
+		Cleanup pthread_name_cleaner([&]() {
+			pthread_setname_np(pthread_self(), buf);
+		});
+		DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), m_title.c_str()));
+
+		weak_ptr<TaskState> this_task_wp = shared_from_this();
+		Cleanup current_task_cleaner([&]() {
+			swap(this_task_wp, tl_current_task_wp);
+		});
+		swap(this_task_wp, tl_current_task_wp);
+
+		m_exec_fn();
+	}
+
+	string
+	TaskState::title() const
+	{
+		THROW_CHECK0(runtime_error, !m_title.empty());
+		return m_title;
+	}
+
+	TaskId
+	TaskState::id() const
+	{
+		return m_id;
+	}
+
+	TaskMasterState::TaskMasterState(size_t thread_max) :
+		m_thread_max(thread_max),
+		m_configured_thread_max(thread_max),
+		m_thread_target(thread_max)
+	{
+	}
+
+	void
+	TaskMasterState::start_threads_nolock()
+	{
+		while (m_threads.size() < m_thread_max) {
+			m_threads.insert(make_shared<TaskConsumer>(shared_from_this()));
+		}
+	}
+
+	void
+	TaskMasterState::start_stop_threads()
+	{
+		unique_lock<mutex> lock(m_mutex);
+		while (m_threads.size() != m_thread_max) {
+			if (m_threads.size() < m_thread_max) {
+				m_threads.insert(make_shared<TaskConsumer>(shared_from_this()));
+			} else if (m_threads.size() > m_thread_max) {
+				m_condvar.wait(lock);
+			}
+		}
+	}
+
+	void
+	TaskMasterState::push_back(shared_ptr<TaskState> task)
+	{
+		THROW_CHECK0(runtime_error, task);
+		unique_lock<mutex> lock(s_tms->m_mutex);
+		s_tms->m_queue.push_back(task);
+		s_tms->m_condvar.notify_all();
+		s_tms->start_threads_nolock();
+	}
+
+	void
+	TaskMasterState::push_front(shared_ptr<TaskState> task)
+	{
+		THROW_CHECK0(runtime_error, task);
+		unique_lock<mutex> lock(s_tms->m_mutex);
+		s_tms->m_queue.push_front(task);
+		s_tms->m_condvar.notify_all();
+		s_tms->start_threads_nolock();
+	}
+
+	TaskMasterState::~TaskMasterState()
+	{
+		set_thread_count(0);
+	}
+
+	size_t
+	TaskMaster::get_queue_count()
+	{
+		unique_lock<mutex> lock(s_tms->m_mutex);
+		return s_tms->m_queue.size();
+	}
+
+	ostream &
+	TaskMaster::print_queue(ostream &os)
+	{
+		unique_lock<mutex> lock(s_tms->m_mutex);
+		os << "Queue (size " << s_tms->m_queue.size() << "):" << endl;
+		size_t counter = 0;
+		for (auto i : s_tms->m_queue) {
+			os << "Queue #" << ++counter << " Task ID " << i->id() << " " << i->title() << endl;
+		}
+		return os << "Queue End" << endl;
+	}
+
+	ostream &
+	TaskMaster::print_workers(ostream &os)
+	{
+		unique_lock<mutex> lock(s_tms->m_mutex);
+		os << "Workers (size " << s_tms->m_threads.size() << "):" << endl;
+		size_t counter = 0;
+		for (auto i : s_tms->m_threads) {
+			os << "Worker #" << ++counter << " ";
+			auto task = i->current_task_locked();
+			if (task) {
+				os << "Task ID " << task->id() << " " << task->title();
+			} else {
+				os << "(idle)";
+			}
+			os << endl;
+		}
+		return os << "Workers End" << endl;
+	}
+
+	size_t
+	TaskMasterState::calculate_thread_count_nolock()
+	{
+		if (m_load_target == 0) {
+			// No limits, no stats, use configured thread count
+			return m_configured_thread_max;
+		}
+
+		if (m_configured_thread_max == 0) {
+			// Not a lot of choice here, and zeros break the algorithm
+			return 0;
+		}
+
+		const double loadavg = getloadavg1();
+
+		static const double load_exp = exp(-5.0 / 60.0);
+
+		// Averages are fun, but want to know the load from the last 5 seconds.
+		// Invert the load average function:
+		// LA = LA * load_exp + N * (1 - load_exp)
+		// LA2 - LA1 = LA1 * load_exp + N * (1 - load_exp) - LA1
+		// LA2 - LA1 + LA1 = LA1 * load_exp + N * (1 - load_exp)
+		// LA2 - LA1 + LA1 - LA1 * load_exp = N * (1 - load_exp)
+		// LA2 - LA1 * load_exp = N * (1 - load_exp)
+		// LA2 / (1 - load_exp) - (LA1 * load_exp / 1 - load_exp) = N
+		// (LA2 - LA1 * load_exp) / (1 - load_exp) = N
+		// except for rounding error which might make this just a bit below zero.
+		const double current_load = max(0.0, (loadavg - m_prev_loadavg * load_exp) / (1 - load_exp));
+
+		m_prev_loadavg = loadavg;
+
+		// Change the thread target based on the
+		// difference between current and desired load
+		// but don't get too close all at once due to rounding and sample error.
+		// If m_load_target < 1.0 then we are just doing PWM with one thread.
+
+		if (m_load_target <= 1.0) {
+			m_thread_target = 1.0;
+		} else if (m_load_target - current_load >= 1.0) {
+			m_thread_target += (m_load_target - current_load - 1.0) / 2.0;
+		} else if (m_load_target < current_load) {
+			m_thread_target += m_load_target - current_load;
+		}
+
+		// Cannot exceed configured maximum thread count or less than zero
+		m_thread_target = min(max(0.0, m_thread_target), double(m_configured_thread_max));
+
+		// Convert to integer but keep within range
+		const size_t rv = max(m_thread_min, min(size_t(ceil(m_thread_target)), m_configured_thread_max));
+
+		return rv;
+	}
+
+	void
+	TaskMasterState::adjust_thread_count()
+	{
+		unique_lock<mutex> lock(m_mutex);
+		size_t new_thread_max = calculate_thread_count_nolock();
+		size_t old_thread_max = m_thread_max;
+		m_thread_max = new_thread_max;
+
+		// If we are reducing the number of threads we have to wake them up so they can exit their loops
+		// If we are increasing the number of threads we have to notify start_stop_threads it can stop waiting for threads to stop
+		if (new_thread_max != old_thread_max) {
+			m_condvar.notify_all();
+			start_threads_nolock();
+		}
+	}
+
+	void
+	TaskMasterState::set_thread_count(size_t thread_max)
+	{
+		unique_lock<mutex> lock(m_mutex);
+		m_configured_thread_max = thread_max;
+		lock.unlock();
+		adjust_thread_count();
+		start_stop_threads();
+	}
+
+	void
+	TaskMaster::set_thread_count(size_t thread_max)
+	{
+		s_tms->set_thread_count(thread_max);
+	}
+
+	void
+	TaskMasterState::set_thread_min_count(size_t thread_min)
+	{
+		unique_lock<mutex> lock(m_mutex);
+		m_thread_min = thread_min;
+		lock.unlock();
+		adjust_thread_count();
+		start_stop_threads();
+	}
+
+	void
+	TaskMaster::set_thread_min_count(size_t thread_min)
+	{
+		s_tms->set_thread_min_count(thread_min);
+	}
+
+	void
+	TaskMasterState::loadavg_thread_fn()
+	{
+		pthread_setname_np(pthread_self(), "load_tracker");
+		while (true) {
+			adjust_thread_count();
+			nanosleep(5.0);
+		}
+	}
+
+	void
+	TaskMasterState::set_loadavg_target(double target)
+	{
+		THROW_CHECK1(out_of_range, target, target >= 0);
+
+		unique_lock<mutex> lock(m_mutex);
+		m_load_target = target;
+		m_prev_loadavg = getloadavg1();
+
+		if (target && !m_load_tracking_thread) {
+			m_load_tracking_thread = make_shared<thread>([=] () { loadavg_thread_fn(); });
+			m_load_tracking_thread->detach();
+		}
+	}
+
+	void
+	TaskMaster::set_loadavg_target(double target)
+	{
+		s_tms->set_loadavg_target(target);
+	}
+
+	void
+	TaskMaster::set_thread_count()
+	{
+		set_thread_count(thread::hardware_concurrency());
+	}
+
+	Task::Task(shared_ptr<TaskState> pts) :
+		m_task_state(pts)
+	{
+	}
+
+	Task::Task(string title, function<void()> exec_fn) :
+		m_task_state(make_shared<TaskState>(title, exec_fn))
+	{
+	}
+
+	void
+	Task::run() const
+	{
+		THROW_CHECK0(runtime_error, m_task_state);
+		TaskMasterState::push_back(m_task_state);
+	}
+
+	void
+	Task::run_earlier() const
+	{
+		THROW_CHECK0(runtime_error, m_task_state);
+		TaskMasterState::push_front(m_task_state);
+	}
+
+	Task
+	Task::current_task()
+	{
+		return Task(tl_current_task_wp.lock());
+	}
+
+	string
+	Task::title() const
+	{
+		THROW_CHECK0(runtime_error, m_task_state);
+		return m_task_state->title();
+	}
+
+	ostream &
+	operator<<(ostream &os, const Task &task)
+	{
+		return os << task.title();
+	};
+
+	TaskId
+	Task::id() const
+	{
+		THROW_CHECK0(runtime_error, m_task_state);
+		return m_task_state->id();
+	}
+
+	bool
+	Task::operator<(const Task &that) const
+	{
+		return id() < that.id();
+	}
+
+	Task::operator bool() const
+	{
+		return !!m_task_state;
+	}
+
+	shared_ptr<TaskState>
+	TaskConsumer::current_task_locked()
+	{
+		return m_current_task;
+	}
+
+	shared_ptr<TaskState>
+	TaskConsumer::current_task()
+	{
+		auto master_locked = m_master.lock();
+		unique_lock<mutex> lock(master_locked->m_mutex);
+		return current_task_locked();
+	}
+
+	void
+	TaskConsumer::consumer_thread()
+	{
+		auto master_locked = m_master.lock();
+		while (true) {
+			unique_lock<mutex> lock(master_locked->m_mutex);
+			if (master_locked->m_thread_max < master_locked->m_threads.size()) {
+				break;
+			}
+
+			if (master_locked->m_queue.empty()) {
+				master_locked->m_condvar.wait(lock);
+				continue;
+			}
+
+			m_current_task = *master_locked->m_queue.begin();
+			master_locked->m_queue.pop_front();
+			lock.unlock();
+			catch_all([&]() {
+				m_current_task->exec();
+			});
+			lock.lock();
+			m_current_task.reset();
+		}
+
+		unique_lock<mutex> lock(master_locked->m_mutex);
+		m_thread.detach();
+		master_locked->m_threads.erase(shared_from_this());
+		master_locked->m_condvar.notify_all();
+	}
+
+	TaskConsumer::TaskConsumer(weak_ptr<TaskMasterState> tms) :
+		m_master(tms),
+		m_thread([=](){ consumer_thread(); })
+	{
+	}
+
+	class BarrierState {
+		mutex		m_mutex;
+		set<Task>	m_tasks;
+
+		void release();
+	public:
+		~BarrierState();
+		void insert_task(Task t);
+	};
+
+	Barrier::Barrier(shared_ptr<BarrierState> pbs) :
+		m_barrier_state(pbs)
+	{
+	}
+
+	Barrier::Barrier() :
+		m_barrier_state(make_shared<BarrierState>())
+	{
+	}
+
+	void
+	BarrierState::release()
+	{
+		unique_lock<mutex> lock(m_mutex);
+		for (auto i : m_tasks) {
+			i.run();
+		}
+		m_tasks.clear();
+	}
+
+	BarrierState::~BarrierState()
+	{
+		release();
+	}
+
+	BarrierLock::BarrierLock(shared_ptr<BarrierState> pbs) :
+		m_barrier_state(pbs)
+	{
+	}
+
+	void
+	BarrierLock::release()
+	{
+		m_barrier_state.reset();
+	}
+
+	void
+	BarrierState::insert_task(Task t)
+	{
+		unique_lock<mutex> lock(m_mutex);
+		m_tasks.insert(t);
+	}
+
+	void
+	Barrier::insert_task(Task t)
+	{
+		m_barrier_state->insert_task(t);
+	}
+
+	BarrierLock
+	Barrier::lock()
+	{
+		return BarrierLock(m_barrier_state);
+	}
+
+	class ExclusionState {
+		mutex		m_mutex;
+		bool		m_locked = false;
+		set<Task>	m_tasks;
+
+	public:
+		~ExclusionState();
+		void release();
+		bool try_lock();
+		void insert_task(Task t);
+	};
+
+	Exclusion::Exclusion(shared_ptr<ExclusionState> pbs) :
+		m_exclusion_state(pbs)
+	{
+	}
+
+	Exclusion::Exclusion() :
+		m_exclusion_state(make_shared<ExclusionState>())
+	{
+	}
+
+	void
+	ExclusionState::release()
+	{
+		unique_lock<mutex> lock(m_mutex);
+		m_locked = false;
+		bool first = true;
+		for (auto i : m_tasks) {
+			if (first) {
+				i.run_earlier();
+				first = false;
+			} else {
+				i.run();
+			}
+		}
+		m_tasks.clear();
+	}
+
+	ExclusionState::~ExclusionState()
+	{
+		release();
+	}
+
+	ExclusionLock::ExclusionLock(shared_ptr<ExclusionState> pbs) :
+		m_exclusion_state(pbs)
+	{
+	}
+
+	void
+	ExclusionLock::release()
+	{
+		if (m_exclusion_state) {
+			m_exclusion_state->release();
+			m_exclusion_state.reset();
+		}
+	}
+
+	ExclusionLock::~ExclusionLock()
+	{
+		release();
+	}
+
+	void
+	ExclusionState::insert_task(Task task)
+	{
+		unique_lock<mutex> lock(m_mutex);
+		m_tasks.insert(task);
+	}
+
+	bool
+	ExclusionState::try_lock()
+	{
+		unique_lock<mutex> lock(m_mutex);
+		if (m_locked) {
+			return false;
+		} else {
+			m_locked = true;
+			return true;
+		}
+	}
+
+	void
+	Exclusion::insert_task(Task t)
+	{
+		m_exclusion_state->insert_task(t);
+	}
+
+	ExclusionLock::operator bool() const
+	{
+		return !!m_exclusion_state;
+	}
+
+	ExclusionLock
+	Exclusion::try_lock()
+	{
+		THROW_CHECK0(runtime_error, m_exclusion_state);
+		if (m_exclusion_state->try_lock()) {
+			return ExclusionLock(m_exclusion_state);
+		} else {
+			return ExclusionLock();
+		}
+	}
+}
--- a/lib/time.cc
+++ b/lib/time.cc
@@ -1,11 +1,13 @@
 #include "crucible/time.h"

 #include "crucible/error.h"
+#include "crucible/process.h"

 #include <algorithm>
+#include <thread>
+
 #include <cmath>
 #include <ctime>
-#include <thread>

 namespace crucible {

@@ -59,16 +61,10 @@ namespace crucible {
 		m_start = chrono::high_resolution_clock::now();
 	}

-	void
-	Timer::set(const chrono::high_resolution_clock::time_point &start)
+	chrono::high_resolution_clock::time_point
+	Timer::get() const
 	{
-		m_start = start;
-	}
-
-	void
-	Timer::set(double delta)
-	{
-		m_start += chrono::duration_cast<chrono::high_resolution_clock::duration>(chrono::duration<double>(delta));
+		return m_start;
 	}

 	double
@@ -155,4 +151,189 @@ namespace crucible {
 		m_tokens -= cost;
 	}

+	RateEstimator::RateEstimator(double min_delay, double max_delay) :
+		m_min_delay(min_delay),
+		m_max_delay(max_delay)
+	{
+		THROW_CHECK1(invalid_argument, min_delay, min_delay > 0);
+		THROW_CHECK1(invalid_argument, max_delay, max_delay > 0);
+		THROW_CHECK2(invalid_argument, min_delay, max_delay, max_delay > min_delay);
+	}
+
+	void
+	RateEstimator::update_unlocked(uint64_t new_count)
+	{
+		// Gradually reduce the effect of previous updates
+		if (m_last_decay.age() > 1) {
+			m_num *= m_decay;
+			m_den *= m_decay;
+			m_last_decay.reset();
+		}
+		// Add units over time to running totals
+		auto increment = new_count - min(new_count, m_last_count);
+		auto delta = max(0.0, m_last_update.lap());
+		m_num += increment;
+		m_den += delta;
+		m_last_count = new_count;
+		// If count increased, wake up any waiters
+		if (delta > 0) {
+			m_condvar.notify_all();
+		}
+	}
+
+	void
+	RateEstimator::update(uint64_t new_count)
+	{
+		unique_lock<mutex> lock(m_mutex);
+		return update_unlocked(new_count);
+	}
+
+	void
+	RateEstimator::update_monotonic(uint64_t new_count)
+	{
+		unique_lock<mutex> lock(m_mutex);
+		if (m_last_count == numeric_limits<uint64_t>::max() || new_count > m_last_count) {
+			return update_unlocked(new_count);
+		} else {
+			return update_unlocked(m_last_count);
+		}
+	}
+
+	uint64_t
+	RateEstimator::count() const
+	{
+		unique_lock<mutex> lock(m_mutex);
+		return m_last_count;
+	}
+
+	pair<double, double>
+	RateEstimator::ratio_unlocked() const
+	{
+		auto num = max(m_num, 1.0);
+		// auto den = max(m_den, 1.0);
+		// Rate estimation slows down if there are no new units to count
+		auto den = max(m_den + m_last_update.age(), 1.0);
+		auto sec_per_count = den / num;
+		if (sec_per_count < m_min_delay) {
+			return make_pair(1.0, m_min_delay);
+		}
+		if (sec_per_count > m_max_delay) {
+			return make_pair(1.0, m_max_delay);
+		}
+		return make_pair(num, den);
+	}
+
+	pair<double, double>
+	RateEstimator::ratio() const
+	{
+		unique_lock<mutex> lock(m_mutex);
+		return ratio_unlocked();
+	}
+
+	pair<double, double>
+	RateEstimator::raw() const
+	{
+		unique_lock<mutex> lock(m_mutex);
+		return make_pair(m_num, m_den);
+	}
+
+	double
+	RateEstimator::rate_unlocked() const
+	{
+		auto r = ratio_unlocked();
+		return r.first / r.second;
+	}
+
+	double
+	RateEstimator::rate() const
+	{
+		unique_lock<mutex> lock(m_mutex);
+		return rate_unlocked();
+	}
+
+	ostream &
+	operator<<(ostream &os, const RateEstimator &re)
+	{
+		os << "RateEstimator { ";
+		auto ratio = re.ratio();
+		auto raw = re.raw();
+		os << "count = " << re.count() << ", raw = " << raw.first << " / " << raw.second << ", ratio = " << ratio.first << " / " << ratio.second << ", rate = " << re.rate() << ", duration(1) = " << re.duration(1).count() << ", seconds_for(1) = " << re.seconds_for(1) << " }";
+		return os;
+	}
+
+	chrono::duration<double>
+	RateEstimator::duration_unlocked(uint64_t relative_count) const
+	{
+		auto dur = relative_count / rate_unlocked();
+		dur = min(m_max_delay, dur);
+		dur = max(m_min_delay, dur);
+		return chrono::duration<double>(dur);
+	}
+
+	chrono::duration<double>
+	RateEstimator::duration(uint64_t relative_count) const
+	{
+		unique_lock<mutex> lock(m_mutex);
+		return duration_unlocked(relative_count);
+	}
+
+	chrono::high_resolution_clock::time_point
+	RateEstimator::time_point_unlocked(uint64_t absolute_count) const
+	{
+		auto relative_count = absolute_count - min(m_last_count, absolute_count);
+		auto relative_duration = duration_unlocked(relative_count);
+		return m_last_update.get() + chrono::duration_cast<chrono::high_resolution_clock::duration>(relative_duration);
+		// return chrono::high_resolution_clock::now() + chrono::duration_cast<chrono::high_resolution_clock::duration>(relative_duration);
+	}
+
+	chrono::high_resolution_clock::time_point
+	RateEstimator::time_point(uint64_t absolute_count) const
+	{
+		unique_lock<mutex> lock(m_mutex);
+		return time_point_unlocked(absolute_count);
+	}
+
+	void
+	RateEstimator::wait_until(uint64_t new_count_absolute) const
+	{
+		unique_lock<mutex> lock(m_mutex);
+		auto saved_count = m_last_count;
+		while (saved_count <= m_last_count && m_last_count < new_count_absolute) {
+			// Stop waiting if clock runs backwards
+			saved_count = m_last_count;
+			m_condvar.wait(lock);
+		}
+	}
+
+	void
+	RateEstimator::wait_for(uint64_t new_count_relative) const
+	{
+		unique_lock<mutex> lock(m_mutex);
+		auto saved_count = m_last_count;
+		auto new_count_absolute = m_last_count + new_count_relative;
+		while (saved_count <= m_last_count && m_last_count < new_count_absolute) {
+			// Stop waiting if clock runs backwards
+			saved_count = m_last_count;
+			m_condvar.wait(lock);
+		}
+	}
+
+	double
+	RateEstimator::seconds_for(uint64_t new_count_relative) const
+	{
+		unique_lock<mutex> lock(m_mutex);
+		auto ts = time_point_unlocked(new_count_relative + m_last_count);
+		auto delta_dur = ts - chrono::high_resolution_clock::now();
+		return max(min(chrono::duration<double>(delta_dur).count(), m_max_delay), m_min_delay);
+	}
+
+	double
+	RateEstimator::seconds_until(uint64_t new_count_absolute) const
+	{
+		unique_lock<mutex> lock(m_mutex);
+		auto ts = time_point_unlocked(new_count_absolute);
+		auto delta_dur = ts - chrono::high_resolution_clock::now();
+		return max(min(chrono::duration<double>(delta_dur).count(), m_max_delay), m_min_delay);
+	}
+
 }
--- a/15
+++ b/15
@@ -1,4 +1,11 @@
-CCFLAGS  = -Wall -Wextra -Werror -O3 -I../include -ggdb -fpic
-# CCFLAGS  = -Wall -Wextra -Werror -O0 -I../include -ggdb -fpic
-CFLAGS   = $(CCFLAGS) -std=c99
-CXXFLAGS = $(CCFLAGS) -std=c++11 -Wold-style-cast
+# Default:
+CCFLAGS  = -Wall -Wextra -Werror -I../include -fpic -D_FILE_OFFSET_BITS=64
+
+# Optimized:
+# CCFLAGS  = -Wall -Wextra -Werror -O3 -march=native -I../include -fpic -D_FILE_OFFSET_BITS=64
+
+# Debug:
+# CCFLAGS  = -Wall -Wextra -Werror -O0 -I../include -ggdb -fpic -D_FILE_OFFSET_BITS=64
+
+CFLAGS   += $(CCFLAGS) -std=c99
+CXXFLAGS += $(CCFLAGS) -std=c++11 -Wold-style-cast
--- a/scripts/beesd.conf.sample
+++ b/scripts/beesd.conf.sample
@@ -0,0 +1,34 @@
+## Config for Bees: /etc/bees/beesd.conf.sample
+## https://github.com/Zygo/bees
+## It's a default values, change it, if needed
+
+# How to use?
+# Copy this file to a new file name and adjust the UUID below
+
+# Which FS will be used
+UUID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+
+## System Vars
+# Change carefully
+# WORK_DIR=/run/bees/
+# MNT_DIR="$WORK_DIR/mnt/$UUID"
+# BEESHOME="$MNT_DIR/.beeshome"
+# BEESSTATUS="$WORK_DIR/$UUID.status"
+
+## Options to apply, see `beesd --help` for details
+# OPTIONS="--strip-paths --no-timestamps"
+
+## Bees DB size
+# Hash Table Sizing
+# sHash table entries are 16 bytes each
+# (64-bit hash, 52-bit block number, and some metadata bits)
+# Each entry represents a minimum of 4K on disk.
+# unique data size    hash table size    average dedup block size
+#     1TB                 4GB                  4K
+#     1TB                 1GB                 16K
+#     1TB               256MB                 64K
+#     1TB                16MB               1024K
+#    64TB                 1GB               1024K
+#
+# Size MUST be power of 16M
+# DB_SIZE=$((64*$AL16M)) # 1G in bytes
--- a/scripts/beesd.in
+++ b/scripts/beesd.in
@@ -0,0 +1,145 @@
+#!/bin/bash
+
+## Helpful functions
+INFO(){ echo "INFO:" "$@"; }
+ERRO(){ echo "ERROR:" "$@"; exit 1; }
+YN(){ [[ "$1" =~ (1|Y|y) ]]; }
+
+## Global vars
+export BEESHOME BEESSTATUS
+export WORK_DIR CONFIG_DIR
+export CONFIG_FILE
+export UUID AL16M
+
+readonly AL16M="$((16*1024*1024))"
+readonly CONFIG_DIR=@ETC_PREFIX@/bees/
+
+readonly bees_bin=$(realpath @LIBEXEC_PREFIX@/bees)
+
+command -v "$bees_bin" &> /dev/null || ERRO "Missing 'bees' agent"
+
+uuid_valid(){
+    if uuidparse -n -o VARIANT $1 | grep -i -q invalid; then
+        false
+    fi
+}
+
+help(){
+    echo "Usage: beesd [options] <btrfs_uuid>"
+    echo "- - -"
+    exec "$bees_bin" --help
+}
+
+get_bees_supp_opts(){
+    "$bees_bin" --help |& awk '/--../ { gsub( ",", "" ); print $1 " " $2}'
+}
+
+SUPPORTED_ARGS=(
+    $(get_bees_supp_opts)
+)
+NOT_SUPPORTED_ARGS=()
+ARGUMENTS=()
+
+for arg in "${@}"; do
+    supp=false
+    for supp_arg in "${SUPPORTED_ARGS[@]}"; do
+        if [ "$arg" == "$supp_arg" ]; then
+            supp=true
+            break
+        fi
+    done
+    if $supp; then
+        ARGUMENTS+=($arg)
+    else
+        NOT_SUPPORTED_ARGS+=($arg)
+    fi
+done
+
+for arg in "${ARGUMENTS[@]}"; do
+    case $arg in
+        -h) help;;
+        --help) help;;
+    esac
+done
+
+for arg in "${NOT_SUPPORTED_ARGS[@]}"; do
+    if uuid_valid $arg; then
+        [ ! -z "$UUID" ] && help
+        UUID=$arg
+    fi
+done
+
+[ -z "$UUID" ] && help
+
+
+FILE_CONFIG="$(egrep -l '^[^#]*UUID\s*=\s*"?'"$UUID" "$CONFIG_DIR"/*.conf | head -1)"
+[ ! -f "$FILE_CONFIG" ] && ERRO "No config for $UUID"
+INFO "Find $UUID in $FILE_CONFIG, use as conf"
+source "$FILE_CONFIG"
+
+
+## Pre checks
+{
+    [ ! -d "$CONFIG_DIR" ] && ERRO "Missing: $CONFIG_DIR"
+    [ "$UID" == "0" ] || ERRO "Must be run as root"
+}
+
+
+WORK_DIR="${WORK_DIR:-/run/bees/}"
+MNT_DIR="${MNT_DIR:-$WORK_DIR/mnt/$UUID}"
+BEESHOME="${BEESHOME:-$MNT_DIR/.beeshome}"
+BEESSTATUS="${BEESSTATUS:-$WORK_DIR/$UUID.status}"
+DB_SIZE="${DB_SIZE:-$((64*AL16M))}"
+
+INFO "Check: Disk exists"
+if [ ! -b "/dev/disk/by-uuid/$UUID" ]; then
+    ERRO "Missing disk: /dev/disk/by-uuid/$UUID"
+fi
+
+is_btrfs(){ [ "$(blkid -s TYPE -o value "$1")" == "btrfs" ]; }
+
+INFO "Check: Disk with btrfs"
+if ! is_btrfs "/dev/disk/by-uuid/$UUID"; then
+    ERRO "Disk not contain btrfs: /dev/disk/by-uuid/$UUID"
+fi
+
+INFO "WORK DIR: $WORK_DIR"
+mkdir -p "$WORK_DIR" || exit 1
+
+INFO "MOUNT DIR: $MNT_DIR"
+mkdir -p "$MNT_DIR" || exit 1
+
+umount_w(){ mountpoint -q "$1" && umount -l "$1"; }
+force_umount(){ umount_w "$MNT_DIR"; }
+trap force_umount SIGINT SIGTERM EXIT
+
+mount -osubvolid=5 /dev/disk/by-uuid/$UUID "$MNT_DIR" || exit 1
+
+if [ ! -d "$BEESHOME" ]; then
+    INFO "Create subvol $BEESHOME for store bees data"
+    btrfs sub cre "$BEESHOME"
+else
+    btrfs sub show "$BEESHOME" &> /dev/null || ERRO "$BEESHOME MUST BE A SUBVOL!"
+fi
+
+# Check DB size
+{
+    DB_PATH="$BEESHOME/beeshash.dat"
+    touch "$DB_PATH"
+    OLD_SIZE="$(du -b "$DB_PATH" | sed 's/\t/ /g' | cut -d' ' -f1)"
+    NEW_SIZE="$DB_SIZE"
+    if (( "$NEW_SIZE"%AL16M > 0 )); then
+        ERRO "DB_SIZE Must be multiple of 16M"
+    fi
+    if (( "$OLD_SIZE" != "$NEW_SIZE" )); then
+        INFO "Resize db: $OLD_SIZE -> $NEW_SIZE"
+        [ -f "$BEESHOME/beescrawl.$UUID.dat" ] && rm "$BEESHOME/beescrawl.$UUID.dat"
+        truncate -s $NEW_SIZE $DB_PATH
+    fi
+    chmod 700 "$DB_PATH"
+}
+
+MNT_DIR="$(realpath $MNT_DIR)"
+
+cd "$MNT_DIR"
+"$bees_bin" "${ARGUMENTS[@]}" $OPTIONS "$MNT_DIR"
--- a/scripts/beesd@.service.in
+++ b/scripts/beesd@.service.in
@@ -0,0 +1,24 @@
+[Unit]
+Description=Bees (%i)
+Documentation=https://github.com/Zygo/bees
+After=sysinit.target
+
+[Service]
+Type=simple
+ExecStart=@PREFIX@/sbin/beesd --no-timestamps %i
+CPUAccounting=true
+CPUSchedulingPolicy=batch
+CPUWeight=12
+IOSchedulingClass=idle
+IOSchedulingPriority=7
+IOWeight=10
+KillMode=control-group
+KillSignal=SIGTERM
+MemoryAccounting=true
+Nice=19
+Restart=on-abnormal
+StartupCPUWeight=25
+StartupIOWeight=25
+
+[Install]
+WantedBy=basic.target
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -0,0 +1,2 @@
+bees-version.[ch]
+bees-version.new.c
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,28 +1,15 @@
 PROGRAMS = \
 	../bin/bees \
-	../bin/fanotify-watch \
 	../bin/fiemap \
 	../bin/fiewalk \

-all: $(PROGRAMS) depends.mk
+all: $(PROGRAMS)

 include ../makeflags
+-include ../localconf

 LIBS = -lcrucible -lpthread
-LDFLAGS = -L../lib -Wl,-rpath=$(shell realpath ../lib)
-
-depends.mk: Makefile *.cc
-	for x in *.cc; do $(CXX) $(CXXFLAGS) -M "$$x"; done > depends.mk.new
-	mv -fv depends.mk.new depends.mk
-
-include depends.mk
-
-%.o: %.cc %.h
-	$(CXX) $(CXXFLAGS) -o "$@" -c "$<"
-
-../bin/%: %.o
-	@echo Implicit bin rule "$<" '->' "$@"
-	$(CXX) $(CXXFLAGS) -o "$@" "$<" $(LDFLAGS) $(LIBS)
+LDFLAGS = -L../lib

 BEES_OBJS = \
 	bees.o \
@@ -33,8 +20,29 @@ BEES_OBJS = \
 	bees-thread.o \
 	bees-types.o \

-../bin/bees: $(BEES_OBJS)
-	$(CXX) $(CXXFLAGS) -o "$@" $(BEES_OBJS) $(LDFLAGS) $(LIBS)
+bees-version.c: bees.h $(BEES_OBJS:.o=.cc) Makefile
+	echo "const char *BEES_VERSION = \"$(BEES_VERSION)\";" > bees-version.new.c
+	mv -f bees-version.new.c bees-version.c
+
+.depends/%.dep: %.cc Makefile
+	@mkdir -p .depends
+	$(CXX) $(CXXFLAGS) -M -MF $@ -MT $(<:.cc=.o) $<
+
+depends.mk: $(BEES_OBJS:%.o=.depends/%.dep)
+	cat $^ > $@.new
+	mv -f $@.new $@
+
+include depends.mk
+
+%.o: %.cc %.h
+	$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+../bin/%: %.o
+	@echo Implicit bin rule "$<" '->' "$@"
+	$(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $< $(LIBS)
+
+../bin/bees: $(BEES_OBJS) bees-version.o
+	$(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $^ $(LIBS)

 clean:
-	-rm -fv *.o
+	rm -fv *.o bees-version.c
--- a/src/bees-context.cc
+++ b/src/bees-context.cc
@@ -2,46 +2,47 @@

 #include "crucible/limits.h"
 #include "crucible/string.h"
+#include "crucible/task.h"

 #include <fstream>
 #include <iostream>
+#include <vector>

 using namespace crucible;
 using namespace std;

-static inline
-const char *
-getenv_or_die(const char *name)
-{
-	const char *rv = getenv(name);
-	if (!rv) {
-		THROW_ERROR(runtime_error, "Environment variable " << name << " not defined");
-	}
-	return rv;
-}
-
 BeesFdCache::BeesFdCache()
 {
 	m_root_cache.func([&](shared_ptr<BeesContext> ctx, uint64_t root) -> Fd {
-		return ctx->roots()->open_root_nocache(root);
+		Timer open_timer;
+		auto rv = ctx->roots()->open_root_nocache(root);
+		BEESCOUNTADD(open_root_ms, open_timer.age() * 1000);
+		return rv;
 	});
+	m_root_cache.max_size(BEES_ROOT_FD_CACHE_SIZE);
 	m_file_cache.func([&](shared_ptr<BeesContext> ctx, uint64_t root, uint64_t ino) -> Fd {
-		return ctx->roots()->open_root_ino_nocache(root, ino);
+		Timer open_timer;
+		auto rv = ctx->roots()->open_root_ino_nocache(root, ino);
+		BEESCOUNTADD(open_ino_ms, open_timer.age() * 1000);
+		return rv;
 	});
+	m_file_cache.max_size(BEES_FILE_FD_CACHE_SIZE);
+}
+
+void
+BeesFdCache::clear()
+{
+	BEESNOTE("Clearing root FD cache to enable subvol delete");
+	m_root_cache.clear();
+	BEESCOUNT(root_clear);
+	BEESNOTE("Clearing open FD cache to enable file delete");
+	m_file_cache.clear();
+	BEESCOUNT(open_clear);
 }

 Fd
 BeesFdCache::open_root(shared_ptr<BeesContext> ctx, uint64_t root)
 {
-	// Don't hold root FDs open too long.
-	// The open FDs prevent snapshots from being deleted.
-	// cleaner_kthread just keeps skipping over the open dir and all its children.
-	if (m_root_cache_timer.age() > BEES_COMMIT_INTERVAL) {
-		BEESINFO("Clearing root FD cache to enable subvol delete");
-		m_root_cache.clear();
-		m_root_cache_timer.reset();
-		BEESCOUNT(root_clear);
-	}
 	return m_root_cache(ctx, root);
 }

@@ -58,104 +59,13 @@ BeesFdCache::insert_root_ino(shared_ptr<BeesContext> ctx, Fd fd)
 	return m_file_cache.insert(fd, ctx, fid.root(), fid.ino());
 }

-mutex BeesWorkQueueBase::s_mutex;
-set<BeesWorkQueueBase*> BeesWorkQueueBase::s_all_workers;
-
-BeesWorkQueueBase::BeesWorkQueueBase(const string &name) :
-	m_name(name)
-{
-}
-
-BeesWorkQueueBase::~BeesWorkQueueBase()
-{
-	unique_lock<mutex> lock(s_mutex);
-	s_all_workers.erase(this);
-}
-
-void
-BeesWorkQueueBase::for_each_work_queue(std::function<void (BeesWorkQueueBase*)> f)
-{
-	unique_lock<mutex> lock(s_mutex);
-	for (auto i : s_all_workers) {
-		f(i);
-	}
-}
-
-string
-BeesWorkQueueBase::name() const
-{
-	return m_name;
-}
-
-void
-BeesWorkQueueBase::name(const string &new_name)
-{
-	m_name = new_name;
-}
-
-template <class Task>
-BeesWorkQueue<Task>::~BeesWorkQueue()
-{
-}
-
-template <class Task>
-BeesWorkQueue<Task>::BeesWorkQueue(const string &name) :
-	BeesWorkQueueBase(name)
-{
-	unique_lock<mutex> lock(s_mutex);
-	s_all_workers.insert(this);
-}
-
-template <class Task>
-void
-BeesWorkQueue<Task>::push_active(const Task &t)
-{
-	BEESNOTE("pushing task " << t);
-	m_active_queue.push(t);
-}
-
-template <class Task>
-void
-BeesWorkQueue<Task>::push_active(const Task &t, size_t limit)
-{
-	// BEESNOTE("pushing limit " << limit << " task " << t);
-	m_active_queue.push_wait(t, limit);
-}
-
-template <class Task>
-size_t
-BeesWorkQueue<Task>::active_size() const
-{
-	return m_active_queue.size();
-}
-
-template <class Task>
-list<string>
-BeesWorkQueue<Task>::peek_active(size_t count) const
-{
-	list<string> rv;
-	for (auto i : m_active_queue.peek(count)) {
-		ostringstream oss;
-		oss << i;
-		rv.push_back(oss.str());
-	}
-	return rv;
-}
-
-template <class Task>
-Task
-BeesWorkQueue<Task>::pop()
-{
-	return m_active_queue.pop();
-}
-
 void
 BeesContext::dump_status()
 {
 	auto status_charp = getenv("BEESSTATUS");
 	if (!status_charp) return;
 	string status_file(status_charp);
-	BEESLOG("Writing status to file '" << status_file << "' every " << BEES_STATUS_INTERVAL << " sec");
+	BEESLOGINFO("Writing status to file '" << status_file << "' every " << BEES_STATUS_INTERVAL << " sec");
 	while (1) {
 		BEESNOTE("waiting " << BEES_STATUS_INTERVAL);
 		sleep(BEES_STATUS_INTERVAL);
@@ -170,17 +80,19 @@ BeesContext::dump_status()
 		ofs << "RATES:\n";
 		ofs << "\t" << avg_rates << "\n";

-		ofs << "THREADS:\n";
-		for (auto t : BeesNote::get_status()) {	
+		ofs << "THREADS (work queue " << TaskMaster::get_queue_count() << " tasks):\n";
+		for (auto t : BeesNote::get_status()) {
 			ofs << "\ttid " << t.first << ": " << t.second << "\n";
 		}

-		BeesWorkQueueBase::for_each_work_queue([&](BeesWorkQueueBase *worker) {
-			ofs << "QUEUE: " << worker->name() << " active: " << worker->active_size() << "\n";
-			for (auto t : worker->peek_active(10)) {
-				ofs << "\t" << t << "\n";
-			}
-		});
+#if 0
+		// Huge amount of data, not a lot of information (yet)
+		ofs << "WORKERS:\n";
+		TaskMaster::print_workers(ofs);
+		ofs << "QUEUE:\n";
+		TaskMaster::print_queue(ofs);
+#endif
+
 		ofs.close();

 		BEESNOTE("renaming status file '" << status_file << "'");
@@ -202,86 +114,88 @@ BeesContext::show_progress()

 			auto thisStats = BeesStats::s_global;
 			auto avg_rates = lastStats / BEES_STATS_INTERVAL;
-			BEESLOG("TOTAL: " << thisStats);
-			BEESLOG("RATES: " << avg_rates);
+			BEESLOGINFO("TOTAL: " << thisStats);
+			BEESLOGINFO("RATES: " << avg_rates);
 			lastStats = thisStats;
 		}

-		BEESLOG("ACTIVITY:");
+		BEESLOGINFO("ACTIVITY:");

 		auto thisStats = BeesStats::s_global;
 		auto deltaStats = thisStats - lastProgressStats;
 		if (deltaStats) {
-			BEESLOG("\t" << deltaStats / BEES_PROGRESS_INTERVAL);
+			BEESLOGINFO("\t" << deltaStats / BEES_PROGRESS_INTERVAL);
 		};
 		lastProgressStats = thisStats;

-		BeesWorkQueueBase::for_each_work_queue([&](BeesWorkQueueBase *worker) {
-			BEESLOG("QUEUE: " << worker->name() << " active: " << worker->active_size());
-		});
+		BEESLOGINFO("THREADS:");

-		BEESLOG("THREADS:");
-
-		for (auto t : BeesNote::get_status()) {	
-			BEESLOG("\ttid " << t.first << ": " << t.second);
+		for (auto t : BeesNote::get_status()) {
+			BEESLOGINFO("\ttid " << t.first << ": " << t.second);
 		}
 	}
 }

+Fd
+BeesContext::home_fd()
+{
+	if (!!m_home_fd) {
+		return m_home_fd;
+	}
+
+	const char *base_dir = getenv("BEESHOME");
+	if (!base_dir) {
+		base_dir = ".beeshome";
+	}
+	m_home_fd = openat(root_fd(), base_dir, FLAGS_OPEN_DIR);
+	if (!m_home_fd) {
+		THROW_ERRNO("openat: " << name_fd(root_fd()) << " / " << base_dir);
+	}
+	return m_home_fd;
+}
+
 BeesContext::BeesContext(shared_ptr<BeesContext> parent) :
 	m_parent_ctx(parent)
 {
-	auto base_dir = getenv_or_die("BEESHOME");
-	BEESLOG("BEESHOME = " << base_dir);
-	m_home_fd = open_or_die(base_dir, FLAGS_OPEN_DIR);
 	if (m_parent_ctx) {
-		m_hash_table = m_parent_ctx->hash_table();
-		m_hash_table->set_shared(true);
 		m_fd_cache = m_parent_ctx->fd_cache();
 	}
 }

+bool
+BeesContext::is_root_ro(uint64_t root)
+{
+	return roots()->is_root_ro(root);
+}
+
 bool
 BeesContext::dedup(const BeesRangePair &brp)
 {
 	// TOOLONG and NOTE can retroactively fill in the filename details, but LOG can't
 	BEESNOTE("dedup " << brp);

-	brp.first.fd(shared_from_this());
 	brp.second.fd(shared_from_this());

-#if 0
-	// This avoids some sort of kernel race condition;
-	// however, it also doubles our dedup times.
-	// Is avoiding a crash every few weeks worth it?
-	bees_sync(brp.first.fd());
-#endif
+	if (is_root_ro(brp.second.fid().root())) {
+		// BEESLOGDEBUG("WORKAROUND: dst subvol is read-only in " << name_fd(brp.second.fd()));
+		BEESCOUNT(dedup_workaround_btrfs_send);
+		return false;
+	}
+
+	brp.first.fd(shared_from_this());

 	BEESTOOLONG("dedup " << brp);

-	thread_local BeesFileId tl_first_fid, tl_second_fid;
-	if (tl_first_fid != brp.first.fid()) {
-		BEESLOG("dedup: src " << name_fd(brp.first.fd()));
-		tl_first_fid = brp.first.fid();
-		tl_second_fid = BeesFileId();
-	}
-	ostringstream dst_line;
-	dst_line << "       dst " << pretty(brp.first.size()) << " [" << to_hex(brp.first.begin()) << ".." << to_hex(brp.first.end()) << "]";
-	if (brp.first.begin() != brp.second.begin()) {
-		dst_line << " [" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "]";
-	}
 	BeesAddress first_addr(brp.first.fd(), brp.first.begin());
 	BeesAddress second_addr(brp.second.fd(), brp.second.begin());
-	dst_line << " (" << first_addr << "->" << second_addr << ")";
+
+	BEESLOGINFO("dedup: src " << pretty(brp.first.size())  << " [" << to_hex(brp.first.begin())  << ".." << to_hex(brp.first.end())  << "] {" << first_addr  << "} " << name_fd(brp.first.fd()) << "\n"
+		 << "       dst " << pretty(brp.second.size()) << " [" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "] {" << second_addr << "} " << name_fd(brp.second.fd()));
+
 	if (first_addr.get_physical_or_zero() == second_addr.get_physical_or_zero()) {
 		BEESLOGTRACE("equal physical addresses in dedup");
 		BEESCOUNT(bug_dedup_same_physical);
 	}
-	if (tl_second_fid != brp.second.fid()) {
-		dst_line << " " << name_fd(brp.second.fd());
-		tl_second_fid = brp.second.fid();
-	}
-	BEESLOG(dst_line.str());

 	THROW_CHECK1(invalid_argument, brp, !brp.first.overlaps(brp.second));
 	THROW_CHECK1(invalid_argument, brp, brp.first.size() == brp.second.size());
@@ -301,7 +215,7 @@ BeesContext::dedup(const BeesRangePair &brp)
 		}
 	} else {
 		BEESCOUNT(dedup_miss);
-		BEESLOG("NO Dedup! " << brp);
+		BEESLOGWARN("NO Dedup! " << brp);
 	}

 	return rv;
@@ -326,6 +240,7 @@ BeesContext::rewrite_file_range(const BeesFileRange &bfr)
 	// BEESLOG("\torig_bbd " << orig_bbd);
 	BeesBlockData dup_bbd(dup_brp.first.fd(), dup_brp.first.begin(), min(BLOCK_SIZE_SUMS, dup_brp.first.size()));
 	// BEESLOG("BeesResolver br(..., " << bfr << ")");
+	BEESTRACE("BeesContext::rewrite_file_range calling BeesResolver " << bfr);
 	BeesResolver br(m_ctx, BeesAddress(bfr.fd(), bfr.begin()));
 	// BEESLOG("\treplace_src " << dup_bbd);
 	br.replace_src(dup_bbd);
@@ -376,7 +291,7 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
 		Extent::OBSCURED | Extent::PREALLOC
 	)) {
 		BEESCOUNT(scan_interesting);
-		BEESLOG("Interesting extent flags " << e << " from fd " << name_fd(bfr.fd()));
+		BEESLOGWARN("Interesting extent flags " << e << " from fd " << name_fd(bfr.fd()));
 	}

 	if (e.flags() & Extent::HOLE) {
@@ -388,8 +303,10 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
 	if (e.flags() & Extent::PREALLOC) {
 		// Prealloc is all zero and we replace it with a hole.
 		// No special handling is required here.  Nuke it and move on.
-		BEESLOG("prealloc extent " << e);
-		BeesFileRange prealloc_bfr(m_ctx->tmpfile()->make_hole(e.size()));
+		BEESLOGINFO("prealloc extent " << e);
+		// Must not extend past EOF
+		auto extent_size = min(e.end(), bfr.file_size()) - e.begin();
+		BeesFileRange prealloc_bfr(m_ctx->tmpfile()->make_hole(extent_size));
 		BeesRangePair brp(prealloc_bfr, bfr);
 		// Raw dedup here - nothing else to do with this extent, nothing to merge with
 		if (m_ctx->dedup(brp)) {
@@ -402,7 +319,7 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
 	}

 	// OK we need to read extent now
-	posix_fadvise(bfr.fd(), bfr.begin(), bfr.size(), POSIX_FADV_WILLNEED);
+	readahead(bfr.fd(), bfr.begin(), bfr.size());

 	map<off_t, pair<BeesHash, BeesAddress>> insert_map;
 	set<off_t> noinsert_set;
@@ -462,7 +379,7 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
 				// Do not attempt to lookup hash of zero block
 				continue;
 			} else {
-				BEESLOG("zero bbd " << bbd << "\n\tin extent " << e);
+				BEESLOGINFO("zero bbd " << bbd << "\n\tin extent " << e);
 				BEESCOUNT(scan_zero_uncompressed);
 				rewrite_extent = true;
 				break;
@@ -519,8 +436,9 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)

 			// Hash is toxic
 			if (found_addr.is_toxic()) {
-				BEESINFO("WORKAROUND: abandoned toxic match for hash " << hash << " addr " << found_addr);
+				BEESLOGWARN("WORKAROUND: abandoned toxic match for hash " << hash << " addr " << found_addr << " matching bbd " << bbd);
 				// Don't push these back in because we'll never delete them.
+				// Extents may become non-toxic so give them a chance to expire.
 				// hash_table->push_front_hash_addr(hash, found_addr);
 				BEESCOUNT(scan_toxic_hash);
 				return bfr;
@@ -531,17 +449,16 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
 			catch_all([&]() {
 				BEESNOTE("resolving " << found_addr << " matched " << bbd);
 				BEESTRACE("resolving " << found_addr << " matched " << bbd);
+				BEESTRACE("BeesContext::scan_one_extent calling BeesResolver " << found_addr);
 				BeesResolver resolved(m_ctx, found_addr);
 				// Toxic extents are really toxic
 				if (resolved.is_toxic()) {
-					BEESINFO("WORKAROUND: abandoned toxic match at found_addr " << found_addr << " matching bbd " << bbd);
+					BEESLOGWARN("WORKAROUND: discovered toxic match at found_addr " << found_addr << " matching bbd " << bbd);
 					BEESCOUNT(scan_toxic_match);
-#if 0
-					// Don't push these back in because we'll never delete them.
-					// Make sure we never see this hash again
+					// Make sure we never see this hash again.
+					// It has become toxic since it was inserted into the hash table.
 					found_addr.set_toxic();
 					hash_table->push_front_hash_addr(hash, found_addr);
-#endif
 					abandon_extent = true;
 				} else if (!resolved.count()) {
 					BEESCOUNT(scan_resolve_zero);
@@ -578,7 +495,8 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)

 			BeesAddress last_replaced_addr;
 			for (auto it = resolved_addrs.begin(); it != resolved_addrs.end(); ++it) {
-				catch_all([&]() {
+				// FIXME:  Need to terminate this loop on replace_dst exception condition
+				// catch_all([&]() {
 					auto it_copy = *it;
 					BEESNOTE("finding one match (out of " << it_copy.count() << ") at " << it_copy.addr() << " for " << bbd);
 					BEESTRACE("finding one match (out of " << it_copy.count() << ") at " << it_copy.addr() << " for " << bbd);
@@ -590,7 +508,7 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
 					if (it_copy.found_hash()) {
 						BEESCOUNT(scan_hash_hit);
 					} else {
-						// BEESINFO("erase src hash " << hash << " addr " << it_copy.addr());
+						// BEESLOGDEBUG("erase src hash " << hash << " addr " << it_copy.addr());
 						BEESCOUNT(scan_hash_miss);
 						hash_table->erase_hash_addr(hash, it_copy.addr());
 					}
@@ -601,7 +519,7 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
 						// FIXME:  we will thrash if we let multiple references to identical blocks
 						// exist in the hash table.  Erase all but the last one.
 						if (last_replaced_addr) {
-							BEESLOG("Erasing redundant hash " << hash << " addr " << last_replaced_addr);
+							BEESLOGINFO("Erasing redundant hash " << hash << " addr " << last_replaced_addr);
 							hash_table->erase_hash_addr(hash, last_replaced_addr);
 							BEESCOUNT(scan_erase_redundant);
 						}
@@ -630,7 +548,7 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
 					} else {
 						BEESCOUNT(scan_dup_miss);
 					}
-				});
+				// });
 			}
 			if (last_replaced_addr) {
 				// If we replaced extents containing the incoming addr,
@@ -763,13 +681,7 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)

 	// Visualize
 	if (bar != string(block_count, '.')) {
-		thread_local BeesFileId last_fid;
-		string file_name;
-		if (bfr.fid() != last_fid) {
-			last_fid = bfr.fid();
-			file_name = " " + name_fd(bfr.fd());
-		}
-		BEESLOG("scan: " << pretty(e.size()) << " " << to_hex(e.begin()) << " [" << bar << "] " << to_hex(e.end()) << file_name);
+		BEESLOGINFO("scan: " << pretty(e.size()) << " " << to_hex(e.begin()) << " [" << bar << "] " << to_hex(e.end()) << ' ' << name_fd(bfr.fd()));
 	}

 	return bfr;
@@ -799,14 +711,14 @@ BeesContext::scan_forward(const BeesFileRange &bfr)

 	// No FD?  Well, that was quick.
 	if (!bfr.fd()) {
-		BEESINFO("No FD in " << root_path() << " for " << bfr);
+		// BEESLOGINFO("No FD in " << root_path() << " for " << bfr);
 		BEESCOUNT(scan_no_fd);
 		return bfr;
 	}

 	// Sanity check
 	if (bfr.begin() >= bfr.file_size()) {
-		BEESLOG("past EOF: " << bfr);
+		BEESLOGWARN("past EOF: " << bfr);
 		BEESCOUNT(scan_eof);
 		return bfr;
 	}
@@ -821,6 +733,9 @@ BeesContext::scan_forward(const BeesFileRange &bfr)
 			e = ew.current();

 			catch_all([&]() {
+				uint64_t extent_bytenr = e.bytenr();
+				BEESNOTE("waiting for extent bytenr " << to_hex(extent_bytenr));
+				auto extent_lock = m_extent_lock_set.make_lock(extent_bytenr);
 				Timer one_extent_timer;
 				return_bfr = scan_one_extent(bfr, e);
 				BEESCOUNTADD(scanf_extent_ms, one_extent_timer.age() * 1000);
@@ -847,11 +762,42 @@ BeesResolveAddrResult::BeesResolveAddrResult()
 {
 }

+void
+BeesContext::wait_for_balance()
+{
+	Timer balance_timer;
+	BEESNOTE("WORKAROUND: waiting for balance to stop");
+	while (true) {
+		btrfs_ioctl_balance_args args;
+		memset_zero<btrfs_ioctl_balance_args>(&args);
+		const int ret = ioctl(root_fd(), BTRFS_IOC_BALANCE_PROGRESS, &args);
+		if (ret < 0) {
+			// Either can't get balance status or not running, exit either way
+			break;
+		}
+
+		if (!(args.state & BTRFS_BALANCE_STATE_RUNNING)) {
+			// Balance not running, doesn't matter if paused or cancelled
+			break;
+		}
+
+		BEESLOGDEBUG("WORKAROUND: Waiting " << balance_timer << "s for balance to stop");
+		sleep(BEES_BALANCE_POLL_INTERVAL);
+	}
+}
+
 BeesResolveAddrResult
 BeesContext::resolve_addr_uncached(BeesAddress addr)
 {
 	THROW_CHECK1(invalid_argument, addr, !addr.is_magic());
 	THROW_CHECK0(invalid_argument, !!root_fd());
+
+	// Is there a bug where resolve and balance cause a crash (BUG_ON at fs/btrfs/ctree.c:1227)?
+	// Apparently yes, and more than one.
+	// Wait for the balance to finish before we run LOGICAL_INO
+	wait_for_balance();
+
+	// Time how long this takes
 	Timer resolve_timer;

 	// There is no performance benefit if we restrict the buffer size.
@@ -876,7 +822,7 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
 	if (rt_age < BEES_TOXIC_DURATION && log_ino.m_iors.size() < BEES_MAX_EXTENT_REF_COUNT) {
 		rv.m_is_toxic = false;
 	} else {
-		BEESLOG("WORKAROUND: toxic address " << addr << " in " << root_path() << " with " << log_ino.m_iors.size() << " refs took " << rt_age << "s in LOGICAL_INO");
+		BEESLOGWARN("WORKAROUND: toxic address " << addr << " in " << root_path() << " with " << log_ino.m_iors.size() << " refs took " << rt_age << "s in LOGICAL_INO");
 		BEESCOUNT(resolve_toxic);
 		rv.m_is_toxic = true;
 	}
@@ -901,7 +847,7 @@ void
 BeesContext::set_root_fd(Fd fd)
 {
 	uint64_t root_fd_treeid = btrfs_get_root_id(fd);
-	BEESLOG("set_root_fd " << name_fd(fd));
+	BEESLOGINFO("set_root_fd " << name_fd(fd));
 	BEESTRACE("set_root_fd " << name_fd(fd));
 	THROW_CHECK1(invalid_argument, root_fd_treeid, root_fd_treeid == BTRFS_FS_TREE_OBJECTID);
 	Stat st(fd);
@@ -910,9 +856,10 @@ BeesContext::set_root_fd(Fd fd)
 	BtrfsIoctlFsInfoArgs fsinfo;
 	fsinfo.do_ioctl(fd);
 	m_root_uuid = fsinfo.uuid();
-	BEESLOG("Filesystem UUID is " << m_root_uuid);
+	BEESLOGINFO("Filesystem UUID is " << m_root_uuid);

-	// 65536 is big enough for two max-sized extents
+	// 65536 is big enough for two max-sized extents.
+	// Need enough total space in the cache for the maximum number of active threads.
 	m_resolve_cache.max_size(65536);
 	m_resolve_cache.func([&](BeesAddress addr) -> BeesResolveAddrResult {
 		return resolve_addr_uncached(addr);
@@ -921,13 +868,13 @@ BeesContext::set_root_fd(Fd fd)
 	// Start queue producers
 	roots();

-	BEESLOG("returning from set_root_fd in " << name_fd(fd));
+	BEESLOGINFO("returning from set_root_fd in " << name_fd(fd));
 }

 void
 BeesContext::blacklist_add(const BeesFileId &fid)
 {
-	BEESLOG("Adding " << fid << " to blacklist");
+	BEESLOGDEBUG("Adding " << fid << " to blacklist");
 	unique_lock<mutex> lock(m_blacklist_mutex);
 	m_blacklist.insert(fid);
 }
@@ -953,7 +900,8 @@ BeesContext::tmpfile()
 	if (!m_tmpfiles[this_thread::get_id()]) {
 		m_tmpfiles[this_thread::get_id()] = make_shared<BeesTempFile>(shared_from_this());
 	}
-	return m_tmpfiles[this_thread::get_id()];
+	auto rv = m_tmpfiles[this_thread::get_id()];
+	return rv;
 }

 shared_ptr<BeesFdCache>
@@ -964,7 +912,8 @@ BeesContext::fd_cache()
 	if (!m_fd_cache) {
 		m_fd_cache = make_shared<BeesFdCache>();
 	}
-	return m_fd_cache;
+	auto rv = m_fd_cache;
+	return rv;
 }

 shared_ptr<BeesRoots>
@@ -975,7 +924,8 @@ BeesContext::roots()
 	if (!m_roots) {
 		m_roots = make_shared<BeesRoots>(shared_from_this());
 	}
-	return m_roots;
+	auto rv = m_roots;
+	return rv;
 }

 shared_ptr<BeesHashTable>
@@ -986,13 +936,14 @@ BeesContext::hash_table()
 	if (!m_hash_table) {
 		m_hash_table = make_shared<BeesHashTable>(shared_from_this(), "beeshash.dat");
 	}
-	return m_hash_table;
+	auto rv = m_hash_table;
+	return rv;
 }

 void
 BeesContext::set_root_path(string path)
 {
-	BEESLOG("set_root_path " << path);
+	BEESLOGINFO("set_root_path " << path);
 	m_root_path = path;
 	set_root_fd(open_or_die(m_root_path, FLAGS_OPEN_DIR));
 }
@@ -1002,8 +953,3 @@ BeesContext::insert_root_ino(Fd fd)
 {
 	fd_cache()->insert_root_ino(shared_from_this(), fd);
 }
-
-// instantiate templates for linkage ----------------------------------------
-
-template class BeesWorkQueue<BeesFileRange>;
-template class BeesWorkQueue<BeesRangePair>;
--- a/src/bees-hash.cc
+++ b/src/bees-hash.cc
@@ -11,13 +11,6 @@
 using namespace crucible;
 using namespace std;

-static inline
-bool
-using_any_madvise()
-{
-	return true;
-}
-
 ostream &
 operator<<(ostream &os, const BeesHash &bh)
 {
@@ -31,14 +24,16 @@ operator<<(ostream &os, const BeesHashTable::Cell &bhte)
 		  << BeesAddress(bhte.e_addr) << " }";
 }

+#if 0
+static
 void
-dump_bucket(BeesHashTable::Cell *p, BeesHashTable::Cell *q)
+dump_bucket_locked(BeesHashTable::Cell *p, BeesHashTable::Cell *q)
 {
-	// Must be called while holding m_bucket_mutex
 	for (auto i = p; i < q; ++i) {
 		BEESLOG("Entry " << i - p << " " << *i);
 	}
 }
+#endif

 const bool VERIFY_CLEARS_BUGS = false;

@@ -51,7 +46,7 @@ verify_cell_range(BeesHashTable::Cell *p, BeesHashTable::Cell *q, bool clear_bug
 	for (BeesHashTable::Cell *cell = p; cell < q; ++cell) {
 		if (cell->e_addr && cell->e_addr < 0x1000) {
 			BEESCOUNT(bug_hash_magic_addr);
-			BEESINFO("Bad hash table address hash " << to_hex(cell->e_hash) << " addr " << to_hex(cell->e_addr));
+			BEESLOGDEBUG("Bad hash table address hash " << to_hex(cell->e_hash) << " addr " << to_hex(cell->e_addr));
 			if (clear_bugs) {
 				cell->e_addr = 0;
 				cell->e_hash = 0;
@@ -60,8 +55,8 @@ verify_cell_range(BeesHashTable::Cell *p, BeesHashTable::Cell *q, bool clear_bug
 		}
 		if (cell->e_addr && !seen_it.insert(*cell).second) {
 			BEESCOUNT(bug_hash_duplicate_cell);
-			// BEESLOG("Duplicate hash table entry:\nthis = " << *cell << "\nold = " << *seen_it.find(*cell));
-			BEESINFO("Duplicate hash table entry: " << *cell);
+			// BEESLOGDEBUG("Duplicate hash table entry:\nthis = " << *cell << "\nold = " << *seen_it.find(*cell));
+			BEESLOGDEBUG("Duplicate hash table entry: " << *cell);
 			if (clear_bugs) {
 				cell->e_addr = 0;
 				cell->e_hash = 0;
@@ -98,68 +93,81 @@ BeesHashTable::get_extent_range(HashType hash)
 	return make_pair(bp, ep);
 }

-void
-BeesHashTable::flush_dirty_extents()
+bool
+BeesHashTable::flush_dirty_extent(uint64_t extent_index)
 {
-	if (using_shared_map()) return;
+	BEESNOTE("flushing extent #" << extent_index << " of " << m_extents << " extents");

-	THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
+	auto lock = lock_extent_by_index(extent_index);

-	unique_lock<mutex> lock(m_extent_mutex);
-	auto dirty_extent_copy = m_buckets_dirty;
-	m_buckets_dirty.clear();
-	if (dirty_extent_copy.empty()) {
-		BEESNOTE("idle");
-		m_condvar.wait(lock);
-		return; // please call later, i.e. immediately
+	// Not dirty, nothing to do
+	if (!m_extent_metadata.at(extent_index).m_dirty) {
+		return false;
 	}
-	lock.unlock();

-	size_t extent_counter = 0;
-	for (auto extent_number : dirty_extent_copy) {
-		++extent_counter;
-		BEESNOTE("flush extent #" << extent_number << " (" << extent_counter << " of " << dirty_extent_copy.size() << ")");
-		catch_all([&]() {
-			uint8_t *dirty_extent     = m_extent_ptr[extent_number].p_byte;
-			uint8_t *dirty_extent_end = m_extent_ptr[extent_number + 1].p_byte;
-			THROW_CHECK1(out_of_range, dirty_extent,     dirty_extent     >= m_byte_ptr);
-			THROW_CHECK1(out_of_range, dirty_extent_end, dirty_extent_end <= m_byte_ptr_end);
-			if (using_shared_map()) {
-				BEESTOOLONG("flush extent " << extent_number);
-				copy(dirty_extent, dirty_extent_end, dirty_extent);
-			} else {
-				BEESTOOLONG("pwrite(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
-				// Page locks slow us down more than copying the data does
-				vector<uint8_t> extent_copy(dirty_extent, dirty_extent_end);
-				pwrite_or_die(m_fd, extent_copy, dirty_extent - m_byte_ptr);
-				BEESCOUNT(hash_extent_out);
-			}
-		});
-		BEESNOTE("flush rate limited at extent #" << extent_number << " (" << extent_counter << " of " << dirty_extent_copy.size() << ")");
-		m_flush_rate_limit.sleep_for(BLOCK_SIZE_HASHTAB_EXTENT);
-	}
+	bool wrote_extent = false;
+
+	catch_all([&]() {
+		uint8_t *dirty_extent     = m_extent_ptr[extent_index].p_byte;
+		uint8_t *dirty_extent_end = m_extent_ptr[extent_index + 1].p_byte;
+		THROW_CHECK1(out_of_range, dirty_extent,     dirty_extent     >= m_byte_ptr);
+		THROW_CHECK1(out_of_range, dirty_extent_end, dirty_extent_end <= m_byte_ptr_end);
+		THROW_CHECK2(out_of_range, dirty_extent_end, dirty_extent, dirty_extent_end - dirty_extent == BLOCK_SIZE_HASHTAB_EXTENT);
+		BEESTOOLONG("pwrite(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
+		// Copy the extent because we might be stuck writing for a while
+		vector<uint8_t> extent_copy(dirty_extent, dirty_extent_end);
+
+		// Mark extent non-dirty while we still hold the lock
+		m_extent_metadata.at(extent_index).m_dirty = false;
+
+		// Release the lock
+		lock.unlock();
+
+		// Write the extent (or not)
+		pwrite_or_die(m_fd, extent_copy, dirty_extent - m_byte_ptr);
+		BEESCOUNT(hash_extent_out);
+
+		wrote_extent = true;
+	});
+
+	BEESNOTE("flush rate limited after extent #" << extent_index << " of " << m_extents << " extents");
+	m_flush_rate_limit.sleep_for(BLOCK_SIZE_HASHTAB_EXTENT);
+	return wrote_extent;
 }

 void
-BeesHashTable::set_extent_dirty(HashType hash)
+BeesHashTable::flush_dirty_extents()
 {
-	if (using_shared_map()) return;
 	THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
-	auto pr = get_extent_range(hash);
-	uint64_t extent_number = reinterpret_cast<Extent *>(pr.first) - m_extent_ptr;
-	THROW_CHECK1(runtime_error, extent_number, extent_number < m_extents);
-	unique_lock<mutex> lock(m_extent_mutex);
-	m_buckets_dirty.insert(extent_number);
-	m_condvar.notify_one();
+
+	uint64_t wrote_extents = 0;
+	for (size_t extent_index = 0; extent_index < m_extents; ++extent_index) {
+		if (flush_dirty_extent(extent_index)) {
+			++wrote_extents;
+		}
+	}
+
+	BEESNOTE("idle after writing " << wrote_extents << " of " << m_extents << " extents");
+	unique_lock<mutex> lock(m_dirty_mutex);
+	m_dirty_condvar.wait(lock);
+}
+
+void
+BeesHashTable::set_extent_dirty_locked(uint64_t extent_index)
+{
+	// Must already be locked
+	m_extent_metadata.at(extent_index).m_dirty = true;
+
+	// Signal writeback thread
+	unique_lock<mutex> dirty_lock(m_dirty_mutex);
+	m_dirty_condvar.notify_one();
 }

 void
 BeesHashTable::writeback_loop()
 {
-	if (!using_shared_map()) {
-		while (1) {
-			flush_dirty_extents();
-		}
+	while (true) {
+		flush_dirty_extents();
 	}
 }

@@ -177,14 +185,8 @@ percent(size_t num, size_t den)
 void
 BeesHashTable::prefetch_loop()
 {
-	// Always do the mlock, whether shared or not
-	THROW_CHECK1(runtime_error, m_size, m_size > 0);
-	catch_all([&]() {
-		BEESNOTE("mlock " << pretty(m_size));
-		DIE_IF_NON_ZERO(mlock(m_byte_ptr, m_size));
-	});
-
-	while (1) {
+	bool not_locked = true;
+	while (true) {
 		size_t width = 64;
 		vector<size_t> occupancy(width, 0);
 		size_t occupied_count = 0;
@@ -195,13 +197,13 @@ BeesHashTable::prefetch_loop()
 		size_t unaligned_eof_count = 0;

 		for (uint64_t ext = 0; ext < m_extents; ++ext) {
-			BEESNOTE("prefetching hash table extent " << ext << " of " << m_extent_ptr_end - m_extent_ptr);
+			BEESNOTE("prefetching hash table extent #" << ext << " of " << m_extents);
 			catch_all([&]() {
-				fetch_missing_extent(ext * c_buckets_per_extent);
+				fetch_missing_extent_by_index(ext);

-				BEESNOTE("analyzing hash table extent " << ext << " of " << m_extent_ptr_end - m_extent_ptr);
+				BEESNOTE("analyzing hash table extent #" << ext << " of " << m_extents);
 				bool duplicate_bugs_found = false;
-				unique_lock<mutex> lock(m_bucket_mutex);
+				auto lock = lock_extent_by_index(ext);
 				for (Bucket *bucket = m_extent_ptr[ext].p_buckets; bucket < m_extent_ptr[ext + 1].p_buckets; ++bucket) {
 					if (verify_cell_range(bucket[0].p_cells, bucket[1].p_cells)) {
 						duplicate_bugs_found = true;
@@ -230,9 +232,8 @@ BeesHashTable::prefetch_loop()
 					// Count these instead of calculating the number so we get better stats in case of exceptions
 					occupied_count += this_bucket_occupied_count;
 				}
-				lock.unlock();
 				if (duplicate_bugs_found) {
-					set_extent_dirty(ext);
+					set_extent_dirty_locked(ext);
 				}
 			});
 		}
@@ -268,20 +269,18 @@ BeesHashTable::prefetch_loop()
 			out << "\n";
 		}

-		size_t uncompressed_count = occupied_count - compressed_count;
-		size_t legacy_count = compressed_count - compressed_offset_count;
+		size_t uncompressed_count = occupied_count - compressed_offset_count;

 		ostringstream graph_blob;

 		graph_blob << "Now:     " << format_time(time(NULL)) << "\n";
 		graph_blob << "Uptime:  " << m_ctx->total_timer().age() << " seconds\n";
+		graph_blob << "Version: " << BEES_VERSION << "\n";

-		graph_blob 
-			<< "\nHash table page occupancy histogram (" << occupied_count << "/" << total_count << " cells occupied, " << (occupied_count * 100 / total_count) << "%)\n" 
+		graph_blob
+			<< "\nHash table page occupancy histogram (" << occupied_count << "/" << total_count << " cells occupied, " << (occupied_count * 100 / total_count) << "%)\n"
 			<< out.str() << "0%      |      25%      |      50%      |      75%      |   100% page fill\n"
-			<< "compressed " << compressed_count << " (" << percent(compressed_count, occupied_count) << ")"
-			<< " new-style " << compressed_offset_count << " (" << percent(compressed_offset_count, occupied_count) << ")"
-			<< " old-style " << legacy_count << " (" << percent(legacy_count, occupied_count) << ")\n"
+			<< "compressed " << compressed_count << " (" << percent(compressed_count, occupied_count) << ")\n"
 			<< "uncompressed " << uncompressed_count << " (" << percent(uncompressed_count, occupied_count) << ")"
 			<< " unaligned_eof " << unaligned_eof_count << " (" << percent(unaligned_eof_count, occupied_count) << ")"
 			<< " toxic " << toxic_count << " (" << percent(toxic_count, occupied_count) << ")";
@@ -296,66 +295,93 @@ BeesHashTable::prefetch_loop()
 		auto avg_rates = thisStats / m_ctx->total_timer().age();
 		graph_blob << "\t" << avg_rates << "\n";

-		BEESLOG(graph_blob.str());
+		BEESLOGINFO(graph_blob.str());
 		catch_all([&]() {
 			m_stats_file.write(graph_blob.str());
 		});

+		if (not_locked) {
+			// Always do the mlock, whether shared or not
+			THROW_CHECK1(runtime_error, m_size, m_size > 0);
+			BEESLOGINFO("mlock(" << pretty(m_size) << ")...");
+			Timer lock_time;
+			catch_all([&]() {
+				BEESNOTE("mlock " << pretty(m_size));
+				DIE_IF_NON_ZERO(mlock(m_byte_ptr, m_size));
+			});
+			BEESLOGINFO("mlock(" << pretty(m_size) << ") done in " << lock_time << " sec");
+			not_locked = false;
+		}
+
 		BEESNOTE("idle " << BEES_HASH_TABLE_ANALYZE_INTERVAL << "s");
 		nanosleep(BEES_HASH_TABLE_ANALYZE_INTERVAL);
 	}
 }

-void
-BeesHashTable::fetch_missing_extent(HashType hash)
+size_t
+BeesHashTable::hash_to_extent_index(HashType hash)
+{
+	auto pr = get_extent_range(hash);
+	uint64_t extent_index = reinterpret_cast<const Extent *>(pr.first) - m_extent_ptr;
+	THROW_CHECK2(runtime_error, extent_index, m_extents, extent_index < m_extents);
+	return extent_index;
+}
+
+BeesHashTable::ExtentMetaData::ExtentMetaData() :
+	m_mutex_ptr(make_shared<mutex>())
+{
+}
+
+unique_lock<mutex>
+BeesHashTable::lock_extent_by_index(uint64_t extent_index)
+{
+	THROW_CHECK2(out_of_range, extent_index, m_extents, extent_index < m_extents);
+	return unique_lock<mutex>(*m_extent_metadata.at(extent_index).m_mutex_ptr);
+}
+
+unique_lock<mutex>
+BeesHashTable::lock_extent_by_hash(HashType hash)
 {
 	BEESTOOLONG("fetch_missing_extent for hash " << to_hex(hash));
-	if (using_shared_map()) return;
-	THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
-	auto pr = get_extent_range(hash);
-	uint64_t extent_number = reinterpret_cast<Extent *>(pr.first) - m_extent_ptr;
-	THROW_CHECK1(runtime_error, extent_number, extent_number < m_extents);
+	return lock_extent_by_index(hash_to_extent_index(hash));
+}

-	unique_lock<mutex> lock(m_extent_mutex);
-	if (!m_buckets_missing.count(extent_number)) {
+void
+BeesHashTable::fetch_missing_extent_by_index(uint64_t extent_index)
+{
+	BEESNOTE("checking hash extent #" << extent_index << " of " << m_extents << " extents");
+	auto lock = lock_extent_by_index(extent_index);
+	if (!m_extent_metadata.at(extent_index).m_missing) {
 		return;
 	}

-	size_t missing_buckets = m_buckets_missing.size();
-	lock.unlock();
-
-	BEESNOTE("fetch waiting for hash extent #" << extent_number << ", " << missing_buckets << " left to fetch");
-
-	// Acquire blocking lock on this extent only
-	LockSet<uint64_t>::Lock extent_lock(m_extent_lock_set, extent_number);
-
-	// Check missing again because someone else might have fetched this
-	// extent for us while we didn't hold any locks
-	lock.lock();
-	if (!m_buckets_missing.count(extent_number)) {
-		BEESCOUNT(hash_extent_in_twice);
-		return;
-	}
-	lock.unlock();
-
 	// OK we have to read this extent
-	BEESNOTE("fetching hash extent #" << extent_number << ", " << missing_buckets << " left to fetch");
+	BEESNOTE("fetching hash extent #" << extent_index << " of " << m_extents << " extents");
+	BEESTRACE("Fetching hash extent #" << extent_index << " of " << m_extents << " extents");
+	BEESTOOLONG("Fetching hash extent #" << extent_index << " of " << m_extents << " extents");

-	BEESTRACE("Fetching missing hash extent " << extent_number);
-	uint8_t *dirty_extent     = m_extent_ptr[extent_number].p_byte;
-	uint8_t *dirty_extent_end = m_extent_ptr[extent_number + 1].p_byte;
+	uint8_t *dirty_extent     = m_extent_ptr[extent_index].p_byte;
+	uint8_t *dirty_extent_end = m_extent_ptr[extent_index + 1].p_byte;

-	{
+	// If the read fails don't retry, just go with whatever data we have
+	m_extent_metadata.at(extent_index).m_missing = false;
+
+	catch_all([&]() {
 		BEESTOOLONG("pread(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
 		pread_or_die(m_fd, dirty_extent, dirty_extent_end - dirty_extent, dirty_extent - m_byte_ptr);
-	}
+	});

+	// Only count extents successfully read
 	BEESCOUNT(hash_extent_in);
-	// We don't block when fetching an extent but we do slow down the
-	// prefetch thread.
-	m_prefetch_rate_limit.borrow(BLOCK_SIZE_HASHTAB_EXTENT);
-	lock.lock();
-	m_buckets_missing.erase(extent_number);
+}
+
+void
+BeesHashTable::fetch_missing_extent_by_hash(HashType hash)
+{
+	uint64_t extent_index = hash_to_extent_index(hash);
+	BEESNOTE("waiting to fetch hash extent #" << extent_index << " of " << m_extents << " extents");
+
+	fetch_missing_extent_by_index(extent_index);
 }

 bool
@@ -377,10 +403,10 @@ BeesHashTable::find_cell(HashType hash)
 		rv.push_back(toxic_cell);
 		return rv;
 	}
-	fetch_missing_extent(hash);
+	fetch_missing_extent_by_hash(hash);
 	BEESTOOLONG("find_cell hash " << BeesHash(hash));
 	vector<Cell> rv;
-	unique_lock<mutex> lock(m_bucket_mutex);
+	auto lock = lock_extent_by_hash(hash);
 	auto er = get_cell_range(hash);
 	// FIXME:  Weed out zero addresses in the table due to earlier bugs
 	copy_if(er.first, er.second, back_inserter(rv), [=](const Cell &ip) { return ip.e_hash == hash && ip.e_addr >= 0x1000; });
@@ -396,10 +422,9 @@ BeesHashTable::find_cell(HashType hash)
 void
 BeesHashTable::erase_hash_addr(HashType hash, AddrType addr)
 {
-	// if (m_shared) return;
-	fetch_missing_extent(hash);
+	fetch_missing_extent_by_hash(hash);
 	BEESTOOLONG("erase hash " << to_hex(hash) << " addr " << addr);
-	unique_lock<mutex> lock(m_bucket_mutex);
+	auto lock = lock_extent_by_hash(hash);
 	auto er = get_cell_range(hash);
 	Cell mv(hash, addr);
 	Cell *ip = find(er.first, er.second, mv);
@@ -407,11 +432,11 @@ BeesHashTable::erase_hash_addr(HashType hash, AddrType addr)
 	if (found) {
 		// Lookups on invalid addresses really hurt us.  Kill it with fire!
 		*ip = Cell(0, 0);
-		set_extent_dirty(hash);
+		set_extent_dirty_locked(hash_to_extent_index(hash));
 		BEESCOUNT(hash_erase);
 #if 0
 		if (verify_cell_range(er.first, er.second)) {
-			BEESINFO("while erasing hash " << hash << " addr " << addr);
+			BEESLOGDEBUG("while erasing hash " << hash << " addr " << addr);
 		}
 #endif
 	}
@@ -425,9 +450,9 @@ BeesHashTable::erase_hash_addr(HashType hash, AddrType addr)
 bool
 BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
 {
-	fetch_missing_extent(hash);
+	fetch_missing_extent_by_hash(hash);
 	BEESTOOLONG("push_front_hash_addr hash " << BeesHash(hash) <<" addr " << BeesAddress(addr));
-	unique_lock<mutex> lock(m_bucket_mutex);
+	auto lock = lock_extent_by_hash(hash);
 	auto er = get_cell_range(hash);
 	Cell mv(hash, addr);
 	Cell *ip = find(er.first, er.second, mv);
@@ -457,12 +482,12 @@ BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
 	// There is now a space at the front, insert there if different
 	if (er.first[0] != mv) {
 		er.first[0] = mv;
-		set_extent_dirty(hash);
+		set_extent_dirty_locked(hash_to_extent_index(hash));
 		BEESCOUNT(hash_front);
 	}
 #if 0
 	if (verify_cell_range(er.first, er.second)) {
-		BEESINFO("while push_fronting hash " << hash << " addr " << addr);
+		BEESLOGDEBUG("while push_fronting hash " << hash << " addr " << addr);
 	}
 #endif
 	return found;
@@ -476,9 +501,9 @@ BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
 bool
 BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr)
 {
-	fetch_missing_extent(hash);
+	fetch_missing_extent_by_hash(hash);
 	BEESTOOLONG("push_random_hash_addr hash " << BeesHash(hash) << " addr " << BeesAddress(addr));
-	unique_lock<mutex> lock(m_bucket_mutex);
+	auto lock = lock_extent_by_hash(hash);
 	auto er = get_cell_range(hash);
 	Cell mv(hash, addr);
 	Cell *ip = find(er.first, er.second, mv);
@@ -541,14 +566,14 @@ BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr)
 	case_cond = 5;
 ret_dirty:
 	BEESCOUNT(hash_insert);
-	set_extent_dirty(hash);
+	set_extent_dirty_locked(hash_to_extent_index(hash));
 ret:
 #if 0
 	if (verify_cell_range(er.first, er.second, false)) {
 		BEESLOG("while push_randoming (case " << case_cond << ") pos " << pos
 			<< " ip " << (ip - er.first) << " " << mv);
-		// dump_bucket(saved.data(), saved.data() + saved.size());
-		// dump_bucket(er.first, er.second);
+		// dump_bucket_locked(saved.data(), saved.data() + saved.size());
+		// dump_bucket_locked(er.first, er.second);
 	}
 #else
 	(void)case_cond;
@@ -563,9 +588,9 @@ BeesHashTable::try_mmap_flags(int flags)
 		THROW_CHECK1(out_of_range, m_size, m_size > 0);
 		Timer map_time;
 		catch_all([&]() {
-			BEESLOG("mapping hash table size " << m_size << " with flags " << mmap_flags_ntoa(flags));
+			BEESLOGINFO("mapping hash table size " << m_size << " with flags " << mmap_flags_ntoa(flags));
 			void *ptr = mmap_or_die(nullptr, m_size, PROT_READ | PROT_WRITE, flags, flags & MAP_ANONYMOUS ? -1 : int(m_fd), 0);
-			BEESLOG("mmap done in " << map_time << " sec");
+			BEESLOGINFO("mmap done in " << map_time << " sec");
 			m_cell_ptr = static_cast<Cell *>(ptr);
 			void *ptr_end = static_cast<uint8_t *>(ptr) + m_size;
 			m_cell_ptr_end = static_cast<Cell *>(ptr_end);
@@ -574,12 +599,39 @@ BeesHashTable::try_mmap_flags(int flags)
 }

 void
-BeesHashTable::set_shared(bool shared)
+BeesHashTable::open_file()
 {
-	m_shared = shared;
+	// OK open hash table
+	BEESNOTE("opening hash table '" << m_filename << "' target size " << m_size << " (" << pretty(m_size) << ")");
+
+	// Try to open existing hash table
+	Fd new_fd = openat(m_ctx->home_fd(), m_filename.c_str(), FLAGS_OPEN_FILE_RW, 0700);
+
+	// If that doesn't work, try to make a new one
+	if (!new_fd) {
+		string tmp_filename = m_filename + ".tmp";
+		BEESNOTE("creating new hash table '" << tmp_filename << "'");
+		BEESLOGINFO("Creating new hash table '" << tmp_filename << "'");
+		unlinkat(m_ctx->home_fd(), tmp_filename.c_str(), 0);
+		new_fd = openat_or_die(m_ctx->home_fd(), tmp_filename, FLAGS_CREATE_FILE, 0700);
+		BEESNOTE("truncating new hash table '" << tmp_filename << "' size " << m_size << " (" << pretty(m_size) << ")");
+		BEESLOGINFO("Truncating new hash table '" << tmp_filename << "' size " << m_size << " (" << pretty(m_size) << ")");
+		ftruncate_or_die(new_fd, m_size);
+		BEESNOTE("truncating new hash table '" << tmp_filename << "' -> '" << m_filename << "'");
+		BEESLOGINFO("Truncating new hash table '" << tmp_filename << "' -> '" << m_filename << "'");
+		renameat_or_die(m_ctx->home_fd(), tmp_filename, m_ctx->home_fd(), m_filename);
+	}
+
+	Stat st(new_fd);
+	off_t new_size = st.st_size;
+
+	THROW_CHECK1(invalid_argument, new_size, new_size > 0);
+	THROW_CHECK1(invalid_argument, new_size, (new_size % BLOCK_SIZE_HASHTAB_EXTENT) == 0);
+	m_size = new_size;
+	m_fd = new_fd;
 }

-BeesHashTable::BeesHashTable(shared_ptr<BeesContext> ctx, string filename) :
+BeesHashTable::BeesHashTable(shared_ptr<BeesContext> ctx, string filename, off_t size) :
 	m_ctx(ctx),
 	m_size(0),
 	m_void_ptr(nullptr),
@@ -587,66 +639,69 @@ BeesHashTable::BeesHashTable(shared_ptr<BeesContext> ctx, string filename) :
 	m_buckets(0),
 	m_cells(0),
 	m_writeback_thread("hash_writeback"),
-	m_prefetch_thread("hash_prefetch " + m_ctx->root_path()),
+	m_prefetch_thread("hash_prefetch"),
 	m_flush_rate_limit(BEES_FLUSH_RATE),
-	m_prefetch_rate_limit(BEES_FLUSH_RATE),
 	m_stats_file(m_ctx->home_fd(), "beesstats.txt")
 {
-	BEESNOTE("opening hash table " << filename);
-
-	m_fd = openat_or_die(m_ctx->home_fd(), filename, FLAGS_OPEN_FILE_RW, 0700);
-	Stat st(m_fd);
-	m_size = st.st_size;
-
-	BEESTRACE("hash table size " << m_size);
-	BEESTRACE("hash table bucket size " << BLOCK_SIZE_HASHTAB_BUCKET);
-	BEESTRACE("hash table extent size " << BLOCK_SIZE_HASHTAB_EXTENT);
-
+	// Sanity checks to protect the implementation from its weaknesses
 	THROW_CHECK2(invalid_argument, BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_EXTENT, (BLOCK_SIZE_HASHTAB_EXTENT % BLOCK_SIZE_HASHTAB_BUCKET) == 0);

-	// Does the union work?
-	THROW_CHECK2(runtime_error, m_void_ptr, m_cell_ptr, m_void_ptr == m_cell_ptr);
-	THROW_CHECK2(runtime_error, m_void_ptr, m_byte_ptr, m_void_ptr == m_byte_ptr);
-	THROW_CHECK2(runtime_error, m_void_ptr, m_bucket_ptr, m_void_ptr == m_bucket_ptr);
-	THROW_CHECK2(runtime_error, m_void_ptr, m_extent_ptr, m_void_ptr == m_extent_ptr);
-
 	// There's more than one union
 	THROW_CHECK2(runtime_error, sizeof(Bucket), BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_BUCKET == sizeof(Bucket));
 	THROW_CHECK2(runtime_error, sizeof(Bucket::p_byte), BLOCK_SIZE_HASHTAB_BUCKET, BLOCK_SIZE_HASHTAB_BUCKET == sizeof(Bucket::p_byte));
 	THROW_CHECK2(runtime_error, sizeof(Extent), BLOCK_SIZE_HASHTAB_EXTENT, BLOCK_SIZE_HASHTAB_EXTENT == sizeof(Extent));
 	THROW_CHECK2(runtime_error, sizeof(Extent::p_byte), BLOCK_SIZE_HASHTAB_EXTENT, BLOCK_SIZE_HASHTAB_EXTENT == sizeof(Extent::p_byte));

-	BEESLOG("opened hash table filename '" << filename << "' length " << m_size);
+	m_filename = filename;
+	m_size = size;
+	open_file();
+
+	// Now we know size we can compute stuff
+
+	BEESTRACE("hash table size " << m_size);
+	BEESTRACE("hash table bucket size " << BLOCK_SIZE_HASHTAB_BUCKET);
+	BEESTRACE("hash table extent size " << BLOCK_SIZE_HASHTAB_EXTENT);
+
+	BEESLOGINFO("opened hash table filename '" << filename << "' length " << m_size);
 	m_buckets = m_size / BLOCK_SIZE_HASHTAB_BUCKET;
 	m_cells = m_buckets * c_cells_per_bucket;
 	m_extents = (m_size + BLOCK_SIZE_HASHTAB_EXTENT - 1) / BLOCK_SIZE_HASHTAB_EXTENT;
-	BEESLOG("\tcells " << m_cells << ", buckets " << m_buckets << ", extents " << m_extents);
+	BEESLOGINFO("\tcells " << m_cells << ", buckets " << m_buckets << ", extents " << m_extents);

-	BEESLOG("\tflush rate limit " << BEES_FLUSH_RATE);
+	BEESLOGINFO("\tflush rate limit " << BEES_FLUSH_RATE);

-	if (using_shared_map()) {
-		try_mmap_flags(MAP_SHARED);
-	} else {
-		try_mmap_flags(MAP_PRIVATE | MAP_ANONYMOUS);
-	}
+	// Try to mmap that much memory
+	try_mmap_flags(MAP_PRIVATE | MAP_ANONYMOUS);

 	if (!m_cell_ptr) {
-		THROW_ERROR(runtime_error, "unable to mmap " << filename);
+		THROW_ERRNO("unable to mmap " << filename);
 	}

-	if (!using_shared_map()) {
-		// madvise fails if MAP_SHARED
-		if (using_any_madvise()) {
-			// DONTFORK because we sometimes do fork,
-			// but the child doesn't touch any of the many, many pages
-			BEESTOOLONG("madvise(MADV_HUGEPAGE | MADV_DONTFORK)");
-			DIE_IF_NON_ZERO(madvise(m_byte_ptr, m_size, MADV_HUGEPAGE | MADV_DONTFORK));
-		}
-		for (uint64_t i = 0; i < m_size / sizeof(Extent); ++i) {
-			m_buckets_missing.insert(i);
+	// Do unions work the way we think (and rely on)?
+	THROW_CHECK2(runtime_error, m_void_ptr, m_cell_ptr, m_void_ptr == m_cell_ptr);
+	THROW_CHECK2(runtime_error, m_void_ptr, m_byte_ptr, m_void_ptr == m_byte_ptr);
+	THROW_CHECK2(runtime_error, m_void_ptr, m_bucket_ptr, m_void_ptr == m_bucket_ptr);
+	THROW_CHECK2(runtime_error, m_void_ptr, m_extent_ptr, m_void_ptr == m_extent_ptr);
+
+	// Give all the madvise hints that the kernel understands
+	const struct madv_flag {
+		const char *name;
+		int value;
+	} madv_flags[] = {
+		{ .name = "MADV_HUGEPAGE", .value = MADV_HUGEPAGE },
+		{ .name = "MADV_DONTFORK", .value = MADV_DONTFORK },
+		{ .name = "MADV_DONTDUMP", .value = MADV_DONTDUMP },
+		{ .name = "", .value = 0 },
+	};
+	for (auto fp = madv_flags; fp->value; ++fp) {
+		BEESTOOLONG("madvise(" << fp->name << ")");
+		if (madvise(m_byte_ptr, m_size, fp->value)) {
+			BEESLOGWARN("madvise(..., " << fp->name << "): " << strerror(errno) << " (ignored)");
 		}
 	}

+	m_extent_metadata.resize(m_extents);
+
 	m_writeback_thread.exec([&]() {
 		writeback_loop();
        });
--- a/src/bees-resolve.cc
+++ b/src/bees-resolve.cc
@@ -98,90 +98,77 @@ BeesResolver::adjust_offset(const BeesFileRange &haystack, const BeesBlockData &
 		return BeesBlockData();
 	}

-	off_t lower_offset = haystack.begin();
-	off_t upper_offset = haystack.end();
+	off_t haystack_offset = haystack.begin();
 	bool is_compressed_offset = false;
 	bool is_exact = false;
-	bool is_legacy = false;
 	if (m_addr.is_compressed()) {
 		BtrfsExtentWalker ew(haystack.fd(), haystack.begin(), m_ctx->root_fd());
-		BEESTRACE("haystack extent data " << ew); 
+		BEESTRACE("haystack extent data " << ew);
 		Extent e = ew.current();
-		if (m_addr.has_compressed_offset()) {
-			off_t coff = m_addr.get_compressed_offset();
-			if (e.offset() > coff) {
-				// this extent begins after the target block
-				BEESCOUNT(adjust_offset_low);
-				return BeesBlockData();
-			}
-			coff -= e.offset();
-			if (e.size() <= coff) {
-				// this extent ends before the target block
-				BEESCOUNT(adjust_offset_high);
-				return BeesBlockData();
-			}
-			lower_offset = e.begin() + coff;
-			upper_offset = lower_offset + BLOCK_SIZE_CLONE;
-			BEESCOUNT(adjust_offset_hit);
-			is_compressed_offset = true;
-		} else {
-			lower_offset = e.begin();
-			upper_offset = e.end();
-			BEESCOUNT(adjust_legacy);
-			is_legacy = true;
+		THROW_CHECK1(runtime_error, m_addr, m_addr.has_compressed_offset());
+		off_t coff = m_addr.get_compressed_offset();
+		if (e.offset() > coff) {
+			// this extent begins after the target block
+			BEESCOUNT(adjust_offset_low);
+			return BeesBlockData();
 		}
+		coff -= e.offset();
+		if (e.size() <= coff) {
+			// this extent ends before the target block
+			BEESCOUNT(adjust_offset_high);
+			return BeesBlockData();
+		}
+		haystack_offset = e.begin() + coff;
+		BEESCOUNT(adjust_offset_hit);
+		is_compressed_offset = true;
 	} else {
 		BEESCOUNT(adjust_exact);
 		is_exact = true;
 	}

-	BEESTRACE("Checking haystack " << haystack << " offsets " << to_hex(lower_offset) << ".." << to_hex(upper_offset));
+	BEESTRACE("Checking haystack " << haystack << " offset " << to_hex(haystack_offset));

 	// Check all the blocks in the list
-	for (off_t haystack_offset = lower_offset; haystack_offset < upper_offset; haystack_offset += BLOCK_SIZE_CLONE) {
-		THROW_CHECK1(out_of_range, haystack_offset, (haystack_offset & BLOCK_MASK_CLONE) == 0);
+	THROW_CHECK1(out_of_range, haystack_offset, (haystack_offset & BLOCK_MASK_CLONE) == 0);

-		// Straw cannot extend beyond end of haystack
-		if (haystack_offset + needle.size() > haystack_size) {
-			BEESCOUNT(adjust_needle_too_long);
-			break;
-		}
-
-		// Read the haystack
-		BEESTRACE("straw " << name_fd(haystack.fd()) << ", offset " << to_hex(haystack_offset) << ", length " << needle.size());
-		BeesBlockData straw(haystack.fd(), haystack_offset, needle.size());
-
-		BEESTRACE("straw = " << straw);
-
-		// Stop if we find a match
-		if (straw.is_data_equal(needle)) {
-			BEESCOUNT(adjust_hit);
-			m_found_data = true;
-			m_found_hash = true;
-			if (is_compressed_offset) BEESCOUNT(adjust_compressed_offset_correct);
-			if (is_legacy) BEESCOUNT(adjust_legacy_correct);
-			if (is_exact) BEESCOUNT(adjust_exact_correct);
-			return straw;
-		}
-
-		if (straw.hash() != needle.hash()) {
-			// Not the same hash or data, try next block
-			BEESCOUNT(adjust_miss);
-			continue;
-		}
-
-		// Found the hash but not the data.  Yay!
-		m_found_hash = true;
-		BEESLOG("HASH COLLISION\n"
-			<< "\tneedle " << needle << "\n"
-			<< "\tstraw " << straw);
-		BEESCOUNT(hash_collision);
+	// Straw cannot extend beyond end of haystack
+	if (haystack_offset + needle.size() > haystack_size) {
+		BEESCOUNT(adjust_needle_too_long);
+		return BeesBlockData();
 	}

+	// Read the haystack
+	BEESTRACE("straw " << name_fd(haystack.fd()) << ", offset " << to_hex(haystack_offset) << ", length " << needle.size());
+	BeesBlockData straw(haystack.fd(), haystack_offset, needle.size());
+
+	BEESTRACE("straw = " << straw);
+
+	// Stop if we find a match
+	if (straw.is_data_equal(needle)) {
+		BEESCOUNT(adjust_hit);
+		m_found_data = true;
+		m_found_hash = true;
+		if (is_compressed_offset) BEESCOUNT(adjust_compressed_offset_correct);
+		if (is_exact) BEESCOUNT(adjust_exact_correct);
+		return straw;
+	}
+
+	if (straw.hash() != needle.hash()) {
+		// Not the same hash or data, try next block
+		BEESCOUNT(adjust_miss);
+		return BeesBlockData();
+	}
+
+	// Found the hash but not the data.  Yay!
+	m_found_hash = true;
+	BEESLOGINFO("HASH COLLISION\n"
+		<< "\tneedle " << needle << "\n"
+		<< "\tstraw " << straw);
+	BEESCOUNT(hash_collision);
+
 	// Ran out of offsets to try
 	BEESCOUNT(adjust_no_match);
 	if (is_compressed_offset) BEESCOUNT(adjust_compressed_offset_wrong);
-	if (is_legacy) BEESCOUNT(adjust_legacy_wrong);
 	if (is_exact) BEESCOUNT(adjust_exact_wrong);
 	m_wrong_data = true;
 	return BeesBlockData();
@@ -196,8 +183,8 @@ BeesResolver::chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &

 	Fd file_fd = m_ctx->roots()->open_root_ino(bior.m_root, bior.m_inum);
 	if (!file_fd) {
-		// Delete snapshots generate craptons of these
-		// BEESINFO("No FD in chase_extent_ref " << bior);
+		// Deleted snapshots generate craptons of these
+		// BEESLOGDEBUG("No FD in chase_extent_ref " << bior);
 		BEESCOUNT(chase_no_fd);
 		return BeesFileRange();
 	}
@@ -211,7 +198,7 @@ BeesResolver::chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &

 	// ...or are we?
 	if (file_addr.is_magic()) {
-		BEESINFO("file_addr is magic: file_addr = " << file_addr << " bior = " << bior << " needle_bbd = " << needle_bbd);
+		BEESLOGDEBUG("file_addr is magic: file_addr = " << file_addr << " bior = " << bior << " needle_bbd = " << needle_bbd);
 		BEESCOUNT(chase_wrong_magic);
 		return BeesFileRange();
 	}
@@ -220,7 +207,7 @@ BeesResolver::chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &
 	// Did we get the physical block we asked for?  The magic bits have to match too,
 	// but the compressed offset bits do not.
 	if (file_addr.get_physical_or_zero() != m_addr.get_physical_or_zero()) {
-		// BEESINFO("found addr " << file_addr << " at " << name_fd(file_fd) << " offset " << to_hex(bior.m_offset) << " but looking for " << m_addr);
+		// BEESLOGDEBUG("found addr " << file_addr << " at " << name_fd(file_fd) << " offset " << to_hex(bior.m_offset) << " but looking for " << m_addr);
 		// FIEMAP/resolve are working, but the data is old.
 		BEESCOUNT(chase_wrong_addr);
 		return BeesFileRange();
@@ -243,7 +230,7 @@ BeesResolver::chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &
 	auto new_bbd = adjust_offset(haystack_bbd, needle_bbd);
 	if (new_bbd.empty()) {
 		// matching offset search failed
-		BEESCOUNT(chase_wrong_data);
+		BEESCOUNT(chase_no_data);
 		return BeesFileRange();
 	}
 	if (new_bbd.begin() == haystack_bbd.begin()) {
@@ -368,7 +355,8 @@ BeesResolver::for_each_extent_ref(BeesBlockData bbd, function<bool(const BeesFil
 		}

 		// Look at the old data
-		catch_all([&]() {
+		// FIXME:  propagate exceptions for now.  Proper fix requires a rewrite.
+		// catch_all([&]() {
 			BEESTRACE("chase_extent_ref ino " << ino_off_root << " bbd " << bbd);
 			auto new_range = chase_extent_ref(ino_off_root, bbd);
 			// XXX: should we catch visitor's exceptions here?
@@ -378,9 +366,12 @@ BeesResolver::for_each_extent_ref(BeesBlockData bbd, function<bool(const BeesFil
 				// We have reliable block addresses now, so we guarantee we can hit the desired block.
 				// Failure in chase_extent_ref means we are done, and don't need to look up all the
 				// other references.
-				stop_now = true;
+				// Or...not?  If we have a compressed extent, some refs will not match
+				// if there is are two references to the same extent with a reference
+				// to a different extent between them.
+				// stop_now = true;
 			}
-		});
+		// });

 		if (stop_now) {
 			break;
@@ -421,7 +412,8 @@ BeesResolver::replace_dst(const BeesFileRange &dst_bfr)
 		BeesBlockData src_bbd(src_bfr.fd(), src_bfr.begin(), min(BLOCK_SIZE_SUMS, src_bfr.size()));
 		if (bbd.addr().get_physical_or_zero() == src_bbd.addr().get_physical_or_zero()) {
 			BEESCOUNT(replacedst_same);
-			return false; // i.e. continue
+			// stop looping here, all the other srcs will probably fail this test too
+			throw runtime_error("FIXME: bailing out here, need to fix this further up the call stack");
 		}

 		// Make pair(src, dst)
@@ -477,11 +469,6 @@ BeesResolver::find_all_matches(BeesBlockData &bbd)
 bool
 BeesResolver::operator<(const BeesResolver &that) const
 {
-	if (that.m_bior_count < m_bior_count) {
-		return true;
-	} else if (m_bior_count < that.m_bior_count) {
-		return false;
-	}
-	return m_addr < that.m_addr;
+	// Lowest count, highest address
+	return tie(that.m_bior_count, m_addr) < tie(m_bior_count, that.m_addr);
 }
-
--- a/src/bees-roots.cc
+++ b/src/bees-roots.cc
--- a/src/bees-thread.cc
+++ b/src/bees-thread.cc
@@ -13,19 +13,16 @@ void
 BeesThread::exec(function<void()> func)
 {
 	m_timer.reset();
-	BEESLOG("BeesThread exec " << m_name);
+	BEESLOGDEBUG("BeesThread exec " << m_name);
 	m_thread_ptr = make_shared<thread>([=]() {
-		BEESLOG("Starting thread " << m_name);
 		BeesNote::set_name(m_name);
+		BEESLOGDEBUG("Starting thread " << m_name);
 		BEESNOTE("thread function");
 		Timer thread_time;
-		catch_all([&]() {
-			DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), m_name.c_str()));
-		});
 		catch_all([&]() {
 			func();
 		});
-		BEESLOG("Exiting thread " << m_name << ", " << thread_time << " sec");
+		BEESLOGDEBUG("Exiting thread " << m_name << ", " << thread_time << " sec");
 	});
 }

@@ -33,7 +30,7 @@ BeesThread::BeesThread(string name, function<void()> func) :
 	m_name(name)
 {
 	THROW_CHECK1(invalid_argument, name, !name.empty());
-	BEESLOG("BeesThread construct " << m_name);
+	BEESLOGDEBUG("BeesThread construct " << m_name);
 	exec(func);
 }

@@ -41,20 +38,20 @@ void
 BeesThread::join()
 {
 	if (!m_thread_ptr) {
-		BEESLOG("Thread " << m_name << " no thread ptr");
+		BEESLOGDEBUG("Thread " << m_name << " no thread ptr");
 		return;
 	}

-	BEESLOG("BeesThread::join " << m_name);
+	BEESLOGDEBUG("BeesThread::join " << m_name);
 	if (m_thread_ptr->joinable()) {
-		BEESLOG("Joining thread " << m_name);
+		BEESLOGDEBUG("Joining thread " << m_name);
 		Timer thread_time;
 		m_thread_ptr->join();
-		BEESLOG("Waited for " << m_name << ", " << thread_time << " sec");
+		BEESLOGDEBUG("Waited for " << m_name << ", " << thread_time << " sec");
 	} else if (!m_name.empty()) {
-		BEESLOG("BeesThread " << m_name << " not joinable");
+		BEESLOGDEBUG("BeesThread " << m_name << " not joinable");
 	} else {
-		BEESLOG("BeesThread else " << m_name);
+		BEESLOGDEBUG("BeesThread else " << m_name);
 	}
 }

@@ -67,25 +64,25 @@ BeesThread::set_name(const string &name)
 BeesThread::~BeesThread()
 {
 	if (!m_thread_ptr) {
-		BEESLOG("Thread " << m_name << " no thread ptr");
+		BEESLOGDEBUG("Thread " << m_name << " no thread ptr");
 		return;
 	}

-	BEESLOG("BeesThread destructor " << m_name);
+	BEESLOGDEBUG("BeesThread destructor " << m_name);
 	if (m_thread_ptr->joinable()) {
-		BEESLOG("Cancelling thread " << m_name);
+		BEESLOGDEBUG("Cancelling thread " << m_name);
 		int rv = pthread_cancel(m_thread_ptr->native_handle());
 		if (rv) {
-			BEESLOG("pthread_cancel returned " << strerror(-rv));
+			BEESLOGDEBUG("pthread_cancel returned " << strerror(-rv));
 		}
-		BEESLOG("Waiting for thread " << m_name);
+		BEESLOGDEBUG("Waiting for thread " << m_name);
 		Timer thread_time;
 		m_thread_ptr->join();
-		BEESLOG("Waited for " << m_name << ", " << thread_time << " sec");
+		BEESLOGDEBUG("Waited for " << m_name << ", " << thread_time << " sec");
 	} else if (!m_name.empty()) {
-		BEESLOG("Thread " << m_name << " not joinable");
+		BEESLOGDEBUG("Thread " << m_name << " not joinable");
 	} else {
-		BEESLOG("Thread destroy else " << m_name);
+		BEESLOGDEBUG("Thread destroy else " << m_name);
 	}
 }

--- a/src/bees-types.cc
+++ b/src/bees-types.cc
@@ -71,7 +71,18 @@ operator<<(ostream &os, const BeesFileRange &bfr)
 	if (bfr.end() == numeric_limits<off_t>::max()) {
 		os << "- [" << to_hex(bfr.begin()) << "..eof]";
 	} else {
-		os << pretty(bfr.size()) << " [" << to_hex(bfr.begin()) << ".." << to_hex(bfr.end()) << "]";
+		os << pretty(bfr.size()) << " ";
+		if (bfr.begin() != 0) {
+			os << "[" << to_hex(bfr.begin());
+		} else {
+			os << "(";
+		}
+		os << ".." << to_hex(bfr.end());
+		if (!!bfr.m_fd && bfr.end() >= bfr.file_size()) {
+			os << ")";
+		} else {
+			os << "]";
+		}
 	}
 	if (bfr.m_fid) {
 		os << " fid = " << bfr.m_fid;
@@ -92,8 +103,6 @@ operator<<(ostream &os, const BeesRangePair &brp)
 		<< "\ndst = " << brp.second.fd() << " " << name_fd(brp.second.fd());
 }

-mutex BeesFileRange::s_mutex;
-
 bool
 BeesFileRange::operator<(const BeesFileRange &that) const
 {
@@ -145,14 +154,14 @@ off_t
 BeesFileRange::file_size() const
 {
 	if (m_file_size <= 0) {
-		// Use method fd() not member m_fd() so we hold lock
 		Stat st(fd());
 		m_file_size = st.st_size;
 		// These checks could trigger on valid input, but that would mean we have
 		// lost a race (e.g. a file was truncated while we were building a
 		// matching range pair with it).  In such cases we should probably stop
 		// whatever we were doing and backtrack to some higher level anyway.
-		THROW_CHECK1(invalid_argument, m_file_size, m_file_size > 0);
+		// Well, OK, but we call this function from exception handlers...
+		THROW_CHECK1(invalid_argument, m_file_size, m_file_size >= 0);
 		// THROW_CHECK2(invalid_argument, m_file_size, m_end, m_end <= m_file_size || m_end == numeric_limits<off_t>::max());
 	}
 	return m_file_size;
@@ -178,31 +187,21 @@ BeesFileRange::grow_begin(off_t delta)
 BeesFileRange::BeesFileRange(const BeesBlockData &bbd) :
 	m_fd(bbd.fd()),
 	m_begin(bbd.begin()),
-	m_end(bbd.end()),
-	m_file_size(-1)
+	m_end(bbd.end())
 {
 }

 BeesFileRange::BeesFileRange(Fd fd, off_t begin, off_t end) :
 	m_fd(fd),
 	m_begin(begin),
-	m_end(end),
-	m_file_size(-1)
+	m_end(end)
 {
 }

 BeesFileRange::BeesFileRange(const BeesFileId &fid, off_t begin, off_t end) :
 	m_fid(fid),
 	m_begin(begin),
-	m_end(end),
-	m_file_size(-1)
-{
-}
-
-BeesFileRange::BeesFileRange() :
-	m_begin(0),
-	m_end(0),
-	m_file_size(-1)
+	m_end(end)
 {
 }

@@ -285,22 +284,18 @@ BeesFileRange::operator BeesBlockData() const
 Fd
 BeesFileRange::fd() const
 {
-	unique_lock<mutex> lock(s_mutex);
 	return m_fd;
 }

 Fd
 BeesFileRange::fd(const shared_ptr<BeesContext> &ctx) const
 {
-	unique_lock<mutex> lock(s_mutex);
 	// If we don't have a fid we can't do much here
 	if (m_fid) {
 		if (!m_fd) {
 			// If we don't have a fd, open by fid
 			if (m_fid && ctx) {
-				lock.unlock();
 				Fd new_fd = ctx->roots()->open_root_ino(m_fid);
-				lock.lock();
 				m_fd = new_fd;
 			}
 		} else {
@@ -374,6 +369,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
 	BEESTOOLONG("grow constrained = " << constrained << " *this = " << *this);
 	BEESTRACE("grow constrained = " << constrained << " *this = " << *this);
 	bool rv = false;
+	Timer grow_backward_timer;

 	THROW_CHECK1(invalid_argument, first.begin(), (first.begin() & BLOCK_MASK_CLONE) == 0);
 	THROW_CHECK1(invalid_argument, second.begin(), (second.begin() & BLOCK_MASK_CLONE) == 0);
@@ -390,8 +386,8 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
 	BEESTRACE("e_second " << e_second);

 	// Preread entire extent
-	posix_fadvise(second.fd(), e_second.begin(), e_second.size(), POSIX_FADV_WILLNEED);
-	posix_fadvise(first.fd(), e_second.begin() + first.begin() - second.begin(), e_second.size(), POSIX_FADV_WILLNEED);
+	readahead(second.fd(), e_second.begin(), e_second.size());
+	readahead(first.fd(), e_second.begin() + first.begin() - second.begin(), e_second.size());

 	auto hash_table = ctx->hash_table();

@@ -410,7 +406,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
 				BEESCOUNT(pairbackward_hole);
 				break;
 			}
-			posix_fadvise(second.fd(), e_second.begin(), e_second.size(), POSIX_FADV_WILLNEED);
+			readahead(second.fd(), e_second.begin(), e_second.size());
 #else
 			// This tends to repeatedly process extents that were recently processed.
 			// We tend to catch duplicate blocks early since we scan them forwards.
@@ -434,7 +430,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
 		if (!first_addr.is_magic()) {
 			auto first_resolved = ctx->resolve_addr(first_addr);
 			if (first_resolved.is_toxic()) {
-				BEESLOG("WORKAROUND: not growing matching pair backward because src addr is toxic:\n" << *this);
+				BEESLOGWARN("WORKAROUND: not growing matching pair backward because src addr is toxic:\n" << *this);
 				BEESCOUNT(pairbackward_toxic_addr);
 				break;
 			}
@@ -490,7 +486,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
 			}
 		}
 		if (found_toxic) {
-			BEESLOG("WORKAROUND: found toxic hash in " << first_bbd << " while extending backward:\n" << *this);
+			BEESLOGWARN("WORKAROUND: found toxic hash in " << first_bbd << " while extending backward:\n" << *this);
 			BEESCOUNT(pairbackward_toxic_hash);
 			break;
 		}
@@ -502,9 +498,11 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
 		BEESCOUNT(pairbackward_hit);
 	}
 	BEESCOUNT(pairbackward_stop);
+	BEESCOUNTADD(pairbackward_ms, grow_backward_timer.age() * 1000);

 	// Look forward
 	BEESTRACE("grow_forward " << *this);
+	Timer grow_forward_timer;
 	while (first.size() < BLOCK_SIZE_MAX_EXTENT) {
 		if (second.end() >= e_second.end()) {
 			if (constrained) {
@@ -517,7 +515,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
 				BEESCOUNT(pairforward_hole);
 				break;
 			}
-			posix_fadvise(second.fd(), e_second.begin(), e_second.size(), POSIX_FADV_WILLNEED);
+			readahead(second.fd(), e_second.begin(), e_second.size());
 		}
 		BEESCOUNT(pairforward_try);

@@ -535,7 +533,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
 		if (!first_addr.is_magic()) {
 			auto first_resolved = ctx->resolve_addr(first_addr);
 			if (first_resolved.is_toxic()) {
-				BEESLOG("WORKAROUND: not growing matching pair forward because src is toxic:\n" << *this);
+				BEESLOGWARN("WORKAROUND: not growing matching pair forward because src is toxic:\n" << *this);
 				BEESCOUNT(pairforward_toxic);
 				break;
 			}
@@ -599,7 +597,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
 			}
 		}
 		if (found_toxic) {
-			BEESLOG("WORKAROUND: found toxic hash in " << first_bbd << " while extending forward:\n" << *this);
+			BEESLOGWARN("WORKAROUND: found toxic hash in " << first_bbd << " while extending forward:\n" << *this);
 			BEESCOUNT(pairforward_toxic_hash);
 			break;
 		}
@@ -618,6 +616,7 @@ BeesRangePair::grow(shared_ptr<BeesContext> ctx, bool constrained)
 	}

 	BEESCOUNT(pairforward_stop);
+	BEESCOUNTADD(pairforward_ms, grow_forward_timer.age() * 1000);
 	return rv;
 }

@@ -878,6 +877,9 @@ operator<<(ostream &os, const BeesBlockData &bbd)
 		os << ", hash = " << bbd.m_hash;
 	}
 	if (!bbd.m_data.empty()) {
+// Turn this on to debug BeesBlockData, but leave it off otherwise.
+// It's a massive data leak that is only interesting to developers.
+#if 0
 		os << ", data[" << bbd.m_data.size() << "] = '";

 		size_t max_print = 12;
@@ -894,6 +896,9 @@ operator<<(ostream &os, const BeesBlockData &bbd)
 			}
 		}
 		os << "...'";
+#else
+		os << ", data[" << bbd.m_data.size() << "]";
+#endif
 	}
 	return os << " }";
 }
@@ -936,12 +941,13 @@ BeesBlockData::data() const
 {
 	if (m_data.empty()) {
 		THROW_CHECK1(invalid_argument, size(), size() > 0);
+		BEESNOTE("Reading BeesBlockData " << *this);
 		BEESTOOLONG("Reading BeesBlockData " << *this);
 		Timer read_timer;

-		Blob rv(m_length);
+		Blob rv(size());
 		pread_or_die(m_fd, rv, m_offset);
-		THROW_CHECK2(runtime_error, rv.size(), m_length, ranged_cast<off_t>(rv.size()) == m_length);
+		THROW_CHECK2(runtime_error, rv.size(), size(), ranged_cast<off_t>(rv.size()) == size());
 		m_data = rv;
 		BEESCOUNT(block_read);
 		BEESCOUNTADD(block_bytes, rv.size());
--- a/src/bees.cc
+++ b/src/bees.cc
@@ -1,15 +1,16 @@
 #include "bees.h"

-#include "crucible/interp.h"
 #include "crucible/limits.h"
 #include "crucible/process.h"
 #include "crucible/string.h"
+#include "crucible/task.h"

 #include <cctype>
 #include <cmath>

 #include <iostream>
 #include <memory>
+#include <sstream>

 // PRIx64
 #include <inttypes.h>
@@ -20,110 +21,166 @@
 #include <linux/fs.h>
 #include <sys/ioctl.h>

+// setrlimit
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include <getopt.h>
+
 using namespace crucible;
 using namespace std;

-int
-do_cmd_help(const ArgList &argv)
+int bees_log_level = 8;
+
+void
+do_cmd_help(char *argv[])
 {
-	cerr << "Usage: " << argv[0] << " fs-root-path [fs-root-path-2...]\n"
+	// 80col 01234567890123456789012345678901234567890123456789012345678901234567890123456789
+	cerr << "Usage: " << argv[0] << " [options] fs-root-path [fs-root-path-2...]\n"
 		"Performs best-effort extent-same deduplication on btrfs.\n"
 		"\n"
 		"fs-root-path MUST be the root of a btrfs filesystem tree (id 5).\n"
 		"Other directories will be rejected.\n"
 		"\n"
-		"Multiple filesystems can share a single hash table (BEESHOME)\n"
-		"but this only works well if the content of each filesystem\n"
-		"is distinct from all the others.\n"
+		"Options:\n"
+		"    -h, --help            Show this help\n"
 		"\n"
-		"Required environment variables:\n"
-		"\tBEESHOME\tPath to hash table and configuration files\n"
+		"Load management options:\n"
+		"    -c, --thread-count    Worker thread count (default CPU count * factor)\n"
+		"    -C, --thread-factor   Worker thread factor (default " << BEES_DEFAULT_THREAD_FACTOR << ")\n"
+		"    -G, --thread-min      Minimum worker thread count (default 0)\n"
+		"    -g, --loadavg-target  Target load average for worker threads (default none)\n"
+		"\n"
+		"Filesystem tree traversal options:\n"
+		"    -m, --scan-mode       Scanning mode (0..2, default 0)\n"
+		"\n"
+		"Workarounds:\n"
+		"    -a, --workaround-btrfs-send    Workaround for btrfs send\n"
+		"\n"
+		"Logging options:\n"
+		"    -t, --timestamps      Show timestamps in log output (default)\n"
+		"    -T, --no-timestamps   Omit timestamps in log output\n"
+		"    -p, --absolute-paths  Show absolute paths (default)\n"
+		"    -P, --strip-paths     Strip $CWD from beginning of all paths in the log\n"
+		"    -v, --verbose         Set maximum log level (0..8, default 8)\n"
 		"\n"
 		"Optional environment variables:\n"
-		"\tBEESSTATUS\tFile to write status to (tmpfs recommended, e.g. /run)\n"
+		"    BEESHOME    Path to hash table and configuration files\n"
+		"                (default is .beeshome/ in the root of each filesystem).\n"
 		"\n"
+		"    BEESSTATUS  File to write status to (tmpfs recommended, e.g. /run).\n"
+		"                No status is written if this variable is unset.\n"
+		"\n"
+	// 80col 01234567890123456789012345678901234567890123456789012345678901234567890123456789
 	<< endl;
-	return 0;
 }

 // tracing ----------------------------------------

-RateLimiter bees_info_rate_limit(BEES_INFO_RATE, BEES_INFO_BURST);
-
-thread_local BeesTracer *BeesTracer::s_next_tracer = nullptr;
+thread_local BeesTracer *BeesTracer::tl_next_tracer = nullptr;

 BeesTracer::~BeesTracer()
 {
 	if (uncaught_exception()) {
-		m_func();
+		try {
+			m_func();
+		} catch (exception &e) {
+			BEESLOGERR("Nested exception: " << e.what());
+		} catch (...) {
+			BEESLOGERR("Nested exception ...");
+		}
 		if (!m_next_tracer) {
-			BEESLOG("---  END  TRACE --- exception ---");
+			BEESLOGERR("---  END  TRACE --- exception ---");
 		}
 	}
-	s_next_tracer = m_next_tracer;
+	tl_next_tracer = m_next_tracer;
 }

 BeesTracer::BeesTracer(function<void()> f) :
 	m_func(f)
 {
-	m_next_tracer = s_next_tracer;
-	s_next_tracer = this;
+	m_next_tracer = tl_next_tracer;
+	tl_next_tracer = this;
 }

 void
 BeesTracer::trace_now()
 {
-	BeesTracer *tp = s_next_tracer;
-	BEESLOG("--- BEGIN TRACE ---");
+	BeesTracer *tp = tl_next_tracer;
+	BEESLOGERR("--- BEGIN TRACE ---");
 	while (tp) {
 		tp->m_func();
 		tp = tp->m_next_tracer;
 	}
-	BEESLOG("---  END  TRACE ---");
+	BEESLOGERR("---  END  TRACE ---");
 }

-thread_local BeesNote *BeesNote::s_next = nullptr;
+thread_local BeesNote *BeesNote::tl_next = nullptr;
 mutex BeesNote::s_mutex;
 map<pid_t, BeesNote*> BeesNote::s_status;
-thread_local string BeesNote::s_name;
+thread_local string BeesNote::tl_name;

 BeesNote::~BeesNote()
 {
+	tl_next = m_prev;
 	unique_lock<mutex> lock(s_mutex);
-	s_next = m_prev;
-	if (s_next) {
-		s_status[gettid()] = s_next;
+	if (tl_next) {
+		s_status[crucible::gettid()] = tl_next;
 	} else {
-		s_status.erase(gettid());
+		s_status.erase(crucible::gettid());
 	}
 }

 BeesNote::BeesNote(function<void(ostream &os)> f) :
 	m_func(f)
 {
+	m_name = get_name();
+	m_prev = tl_next;
+	tl_next = this;
 	unique_lock<mutex> lock(s_mutex);
-	m_name = s_name;
-	m_prev = s_next;
-	s_next = this;
-	s_status[gettid()] = s_next;
+	s_status[crucible::gettid()] = tl_next;
 }

 void
 BeesNote::set_name(const string &name)
 {
-	unique_lock<mutex> lock(s_mutex);
-	s_name = name;
+	tl_name = name;
+	catch_all([&]() {
+		DIE_IF_MINUS_ERRNO(pthread_setname_np(pthread_self(), name.c_str()));
+	});
 }

 string
 BeesNote::get_name()
 {
-	unique_lock<mutex> lock(s_mutex);
-	if (s_name.empty()) {
-		return "bees";
-	} else {
-		return s_name;
+	// Use explicit name if given
+	if (!tl_name.empty()) {
+		return tl_name;
 	}
+
+	// Try a Task name.  If there is one, return it, but do not
+	// remember it.  Each output message may be a different Task.
+	// The current task is thread_local so we don't need to worry
+	// about it being destroyed under us.
+	auto current_task = Task::current_task();
+	if (current_task) {
+		return current_task.title();
+	}
+
+	// OK try the pthread name next.
+	char buf[24];
+	memset(buf, '\0', sizeof(buf));
+	int err = pthread_getname_np(pthread_self(), buf, sizeof(buf));
+	if (err) {
+		return string("pthread_getname_np: ") + strerror(err);
+	}
+	buf[sizeof(buf) - 1] = '\0';
+
+	// thread_getname_np returns process name
+	// ...by default?  ...for the main thread?
+	// ...except during exception handling?
+	// ...randomly?
+	return buf;
 }

 BeesNote::ThreadStatusMap
@@ -147,20 +204,6 @@ BeesNote::get_status()

 // static inline helpers ----------------------------------------

-static inline
-bool
-bees_addr_check(uint64_t v)
-{
-	return !(v & (1ULL << 63));
-}
-
-static inline
-bool
-bees_addr_check(int64_t v)
-{
-	return !(v & (1ULL << 63));
-}
-
 string
 pretty(double d)
 {
@@ -208,11 +251,10 @@ template <class T>
 T&
 BeesStatTmpl<T>::at(string idx)
 {
-	unique_lock<mutex> lock(m_mutex);
-    if (!m_stats_map.count(idx)) {
+	if (!m_stats_map.count(idx)) {
 		m_stats_map[idx] = 0;
 	}
-    return m_stats_map[idx];
+	return m_stats_map[idx];
 }

 template <class T>
@@ -220,7 +262,8 @@ T
 BeesStatTmpl<T>::at(string idx) const
 {
 	unique_lock<mutex> lock(m_mutex);
-    return m_stats_map.at(idx);
+	auto rv = m_stats_map.at(idx);
+	return rv;
 }

 template <class T>
@@ -228,7 +271,7 @@ void
 BeesStatTmpl<T>::add_count(string idx, size_t amount)
 {
 	unique_lock<mutex> lock(m_mutex);
-    if (!m_stats_map.count(idx)) {
+	if (!m_stats_map.count(idx)) {
 		m_stats_map[idx] = 0;
 	}
 	m_stats_map.at(idx) += amount;
@@ -260,14 +303,17 @@ BeesStats
 BeesStats::operator-(const BeesStats &that) const
 {
 	if (&that == this) return BeesStats();
+
 	unique_lock<mutex> this_lock(m_mutex);
 	BeesStats this_copy;
 	this_copy.m_stats_map = m_stats_map;
+	this_lock.unlock();
+
 	unique_lock<mutex> that_lock(that.m_mutex);
 	BeesStats that_copy;
 	that_copy.m_stats_map = that.m_stats_map;
-	this_lock.unlock();
 	that_lock.unlock();
+
 	for (auto i : that.m_stats_map) {
 		if (i.second != 0) {
 			this_copy.at(i.first) -= i.second;
@@ -316,7 +362,7 @@ BeesTooLong::check() const
 	if (age() > m_limit) {
 		ostringstream oss;
 		m_func(oss);
-		BEESLOG("PERFORMANCE: " << *this << " sec: " << oss.str());
+		BEESLOGWARN("PERFORMANCE: " << *this << " sec: " << oss.str());
 	}
 }

@@ -348,7 +394,19 @@ BeesStringFile::BeesStringFile(Fd dir_fd, string name, size_t limit) :
 	m_name(name),
 	m_limit(limit)
 {
-	BEESLOG("BeesStringFile " << name_fd(m_dir_fd) << "/" << m_name << " max size " << pretty(m_limit));
+	BEESLOGINFO("BeesStringFile " << name_fd(m_dir_fd) << "/" << m_name << " max size " << pretty(m_limit));
+}
+
+void
+BeesStringFile::name(const string &new_name)
+{
+	m_name = new_name;
+}
+
+string
+BeesStringFile::name() const
+{
+	return m_name;
 }

 string
@@ -384,8 +442,19 @@ BeesStringFile::write(string contents)
 		Fd ofd = openat_or_die(m_dir_fd, tmpname, FLAGS_CREATE_FILE, S_IRUSR | S_IWUSR);
 		BEESNOTE("writing " << tmpname << " in " << name_fd(m_dir_fd));
 		write_or_die(ofd, contents);
+#if 0
+		// This triggers too many btrfs bugs.  I wish I was kidding.
+		// Forget snapshots, balance, compression, and dedup:
+		// the system call you have to fear on btrfs is fsync().
+		// Also note that when bees renames a temporary over an
+		// existing file, it flushes the temporary, so we get
+		// the right behavior if we just do nothing here
+		// (except when the file is first created; however,
+		// in that case the result is the same as if the file
+		// did not exist, was empty, or was filled with garbage).
 		BEESNOTE("fsyncing " << tmpname << " in " << name_fd(m_dir_fd));
 		DIE_IF_NON_ZERO(fsync(ofd));
+#endif
 	}
 	BEESNOTE("renaming " << tmpname << " to " << m_name << " in FD " << name_fd(m_dir_fd));
 	BEESTRACE("renaming " << tmpname << " to " << m_name << " in FD " << name_fd(m_dir_fd));
@@ -399,6 +468,7 @@ BeesTempFile::create()
 	BEESNOTE("creating temporary file in " << m_ctx->root_path());
 	BEESTOOLONG("creating temporary file in " << m_ctx->root_path());

+	Timer create_timer;
 	DIE_IF_MINUS_ONE(m_fd = openat(m_ctx->root_fd(), ".", FLAGS_OPEN_TMPFILE, S_IRUSR | S_IWUSR));
 	BEESCOUNT(tmp_create);

@@ -406,18 +476,22 @@ BeesTempFile::create()
 	// Resolves won't work there anyway.  There are lots of tempfiles
 	// and they're short-lived, so this ends up being just a memory leak
 	// m_ctx->blacklist_add(BeesFileId(m_fd));
+
+	// Put this inode in the cache so we can resolve it later
 	m_ctx->insert_root_ino(m_fd);

 	// Set compression attribute
-	int flags = 0;
-	BEESTRACE("Getting FS_COMPR_FL on m_fd " << name_fd(m_fd) << " flags " << to_hex(flags));
-	DIE_IF_MINUS_ONE(ioctl(m_fd, FS_IOC_GETFLAGS, &flags));
+	BEESTRACE("Getting FS_COMPR_FL on m_fd " << name_fd(m_fd));
+	int flags = ioctl_iflags_get(m_fd);
 	flags |= FS_COMPR_FL;
 	BEESTRACE("Setting FS_COMPR_FL on m_fd " << name_fd(m_fd) << " flags " << to_hex(flags));
-	DIE_IF_MINUS_ONE(ioctl(m_fd, FS_IOC_SETFLAGS, &flags));
+	ioctl_iflags_set(m_fd, flags);

 	// Always leave first block empty to avoid creating a file with an inline extent
 	m_end_offset = BLOCK_SIZE_CLONE;
+
+	// Count time spent here
+	BEESCOUNTADD(tmp_create_ms, create_timer.age() * 1000);
 }

 void
@@ -431,11 +505,15 @@ BeesTempFile::resize(off_t offset)
 	THROW_CHECK2(invalid_argument, m_end_offset, offset, m_end_offset < offset);

 	// Truncate
+	Timer resize_timer;
 	DIE_IF_NON_ZERO(ftruncate(m_fd, offset));
 	BEESCOUNT(tmp_resize);

 	// Success
 	m_end_offset = offset;
+
+	// Count time spent here
+	BEESCOUNTADD(tmp_resize_ms, resize_timer.age() * 1000);
 }

 BeesTempFile::BeesTempFile(shared_ptr<BeesContext> ctx) :
@@ -447,9 +525,9 @@ BeesTempFile::BeesTempFile(shared_ptr<BeesContext> ctx) :

 void
 BeesTempFile::realign()
-{ 
+{
 	if (m_end_offset > BLOCK_SIZE_MAX_TEMP_FILE) {
-		BEESLOG("temporary file size " << to_hex(m_end_offset) << " > max " << BLOCK_SIZE_MAX_TEMP_FILE);
+		BEESLOGINFO("temporary file size " << to_hex(m_end_offset) << " > max " << BLOCK_SIZE_MAX_TEMP_FILE);
 		BEESCOUNT(tmp_trunc);
 		return create();
 	}
@@ -483,14 +561,19 @@ BeesTempFile::make_hole(off_t count)
 BeesFileRange
 BeesTempFile::make_copy(const BeesFileRange &src)
 {
-	BEESLOG("copy: " << src);
+	BEESLOGINFO("copy: " << src);
 	BEESNOTE("Copying " << src);
 	BEESTRACE("Copying " << src);

 	THROW_CHECK1(invalid_argument, src, src.size() > 0);

-	// FIXME:  don't know where these come from, but we can't handle them.
-	// Grab a trace for the log.
+	// FIEMAP used to give us garbage data, e.g. distinct adjacent
+	// extents merged into a single entry in the FIEMAP output.
+	// FIEMAP didn't stop giving us garbage data, we just stopped
+	// using FIEMAP.
+	// We shouldn't get absurdly large extents any more; however,
+	// it's still a problem if we do, so bail out and leave a trace
+	// in the log.
 	THROW_CHECK1(invalid_argument, src, src.size() < BLOCK_SIZE_MAX_TEMP_FILE);

 	realign();
@@ -499,6 +582,7 @@ BeesTempFile::make_copy(const BeesFileRange &src)
 	auto end = m_end_offset + src.size();
 	resize(end);

+	Timer copy_timer;
 	BeesFileRange rv(m_fd, begin, end);
 	BEESTRACE("copying to: " << rv);
 	BEESNOTE("copying " << src << " to " << rv);
@@ -524,10 +608,17 @@ BeesTempFile::make_copy(const BeesFileRange &src)
 		src_p += len;
 		dst_p += len;
 	}
+	BEESCOUNTADD(tmp_copy_ms, copy_timer.age() * 1000);

 	// We seem to get lockups without this!
 	if (did_block_write) {
+#if 0
+		// Is this fixed by "Btrfs: fix deadlock between dedup on same file and starting writeback"?
+		// No.
+		// Is this fixed in kernel 4.14.34?
+		// No.
 		bees_sync(m_fd);
+#endif
 	}

 	BEESCOUNT(tmp_copy);
@@ -535,33 +626,184 @@ BeesTempFile::make_copy(const BeesFileRange &src)
 }

 int
-bees_main(ArgList args)
+bees_main(int argc, char *argv[])
 {
 	set_catch_explainer([&](string s) {
-		BEESLOG("\n\n*** EXCEPTION ***\n\t" << s << "\n***\n");
+		BEESLOGERR("\n\n*** EXCEPTION ***\n\t" << s << "\n***\n");
 		BEESCOUNT(exception_caught);
 	});

+	// The thread name for the main function is also what the kernel
+	// Oops messages call the entire process.  So even though this
+	// thread's proper title is "main", let's call it "bees".
+	BeesNote::set_name("bees");
 	BEESNOTE("main");
-	BeesNote::set_name("main");

-	list<shared_ptr<BeesContext>> all_contexts;
-	shared_ptr<BeesContext> bc;
+	THROW_CHECK1(invalid_argument, argc, argc >= 0);

-	// Subscribe to fanotify events
-	bool did_subscription = false;
-	for (string arg : args) {
-		catch_all([&]() {
-			bc = make_shared<BeesContext>(bc);
-			bc->set_root_path(arg);
-			did_subscription = true;
-		});
+	// Create a context so we can apply configuration to it
+	shared_ptr<BeesContext> bc = make_shared<BeesContext>();
+
+	string cwd(readlink_or_die("/proc/self/cwd"));
+
+	// Defaults
+	bool chatter_prefix_timestamp = true;
+	double thread_factor = 0;
+	unsigned thread_count = 0;
+	unsigned thread_min = 0;
+	double load_target = 0;
+	bool workaround_btrfs_send = false;
+	BeesRoots::ScanMode root_scan_mode = BeesRoots::SCAN_MODE_ZERO;
+
+	// Configure getopt_long
+	static const struct option long_options[] = {
+		{ "thread-factor",         required_argument, NULL, 'C' },
+		{ "thread-min",            required_argument, NULL, 'G' },
+		{ "strip-paths",           no_argument,       NULL, 'P' },
+		{ "no-timestamps",         no_argument,       NULL, 'T' },
+		{ "workaround-btrfs-send", no_argument,       NULL, 'a' },
+		{ "thread-count",          required_argument, NULL, 'c' },
+		{ "loadavg-target",        required_argument, NULL, 'g' },
+		{ "help",                  no_argument,       NULL, 'h' },
+		{ "scan-mode",             required_argument, NULL, 'm' },
+		{ "absolute-paths",        no_argument,       NULL, 'p' },
+		{ "timestamps",            no_argument,       NULL, 't' },
+		{ "verbose",               required_argument, NULL, 'v' },
+		{ 0, 0, 0, 0 },
+	};
+
+	// Build getopt_long's short option list from the long_options table.
+	// While we're at it, make sure we didn't duplicate any options.
+	string getopt_list;
+	set<decltype(option::val)> option_vals;
+	for (const struct option *op = long_options; op->val; ++op) {
+		THROW_CHECK1(runtime_error, op->val, !option_vals.count(op->val));
+		option_vals.insert(op->val);
+		if ((op->val & 0xff) != op->val) {
+			continue;
+		}
+		getopt_list += op->val;
+		if (op->has_arg == required_argument) {
+			getopt_list += ':';
+		}
 	}

-	if (!did_subscription) {
-		BEESLOG("WARNING: no filesystems added");
+	// Parse options
+	int c;
+	while (true) {
+		int option_index = 0;
+
+		c = getopt_long(argc, argv, getopt_list.c_str(), long_options, &option_index);
+		if (-1 == c) {
+			break;
+		}
+
+		switch (c) {
+
+			case 'C':
+				thread_factor = stod(optarg);
+				break;
+			case 'G':
+				thread_min = stoul(optarg);
+				break;
+			case 'P':
+				crucible::set_relative_path(cwd);
+				break;
+			case 'T':
+				chatter_prefix_timestamp = false;
+				break;
+			case 'a':
+				workaround_btrfs_send = true;
+				break;
+			case 'c':
+				thread_count = stoul(optarg);
+				break;
+			case 'g':
+				load_target = stod(optarg);
+				break;
+			case 'm':
+				root_scan_mode = static_cast<BeesRoots::ScanMode>(stoul(optarg));
+				break;
+			case 'p':
+				crucible::set_relative_path("");
+				break;
+			case 't':
+				chatter_prefix_timestamp = true;
+				break;
+			case 'v':
+				{
+					int new_log_level = stoul(optarg);
+					THROW_CHECK1(out_of_range, new_log_level, new_log_level <= 8);
+					THROW_CHECK1(out_of_range, new_log_level, new_log_level >= 0);
+					bees_log_level = new_log_level;
+					BEESLOGNOTICE("log level set to " << bees_log_level);
+				}
+				break;
+
+			case 'h':
+			default:
+				do_cmd_help(argv);
+				return EXIT_FAILURE;
+		}
 	}

+	if (optind + 1 != argc) {
+		BEESLOGERR("Only one filesystem path per bees process");
+		return EXIT_FAILURE;
+	}
+
+	Chatter::enable_timestamp(chatter_prefix_timestamp);
+
+	if (!relative_path().empty()) {
+		BEESLOGINFO("using relative path " << relative_path() << "\n");
+	}
+
+	BEESLOGINFO("setting rlimit NOFILE to " << BEES_OPEN_FILE_LIMIT);
+
+	struct rlimit lim = {
+		.rlim_cur = BEES_OPEN_FILE_LIMIT,
+		.rlim_max = BEES_OPEN_FILE_LIMIT,
+	};
+	int rv = setrlimit(RLIMIT_NOFILE, &lim);
+	if (rv) {
+		BEESLOGINFO("setrlimit(RLIMIT_NOFILE, { " << lim.rlim_cur << " }): " << strerror(errno));
+	};
+
+	// Set up worker thread pool
+	THROW_CHECK1(out_of_range, thread_factor, thread_factor >= 0);
+	if (thread_count < 1) {
+		if (thread_factor == 0) {
+			thread_factor = BEES_DEFAULT_THREAD_FACTOR;
+		}
+		thread_count = max(1U, static_cast<unsigned>(ceil(thread::hardware_concurrency() * thread_factor)));
+		if (thread_count > BEES_DEFAULT_THREAD_LIMIT) {
+			BEESLOGNOTICE("Limiting computed thread count to " << BEES_DEFAULT_THREAD_LIMIT);
+			BEESLOGNOTICE("Use --thread-count to override this limit");
+			thread_count = BEES_DEFAULT_THREAD_LIMIT;
+		}
+	}
+
+	if (load_target != 0) {
+		BEESLOGNOTICE("setting load average target to " << load_target);
+		BEESLOGNOTICE("setting worker thread pool minimum size to " << thread_min);
+		TaskMaster::set_thread_min_count(thread_min);
+	}
+	TaskMaster::set_loadavg_target(load_target);
+
+	BEESLOGNOTICE("setting worker thread pool maximum size to " << thread_count);
+	TaskMaster::set_thread_count(thread_count);
+
+	// Set root path
+	string root_path = argv[optind++];
+	BEESLOGNOTICE("setting root path to '" << root_path << "'");
+	bc->set_root_path(root_path);
+
+	// Workaround for btrfs send
+	bc->roots()->set_workaround_btrfs_send(workaround_btrfs_send);
+
+	// Set root scan mode
+	bc->roots()->set_scan_mode(root_scan_mode);
+
 	BeesThread status_thread("status", [&]() {
 		bc->dump_status();
 	});
@@ -570,22 +812,22 @@ bees_main(ArgList args)
 	bc->show_progress();

 	// That is all.
-	return 0;
+	return EXIT_SUCCESS;
 }

 int
-main(int argc, const char **argv)
+main(int argc, char *argv[])
 {
+	cerr << "bees version " << BEES_VERSION << endl;
+
 	if (argc < 2) {
 		do_cmd_help(argv);
-		return 2;
+		return EXIT_FAILURE;
 	}

-	ArgList args(argv + 1);
-
 	int rv = 1;
 	catch_and_explain([&]() {
-		rv = bees_main(args);
+		rv = bees_main(argc, argv);
 	});
 	return rv;
 }
--- a/src/bees.h
+++ b/src/bees.h
@@ -1,7 +1,6 @@
 #ifndef BEES_H
 #define BEES_H

-#include "crucible/bool.h"
 #include "crucible/cache.h"
 #include "crucible/chatter.h"
 #include "crucible/error.h"
@@ -9,17 +8,18 @@
 #include "crucible/fd.h"
 #include "crucible/fs.h"
 #include "crucible/lockset.h"
+#include "crucible/progress.h"
 #include "crucible/time.h"
-#include "crucible/timequeue.h"
-#include "crucible/workqueue.h"
+#include "crucible/task.h"

-#include <array>
+#include <atomic>
 #include <functional>
 #include <list>
 #include <mutex>
 #include <string>
 #include <thread>

+#include <syslog.h>
 #include <endian.h>

 using namespace crucible;
@@ -40,13 +40,6 @@ const off_t BLOCK_SIZE_MAX_EXTENT_SAME = 4096 * 4096;
 // Maximum length of a compressed extent in bytes
 const off_t BLOCK_SIZE_MAX_COMPRESSED_EXTENT = 128 * 1024;

-// Try to combine smaller extents into larger ones
-const off_t BLOCK_SIZE_MIN_EXTENT_DEFRAG = BLOCK_SIZE_MAX_COMPRESSED_EXTENT;
-
-// Avoid splitting extents that are already too small
-const off_t BLOCK_SIZE_MIN_EXTENT_SPLIT = BLOCK_SIZE_MAX_COMPRESSED_EXTENT;
-// const off_t BLOCK_SIZE_MIN_EXTENT_SPLIT = 1024LL * 1024 * 1024 * 1024;
-
 // Maximum length of any extent in bytes
 // except we've seen 1.03G extents...
 // ...FIEMAP is slow and full of lies
@@ -55,8 +48,6 @@ const off_t BLOCK_SIZE_MAX_EXTENT = 128 * 1024 * 1024;
 // Masks, so we don't have to write "(BLOCK_SIZE_CLONE - 1)" everywhere
 const off_t BLOCK_MASK_CLONE = BLOCK_SIZE_CLONE - 1;
 const off_t BLOCK_MASK_SUMS = BLOCK_SIZE_SUMS - 1;
-const off_t BLOCK_MASK_MMAP = BLOCK_SIZE_MMAP - 1;
-const off_t BLOCK_MASK_MAX_COMPRESSED_EXTENT = BLOCK_SIZE_MAX_COMPRESSED_EXTENT * 2 - 1;

 // Maximum temporary file size
 const off_t BLOCK_SIZE_MAX_TEMP_FILE = 1024 * 1024 * 1024;
@@ -70,45 +61,65 @@ const off_t BLOCK_SIZE_HASHTAB_EXTENT = 16 * 1024 * 1024;
 // Bytes per second we want to flush (8GB every two hours)
 const double BEES_FLUSH_RATE = 8.0 * 1024 * 1024 * 1024 / 7200.0;

-// Interval between writing non-hash-table things to disk (15 minutes)
+// Interval between writing crawl state to disk
 const int BEES_WRITEBACK_INTERVAL = 900;

 // Statistics reports while scanning
 const int BEES_STATS_INTERVAL = 3600;

 // Progress shows instantaneous rates and thread status
-const int BEES_PROGRESS_INTERVAL = 3600;
+const int BEES_PROGRESS_INTERVAL = BEES_STATS_INTERVAL;

 // Status is output every freakin second.  Use a ramdisk.
 const int BEES_STATUS_INTERVAL = 1;

+// Number of file FDs to cache when not in active use
+const size_t BEES_FILE_FD_CACHE_SIZE = 4096;
+
+// Number of root FDs to cache when not in active use
+const size_t BEES_ROOT_FD_CACHE_SIZE = 1024;
+
+// Number of FDs to open (rlimit)
+const size_t BEES_OPEN_FILE_LIMIT = (BEES_FILE_FD_CACHE_SIZE + BEES_ROOT_FD_CACHE_SIZE) * 2 + 100;
+
+// Worker thread factor (multiplied by detected number of CPU cores)
+const double BEES_DEFAULT_THREAD_FACTOR = 1.0;
+
+// Don't use more than this number of threads unless explicitly configured
+const size_t BEES_DEFAULT_THREAD_LIMIT = 8;
+
 // Log warnings when an operation takes too long
-const double BEES_TOO_LONG = 2.5;
+const double BEES_TOO_LONG = 5.0;

 // Avoid any extent where LOGICAL_INO takes this long
 const double BEES_TOXIC_DURATION = 9.9;
-
-// How long we should wait for new btrfs transactions
-const double BEES_COMMIT_INTERVAL = 900;
+// EXPERIMENT:  Kernel v4.14+ may let us ignore toxicity
+// NOPE:  kernel 4.14 has the same toxicity problems as any previous kernel
+// const double BEES_TOXIC_DURATION = 99.9;

 // How long between hash table histograms
-const double BEES_HASH_TABLE_ANALYZE_INTERVAL = 3600;
+const double BEES_HASH_TABLE_ANALYZE_INTERVAL = BEES_STATS_INTERVAL;

-// Rate limiting of informational messages
-const double BEES_INFO_RATE = 10.0;
-const double BEES_INFO_BURST = 1.0;
-
-// After we have this many events queued, wait
-const size_t BEES_MAX_QUEUE_SIZE = 1024;
+// Stop growing the work queue after we have this many tasks queued
+const size_t BEES_MAX_QUEUE_SIZE = 128;

 // Read this many items at a time in SEARCHv2
-const size_t BEES_MAX_CRAWL_SIZE = 4096;
+const size_t BEES_MAX_CRAWL_SIZE = 1024;
+
+// Insert this many items before switching to a new subvol
+const size_t BEES_MAX_CRAWL_BATCH = 128;
+
+// Wait this many transids between crawls
+const size_t BEES_TRANSID_FACTOR = 10;

 // If an extent has this many refs, pretend it does not exist
 // to avoid a crippling btrfs performance bug
 // The actual limit in LOGICAL_INO seems to be 2730, but let's leave a little headroom
 const size_t BEES_MAX_EXTENT_REF_COUNT = 2560;

+// Wait this long for a balance to stop
+const double BEES_BALANCE_POLL_INTERVAL = 60.0;
+
 // Flags
 const int FLAGS_OPEN_COMMON   = O_NOFOLLOW | O_NONBLOCK | O_CLOEXEC | O_NOATIME | O_LARGEFILE | O_NOCTTY;
 const int FLAGS_OPEN_DIR      = FLAGS_OPEN_COMMON | O_RDONLY | O_DIRECTORY;
@@ -122,19 +133,18 @@ const int FLAGS_OPEN_FANOTIFY = O_RDWR | O_NOATIME | O_CLOEXEC | O_LARGEFILE;

 // macros ----------------------------------------

-#define BEESLOG(x)      do { Chatter c(BeesNote::get_name()); c << x; } while (0)
-#define BEESLOGTRACE(x) do { BEESLOG(x); BeesTracer::trace_now(); } while (0)
+#define BEESLOG(lv,x)   do { if (lv < bees_log_level) { Chatter c(lv, BeesNote::get_name()); c << x; } } while (0)
+#define BEESLOGTRACE(x) do { BEESLOG(LOG_DEBUG, x); BeesTracer::trace_now(); } while (0)

-#define BEESTRACE(x)   BeesTracer  SRSLY_WTF_C(beesTracer_,  __LINE__) ([&]()                 { BEESLOG(x);   })
+#define BEESTRACE(x)   BeesTracer  SRSLY_WTF_C(beesTracer_,  __LINE__) ([&]()                 { BEESLOG(LOG_ERR, x);   })
 #define BEESTOOLONG(x) BeesTooLong SRSLY_WTF_C(beesTooLong_, __LINE__) ([&](ostream &_btl_os) { _btl_os << x; })
 #define BEESNOTE(x)    BeesNote    SRSLY_WTF_C(beesNote_,    __LINE__) ([&](ostream &_btl_os) { _btl_os << x; })
-#define BEESINFO(x) do { \
-	if (bees_info_rate_limit.is_ready()) { \
-		bees_info_rate_limit.borrow(1); \
-		Chatter c(BeesNote::get_name()); \
-		c << x; \
-	} \
-} while (0)
+
+#define BEESLOGERR(x)    BEESLOG(LOG_ERR, x)
+#define BEESLOGWARN(x)   BEESLOG(LOG_WARNING, x)
+#define BEESLOGNOTICE(x) BEESLOG(LOG_NOTICE, x)
+#define BEESLOGINFO(x)   BEESLOG(LOG_INFO, x)
+#define BEESLOGDEBUG(x)  BEESLOG(LOG_DEBUG, x)

 #define BEESCOUNT(stat) do { \
 	BeesStats::s_global.add_count(#stat); \
@@ -154,16 +164,16 @@ class BeesStatTmpl {
 	map<string, T>	m_stats_map;
 	mutable mutex	m_mutex;

+	T& at(string idx);
 public:
 	BeesStatTmpl() = default;
 	BeesStatTmpl(const BeesStatTmpl &that);
 	BeesStatTmpl &operator=(const BeesStatTmpl &that);
 	void add_count(string idx, size_t amount = 1);
-	T& at(string idx);
 	T at(string idx) const;

 friend ostream& operator<< <>(ostream &os, const BeesStatTmpl<T> &bs);
-friend class BeesStats;
+friend struct BeesStats;
 };

 using BeesRates = BeesStatTmpl<double>;
@@ -182,8 +192,8 @@ class BeesBlockData;
 class BeesTracer {
 	function<void()> m_func;
 	BeesTracer *m_next_tracer = 0;
-	
-	thread_local static BeesTracer *s_next_tracer;
+
+	thread_local static BeesTracer *tl_next_tracer;
 public:
 	BeesTracer(function<void()> f);
 	~BeesTracer();
@@ -199,8 +209,8 @@ class BeesNote {
 	static mutex			s_mutex;
 	static map<pid_t, BeesNote*>	s_status;

-	thread_local static BeesNote	*s_next;
-	thread_local static string	s_name;
+	thread_local static BeesNote	*tl_next;
+	thread_local static string	tl_name;

 public:
 	BeesNote(function<void(ostream &)> f);
@@ -250,15 +260,14 @@ ostream& operator<<(ostream &os, const BeesFileId &bfi);

 class BeesFileRange {
 protected:
-	static mutex		s_mutex;
 	mutable Fd		m_fd;
 	mutable BeesFileId	m_fid;
-	off_t			m_begin, m_end;
-	mutable off_t		m_file_size;
+	off_t			m_begin = 0, m_end = 0;
+	mutable off_t		m_file_size = -1;

 public:

-	BeesFileRange();
+	BeesFileRange() = default;
 	BeesFileRange(Fd fd, off_t begin, off_t end);
 	BeesFileRange(const BeesFileId &fid, off_t begin, off_t end);
 	BeesFileRange(const BeesBlockData &bbd);
@@ -374,6 +383,8 @@ public:
 	BeesStringFile(Fd dir_fd, string name, size_t limit = 1024 * 1024);
 	string read();
 	void write(string contents);
+	void name(const string &new_name);
+	string name() const;
 };

 class BeesHashTable {
@@ -407,7 +418,7 @@ public:
 		uint8_t	p_byte[BLOCK_SIZE_HASHTAB_EXTENT];
 	} __attribute__((packed));

-	BeesHashTable(shared_ptr<BeesContext> ctx, string filename);
+	BeesHashTable(shared_ptr<BeesContext> ctx, string filename, off_t size = BLOCK_SIZE_HASHTAB_EXTENT);
 	~BeesHashTable();

 	vector<Cell>	find_cell(HashType hash);
@@ -415,8 +426,6 @@ public:
 	void		erase_hash_addr(HashType hash, AddrType addr);
 	bool		push_front_hash_addr(HashType hash, AddrType addr);

-	void		set_shared(bool shared);
-
 private:
 	string		m_filename;
 	Fd		m_fd;
@@ -438,33 +447,41 @@ private:
 	uint64_t		m_buckets;
 	uint64_t		m_extents;
 	uint64_t		m_cells;
-	set<uint64_t>		m_buckets_dirty;
-	set<uint64_t>		m_buckets_missing;
 	BeesThread  		m_writeback_thread;
 	BeesThread	        m_prefetch_thread;
 	RateLimiter		m_flush_rate_limit;
-	RateLimiter		m_prefetch_rate_limit;
-	mutex			m_extent_mutex;
-	mutex			m_bucket_mutex;
-	condition_variable	m_condvar;
 	set<HashType>		m_toxic_hashes;
 	BeesStringFile		m_stats_file;

-	LockSet<uint64_t> 	m_extent_lock_set;
+	// Mutex/condvar for the writeback thread
+	mutex			m_dirty_mutex;
+	condition_variable	m_dirty_condvar;

-	DefaultBool		m_shared;
+	// Per-extent structures
+	struct ExtentMetaData {
+		shared_ptr<mutex> m_mutex_ptr;		// Access serializer
+		bool	m_dirty = false;	// Needs to be written back to disk
+		bool	m_missing = true;	// Needs to be read from disk
+		ExtentMetaData();
+	};
+	vector<ExtentMetaData>	m_extent_metadata;

+	void open_file();
 	void writeback_loop();
 	void prefetch_loop();
 	void try_mmap_flags(int flags);
 	pair<Cell *, Cell *> get_cell_range(HashType hash);
 	pair<uint8_t *, uint8_t *> get_extent_range(HashType hash);
-	void fetch_missing_extent(HashType hash);
-	void set_extent_dirty(HashType hash);
+	void fetch_missing_extent_by_hash(HashType hash);
+	void fetch_missing_extent_by_index(uint64_t extent_index);
+	void set_extent_dirty_locked(uint64_t extent_index);
 	void flush_dirty_extents();
+	bool flush_dirty_extent(uint64_t extent_index);
 	bool is_toxic_hash(HashType h) const;

-	bool using_shared_map() const { return false; }
+	size_t			hash_to_extent_index(HashType ht);
+	unique_lock<mutex>	lock_extent_by_hash(HashType ht);
+	unique_lock<mutex>	lock_extent_by_index(uint64_t extent_index);

 	BeesHashTable(const BeesHashTable &) = delete;
 	BeesHashTable &operator=(const BeesHashTable &) = delete;
@@ -488,10 +505,11 @@ class BeesCrawl {

 	mutex					m_mutex;
 	set<BeesFileRange>			m_extents;
-	DefaultBool				m_deferred;
+	bool					m_deferred = false;
+	bool					m_finished = false;

 	mutex					m_state_mutex;
-	BeesCrawlState				m_state;
+	ProgressTracker<BeesCrawlState>		m_state;

 	bool fetch_extents();
 	void fetch_extents_harder();
@@ -501,40 +519,51 @@ public:
 	BeesCrawl(shared_ptr<BeesContext> ctx, BeesCrawlState initial_state);
 	BeesFileRange peek_front();
 	BeesFileRange pop_front();
-	BeesCrawlState get_state();
+	ProgressTracker<BeesCrawlState>::ProgressHolder hold_state(const BeesFileRange &bfr);
+	BeesCrawlState get_state_begin();
+	BeesCrawlState get_state_end();
 	void set_state(const BeesCrawlState &bcs);
+	void deferred(bool def_setting);
 };

-class BeesRoots {
+class BeesRoots : public enable_shared_from_this<BeesRoots> {
 	shared_ptr<BeesContext>			m_ctx;

 	BeesStringFile				m_crawl_state_file;
-	BeesCrawlState				m_crawl_current;
 	map<uint64_t, shared_ptr<BeesCrawl>>	m_root_crawl_map;
 	mutex					m_mutex;
-	condition_variable			m_condvar;
-	DefaultBool				m_crawl_dirty;
+	bool					m_crawl_dirty = false;
 	Timer					m_crawl_timer;
 	BeesThread				m_crawl_thread;
 	BeesThread				m_writeback_thread;
+	RateEstimator				m_transid_re;
+	size_t					m_transid_factor = BEES_TRANSID_FACTOR;
+	Task					m_crawl_task;
+	bool					m_workaround_btrfs_send = false;
+	LRUCache<bool, uint64_t>		m_root_ro_cache;

 	void insert_new_crawl();
 	void insert_root(const BeesCrawlState &bcs);
 	Fd open_root_nocache(uint64_t root);
 	Fd open_root_ino_nocache(uint64_t root, uint64_t ino);
+	bool is_root_ro_nocache(uint64_t root);
 	uint64_t transid_min();
 	uint64_t transid_max();
+	uint64_t transid_max_nocache();
 	void state_load();
+	ostream &state_to_stream(ostream &os);
 	void state_save();
-	void crawl_roots();
+	bool crawl_roots();
 	string crawl_state_filename() const;
-	BeesCrawlState crawl_state_get(uint64_t root);
 	void crawl_state_set_dirty();
 	void crawl_state_erase(const BeesCrawlState &bcs);
 	void crawl_thread();
 	void writeback_thread();
 	uint64_t next_root(uint64_t root = 0);
 	void current_state_set(const BeesCrawlState &bcs);
+	RateEstimator& transid_re();
+	size_t crawl_batch(shared_ptr<BeesCrawl> crawl);
+	void clear_caches();

 friend class BeesFdCache;
 friend class BeesCrawl;
@@ -544,6 +573,24 @@ public:
 	Fd open_root(uint64_t root);
 	Fd open_root_ino(uint64_t root, uint64_t ino);
 	Fd open_root_ino(const BeesFileId &bfi) { return open_root_ino(bfi.root(), bfi.ino()); }
+	bool is_root_ro(uint64_t root);
+
+	// TODO:  think of better names for these.
+	// or TODO:  do extent-tree scans instead
+	enum ScanMode {
+		SCAN_MODE_ZERO,
+		SCAN_MODE_ONE,
+		SCAN_MODE_TWO,
+		SCAN_MODE_COUNT, // must be last
+	};
+
+	void set_scan_mode(ScanMode new_mode);
+	void set_workaround_btrfs_send(bool do_avoid);
+
+private:
+	ScanMode m_scan_mode = SCAN_MODE_ZERO;
+	static string scan_mode_ntoa(ScanMode new_mode);
+
 };

 struct BeesHash {
@@ -555,13 +602,13 @@ struct BeesHash {
 	BeesHash& operator=(const Type that) { m_hash = that; return *this; }
 private:
 	Type	m_hash;
-	
+
 };

 ostream & operator<<(ostream &os, const BeesHash &bh);

 class BeesBlockData {
-	using Blob = vector<char>;
+	using Blob = vector<uint8_t>;

 	mutable Fd		m_fd;
 	off_t			m_offset;
@@ -569,7 +616,7 @@ class BeesBlockData {
 	mutable BeesAddress	m_addr;
 	mutable Blob		m_data;
 	mutable BeesHash	m_hash;
-	mutable DefaultBool	m_hash_done;
+	mutable bool		m_hash_done = false;

 public:
 	// Constructor with the immutable fields
@@ -607,42 +654,6 @@ public:
 friend ostream & operator<<(ostream &os, const BeesRangePair &brp);
 };

-class BeesWorkQueueBase {
-	string 				m_name; 
-
-protected:
-	static mutex			s_mutex;
-	static set<BeesWorkQueueBase *>	s_all_workers;
-
-public:
-	virtual ~BeesWorkQueueBase();
-	BeesWorkQueueBase(const string &name);
-
-	string name() const;
-	void name(const string &new_name);
-
-	virtual size_t active_size() const = 0;
-	virtual list<string> peek_active(size_t count) const = 0;
- 
-	static void for_each_work_queue(function<void(BeesWorkQueueBase *)> f);
-};
-
-template <class Task>
-class BeesWorkQueue : public BeesWorkQueueBase {
-	WorkQueue<Task>				m_active_queue;
-
-public:
-	BeesWorkQueue(const string &name);
-	~BeesWorkQueue();
-	void push_active(const Task &task, size_t limit);
-	void push_active(const Task &task);
-
-	size_t active_size() const override;
-	list<string> peek_active(size_t count) const override;
-
-	Task pop();
-};
-
 class BeesTempFile {
 	shared_ptr<BeesContext> m_ctx;
 	Fd			m_fd;
@@ -662,18 +673,20 @@ class BeesFdCache {
 	LRUCache<Fd, shared_ptr<BeesContext>, uint64_t>			m_root_cache;
 	LRUCache<Fd, shared_ptr<BeesContext>, uint64_t, uint64_t>	m_file_cache;
 	Timer								m_root_cache_timer;
+	Timer								m_file_cache_timer;

 public:
 	BeesFdCache();
 	Fd open_root(shared_ptr<BeesContext> ctx, uint64_t root);
 	Fd open_root_ino(shared_ptr<BeesContext> ctx, uint64_t root, uint64_t ino);
 	void insert_root_ino(shared_ptr<BeesContext> ctx, Fd fd);
+	void clear();
 };

 struct BeesResolveAddrResult {
 	BeesResolveAddrResult();
 	vector<BtrfsInodeOffsetRoot> m_biors;
-	DefaultBool m_is_toxic;
+	bool m_is_toxic = false;
 	bool is_toxic() const { return m_is_toxic; }
 };

@@ -701,9 +714,12 @@ class BeesContext : public enable_shared_from_this<BeesContext> {

 	Timer						m_total_timer;

+	LockSet<uint64_t>				m_extent_lock_set;
+
 	void set_root_fd(Fd fd);

 	BeesResolveAddrResult resolve_addr_uncached(BeesAddress addr);
+	void wait_for_balance();

 	BeesFileRange scan_one_extent(const BeesFileRange &bfr, const Extent &e);
 	void rewrite_file_range(const BeesFileRange &bfr);
@@ -714,12 +730,13 @@ public:
 	void set_root_path(string path);

 	Fd root_fd() const { return m_root_fd; }
-	Fd home_fd() const { return m_home_fd; }
+	Fd home_fd();
 	string root_path() const { return m_root_path; }
 	string root_uuid() const { return m_root_uuid; }

 	BeesFileRange scan_forward(const BeesFileRange &bfr);

+	bool is_root_ro(uint64_t root);
 	BeesRangePair dup_extent(const BeesFileRange &src);
 	bool dedup(const BeesRangePair &brp);

@@ -738,6 +755,7 @@ public:
 	shared_ptr<BeesTempFile> tmpfile();

 	const Timer &total_timer() const { return m_total_timer; }
+	LockSet<uint64_t> &extent_lock_set() { return m_extent_lock_set; }

 	// TODO: move the rest of the FD cache methods here
 	void insert_root_ino(Fd fd);
@@ -751,22 +769,22 @@ class BeesResolver {
 	unsigned				m_bior_count;

 	// We found matching data, so we can dedup
-	DefaultBool				m_found_data;
+	bool					m_found_data = false;

 	// We found matching data, so we *did* dedup
-	DefaultBool				m_found_dup;
+	bool					m_found_dup = false;

 	// We found matching hash, so the hash table is still correct
-	DefaultBool				m_found_hash;
+	bool					m_found_hash = false;

 	// We found matching physical address, so the hash table isn't totally wrong
-	DefaultBool				m_found_addr;
+	bool					m_found_addr = false;

 	// We found matching physical address, but data did not match
-	DefaultBool				m_wrong_data;
+	bool					m_wrong_data = false;

 	// The whole thing is a placebo to avoid crippling btrfs performance bugs
-	DefaultBool				m_is_toxic;
+	bool					m_is_toxic = false;

 	BeesFileRange chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &needle_bbd);
 	BeesBlockData adjust_offset(const BeesFileRange &haystack, const BeesBlockData &needle);
@@ -820,8 +838,9 @@ public:
 };

 // And now, a giant pile of extern declarations
+extern int bees_log_level;
+extern const char *BEES_VERSION;
 string pretty(double d);
-extern RateLimiter bees_info_rate_limit;
 void bees_sync(int fd);
 string format_time(time_t t);

--- a/src/fanotify-watch.cc
+++ b/src/fanotify-watch.cc
@@ -1,91 +0,0 @@
-#include <crucible/error.h>
-#include <crucible/fd.h>
-#include <crucible/ntoa.h>
-
-#include <iostream>
-#include <iomanip>
-#include <sstream>
-#include <string>
-
-#include <unistd.h>
-#include <sys/fanotify.h>
-
-using namespace crucible;
-using namespace std;
-
-static
-void
-usage(const char *name)
-{
-	cerr << "Usage: " << name << " directory" << endl;
-	cerr << "Reports fanotify events from directory" << endl;
-}
-
-struct fan_read_block {
-	struct fanotify_event_metadata fem;
-	// more here in the future.  Maybe.
-};
-
-static inline
-string
-fan_flag_ntoa(uint64_t ui)
-{
-	static const bits_ntoa_table flag_names[] = {
-		NTOA_TABLE_ENTRY_BITS(FAN_ACCESS),
-		NTOA_TABLE_ENTRY_BITS(FAN_OPEN),
-		NTOA_TABLE_ENTRY_BITS(FAN_MODIFY),
-		NTOA_TABLE_ENTRY_BITS(FAN_CLOSE),
-		NTOA_TABLE_ENTRY_BITS(FAN_CLOSE_WRITE),
-		NTOA_TABLE_ENTRY_BITS(FAN_CLOSE_NOWRITE),
-		NTOA_TABLE_ENTRY_BITS(FAN_Q_OVERFLOW),
-		NTOA_TABLE_ENTRY_BITS(FAN_ACCESS_PERM),
-		NTOA_TABLE_ENTRY_BITS(FAN_OPEN_PERM),
-		NTOA_TABLE_ENTRY_END()
-	};
-	return bits_ntoa(ui, flag_names);
-}
-
-int
-main(int argc, char **argv)
-{
-	if (argc < 1) {
-		usage(argv[0]);
-		exit(EXIT_FAILURE);
-	}
-
-	Fd fd;
-
-	DIE_IF_MINUS_ONE(fd = fanotify_init(FAN_CLASS_NOTIF, O_RDONLY | O_LARGEFILE | O_CLOEXEC | O_NOATIME));
-
-	for (char **argvp = argv + 1; *argvp; ++argvp) {
-		cerr << "fanotify_mark(" << *argvp << ")..." << flush;
-		DIE_IF_MINUS_ONE(fanotify_mark(fd, FAN_MARK_ADD | FAN_MARK_MOUNT, FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE | FAN_OPEN, FAN_NOFD, *argvp));
-		cerr << endl;
-	}
-
-	while (1) {
-		struct fan_read_block frb;
-		read_or_die(fd, frb);
-
-#if 0
-		cout << "event_len\t= " << frb.fem.event_len << endl;
-		cout << "vers\t= " << static_cast<int>(frb.fem.vers) << endl;
-		cout << "reserved\t= " << static_cast<int>(frb.fem.reserved) << endl;
-		cout << "metadata_len\t= " << frb.fem.metadata_len << endl;
-		cout << "mask\t= " << hex << frb.fem.mask << dec << "\t" << fan_flag_ntoa(frb.fem.mask) << endl;
-		cout << "fd\t= " << frb.fem.fd << endl;
-		cout << "pid\t= " << frb.fem.pid << endl;
-#endif
-
-		cout << "flags " << fan_flag_ntoa(frb.fem.mask) << " pid " << frb.fem.pid << ' ' << flush;
-
-		Fd event_fd(frb.fem.fd);
-		ostringstream oss;
-		oss << "/proc/self/fd/" << event_fd;
-		cout << "file " << readlink_or_die(oss.str()) << endl;
-
-		// cout << endl;
-	}
-	
-	return EXIT_SUCCESS;
-}
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,36 +1,44 @@
 PROGRAMS = \
 	chatter \
 	crc64 \
-	execpipe \
 	fd \
-	interp \
 	limits \
 	path \
 	process \
+	progress \
+	task \

 all: test

-test: $(PROGRAMS)
-	set -x; for prog in $(PROGRAMS); do ./$$prog || exit 1; done
+test: $(PROGRAMS:%=%.txt) Makefile
+FORCE:

 include ../makeflags
+-include ../localconf

-LIBS = -lcrucible
+LIBS = -lcrucible -lpthread
 LDFLAGS = -L../lib -Wl,-rpath=$(shell realpath ../lib)

-depends.mk: *.cc
-	for x in *.cc; do $(CXX) $(CXXFLAGS) -M "$$x"; done >> depends.mk.new
-	mv -fv depends.mk.new depends.mk
-                                       
-include depends.mk
+.depends/%.dep: %.cc tests.h Makefile
+	@mkdir -p .depends
+	$(CXX) $(CXXFLAGS) -M -MF $@ -MT $(<:.cc=.o) $<

-%.o: %.cc %.h ../makeflags
-	-echo "Implicit rule %.o: %.cc" >&2
-	$(CXX) $(CXXFLAGS) -o "$@" -c "$<"
+depends.mk: $(PROGRAMS:%=.depends/%.dep)
+	cat $^ > $@.new
+	mv -f $@.new $@

-%: %.o ../makeflags
-	-echo "Implicit rule %: %.o" >&2
-	$(CXX) $(CXXFLAGS) -o "$@" "$<" $(LDFLAGS) $(LIBS)
+include depends.mk
+
+%.o: %.cc %.h ../makeflags Makefile
+	@echo "Implicit rule %.o: %.cc"
+	$(CXX) $(CXXFLAGS) -o $@ -c $<
+
+$(PROGRAMS): %: %.o ../makeflags Makefile
+	@echo "Implicit rule %: %.o"
+	$(CXX) $(CXXFLAGS) $(LDFLAGS) -o $@ $< $(LIBS)
+
+%.txt: % Makefile FORCE
+	./$< >$@ 2>&1 || (RC=$$?; cat $@; exit $$RC)

 clean:
-	-rm -fv *.o
+	rm -fv $(PROGRAMS:%=%.o) $(PROGRAMS:%=%.txt) $(PROGRAMS)
--- a/test/chatter.cc
+++ b/test/chatter.cc
@@ -32,7 +32,7 @@ void
 test_chatter_three()
 {
 	cerr << endl;
-	Chatter c("tct");
+	Chatter c(0, "tct");
 	c << "More complicated";
 	c << "\ncase with\n";
 	c << "some \\ns";
--- a/test/crc64.cc
+++ b/test/crc64.cc
@@ -5,18 +5,6 @@

 using namespace crucible;

-static
-void
-test_getcrc64_strings()
-{
-	assert(Digest::CRC::crc64("John") == 5942451273432301568);
-	assert(Digest::CRC::crc64("Paul") == 5838402100630913024);
-	assert(Digest::CRC::crc64("George") == 6714394476893704192);
-	assert(Digest::CRC::crc64("Ringo") == 6038837226071130112);
-	assert(Digest::CRC::crc64("") == 0);
-	assert(Digest::CRC::crc64("\377\277\300\200") == 15615382887346470912ULL);
-}
-
 static
 void
 test_getcrc64_byte_arrays()
@@ -32,7 +20,6 @@ test_getcrc64_byte_arrays()
 int
 main(int, char**)
 {
-	RUN_A_TEST(test_getcrc64_strings());
 	RUN_A_TEST(test_getcrc64_byte_arrays());

 	exit(EXIT_SUCCESS);
--- a/test/execpipe.cc
+++ b/test/execpipe.cc
@@ -1,64 +0,0 @@
-#include "tests.h"
-
-#include "crucible/execpipe.h"
-
-#include <ios>
-#include <cassert>
-#include <cstring>
-#include <cstdlib>
-#include <stdexcept>
-
-#include <unistd.h>
-
-using namespace crucible;
-using namespace std;
-
-#if 1 // Needs rework
-static inline
-void
-test_hello_world()
-{
-	// alarm(9);
-	Fd fd = popen([]() { return system("echo Hello, World!"); });
-	char buf[1024];
-	size_t rv = -1;
-	read_partial_or_die(fd, buf, rv);
-	assert(rv > 0);
-	string b(buf, buf + rv - 1);
-	// cerr << "hello_world says: '" << b << "'" << endl;
-	assert(b == "Hello, World!");
-}
-
-static inline
-void
-test_read_limit(size_t limit = 4096)
-{
-	alarm(9);
-	Fd fd = popen([]() { return system("yes Hello!"); });
-	try {
-		string b = read_all(fd, limit);
-	} catch (out_of_range &re) {
-		return;
-	}
-	assert(!"no exception thrown by read_all");
-}
-#endif
-
-namespace crucible {
-	extern bool assert_no_leaked_fds();
-};
-
-int
-main(int, char**)
-{
-#if 1
-	RUN_A_TEST(test_hello_world());
-	assert(assert_no_leaked_fds());
-	RUN_A_TEST(test_read_limit(4095));
-	RUN_A_TEST(test_read_limit(4096));
-	RUN_A_TEST(test_read_limit(4097));
-	assert(assert_no_leaked_fds());
-#endif
-
-	exit(EXIT_SUCCESS);
-}
--- a/test/interp.cc
+++ b/test/interp.cc
@@ -1,88 +0,0 @@
-#include "tests.h"
-
-#include "crucible/interp.h"
-
-using namespace crucible;
-using namespace std;
-
-/***********************************************************************
-
-How this should work:
-
-Interpreter reads an arg list:
-
-	argv[0] --method0args --method1arg arg1 --method1arg=arg1 -- args...
-
-argv[0] should look up a shared_ptr<Command> which creates an object of
-type shared_ptr<Process>.  This object is used to receive args by
-method calls or one at a time.
-
-<Command> and <Process> can be the same object, or not.
-
-Process p methods:
-
-	p->spawn(Interp*) -> Process
-	p->exec(ArgList) -> Process / Result
-	p->method (from ArgParser<>)
-		p->finish() -> void (destroys object without early destruction warnings...?)
-		p->~Process() -> complains loudly if finish() not called first...?
-
-Result might be a pair of Process, string.  Or just string.
-
-ArgParser should be more like GetOpt:
-
-	build a dictionary and an arg list from arguments
-	Process methods should interrogate ArgParser
-	ArgParser might have a table of boolean and string option names so it can reject invalid options
-		but if it had that, we could also pass in Process and have it call methods on it
-		...but that is a _lot_ of pointer-hiding when we could KISS
-		...but if we had that solved, argparser tables look like lists of method names
-	ArgParser<T> has a table of names and methods on object of type T
-		ArgParser hides everything behind void* and hands off to a compiled implementation to do callbacks
-
-Extreme simplification:  arguments are themselves executable
-
-	so '--method_foo arg' really means construct MethodFoo(arg) and cast to shared_ptr<ProcArg>
-	then Process->invokeSomething(ProcArg)
-	too extreme, use argparser instead
-
-***********************************************************************/
-
-void
-test_arg_parser()
-{
-	ArgParser ap;
-	ArgList al( { "abc", "--def", "ghi" } );
-	ap.parse(NULL, al);
-}
-
-struct Thing {
-	int m_i;
-	double m_d;
-	string m_s;
-
-	void set_i(int i) { cerr << "i = " << i << endl; m_i = i; }
-	void set_d(double d) { cerr << "d = " << d << endl; m_d = d; }
-	void set_s(string s) { cerr << "s = " << s << endl; m_s = s; }
-};
-
-template <typename F, typename T, typename A>
-void
-assign(T& t, F f, A a)
-{
-	cerr << __PRETTY_FUNCTION__ << " - a = " << a << endl;
-	(t.*f)(a);
-}
-
-int
-main(int, char**)
-{
-	RUN_A_TEST(test_arg_parser());
-
-	Thing p;
-	assign(p, &Thing::set_i, 5);
-
-	cerr << "p.m_i = " << p.m_i << endl;
-
-	exit(EXIT_SUCCESS);
-}
--- a/test/limits.cc
+++ b/test/limits.cc
@@ -141,7 +141,13 @@ test_cast_0x80000000_to_things()
 	SHOULD_FAIL(ranged_cast<unsigned short>(uv));
 	SHOULD_FAIL(ranged_cast<unsigned char>(uv));
 	SHOULD_PASS(ranged_cast<signed long long>(sv), sv);
-	SHOULD_PASS(ranged_cast<signed long>(sv), sv);
+	if (sizeof(long) == 4) {
+		SHOULD_FAIL(ranged_cast<signed long>(sv));
+	} else if (sizeof(long) == 8) {
+		SHOULD_PASS(ranged_cast<signed long>(sv), sv);
+	} else {
+		assert(!"unhandled case, please add code for long here");
+	}
 	SHOULD_FAIL(ranged_cast<signed short>(sv));
 	SHOULD_FAIL(ranged_cast<signed char>(sv));
 	if (sizeof(int) == 4) {
@@ -149,7 +155,7 @@ test_cast_0x80000000_to_things()
 	} else if (sizeof(int) == 8) {
 		SHOULD_PASS(ranged_cast<signed int>(sv), sv);
 	} else {
-		assert(!"unhandled case, please add code here");
+		assert(!"unhandled case, please add code for int here");
 	}
 }

@@ -174,7 +180,13 @@ test_cast_0xffffffff_to_things()
 	SHOULD_FAIL(ranged_cast<unsigned short>(uv));
 	SHOULD_FAIL(ranged_cast<unsigned char>(uv));
 	SHOULD_PASS(ranged_cast<signed long long>(sv), sv);
-	SHOULD_PASS(ranged_cast<signed long>(sv), sv);
+	if (sizeof(long) == 4) {
+		SHOULD_FAIL(ranged_cast<signed long>(sv));
+	} else if (sizeof(long) == 8) {
+		SHOULD_PASS(ranged_cast<signed long>(sv), sv);
+	} else {
+		assert(!"unhandled case, please add code for long here");
+	}
 	SHOULD_FAIL(ranged_cast<signed short>(sv));
 	SHOULD_FAIL(ranged_cast<signed char>(sv));
 	if (sizeof(int) == 4) {
@@ -182,7 +194,7 @@ test_cast_0xffffffff_to_things()
 	} else if (sizeof(int) == 8) {
 		SHOULD_PASS(ranged_cast<signed int>(sv), sv);
 	} else {
-		assert(!"unhandled case, please add code here");
+		assert(!"unhandled case, please add code for int here");
 	}
 }

--- a/test/progress.cc
+++ b/test/progress.cc
@@ -0,0 +1,40 @@
+#include "tests.h"
+
+#include "crucible/progress.h"
+
+#include <cassert>
+
+#include <unistd.h>
+
+using namespace crucible;
+using namespace std;
+
+void
+test_progress()
+{
+	ProgressTracker<uint64_t> pt(123);
+	auto hold = pt.hold(234);
+	auto hold2 = pt.hold(345);
+	assert(pt.begin() == 123);
+	assert(pt.end() == 345);
+	auto hold3 = pt.hold(456);
+	assert(pt.begin() == 123);
+	assert(pt.end() == 456);
+	hold2.reset();
+	assert(pt.begin() == 123);
+	assert(pt.end() == 456);
+	hold.reset();
+	assert(pt.begin() == 345);
+	assert(pt.end() == 456);
+	hold3.reset();
+	assert(pt.begin() == 456);
+	assert(pt.end() == 456);
+}
+
+int
+main(int, char**)
+{
+	RUN_A_TEST(test_progress());
+
+	exit(EXIT_SUCCESS);
+}
--- a/test/task.cc
+++ b/test/task.cc
@@ -0,0 +1,227 @@
+#include "tests.h"
+
+#include "crucible/task.h"
+#include "crucible/time.h"
+
+#include <cassert>
+#include <condition_variable>
+#include <mutex>
+#include <sstream>
+#include <vector>
+
+#include <unistd.h>
+
+using namespace crucible;
+using namespace std;
+
+void
+test_tasks(size_t count)
+{
+	TaskMaster::set_thread_count();
+
+	vector<bool> task_done(count, false);
+
+	mutex mtx;
+	condition_variable cv;
+
+	unique_lock<mutex> lock(mtx);
+
+	// Run several tasks in parallel
+	for (size_t c = 0; c < count; ++c) {
+		ostringstream oss;
+		oss << "task #" << c;
+		Task t(
+			oss.str(),
+			[c, &task_done, &mtx, &cv]() {
+				unique_lock<mutex> lock(mtx);
+				// cerr << "Task #" << c << endl;
+				task_done.at(c) = true;
+				cv.notify_one();
+			}
+		);
+		t.run();
+	}
+
+	// Get current status
+	ostringstream oss;
+	TaskMaster::print_queue(oss);
+	TaskMaster::print_workers(oss);
+
+	while (true) {
+		size_t tasks_done = 0;
+		for (auto i : task_done) {
+			if (i) {
+				++tasks_done;
+			}
+		}
+		if (tasks_done == count) {
+			return;
+		}
+		// cerr << "Tasks done: " << tasks_done << endl;
+
+		cv.wait(lock);
+	}
+}
+
+void
+test_finish()
+{
+	ostringstream oss;
+	TaskMaster::print_queue(oss);
+	TaskMaster::print_workers(oss);
+	TaskMaster::set_thread_count(0);
+	// cerr << "finish done" << endl;
+}
+
+void
+test_unfinish()
+{
+	TaskMaster::set_thread_count();
+}
+
+
+void
+test_barrier(size_t count)
+{
+	vector<bool> task_done(count, false);
+
+	mutex mtx;
+	condition_variable cv;
+
+	unique_lock<mutex> lock(mtx);
+
+	auto b = make_shared<Barrier>();
+
+	// Run several tasks in parallel
+	for (size_t c = 0; c < count; ++c) {
+		auto bl = b->lock();
+		ostringstream oss;
+		oss << "task #" << c;
+		Task t(
+			oss.str(),
+			[c, &task_done, &mtx, bl]() mutable {
+				// cerr << "Task #" << c << endl;
+				unique_lock<mutex> lock(mtx);
+				task_done.at(c) = true;
+				bl.release();
+			}
+		);
+		t.run();
+	}
+
+	// Get current status
+	ostringstream oss;
+	TaskMaster::print_queue(oss);
+	TaskMaster::print_workers(oss);
+
+	bool done_flag = false;
+
+	Task completed(
+		"Waiting for Barrier",
+		[&mtx, &cv, &done_flag]() {
+			unique_lock<mutex> lock(mtx);
+			// cerr << "Running cv notify" << endl;
+			done_flag = true;
+			cv.notify_all();
+		}
+	);
+	b->insert_task(completed);
+
+	b.reset();
+
+	while (true) {
+		size_t tasks_done = 0;
+		for (auto i : task_done) {
+			if (i) {
+				++tasks_done;
+			}
+		}
+		// cerr << "Tasks done: " << tasks_done << " done_flag " << done_flag << endl;
+		if (tasks_done == count && done_flag) {
+			break;
+		}
+
+		cv.wait(lock);
+	}
+	// cerr << "test_barrier return" << endl;
+}
+
+void
+test_exclusion(size_t count)
+{
+	mutex only_one;
+	Exclusion excl;
+
+	mutex mtx;
+	condition_variable cv;
+
+	unique_lock<mutex> lock(mtx);
+
+	auto b = make_shared<Barrier>();
+
+	// Run several tasks in parallel
+	for (size_t c = 0; c < count; ++c) {
+		auto bl = b->lock();
+		ostringstream oss;
+		oss << "task #" << c;
+		Task t(
+			oss.str(),
+			[c, &only_one, &excl, bl]() mutable {
+				// cerr << "Task #" << c << endl;
+				(void)c;
+				auto lock = excl.try_lock();
+				if (!lock) {
+					excl.insert_task(Task::current_task());
+					return;
+				}
+				bool locked = only_one.try_lock();
+				assert(locked);
+				nanosleep(0.0001);
+				only_one.unlock();
+				bl.release();
+			}
+		);
+		t.run();
+	}
+
+	bool done_flag = false;
+
+	Task completed(
+		"Waiting for Barrier",
+		[&mtx, &cv, &done_flag]() {
+			unique_lock<mutex> lock(mtx);
+			// cerr << "Running cv notify" << endl;
+			done_flag = true;
+			cv.notify_all();
+		}
+	);
+	b->insert_task(completed);
+
+	b.reset();
+
+	while (true) {
+		if (done_flag) {
+			break;
+		}
+
+		cv.wait(lock);
+	}
+}
+
+int
+main(int, char**)
+{
+	// in case of deadlock
+	alarm(9);
+
+	RUN_A_TEST(test_tasks(256));
+	RUN_A_TEST(test_finish());
+	RUN_A_TEST(test_unfinish());
+	RUN_A_TEST(test_barrier(256));
+	RUN_A_TEST(test_finish());
+	RUN_A_TEST(test_unfinish());
+	RUN_A_TEST(test_exclusion(256));
+	RUN_A_TEST(test_finish());
+
+	exit(EXIT_SUCCESS);
+}