Merge branch 'master' into subvol-threads

Makefile: Fail gracefully if markdown is not installed
Previously, MARKDOWN may end up empty. This commit should fix it. Signed-off-by: Kai Krakow <kai@kaishome.de>
2025-12-01 09:13:38 +01:00 · 2018-01-11 21:26:34 -05:00 · 2018-01-11 21:25:12 +01:00 · 2018-01-11 20:56:48 +01:00 · 2018-01-11 20:56:34 +01:00 · 2018-01-10 23:43:09 -05:00
48 changed files with 1294 additions and 1408 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,6 @@ html/
 latex/
 make.log
 make.log.new
+localconf
+scripts/beesd
+scripts/beesd@.service
--- a/51
+++ b/51
@@ -1,19 +1,58 @@
-default install all: lib src test README.html
+PREFIX ?= /
+LIBDIR ?= lib
+USR_PREFIX ?= $(PREFIX)/usr
+USRLIB_PREFIX ?= $(USR_PREFIX)/$(LIBDIR)
+SYSTEMD_LIB_PREFIX ?= $(PREFIX)/lib/systemd
+LIBEXEC_PREFIX ?= $(USRLIB_PREFIX)/bees

-clean:
-	git clean -dfx
+MARKDOWN := $(firstword $(shell which markdown markdown2 markdown_py 2>/dev/null || echo markdown))

-.PHONY: lib src
+# allow local configuration to override above variables
+-include localconf

-lib:
+default all: lib src scripts test README.html
+
+clean: ## Cleanup
+	git clean -dfx -e localconf
+
+.PHONY: lib src test
+
+lib: ## Build libs
 	$(MAKE) -C lib

+src: ## Build bins
 src: lib
 	$(MAKE) -C src

+test: ## Run tests
 test: lib src
 	$(MAKE) -C test

+scripts/%: scripts/%.in
+	sed -e's#@LIBEXEC_PREFIX@#$(LIBEXEC_PREFIX)#' -e's#@PREFIX@#$(PREFIX)#' "$<" >"$@"
+
+scripts: scripts/beesd scripts/beesd@.service
+
 README.html: README.md
-	markdown README.md > README.html.new
+	$(MARKDOWN) README.md > README.html.new
 	mv -f README.html.new README.html
+
+install_bees: ## Install bees + libs
+install_bees: lib src test
+	install -Dm644 lib/libcrucible.so $(DESTDIR)$(USRLIB_PREFIX)/libcrucible.so
+	install -Dm755 bin/bees	$(DESTDIR)$(LIBEXEC_PREFIX)/bees
+
+install_scripts: ## Install scipts
+install_scripts: scripts
+	install -Dm755 scripts/beesd $(DESTDIR)$(USR_PREFIX)/sbin/beesd
+	install -Dm644 scripts/beesd.conf.sample $(DESTDIR)$(PREFIX)/etc/bees/beesd.conf.sample
+	install -Dm644 scripts/beesd@.service $(DESTDIR)$(SYSTEMD_LIB_PREFIX)/system/beesd@.service
+
+install: ## Install distribution
+install: install_bees install_scripts
+
+help: ## Show help
+	@fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/\t/'
+
+bees: all
+fly: install
--- a/README.md
+++ b/README.md
@@ -107,9 +107,9 @@ fresh full-filesystem rescan, and restart `bees'.
 Things You Might Expect That Bees Doesn't Have
 ----------------------------------------------

-* There's no configuration file or getopt command line option processing
-(patches welcome!).  There are some tunables hardcoded in the source
-that could eventually become configuration options.
+* There's no configuration file (patches welcome!).  There are some tunables
+hardcoded in the source that could eventually become configuration options.
+There's also an incomplete option parser (patches welcome!).

 * There's no way to *stop* the Bees daemon.  Use SIGKILL, SIGTERM, or
 Ctrl-C for now.  Some of the destructors are unreachable and have never
@@ -134,11 +134,6 @@ performance by caching, but really fixing this requires rewriting the
 crawler to scan the btrfs extent tree directly instead of the subvol
 FS trees.

-* Bees had support for multiple worker threads in the past; however,
-this was removed because it made Bees too aggressive to coexist with
-other applications on the same machine.  It also hit the *slow backrefs*
-on N CPU cores instead of just one.
-
 * Block reads are currently more allocation- and CPU-intensive than they
 should be, especially for filesystems on SSD where the IO overhead is
 much smaller.  This is a problem for power-constrained environments
@@ -171,6 +166,7 @@ Bees has been tested in combination with the following:
 * Large (>16M) extents
 * Huge files (>1TB--although Btrfs performance on such files isn't great in general)
 * filesystems up to 25T bytes, 100M+ files
+* btrfs read-only snapshots

 Bad Btrfs Feature Interactions
 ------------------------------
@@ -179,14 +175,13 @@ Bees has not been tested with the following, and undesirable interactions may oc

 * Non-4K filesystem data block size (should work if recompiled)
 * Non-equal hash (SUM) and filesystem data block (CLONE) sizes (probably never will work)
-* btrfs read-only snapshots (never tested, probably wouldn't work well)
-* btrfs send/receive (receive is probably OK, but send requires RO snapshots.  See above)
+* btrfs send/receive (receive is probably OK, but send could be confused?)
 * btrfs qgroups (never tested, no idea what might happen)
 * btrfs seed filesystems (does anyone even use those?)
 * btrfs autodefrag mount option (never tested, could fight with Bees)
-* btrfs nodatacow mount option or inode attribute (*could* work, but might not)
+* btrfs nodatacow/nodatasum inode attribute or mount option (bees skips all nodatasum files)
 * btrfs out-of-tree kernel patches (e.g. in-band dedup or encryption)
-* btrfs-convert from ext2/3/4 (never tested)
+* btrfs-convert from ext2/3/4 (never tested, might run out of space or ignore significant portions of the filesystem due to sanity checks)
 * btrfs mixed block groups (don't know a reason why it would *not* work, but never tested)
 * open(O_DIRECT)
 * Filesystems mounted *without* the flushoncommit option
@@ -194,7 +189,7 @@ Bees has not been tested with the following, and undesirable interactions may oc
 Other Caveats
 -------------

-* btrfs balance will invalidate parts of the dedup table.  Bees will
+* btrfs balance will invalidate parts of the dedup hash table.  Bees will
  happily rebuild the table, but it will have to scan all the blocks
  again.

@@ -205,17 +200,35 @@ Other Caveats

 * Bees creates temporary files (with O_TMPFILE) and uses them to split
  and combine extents elsewhere in btrfs.  These will take up to 2GB
-  during normal operation.
+  of disk space per thread during normal operation.

 * Like all deduplicators, Bees will replace data blocks with metadata
-  references.  It is a good idea to ensure there are several GB of
-  unallocated space (see `btrfs fi df`) on the filesystem before running
-  Bees for the first time.  Use
+  references.  It is a good idea to ensure there is sufficient unallocated
+  space (see `btrfs fi usage`) on the filesystem to allow the metadata
+  to multiply in size by the number of snapshots before running Bees
+  for the first time.  Use

-        btrfs balance start -dusage=100,limit=1 /your/filesystem
+        btrfs balance start -dusage=100,limit=N /your/filesystem

-  If possible, raise the `limit` parameter to the current size of metadata
-  usage (from `btrfs fi df`) plus 1.
+  where the `limit` parameter 'N' should be calculated as follows:
+
+	* start with the current size of metadata usage (from `btrfs fi
+	  df`) in GB, plus 1
+
+	* multiply by the proportion of disk space in subvols with
+	  snapshots (i.e. if there are no snapshots, multiply by 0;
+	  if all of the data is shared between at least one origin
+	  and one snapshot subvol, multiply by 1)
+
+	* multiply by the number of snapshots (i.e. if there is only
+	  one subvol, multiply by 0; if there are 3 snapshots and one
+	  origin subvol, multiply by 3)
+
+  `limit = GB_metadata * (disk_space_in_snapshots / total_disk_space) * number_of_snapshots`
+
+  Monitor unallocated space to ensure that the filesystem never runs out
+  of metadata space (whether Bees is running or not--this is a general
+  btrfs requirement).


 A Brief List Of Btrfs Kernel Bugs
@@ -228,28 +241,38 @@ Missing features (usually not available in older LTS kernels):
 * 3.16: `SEARCH_V2` ioctl added.  Bees could use `SEARCH` instead.
 * 4.2: `FILE_EXTENT_SAME` no longer updates mtime, can be used at EOF.

+Future features (kernel features Bees does not yet use, but may rely on
+in the future):
+
+* 4.14: `LOGICAL_INO_V2` allows userspace to create forward and backward
+  reference maps to entire physical extents with a single ioctl call,
+  and raises the limit of 2730 references per extent.  Bees has not yet
+  been rewritten to take full advantage of these features.
+
 Bug fixes (sometimes included in older LTS kernels):

+* Bugs fixed prior to 4.4.3 are not listed here.
 * 4.5: hang in the `INO_PATHS` ioctl used by Bees.
 * 4.5: use-after-free in the `FILE_EXTENT_SAME` ioctl used by Bees.
+* 4.6: lost inodes after a rename, crash, and log tree replay
+  (triggered by the fsync() while writing `beescrawl.dat`).
 * 4.7: *slow backref* bug no longer triggers a softlockup panic.  It still
-  too long to resolve a block address to a root/inode/offset triple.
+  takes too long to resolve a block address to a root/inode/offset triple.
+* 4.10: reduced CPU time cost of the LOGICAL_INO ioctl and dedup
+  backref processing in general.
+* 4.11: yet another dedup deadlock case is fixed.
+* 4.14: backref performance improvements make LOGICAL_INO even faster.

-Fixed bugs not yet integrated in mainline Linux:
+Unfixed kernel bugs (as of 4.11.9) with workarounds in Bees:

-* 7f8e406 ("btrfs: improve delayed refs iterations"): significantly
-  reduces the CPU time cost of the LOGICAL_INO ioctl (from 30-70% of
-  bees running time to under 5%).
-
-Unfixed kernel bugs (as of 4.5.7) with workarounds in Bees:
-
-* *slow backref*: If the number of references to a single shared extent
-  within a single file grows above a few thousand, the kernel consumes CPU
-  for up to 40 uninterruptible minutes while holding various locks that
-  block access to the filesystem.  Bees avoids this bug by measuring the
-  time the kernel spends performing certain operations and permanently
-  blacklisting any extent or hash where the kernel starts to get slow.
-  Inside Bees, such blocks are marked as 'toxic' hash/block addresses.
+* *slow backrefs* (aka toxic extents): If the number of references to a
+  single shared extent within a single file grows above a few thousand,
+  the kernel consumes CPU for minutes at a time while holding various
+  locks that block access to the filesystem.  Bees avoids this bug by
+  measuring the time the kernel spends performing certain operations
+  and permanently blacklisting any extent or hash where the kernel
+  starts to get slow.  Inside Bees, such blocks are marked as 'toxic'
+  hash/block addresses.  *Needs to be retested after v4.14.*

 * `LOGICAL_INO` output is arbitrarily limited to 2730 references
  even if more buffer space is provided for results.  Once this number
@@ -260,35 +283,29 @@ Unfixed kernel bugs (as of 4.5.7) with workarounds in Bees:
  This places an obvious limit on dedup efficiency for extremely common
  blocks or filesystems with many snapshots (although this limit is
  far greater than the effective limit imposed by the *slow backref* bug).
+  *Fixed in v4.14.*
+
+* `LOGICAL_INO` on compressed extents returns a list of root/inode/offset
+  tuples matching the extent bytenr of its argument.  On uncompressed
+  extents, any r/i/o tuple whose extent offset does not match the
+  argument's extent offset is discarded, i.e. only the single 4K block
+  matching the argument is returned, so a complete map of the extent
+  references requires calling `LOGICAL_INO` for every single block of
+  the extent.  This is undesirable behavior for Bees, which wants a
+  list of all extent refs referencing a data extent (i.e. Bees wants
+  the compressed-extent behavior in all cases).  *Fixed in v4.14.*
+
+* `LOGICAL_INO` is only called from one thread at any time per process.
+  This means at most one core is irretrievably stuck in this ioctl.

 * `FILE_EXTENT_SAME` is arbitrarily limited to 16MB.  This is less than
  128MB which is the maximum extent size that can be created by defrag
  or prealloc.  Bees avoids feedback loops this can generate while
  attempting to replace extents over 16MB in length.

-* `DEFRAG_RANGE` is useless.  The ioctl attempts to implement `btrfs
-  fi defrag` in the kernel, and will arbitrarily defragment more or
-  less than the range requested to match the behavior expected from the
-  userspace tool.  Bees implements its own defrag instead, copying data
-  to a temporary file and using the `FILE_EXTENT_SAME` ioctl to replace
-  precisely the specified range of offending fragmented blocks.
-
-* When writing BeesStringFile, a crash can cause the directory entry
-  `beescrawl.dat.tmp` to exist without a corresponding inode.
-  This directory entry cannot be renamed or removed; however, it does
-  not prevent the creation of a second directory entry with the same
-  name that functions normally, so it doesn't prevent Bees operation.
-
-  The orphan directory entry can be removed by deleting its subvol,
-  so place BEESHOME on a separate subvol so you can delete these orphan
-  directory entries when they occur (or use btrfs zero-log before mounting
-  the filesystem after a crash).  Alternatively, place BEESHOME on a
-  non-btrfs filesystem.
-
 * If the `fsync()` in `BeesTempFile::make_copy` is removed, the filesystem
  hangs within a few hours, requiring a reboot to recover.  On the other
-  hand, there may be net performance benefits to calling `fsync()` before
-  or after each dedup.  This needs further investigation.
+  hand, the `fsync()` only costs about 8% of overall performance.

 Not really a bug, but a gotcha nonetheless:

@@ -299,9 +316,54 @@ Not really a bug, but a gotcha nonetheless:
  children* until the FD is closed.  Bees avoids this gotcha by closing
  all of the FDs in its directory FD cache every 15 minutes.

+* If a file is deleted while Bees is caching an open FD to the file,
+  Bees continues to scan the file.  For very large files (e.g. VM
+  images), the deletion of the file can be delayed indefinitely.
+  To limit this delay, Bees closes all FDs in its file FD cache every
+  15 minutes.

+Installation
+============

-Requirements
+Bees can be installed by following one these instructions:
+
+Arch package
+------------
+
+Bees is availabe in Arch Linux AUR. Install with:
+
+`$ pacaur -S bees-git`
+
+Gentoo ebuild
+-------------
+
+Bees is available as a Gentoo ebuild. Just copy `bees-9999.ebuild` from
+`contrib/gentoo` including the `files` subdirectory to your local
+overlay category `sys-fs`.
+
+You can copy the ebuild to match a Bees version number, and it will
+build that tagged version. It is partly supported since v0.5,
+previous versions won't work.
+
+Build from source
+-----------------
+
+Build with `make`. The build produces `bin/bees` and `lib/libcrucible.so`,
+which must be copied to somewhere in `$PATH` and `$LD_LIBRARY_PATH`
+on the target system respectively.
+
+It will also generate `scripts/beesd@.service` for systemd users. This
+service makes use of a helper script `scripts/beesd` to boot the service.
+Both of the latter use the filesystem UUID to mount the root subvolume
+within a temporary runtime directory.
+
+### Ubuntu 16.04 - 17.04:
+`$ apt -y install build-essential btrfs-tools uuid-dev markdown && make`
+
+### Ubuntu 14.04:
+You can try to carry on the work done here: https://gist.github.com/dagelf/99ee07f5638b346adb8c058ab3d57492
+
+Dependencies
 ------------

 * C++11 compiler (tested with GCC 4.9 and 6.2.0)
@@ -316,34 +378,25 @@ Requirements

 * libuuid-dev

-  TODO: remove the one function used from this library.
-  It supports a feature Bees no longer implements.
+  This library is only required for a feature that was removed after v0.1.
+  The lingering support code can be removed.

 * Linux kernel 4.4.3 or later

  Don't bother trying to make Bees work with older kernels.
  It won't end well.

-* 64-bit host and target CPU
+* markdown

-  This code has never been tested on a 32-bit target CPU.
-
-  A 64-bit host CPU may be required for the self-tests.
-  Some of the ioctls don't work properly with a 64-bit
-  kernel and 32-bit userspace.
-
-Build
-----
-
-Build with `make`.
-
-The build produces `bin/bees` and `lib/libcrucible.so`, which must be
-copied to somewhere in `$PATH` and `$LD_LIBRARY_PATH` on the target
-system respectively.
+* util-linux version that provides `blkid` command for the helper
+  script `scripts/beesd` to work

 Setup
 -----

+If you don't want to use the helper script `scripts/beesd` to setup and
+configure bees, here's how you manually setup bees.
+
 Create a directory for bees state files:

        export BEESHOME=/some/path
@@ -441,6 +494,6 @@ You can also use Github:
 Copyright & License
 ===================

-Copyright 2015-2016 Zygo Blaxell <bees@furryterror.org>.
+Copyright 2015-2017 Zygo Blaxell <bees@furryterror.org>.

 GPL (version 3 or later).
--- a/contrib/gentoo/bees-9999.ebuild
+++ b/contrib/gentoo/bees-9999.ebuild
@@ -0,0 +1,44 @@
+# Copyright 1999-2018 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+
+EAPI=6
+
+inherit git-r3 eutils multilib
+
+DESCRIPTION="Best-Effort Extent-Same, a btrfs dedup agent"
+HOMEPAGE="https://github.com/Zygo/bees"
+
+if [[ ${PV} == "9999" ]] ; then
+	EGIT_REPO_URI="https://github.com/kakra/bees.git"
+	EGIT_BRANCH="integration"
+else
+	SRC_URI="https://github.com/Zygo/bees/archive/v${PV}.tar.gz -> ${P}.tar.gz"
+fi
+
+PATCHES="
+	${FILESDIR}/v0.5-gentoo_build.patch
+"
+
+LICENSE="GPL-3"
+SLOT="0"
+KEYWORDS=""
+IUSE=""
+
+COMMON_DEPEND="
+	>=sys-apps/util-linux-2.30.2
+	>=sys-devel/gcc-4.9
+	>=sys-fs/btrfs-progs-4.1
+"
+DEPEND="
+	${COMMON_DEPEND}
+	|| ( dev-python/markdown dev-python/markdown2 )
+"
+RDEPEND="${COMMON_DEPEND}"
+
+DOCS="README.md COPYING"
+HTML_DOCS="README.html"
+
+src_prepare() {
+	default
+	echo LIBDIR=$(get_libdir) >>${S}/localconf
+}
--- a/contrib/gentoo/files/v0.5-gentoo_build.patch
+++ b/contrib/gentoo/files/v0.5-gentoo_build.patch
@@ -0,0 +1,20 @@
+diff --git a/localconf b/localconf
+new file mode 100644
+index 0000000..7705cbb
+--- /dev/null
+++ b/localconf
+@@ -0,0 +1,2 @@
+PREFIX=/
+LIBEXEC_PREFIX=/usr/libexec
+diff --git a/makeflags b/makeflags
+index f5983cb..0348623 100644
+--- a/makeflags
+++ b/makeflags
+@@ -1,4 +1,3 @@
+-CCFLAGS  = -Wall -Wextra -Werror -O3 -march=native -I../include -ggdb -D_FILE_OFFSET_BITS=64
+-# CCFLAGS  = -Wall -Wextra -Werror -O0 -I../include -ggdb -fpic -D_FILE_OFFSET_BITS=64
+-CFLAGS   = $(CCFLAGS) -std=c99
+-CXXFLAGS = $(CCFLAGS) -std=c++11 -Wold-style-cast
+CCFLAGS  = -O3 -I../include -fpic -D_FILE_OFFSET_BITS=64
+CFLAGS   += $(CCFLAGS) -std=c99
+CXXFLAGS += $(CCFLAGS) -std=c++11 -Wold-style-cast
--- a/include/crucible/bool.h
+++ b/include/crucible/bool.h
@@ -1,13 +0,0 @@
-#ifndef CRUCIBLE_BOOL_H
-#define CRUCIBLE_BOOL_H
-
-namespace crucible {
-	struct DefaultBool {
-		bool m_b;
-		DefaultBool(bool init = false) : m_b(init) {}
-		operator bool() const { return m_b; }
-		bool &operator=(const bool &that) { return m_b = that; }
-	};
-}
-
-#endif // CRUCIBLE_BOOL_H
--- a/include/crucible/btrfs.h
+++ b/include/crucible/btrfs.h
@@ -130,7 +130,7 @@
 	};

 #endif
-	 
+
 #ifndef BTRFS_IOC_CLONE_RANGE

 	struct btrfs_ioctl_clone_range_args {
--- a/include/crucible/cache.h
+++ b/include/crucible/cache.h
@@ -18,7 +18,7 @@ namespace crucible {
 	public:
 		using Key = tuple<Arguments...>;
 		using Func = function<Return(Arguments...)>;
-		using Time = unsigned;
+		using Time = size_t;
 		using Value = pair<Time, Return>;
 	private:
 		Func		m_fn;
@@ -28,7 +28,7 @@ namespace crucible {
 		size_t		m_max_size;
 		mutex		m_mutex;

-		void check_overflow();
+		bool check_overflow();
 	public:
 		LRUCache(Func f = Func(), size_t max_size = 100);

@@ -52,21 +52,24 @@ namespace crucible {
 	}

 	template <class Return, class... Arguments>
-	void
+	bool
 	LRUCache<Return, Arguments...>::check_overflow()
 	{
-		if (m_map.size() <= m_max_size) return;
-		vector<pair<Key, Time>> map_contents;
-		map_contents.reserve(m_map.size());
-		for (auto i : m_map) {
-			map_contents.push_back(make_pair(i.first, i.second.first));
+		if (m_map.size() <= m_max_size) {
+			return false;
 		}
-		sort(map_contents.begin(), map_contents.end(), [](const pair<Key, Time> &a, const pair<Key, Time> &b) {
+		vector<pair<Key, Time>> key_times;
+		key_times.reserve(m_map.size());
+		for (auto i : m_map) {
+			key_times.push_back(make_pair(i.first, i.second.first));
+		}
+		sort(key_times.begin(), key_times.end(), [](const pair<Key, Time> &a, const pair<Key, Time> &b) {
 			return a.second < b.second;
 		});
-		for (size_t i = 0; i < map_contents.size() / 2; ++i) {
-			m_map.erase(map_contents[i].first);
+		for (size_t i = 0; i < key_times.size() / 2; ++i) {
+			m_map.erase(key_times[i].first);
 		}
+		return true;
 	}

 	template <class Return, class... Arguments>
@@ -121,7 +124,7 @@ namespace crucible {
 		if (found == m_map.end()) {
 			// No, release cache lock and acquire key lock
 			lock.unlock();
-			typename LockSet<Key>::Lock key_lock(m_lockset, k);
+			auto key_lock = m_lockset.make_lock(k);

 			// Did item appear in cache while we were waiting for key?
 			lock.lock();
@@ -141,9 +144,14 @@ namespace crucible {
 				// We hold a lock on this key so we are the ones to insert it
 				THROW_CHECK0(runtime_error, inserted);

-				// Release key lock and clean out overflow
+				// Release key lock, keep the cache lock
 				key_lock.unlock();
-				check_overflow();
+
+				// Check to see if we have too many items and reduce if so.
+				if (check_overflow()) {
+					// Reset iterator
+					found = m_map.find(k);
+				}
 			}
 		}

@@ -154,7 +162,9 @@ namespace crucible {
 		if (!inserted) {
 			found->second.first = m_ctr++;
 		}
-		return found->second.second;
+		// Make copy before releasing lock
+		auto rv = found->second.second;
+		return rv;
 	}

 	template<class Return, class... Arguments>
@@ -187,7 +197,7 @@ namespace crucible {
 		if (found == m_map.end()) {
 			// No, release cache lock and acquire key lock
 			lock.unlock();
-			typename LockSet<Key>::Lock key_lock(m_lockset, k);
+			auto key_lock = m_lockset.make_lock(k);

 			// Did item appear in cache while we were waiting for key?
 			lock.lock();
@@ -205,7 +215,12 @@ namespace crucible {

 				// Release key lock and clean out overflow
 				key_lock.unlock();
-				check_overflow();
+
+				// Check to see if we have too many items and reduce if so.
+				if (check_overflow()) {
+					// Reset iterator
+					found = m_map.find(k);
+				}
 			}
 		}

--- a/include/crucible/chatter.h
+++ b/include/crucible/chatter.h
@@ -45,6 +45,8 @@ namespace crucible {
 		template <class T> Chatter &operator<<(const T& arg);

 		~Chatter();
+
+		static void enable_timestamp(bool prefix_timestamp);
 	};

 	template <class Argument>
--- a/include/crucible/cleanup.h
+++ b/include/crucible/cleanup.h
@@ -0,0 +1,18 @@
+#ifndef CRUCIBLE_CLEANUP_H
+#define CRUCIBLE_CLEANUP_H
+
+#include <functional>
+
+namespace crucible {
+	using namespace std;
+
+	class Cleanup {
+		function<void()> m_cleaner;
+	public:
+		Cleanup(function<void()> func);
+		~Cleanup();
+	};
+
+}
+
+#endif // CRUCIBLE_CLEANUP_H
--- a/include/crucible/error.h
+++ b/include/crucible/error.h
@@ -100,12 +100,6 @@ namespace crucible {
 } while (0)

 // macros for checking a constraint
-#define CHECK_CONSTRAINT(value, expr) do { \
-	if (!(expr)) { \
-		THROW_ERROR(out_of_range, #value << " = " << value << " failed constraint check (" << #expr << ")"); \
-	} \
-} while(0)
-
 #define THROW_CHECK0(type, expr) do { \
 	if (!(expr)) { \
 		THROW_ERROR(type, "failed constraint check (" << #expr << ")"); \
--- a/include/crucible/execpipe.h
+++ b/include/crucible/execpipe.h
@@ -1,28 +0,0 @@
-#ifndef CRUCIBLE_EXECPIPE_H
-#define CRUCIBLE_EXECPIPE_H
-
-#include "crucible/fd.h"
-
-#include <functional>
-#include <limits>
-#include <string>
-
-namespace crucible {
-	using namespace std;
-
-	void redirect_stdin(const Fd &child_fd);
-	void redirect_stdin_stdout(const Fd &child_fd);
-	void redirect_stdin_stdout_stderr(const Fd &child_fd);
-	void redirect_stdout(const Fd &child_fd);
-	void redirect_stdout_stderr(const Fd &child_fd);
-
-	// Open a pipe (actually socketpair) to child process, then execute code in that process.
-	// e.g. popen([] () { system("echo Hello, World!"); });
-	// Forked process will exit when function returns.
-	Fd popen(function<int()> f, function<void(const Fd &child_fd)> import_fd_fn = redirect_stdin_stdout);
-
-	// Read all the data from fd into a string
-        string read_all(Fd fd, size_t max_bytes = numeric_limits<size_t>::max(), size_t chunk_bytes = 4096);
-};
-
-#endif // CRUCIBLE_EXECPIPE_H
--- a/include/crucible/extentwalker.h
+++ b/include/crucible/extentwalker.h
@@ -8,15 +8,15 @@ namespace crucible {

 	// FIXME:  ExtentCursor is probably a better name
 	struct Extent {
-		off_t		m_begin;
-		off_t		m_end;
-		uint64_t	m_physical;
-		uint64_t	m_flags;
+		off_t		m_begin = 0;
+		off_t		m_end = 0;
+		uint64_t	m_physical = 0;
+		uint64_t	m_flags = 0;

 		// Btrfs extent reference details
-		off_t		m_physical_len;
-		off_t		m_logical_len;
-		off_t		m_offset;
+		off_t		m_physical_len = 0;
+		off_t		m_logical_len = 0;
+		off_t		m_offset = 0;

 		// fiemap flags are uint32_t, so bits 32..63 are OK for us

@@ -38,10 +38,12 @@ namespace crucible {
 		off_t physical_len() const { return m_physical_len; }
 		off_t logical_len() const { return m_logical_len; }
 		off_t offset() const { return m_offset; }
+		bool compressed() const;
+		uint64_t bytenr() const;
 		bool operator==(const Extent &that) const;
 		bool operator!=(const Extent &that) const { return !(*this == that); }

-		Extent();
+		Extent() = default;
 		Extent(const Extent &e) = default;
 	};

--- a/include/crucible/fd.h
+++ b/include/crucible/fd.h
@@ -13,6 +13,10 @@
 #include <sys/stat.h>
 #include <fcntl.h>

+// ioctl
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+
 // socket
 #include <sys/socket.h>

@@ -53,6 +57,10 @@ namespace crucible {

        typedef ResourceHandle<int, IOHandle> Fd;

+	static string __relative_path;
+	void set_relative_path(string path);
+	string relative_path();
+
 	// Functions named "foo_or_die" throw exceptions on failure.

 	// Attempt to open the file with the given mode
@@ -141,6 +149,9 @@ namespace crucible {
 		Stat &lstat(const string &filename);
 	};

+	int ioctl_iflags_get(int fd);
+	void ioctl_iflags_set(int fd, int attr);
+
 	string st_mode_ntoa(mode_t mode);

 	// Because it's not trivial to do correctly
--- a/include/crucible/fs.h
+++ b/include/crucible/fs.h
@@ -112,8 +112,8 @@ namespace crucible {
 		BTRFS_COMPRESS_NONE  = 0,
 		BTRFS_COMPRESS_ZLIB  = 1,
 		BTRFS_COMPRESS_LZO   = 2,
-		BTRFS_COMPRESS_TYPES = 2,
-		BTRFS_COMPRESS_LAST  = 3,
+		BTRFS_COMPRESS_ZSTD  = 3,
+		BTRFS_COMPRESS_TYPES = 3
 	} btrfs_compression_type;

 	struct FiemapExtent : public fiemap_extent {
--- a/include/crucible/interp.h
+++ b/include/crucible/interp.h
@@ -1,106 +0,0 @@
-#ifndef CRUCIBLE_INTERP_H
-#define CRUCIBLE_INTERP_H
-
-#include "crucible/error.h"
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace crucible {
-	using namespace std;
-
-	struct ArgList : public vector<string> {
-		ArgList(const char **argv);
-		// using vector<string>::vector ... doesn't work:
-		// error: ‘std::vector<std::basic_string<char> >::vector’ names constructor
-		// Still doesn't work in 4.9 because it can't manage a conversion
-		ArgList(const vector<string> &&that);
-	};
-
-	struct ArgActor {
-		struct ArgActorBase {
-			virtual void predicate(void *obj, string arg);
-		};
-
-		template <class T>
-		struct ArgActorDerived {
-			function<void(T, string)> m_func;
-
-			ArgActorDerived(decltype(m_func) func) :
-				m_func(func)
-			{
-			}
-
-			void predicate(void *obj, string arg) override
-			{
-				T &op = *(reinterpret_cast<T*>(obj));
-				m_func(op, obj);
-			}
-		};
-
-		template <class T>
-		ArgActor(T, function<void(T, string)> func) :
-			m_actor(make_shared(ArgActorDerived<T>(func)))
-		{
-		}
-
-		ArgActor() = default;
-
-		void predicate(void *t, string arg)
-		{
-			if (m_actor) {
-				m_actor->predicate(t, arg);
-			} else {
-				THROW_ERROR(invalid_argument, "null m_actor for predicate arg '" << arg << "'");
-			}
-		}
-
-	private:
-		shared_ptr<ArgActorBase> m_actor;
-	};
-
-	struct ArgParser {
-		~ArgParser();
-		ArgParser();
-
-		void add_opt(string opt, ArgActor actor);
-
-		template <class T>
-		void
-		parse(T t, const ArgList &args)
-		{
-			void *vt = &t;
-			parse_backend(vt, args);
-		}
-		
-	private:
-		void parse_backend(void *t, const ArgList &args);
-		map<string, ArgActor>	m_string_opts;
-	};
-
-	struct Command {
-		virtual ~Command();
-		virtual int exec(const ArgList &args) = 0;
-	};
-
-	struct Proc : public Command {
-		int exec(const ArgList &args) override;
-		Proc(const function<int(const ArgList &)> &f);
-	private:
-		function<int(const ArgList &)> m_cmd;
-	};
-
-	struct Interp {
-		virtual ~Interp();
-		Interp(const map<string, shared_ptr<Command> > &cmdlist);
-		void add_command(const string &name, const shared_ptr<Command> &command);
-		int exec(const ArgList &args);
-	private:
-		Interp(const Interp &) = delete;
-		map<string, shared_ptr<Command> > m_commands;
-	};
-
-};
-#endif // CRUCIBLE_INTERP_H
--- a/include/crucible/lockset.h
+++ b/include/crucible/lockset.h
@@ -1,14 +1,20 @@
 #ifndef CRUCIBLE_LOCKSET_H
 #define CRUCIBLE_LOCKSET_H

+#include <crucible/cleanup.h>
 #include <crucible/error.h>
+#include <crucible/process.h>

 #include <cassert>

 #include <condition_variable>
 #include <iostream>
+#include <limits>
+#include <map>
+#include <memory>
 #include <mutex>
 #include <set>
+#include <thread>

 namespace crucible {
 	using namespace std;
@@ -17,14 +23,39 @@ namespace crucible {
 	class LockSet {

 	public:
-		using key_type = T;
-		using set_type = set<T>;
+		using set_type = map<T, pid_t>;
+		using key_type = typename set_type::key_type;

 	private:

 		set_type			m_set;
 		mutex				m_mutex;
 		condition_variable		m_condvar;
+		size_t				m_max_size = numeric_limits<size_t>::max();
+		set<uint64_t>			m_priorities;
+		uint64_t			m_priority_counter;
+
+		bool full();
+		bool first_in_priority(uint64_t my_priority);
+		bool locked(const key_type &name);
+
+		class Lock {
+			LockSet		&m_lockset;
+			key_type	m_name;
+			bool		m_locked;
+
+			Lock() = delete;
+			Lock(const Lock &) = delete;
+			Lock& operator=(const Lock &) = delete;
+			Lock(Lock &&that) = delete;
+			Lock& operator=(Lock &&that) = delete;
+		public:
+			~Lock();
+			Lock(LockSet &lockset, const key_type &name, bool start_locked = true);
+			void lock();
+			void unlock();
+			bool try_lock();
+		};

 	public:
 		~LockSet();
@@ -36,26 +67,21 @@ namespace crucible {
 		size_t size();
 		bool empty();
 		set_type copy();
-		void wait_unlock(double interval);

-		class Lock {
-			LockSet		&m_lockset;
-			key_type	m_name;
-			bool		m_locked;
+		void max_size(size_t max);
+
+		class LockHandle {
+			shared_ptr<Lock> m_lock;

-			Lock() = delete;
-			Lock(const Lock &) = delete;
-			Lock& operator=(const Lock &) = delete;
 		public:
-			~Lock();
-			Lock(LockSet &lockset, const key_type &m_name, bool start_locked = true);
-			Lock(Lock &&that);
-			Lock& operator=(Lock &&that);
-			void lock();
-			void unlock();
-			bool try_lock();
+			LockHandle(LockSet &lockset, const key_type &name, bool start_locked = true) :
+				m_lock(make_shared<Lock>(lockset, name, start_locked)) {}
+			void lock() { m_lock->lock(); }
+			void unlock() { m_lock->unlock(); }
+			bool try_lock() { return m_lock->try_lock(); }
 		};

+		LockHandle make_lock(const key_type &name, bool start_locked = true);
 	};

 	template <class T>
@@ -68,16 +94,65 @@ namespace crucible {
 		assert(m_set.empty());
 	}

+	template <class T>
+	bool
+	LockSet<T>::full()
+	{
+		return m_set.size() >= m_max_size;
+	}
+
+	template <class T>
+	bool
+	LockSet<T>::first_in_priority(uint64_t my_priority)
+	{
+#if 1
+		auto counter = m_max_size;
+		for (auto i : m_priorities) {
+			if (i == my_priority) {
+				return true;
+			}
+			if (++counter > m_max_size) {
+				return false;
+			}
+		}
+		THROW_ERROR(runtime_error, "my_priority " << my_priority << " not in m_priorities (size " << m_priorities.size() << ")");
+#else
+		return *m_priorities.begin() == my_priority;
+#endif
+	}
+
+	template <class T>
+	bool
+	LockSet<T>::locked(const key_type &name)
+	{
+		return m_set.count(name);
+	}
+
+	template <class T>
+	void
+	LockSet<T>::max_size(size_t new_max_size)
+	{
+		THROW_CHECK1(out_of_range, new_max_size, new_max_size > 0);
+		m_max_size = new_max_size;
+	}
+
 	template <class T>
 	void
 	LockSet<T>::lock(const key_type &name)
 	{
 		unique_lock<mutex> lock(m_mutex);
-		while (m_set.count(name)) {
+		auto my_priority = m_priority_counter++;
+		Cleanup cleanup([&]() {
+			m_priorities.erase(my_priority);
+		});
+		m_priorities.insert(my_priority);
+		while (full() || locked(name) || !first_in_priority(my_priority)) {
 			m_condvar.wait(lock);
 		}
-		auto rv = m_set.insert(name);
+		auto rv = m_set.insert(make_pair(name, gettid()));
 		THROW_CHECK0(runtime_error, rv.second);
+		// We removed our priority slot so other threads have to check again
+		m_condvar.notify_all();
 	}

 	template <class T>
@@ -85,10 +160,10 @@ namespace crucible {
 	LockSet<T>::try_lock(const key_type &name)
 	{
 		unique_lock<mutex> lock(m_mutex);
-		if (m_set.count(name)) {
+		if (full() || locked(name)) {
 			return false;
 		}
-		auto rv = m_set.insert(name);
+		auto rv = m_set.insert(make_pair(name, gettid()));
 		THROW_CHECK1(runtime_error, name, rv.second);
 		return true;
 	}
@@ -98,20 +173,13 @@ namespace crucible {
 	LockSet<T>::unlock(const key_type &name)
 	{
 		unique_lock<mutex> lock(m_mutex);
-		m_condvar.notify_all();
 		auto erase_count = m_set.erase(name);
+		m_condvar.notify_all();
+		lock.unlock();
+		this_thread::yield();
 		THROW_CHECK1(invalid_argument, erase_count, erase_count == 1);
 	}

-	template <class T>
-	void
-	LockSet<T>::wait_unlock(double interval)
-	{
-		unique_lock<mutex> lock(m_mutex);
-		if (m_set.empty()) return;
-		m_condvar.wait_for(lock, chrono::duration<double>(interval));
-	}
-
 	template <class T>
 	size_t
 	LockSet<T>::size()
@@ -133,7 +201,10 @@ namespace crucible {
 	LockSet<T>::copy()
 	{
 		unique_lock<mutex> lock(m_mutex);
-		return m_set;
+		// Make temporary copy of set while protected by mutex
+		auto rv = m_set;
+		// Return temporary copy after releasing lock
+		return rv;
 	}

 	template <class T>
@@ -183,26 +254,10 @@ namespace crucible {
 	}

 	template <class T>
-	LockSet<T>::Lock::Lock(Lock &&that) :
-		m_lockset(that.lockset),
-		m_name(that.m_name),
-		m_locked(that.m_locked)
+	typename LockSet<T>::LockHandle
+	LockSet<T>::make_lock(const key_type &name, bool start_locked)
 	{
-		that.m_locked = false;
-	}
-
-	template <class T>
-	typename LockSet<T>::Lock &
-	LockSet<T>::Lock::operator=(Lock &&that)
-	{
-		THROW_CHECK2(invalid_argument, &m_lockset, &that.m_lockset, &m_lockset == &that.m_lockset);
-		if (m_locked && that.m_name != m_name) {
-			unlock();
-		}
-		m_name = that.m_name;
-		m_locked = that.m_locked;
-		that.m_locked = false;
-		return *this;
+		return LockHandle(*this, name, start_locked);
 	}

 }
--- a/include/crucible/resource.h
+++ b/include/crucible/resource.h
@@ -8,6 +8,7 @@
 #include <memory>
 #include <mutex>
 #include <iostream>
+#include <stdexcept>

 namespace crucible {
 	using namespace std;
@@ -44,36 +45,29 @@ namespace crucible {

 	private:
 		using traits_type = ResourceTraits<Key, Resource>;
-
-		class ResourceHolder {
-			resource_ptr_type m_ptr;
-		public:
-			~ResourceHolder();
-			ResourceHolder(resource_ptr_type that);
-			ResourceHolder(const ResourceHolder &that) = default;
-			ResourceHolder(ResourceHolder &&that) = default;
-			ResourceHolder& operator=(ResourceHolder &&that) = default;
-			ResourceHolder& operator=(const ResourceHolder &that) = default;
-			resource_ptr_type get_resource_ptr() const;
-		};
-
-		using holder_ptr_type = shared_ptr<ResourceHolder>;
-		using weak_holder_ptr_type = weak_ptr<ResourceHolder>;
-		using map_type = map<key_type, weak_holder_ptr_type>;
+		using weak_ptr_type = weak_ptr<Resource>;
+		using map_type = map<key_type, weak_ptr_type>;

 		// The only instance variable
-		holder_ptr_type m_ptr;
+		resource_ptr_type m_ptr;

 		// A bunch of static variables and functions
-		static mutex &s_mutex();
-		static shared_ptr<map_type> s_map();
-		static holder_ptr_type insert(const key_type &key);
-		static holder_ptr_type insert(const resource_ptr_type &res);
-		static void erase(const key_type &key);
+		static mutex s_map_mutex;
+		static map_type s_map;
+		static resource_ptr_type insert(const key_type &key);
+		static resource_ptr_type insert(const resource_ptr_type &res);
+		static void clean_locked();
 		static ResourceTraits<Key, Resource> s_traits;

 	public:

+		// Exceptions
+		struct duplicate_resource : public invalid_argument {
+			key_type m_key;
+			key_type get_key() const;
+			duplicate_resource(const key_type &key);
+		};
+
 		// test for resource.  A separate operator because key_type could be confused with bool.
 		bool operator!() const;

@@ -89,8 +83,15 @@ namespace crucible {
 		ResourceHandle(const resource_ptr_type &res);
 		ResourceHandle& operator=(const resource_ptr_type &res);

-		// default constructor is public
+		// default construct/assign/move is public and mostly harmless
 		ResourceHandle() = default;
+		ResourceHandle(const ResourceHandle &that) = default;
+		ResourceHandle(ResourceHandle &&that) = default;
+		ResourceHandle& operator=(const ResourceHandle &that) = default;
+		ResourceHandle& operator=(ResourceHandle &&that) = default;
+
+		// Nontrivial destructor
+		~ResourceHandle();

 		// forward anything else to the Resource constructor
 		// if we can do so unambiguously
@@ -109,7 +110,7 @@ namespace crucible {

 		// get pointer to Resource object (nothrow, result may be null)
 		resource_ptr_type get_resource_ptr() const;
-		// this version throws and is probably not thread safe
+		// this version throws
 		resource_ptr_type operator->() const;

 		// dynamic casting of the resource (throws if cast fails)
@@ -145,139 +146,94 @@ namespace crucible {
 	}

 	template <class Key, class Resource>
-	ResourceHandle<Key, Resource>::ResourceHolder::ResourceHolder(resource_ptr_type that) :
-		m_ptr(that)
+	ResourceHandle<Key, Resource>::duplicate_resource::duplicate_resource(const key_type &key) :
+		invalid_argument("duplicate resource"),
+		m_key(key)
 	{
-		// Cannot insert ourselves here since our shared_ptr does not exist yet.
 	}

 	template <class Key, class Resource>
-	mutex &
-	ResourceHandle<Key, Resource>::s_mutex()
+	auto
+	ResourceHandle<Key, Resource>::duplicate_resource::get_key() const -> key_type
 	{
-		static mutex gcc_won_t_instantiate_this_either;
-		return gcc_won_t_instantiate_this_either;
-	}
-
-	template <class Key, class Resource>
-	shared_ptr<typename ResourceHandle<Key, Resource>::map_type>
-	ResourceHandle<Key, Resource>::s_map()
-	{
-		static shared_ptr<map_type> gcc_won_t_instantiate_the_damn_static_vars;
-		if (!gcc_won_t_instantiate_the_damn_static_vars) {
-			gcc_won_t_instantiate_the_damn_static_vars = make_shared<map_type>();
-		}
-		return gcc_won_t_instantiate_the_damn_static_vars;
+		return m_key;
 	}

 	template <class Key, class Resource>
 	void
-	ResourceHandle<Key, Resource>::erase(const key_type &key)
+	ResourceHandle<Key, Resource>::clean_locked()
 	{
-		unique_lock<mutex> lock(s_mutex());
-		// Resources are allowed to set their Keys to null.
-		if (s_traits.is_null_key(key)) {
-			// Clean out any dead weak_ptr objects.
-			for (auto i = s_map()->begin(); i != s_map()->end(); ) {
-				if (! (*i).second.lock()) {
-					i = s_map()->erase(i);
-				} else {
-					++i;
-				}
+		// Must be called with lock held
+		for (auto i = s_map.begin(); i != s_map.end(); ) {
+			auto this_i = i;
+			++i;
+			if (this_i->second.expired()) {
+				s_map.erase(this_i);
 			}
-			return;
-		}
-		auto erased = s_map()->erase(key);
-		if (erased != 1) {
-			cerr << __PRETTY_FUNCTION__ << ": WARNING: s_map()->erase(" << key << ") returned " << erased << " != 1" << endl;
 		}
 	}

 	template <class Key, class Resource>
-	ResourceHandle<Key, Resource>::ResourceHolder::~ResourceHolder()
-	{
-		if (!m_ptr) {
-			// Probably something harmless like a failed constructor.
-			cerr << __PRETTY_FUNCTION__ << ": WARNING: destroying null m_ptr" << endl;
-			return;
-		}
-		Key key = s_traits.get_key(*m_ptr);
-		ResourceHandle::erase(key);
-	}
-
-	template <class Key, class Resource>
-	typename ResourceHandle<Key, Resource>::holder_ptr_type
+	typename ResourceHandle<Key, Resource>::resource_ptr_type
 	ResourceHandle<Key, Resource>::insert(const key_type &key)
 	{
 		// no Resources for null keys
 		if (s_traits.is_null_key(key)) {
-			return holder_ptr_type();
+			return resource_ptr_type();
 		}
-		unique_lock<mutex> lock(s_mutex());
-		// find ResourceHolder for non-null key
-		auto found = s_map()->find(key);
-		if (found != s_map()->end()) {
-			holder_ptr_type rv = (*found).second.lock();
-			// a weak_ptr may have expired
+		unique_lock<mutex> lock(s_map_mutex);
+		auto found = s_map.find(key);
+		if (found != s_map.end()) {
+			resource_ptr_type rv = found->second.lock();
 			if (rv) {
+				// Use existing Resource
 				return rv;
+			} else {
+				// It's OK for the map to temporarily contain an expired weak_ptr to some dead Resource
+				clean_locked();
 			}
 		}
 		// not found or expired, throw any existing ref away and make a new one
 		resource_ptr_type rpt = s_traits.make_resource(key);
-		holder_ptr_type hpt = make_shared<ResourceHolder>(rpt);
 		// store weak_ptr in map
-		(*s_map())[key] = hpt;
+		s_map[key] = rpt;
 		// return shared_ptr
-		return hpt;
+		return rpt;
 	};

 	template <class Key, class Resource>
-	typename ResourceHandle<Key, Resource>::holder_ptr_type
+	typename ResourceHandle<Key, Resource>::resource_ptr_type
 	ResourceHandle<Key, Resource>::insert(const resource_ptr_type &res)
 	{
-		// no Resource, no ResourceHolder.
+		// no Resources for null keys
 		if (!res) {
-			return holder_ptr_type();
+			return resource_ptr_type();
 		}
-		// no ResourceHolders for null keys either.
 		key_type key = s_traits.get_key(*res);
 		if (s_traits.is_null_key(key)) {
-			return holder_ptr_type();
+			return resource_ptr_type();
 		}
-		unique_lock<mutex> lock(s_mutex());
-		// find ResourceHolder for non-null key
-		auto found = s_map()->find(key);
-		if (found != s_map()->end()) {
-			holder_ptr_type rv = (*found).second.lock();
-			// The map doesn't own the ResourceHolders, the ResourceHandles do.
-			// It's OK for the map to contain an expired weak_ptr to some dead ResourceHolder...
+		unique_lock<mutex> lock(s_map_mutex);
+		// find Resource for non-null key
+		auto found = s_map.find(key);
+		if (found != s_map.end()) {
+			resource_ptr_type rv = found->second.lock();
+			// It's OK for the map to temporarily contain an expired weak_ptr to some dead Resource...
 			if (rv) {
-				// found ResourceHolder, look at pointer
-				resource_ptr_type rp = rv->get_resource_ptr();
-				// We do not store references to null Resources.
-				assert(rp);
-				// Key retrieved for an existing object must match key searched or be null.
-				key_type found_key = s_traits.get_key(*rp);
-				bool found_key_is_null = s_traits.is_null_key(found_key);
-				assert(found_key_is_null || found_key == key);
-				if (!found_key_is_null) {
-					// We do not store references to duplicate resources.
-					if (rp.owner_before(res) || res.owner_before(rp)) {
-						cerr << "inserting new Resource with existing Key " << key << " not allowed at " << __PRETTY_FUNCTION__ << endl;;
-						abort();
-						// THROW_ERROR(out_of_range, "inserting new Resource with existing Key " << key << " not allowed at " << __PRETTY_FUNCTION__);
-					}
-					// rv is good, return it
-					return rv;
+				// ...but not a duplicate Resource.
+				if (rv.owner_before(res) || res.owner_before(rv)) {
+					throw duplicate_resource(key);
 				}
+				// Use the existing Resource (discard the caller's).
+				return rv;
+			} else {
+				// Clean out expired weak_ptrs
+				clean_locked();
 			}
 		}
-		// not found or expired, make a new one
-		holder_ptr_type rv = make_shared<ResourceHolder>(res);
-		s_map()->insert(make_pair(key, weak_holder_ptr_type(rv)));
-		// no need to check s_map result, we are either replacing a dead weak_ptr or adding a new one
-		return rv;
+		// not found or expired, make a new one or replace old one
+		s_map[key] = res;
+		return res;
 	};

 	template <class Key, class Resource>
@@ -309,31 +265,47 @@ namespace crucible {
 	}

 	template <class Key, class Resource>
-	typename ResourceHandle<Key, Resource>::resource_ptr_type
-	ResourceHandle<Key, Resource>::ResourceHolder::get_resource_ptr() const
+	ResourceHandle<Key, Resource>::~ResourceHandle()
 	{
-		return m_ptr;
+		// No pointer, nothing to do
+		if (!m_ptr) {
+			return;
+		}
+		// Save key so we can clean the map
+		auto key = s_traits.get_key(*m_ptr);
+		// Save a weak_ptr so we can tell if we need to clean the map
+		weak_ptr_type wp = m_ptr;
+		// Drop shared_ptr
+		m_ptr.reset();
+		// If there are still other references to the shared_ptr, we can stop now
+		if (!wp.expired()) {
+			return;
+		}
+		// Remove weak_ptr from map if it has expired
+		// (and not been replaced in the meantime)
+		unique_lock<mutex> lock_map(s_map_mutex);
+		auto found = s_map.find(key);
+		// Map entry may have been replaced, so check for expiry again
+		if (found != s_map.end() && found->second.expired()) {
+			s_map.erase(key);
+		}
 	}

 	template <class Key, class Resource>
 	typename ResourceHandle<Key, Resource>::resource_ptr_type
 	ResourceHandle<Key, Resource>::get_resource_ptr() const
 	{
-		if (!m_ptr) {
-			return resource_ptr_type();
-		}
-		return m_ptr->get_resource_ptr();
+		return m_ptr;
 	}

 	template <class Key, class Resource>
 	typename ResourceHandle<Key, Resource>::resource_ptr_type
 	ResourceHandle<Key, Resource>::operator->() const
 	{
-		resource_ptr_type rp = get_resource_ptr();
-		if (!rp) {
+		if (!m_ptr) {
 			THROW_ERROR(out_of_range, __PRETTY_FUNCTION__ << " called on null Resource");
 		}
-		return rp;
+		return m_ptr;
 	}

 	template <class Key, class Resource>
@@ -342,11 +314,10 @@ namespace crucible {
 	ResourceHandle<Key, Resource>::cast() const
 	{
 		shared_ptr<T> dp;
-		resource_ptr_type rp = get_resource_ptr();
-		if (!rp) {
+		if (!m_ptr) {
 			return dp;
 		}
-		dp = dynamic_pointer_cast<T>(rp);
+		dp = dynamic_pointer_cast<T>(m_ptr);
 		if (!dp) {
 			throw bad_cast();
 		}
@@ -357,11 +328,10 @@ namespace crucible {
 	typename ResourceHandle<Key, Resource>::key_type
 	ResourceHandle<Key, Resource>::get_key() const
 	{
-		resource_ptr_type rp = get_resource_ptr();
-		if (!rp) {
+		if (!m_ptr) {
 			return s_traits.get_null_key();
 		} else {
-			return s_traits.get_key(*rp);
+			return s_traits.get_key(*m_ptr);
 		}
 	}

@@ -378,9 +348,15 @@ namespace crucible {
 		return s_traits.is_null_key(operator key_type());
 	}

+	// Apparently GCC wants these to be used before they are defined.
 	template <class Key, class Resource>
 	ResourceTraits<Key, Resource> ResourceHandle<Key, Resource>::s_traits;

+	template <class Key, class Resource>
+	mutex ResourceHandle<Key, Resource>::s_map_mutex;
+
+	template <class Key, class Resource>
+	typename ResourceHandle<Key, Resource>::map_type ResourceHandle<Key, Resource>::s_map;

 }

--- a/include/crucible/time.h
+++ b/include/crucible/time.h
@@ -32,10 +32,11 @@ namespace crucible {
 		Timer	m_timer;
 		double	m_rate;
 		double	m_burst;
-		double  m_tokens;
+		double  m_tokens = 0.0;
 		mutex	m_mutex;

 		void update_tokens();
+		RateLimiter() = delete;
 	public:
 		RateLimiter(double rate, double burst);
 		RateLimiter(double rate);
--- a/include/crucible/timequeue.h
+++ b/include/crucible/timequeue.h
@@ -77,7 +77,7 @@ namespace crucible {
 	void
 	TimeQueue<Task>::push(const Task &task, double delay)
 	{
-		Timestamp time = chrono::high_resolution_clock::now() + 
+		Timestamp time = chrono::high_resolution_clock::now() +
 			chrono::duration_cast<chrono::high_resolution_clock::duration>(chrono::duration<double>(delay));
 		unique_lock<mutex> lock(m_mutex);
 		while (m_set.size() > m_max_queue_depth) {
@@ -91,7 +91,7 @@ namespace crucible {
 	void
 	TimeQueue<Task>::push_nowait(const Task &task, double delay)
 	{
-		Timestamp time = chrono::high_resolution_clock::now() + 
+		Timestamp time = chrono::high_resolution_clock::now() +
 			chrono::duration_cast<chrono::high_resolution_clock::duration>(chrono::duration<double>(delay));
 		unique_lock<mutex> lock(m_mutex);
 		m_set.insert(Item(time, task));
--- a/include/crucible/version.h
+++ b/include/crucible/version.h
@@ -0,0 +1,8 @@
+#ifndef CRUCIBLE_VERSION_H
+#define CRUCIBLE_VERSION_H
+
+namespace crucible {
+	extern const char *VERSION;
+}
+
+#endif CRUCIBLE_VERSION_H
--- a/include/crucible/workqueue.h
+++ b/include/crucible/workqueue.h
@@ -124,7 +124,9 @@ namespace crucible {
 		if (m_set.empty()) {
 			return key_type();
 		} else {
-			return *m_set.begin();
+			// Make copy with lock held
+			auto rv = *m_set.begin();
+			return rv;
 		}
 	}

@@ -149,7 +151,8 @@ namespace crucible {
 	WorkQueue<Task>::copy()
 	{
 		unique_lock<mutex> lock(m_mutex);
-		return m_set;
+		auto rv = m_set;
+		return rv;
 	}

 	template <class Task>
--- a/lib/.gitignore
+++ b/lib/.gitignore
@@ -0,0 +1 @@
+.version.*
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -1,35 +1,37 @@
+TAG := $(shell git describe --always --dirty || echo UNKNOWN)
+
 default: libcrucible.so

 OBJS = \
-	crc64.o \
 	chatter.o \
+	cleanup.o \
+	crc64.o \
 	error.o \
-	execpipe.o \
 	extentwalker.o \
 	fd.o \
 	fs.o \
-	interp.o \
 	ntoa.o \
 	path.o \
 	process.o \
 	string.o \
 	time.o \
 	uuid.o \
+	.version.o \

 include ../makeflags

-depends.mk: *.c *.cc
-	for x in *.c; do $(CC) $(CFLAGS) -M "$$x"; done > depends.mk.new
+depends.mk: *.cc
 	for x in *.cc; do $(CXX) $(CXXFLAGS) -M "$$x"; done >> depends.mk.new
 	mv -fv depends.mk.new depends.mk

+.version.cc: Makefile ../makeflags *.cc ../include/crucible/*.h
+	echo "namespace crucible { const char *VERSION = \"$(TAG)\"; }" > .version.new.cc
+	mv -f .version.new.cc .version.cc
+
 -include depends.mk

-%.o: %.c
-	$(CC) $(CFLAGS) -o $@ -c $<
-
 %.o: %.cc ../include/crucible/%.h
-	$(CXX) $(CXXFLAGS) -o $@ -c $<
+	$(CXX) $(CXXFLAGS) -fPIC -o $@ -c $<

 libcrucible.so: $(OBJS) Makefile
-	$(CXX) $(LDFLAGS) -o $@ $(OBJS) -shared -luuid
+	$(CXX) $(LDFLAGS) -fPIC -o $@ $(OBJS) -shared -Wl,-soname,$@ -luuid
--- a/lib/chatter.cc
+++ b/lib/chatter.cc
@@ -17,6 +17,7 @@ namespace crucible {

 	static shared_ptr<set<string>> chatter_names;
 	static const char *SPACETAB = " \t";
+	static bool add_prefix_timestamp = true;

 	static
 	void
@@ -48,20 +49,31 @@ namespace crucible {
 	{
 	}

+	void
+	Chatter::enable_timestamp(bool prefix_timestamp)
+	{
+		add_prefix_timestamp = prefix_timestamp;
+	}
+
 	Chatter::~Chatter()
 	{
 		ostringstream header_stream;

-		time_t ltime;
-		DIE_IF_MINUS_ONE(time(&ltime));
-		struct tm ltm;
-		DIE_IF_ZERO(localtime_r(&ltime, &ltm));
+		if (add_prefix_timestamp) {
+			time_t ltime;
+			DIE_IF_MINUS_ONE(time(&ltime));
+			struct tm ltm;
+			DIE_IF_ZERO(localtime_r(&ltime, &ltm));

-		char buf[1024];
-		DIE_IF_ZERO(strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &ltm));
+			char buf[1024];
+			DIE_IF_ZERO(strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &ltm));
+
+			header_stream << buf;
+			header_stream << " " << getpid() << "." << gettid();
+		} else {
+			header_stream << "tid " << gettid();
+		}

-		header_stream << buf;
-		header_stream << " " << getpid() << "." << gettid();
 		if (!m_name.empty()) {
 			header_stream << " " << m_name;
 		}
--- a/lib/cleanup.cc
+++ b/lib/cleanup.cc
@@ -0,0 +1,17 @@
+#include <crucible/cleanup.h>
+
+namespace crucible {
+
+	Cleanup::Cleanup(function<void()> func) :
+		m_cleaner(func)
+	{
+	}
+
+	Cleanup::~Cleanup()
+	{
+		if (m_cleaner) {
+			m_cleaner();
+		}
+	}
+
+}
--- a/lib/execpipe.cc
+++ b/lib/execpipe.cc
@@ -1,100 +0,0 @@
-#include "crucible/execpipe.h"
-
-#include "crucible/chatter.h"
-#include "crucible/error.h"
-#include "crucible/process.h"
-
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-namespace crucible {
-	using namespace std;
-
-	void
-	redirect_stdin(const Fd &child_fd)
-	{
-		dup2_or_die(child_fd, STDIN_FILENO);
-	}
-
-	void
-	redirect_stdin_stdout(const Fd &child_fd)
-	{
-		dup2_or_die(child_fd, STDOUT_FILENO);
-		dup2_or_die(child_fd, STDIN_FILENO);
-	}
-
-	void
-	redirect_stdin_stdout_stderr(const Fd &child_fd)
-	{
-		dup2_or_die(child_fd, STDERR_FILENO);
-		dup2_or_die(child_fd, STDOUT_FILENO);
-		dup2_or_die(child_fd, STDIN_FILENO);
-	}
-
-	void
-	redirect_stdout_stderr(const Fd &child_fd)
-	{
-		dup2_or_die(child_fd, STDERR_FILENO);
-		dup2_or_die(child_fd, STDOUT_FILENO);
-	}
-
-	void
-	redirect_stdout(const Fd &child_fd)
-	{
-		dup2_or_die(child_fd, STDOUT_FILENO);
-	}
-
-	void
-	redirect_stderr(const Fd &child_fd)
-	{
-		dup2_or_die(child_fd, STDERR_FILENO);
-	}
-
-	Fd popen(function<int()> f, function<void(const Fd &child_fd)> import_fd_fn)
-	{
-		Fd parent_fd, child_fd;
-		{
-			pair<Fd, Fd> fd_pair = socketpair_or_die();
-			parent_fd = fd_pair.first;
-			child_fd = fd_pair.second;
-		}
-
-		pid_t fv;
-		DIE_IF_MINUS_ONE(fv = fork());
-
-		if (fv) {
-			child_fd->close();
-			return parent_fd;
-		} else {
-			int rv = EXIT_FAILURE;
-			catch_all([&]() {
-				parent_fd->close();
-				import_fd_fn(child_fd);
-
-				rv = f();
-			});
-			_exit(rv);
-		}
-	}
-
-	string
-	read_all(Fd fd, size_t max_bytes, size_t chunk_bytes)
-	{
-		char buf[chunk_bytes];
-		string str;
-		size_t rv;
-		while (1) {
-			read_partial_or_die(fd, static_cast<void *>(buf), chunk_bytes, rv);
-			if (rv == 0) {
-				break;
-			}
-			if (max_bytes - str.size() < rv) {
-				THROW_ERROR(out_of_range, "Output size limit " << max_bytes << " exceeded by appending " << rv << " bytes read to " << str.size() << " already in string");
-			}
-			str.append(buf, rv);
-		}
-		return str;
-	}
-}
--- a/lib/extentwalker.cc
+++ b/lib/extentwalker.cc
@@ -79,17 +79,6 @@ namespace crucible {
 			<< "] }";
 	}

-	Extent::Extent() :
-		m_begin(0),
-		m_end(0),
-		m_physical(0),
-		m_flags(0),
-		m_physical_len(0),
-		m_logical_len(0),
-		m_offset(0)
-	{
-	}
-
 	Extent::operator bool() const
 	{
 		THROW_CHECK2(invalid_argument, m_begin, m_end, m_end >= m_begin);
@@ -109,6 +98,18 @@ namespace crucible {
 		return m_begin == that.m_begin && m_end == that.m_end && m_physical == that.m_physical && m_flags == that.m_flags;
 	}

+	bool
+	Extent::compressed() const
+	{
+		return m_flags & FIEMAP_EXTENT_ENCODED;
+	}
+
+	uint64_t
+	Extent::bytenr() const
+	{
+		return compressed() ? m_physical : m_physical - m_offset;
+	}
+
 	ExtentWalker::ExtentWalker(Fd fd) :
 		m_fd(fd),
 		m_current(m_extents.begin())
@@ -519,25 +520,26 @@ namespace crucible {

 			auto type = call_btrfs_get(btrfs_stack_file_extent_type, i.m_data);
 			off_t len = -1;
-                        switch (type) {
-                                default:
+			switch (type) {
+				default:
 					cerr << "Unhandled file extent type " << type << " in root " << m_tree_id << " ino " << m_stat.st_ino << endl;
 					break;
-                                case BTRFS_FILE_EXTENT_INLINE:
+				case BTRFS_FILE_EXTENT_INLINE:
 					len = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data));
 					e.m_flags |= FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED;
 					// Inline extents are never obscured, so don't bother filling in m_physical_len, etc.
-                                        break;
-                                case BTRFS_FILE_EXTENT_PREALLOC:
+					break;
+				case BTRFS_FILE_EXTENT_PREALLOC:
 					e.m_flags |= Extent::PREALLOC;
-                                case BTRFS_FILE_EXTENT_REG: {
+					// fallthrough
+				case BTRFS_FILE_EXTENT_REG: {
 					e.m_physical = call_btrfs_get(btrfs_stack_file_extent_disk_bytenr, i.m_data);

 					// This is the length of the full extent (decompressed)
-                                        off_t ram = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data));
+					off_t ram = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data));

 					// This is the length of the part of the extent appearing in the file (decompressed)
-                                        len = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_num_bytes, i.m_data));
+					len = ranged_cast<off_t>(call_btrfs_get(btrfs_stack_file_extent_num_bytes, i.m_data));

 					// This is the offset from start of on-disk extent to the part we see in the file (decompressed)
 					// May be negative due to the kind of bug we're stuck with forever, so no cast range check
--- a/lib/fd.cc
+++ b/lib/fd.cc
@@ -488,6 +488,20 @@ namespace crucible {
 		lstat(filename);
 	}

+	int
+	ioctl_iflags_get(int fd)
+	{
+		int attr = 0;
+		DIE_IF_MINUS_ONE(ioctl(fd, FS_IOC_GETFLAGS, &attr));
+		return attr;
+	}
+
+	void
+	ioctl_iflags_set(int fd, int attr)
+	{
+		DIE_IF_MINUS_ONE(ioctl(fd, FS_IOC_SETFLAGS, &attr));
+	}
+
 	string
 	readlink_or_die(const string &path)
 	{
@@ -513,6 +527,22 @@ namespace crucible {
 		THROW_ERROR(runtime_error, "readlink: maximum buffer size exceeded");
 	}

+	string
+	relative_path()
+	{
+		return __relative_path;
+	}
+
+	void
+	set_relative_path(string path)
+	{
+		path = path + "/";
+		for (string::size_type i = path.find("//"); i != string::npos; i = path.find("//")) {
+			path.erase(i, 1);
+		}
+		__relative_path = path;
+	}
+
 	// Turn a FD into a human-recognizable filename OR an error message.
 	string
 	name_fd(int fd)
@@ -520,7 +550,12 @@ namespace crucible {
 		try {
 			ostringstream oss;
 			oss << "/proc/self/fd/" << fd;
-			return readlink_or_die(oss.str());
+			string path = readlink_or_die(oss.str());
+			if (!__relative_path.empty() && 0 == path.find(__relative_path))
+			{
+				path.erase(0, __relative_path.length());
+			}
+			return path;
 		} catch (exception &e) {
 			return string(e.what());
 		}
--- a/lib/fs.cc
+++ b/lib/fs.cc
@@ -468,6 +468,7 @@ namespace crucible {
 		static const bits_ntoa_table table[] = {
 			NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_ZLIB),
 			NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_LZO),
+			NTOA_TABLE_ENTRY_ENUM(BTRFS_COMPRESS_ZSTD),
 			NTOA_TABLE_ENTRY_END()
 		};
 		return bits_ntoa(compress_type, table);
@@ -625,7 +626,7 @@ namespace crucible {
 	void
 	Fiemap::do_ioctl(int fd)
 	{
-		CHECK_CONSTRAINT(m_min_count, m_min_count <= m_max_count);
+		THROW_CHECK1(out_of_range, m_min_count, m_min_count <= m_max_count);

 		auto extent_count = m_min_count;
 		vector<char> ioctl_arg = vector_copy_struct<fiemap>(this);
@@ -716,10 +717,20 @@ namespace crucible {
 	bool
 	BtrfsIoctlSearchKey::do_ioctl_nothrow(int fd)
 	{
-		vector<char> ioctl_arg = vector_copy_struct<btrfs_ioctl_search_key>(this);
 		// Normally we like to be paranoid and fill empty bytes with zero,
 		// but these buffers can be huge.  80% of a 4GHz CPU huge.
-		ioctl_arg.resize(sizeof(btrfs_ioctl_search_args_v2) + m_buf_size);
+
+		// Keep the ioctl buffer from one run to the next to save on malloc costs
+		size_t target_buf_size = sizeof(btrfs_ioctl_search_args_v2) + m_buf_size;
+
+		thread_local vector<char> ioctl_arg;
+		if (ioctl_arg.size() < m_buf_size) {
+			ioctl_arg = vector_copy_struct<btrfs_ioctl_search_key>(this);
+			ioctl_arg.resize(target_buf_size);
+		} else {
+			memcpy(ioctl_arg.data(), static_cast<btrfs_ioctl_search_key*>(this), sizeof(btrfs_ioctl_search_key));
+		}
+
 		btrfs_ioctl_search_args_v2 *ioctl_ptr = reinterpret_cast<btrfs_ioctl_search_args_v2 *>(ioctl_arg.data());

 		ioctl_ptr->buf_size = m_buf_size;
@@ -913,7 +924,7 @@ namespace crucible {
 	ostream &
 	operator<<(ostream &os, const BtrfsIoctlSearchHeader &hdr)
 	{
-		os << "BtrfsIoctlSearchHeader { " 
+		os << "BtrfsIoctlSearchHeader { "
 			<< static_cast<const btrfs_ioctl_search_header &>(hdr)
 			<< ", data = ";
 		hexdump(os, hdr.m_data);
@@ -923,7 +934,7 @@ namespace crucible {
 	ostream &
 	operator<<(ostream &os, const BtrfsIoctlSearchKey &key)
 	{
-		os << "BtrfsIoctlSearchKey { " 
+		os << "BtrfsIoctlSearchKey { "
 			<< static_cast<const btrfs_ioctl_search_key &>(key)
 			<< ", buf_size = " << key.m_buf_size
 			<< ", buf[" << key.m_result.size() << "] = {";
--- a/lib/interp.cc
+++ b/lib/interp.cc
@@ -1,96 +0,0 @@
-#include "crucible/interp.h"
-
-#include "crucible/chatter.h"
-
-namespace crucible {
-	using namespace std;
-
-	int
-	Proc::exec(const ArgList &args)
-	{
-		return m_cmd(args);
-	}
-
-	Proc::Proc(const function<int(const ArgList &)> &f) :
-		m_cmd(f)
-	{
-	}
-
-	Command::~Command()
-	{
-	}
-
-	ArgList::ArgList(const char **argv)
-	{
-		while (argv && *argv) {
-			push_back(*argv++);
-		}
-	}
-
-	ArgList::ArgList(const vector<string> &&that) :
-		vector<string>(that)
-	{
-	}
-
-	Interp::~Interp()
-	{
-	}
-
-	Interp::Interp(const map<string, shared_ptr<Command> > &cmdlist) :
-		m_commands(cmdlist)
-	{
-	}
-
-	void
-	Interp::add_command(const string &name, const shared_ptr<Command> &command)
-	{
-		m_commands[name] = command;
-	}
-
-	int
-	Interp::exec(const ArgList &args)
-	{
-		auto next_arg = args.begin();
-		++next_arg;
-		return m_commands.at(args[0])->exec(vector<string>(next_arg, args.end()));
-	}
-
-	ArgParser::~ArgParser()
-	{
-	}
-
-	ArgParser::ArgParser()
-	{
-	}
-
-	void
-	ArgParser::add_opt(string opt, ArgActor actor)
-	{
-		m_string_opts[opt] = actor;
-	}
-
-	void
-	ArgParser::parse_backend(void *t, const ArgList &args)
-	{
-		bool quote_args = false;
-		for (string arg : args) {
-			if (quote_args) {
-				cerr << "arg: '" << arg << "'" << endl;
-				continue;
-			}
-			if (arg == "--") {
-				quote_args = true;
-				continue;
-			}
-			if (arg.compare(0, 2, "--") == 0) {
-				auto found = m_string_opts.find(arg.substr(2, string::npos));
-				if (found != m_string_opts.end()) {
-					found->second.predicate(t, "foo");
-				}
-				(void)t;
-			}
-		}
-	}
-
-
-};
--- a/4
+++ b/4
@@ -1,4 +1,4 @@
-CCFLAGS  = -Wall -Wextra -Werror -O3 -march=native -I../include -ggdb -fpic -D_FILE_OFFSET_BITS=64
-# CCFLAGS  = -Wall -Wextra -Werror -O0 -I../include -ggdb -fpic
+CCFLAGS  = -Wall -Wextra -Werror -O3 -march=native -I../include -ggdb -D_FILE_OFFSET_BITS=64
+# CCFLAGS  = -Wall -Wextra -Werror -O0 -I../include -ggdb -fpic -D_FILE_OFFSET_BITS=64
 CFLAGS   = $(CCFLAGS) -std=c99
 CXXFLAGS = $(CCFLAGS) -std=c++11 -Wold-style-cast
--- a/scripts/beesd.conf.sample
+++ b/scripts/beesd.conf.sample
@@ -2,8 +2,11 @@
 ## https://github.com/Zygo/bees
 ## It's a default values, change it, if needed

+# How to use?
+# Copy this file to a new file name and adjust the UUID below
+
 # Which FS will be used
-UUID=5d3c0ad5-bedf-463d-8235-b4d4f6f99476
+UUID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx

 ## System Vars
 # Change carefully
@@ -12,8 +15,8 @@ UUID=5d3c0ad5-bedf-463d-8235-b4d4f6f99476
 # BEESHOME="$MNT_DIR/.beeshome"
 # BEESSTATUS="$WORK_DIR/$UUID.status"

-## Make path shorter in logs
-# LOG_SHORT_PATH=N
+## Default options to apply, see --help for details
+# OPTIONS="--relative-paths --notimestamps"

 ## Bees DB size
 # Hash Table Sizing
--- a/scripts/beesd.in
+++ b/scripts/beesd.in
@@ -1,5 +1,4 @@
 #!/bin/bash
-# /usr/bin/beesd

 ## Helpful functions
 INFO(){ echo "INFO:" "$@"; }
@@ -13,18 +12,34 @@ export CONFIG_FILE
 export UUID AL16M

 readonly AL16M="$((16*1024*1024))"
-readonly CONFIG_DIR=/etc/bees/
+readonly CONFIG_DIR=@PREFIX@/etc/bees/

 ## Pre checks
 {
    [ ! -d "$CONFIG_DIR" ] && ERRO "Missing: $CONFIG_DIR"
-    [ "$UID" == "0" ] || ERRO "Must be runned as root"
+    [ "$UID" == "0" ] || ERRO "Must be run as root"
 }

-command -v bees &> /dev/null || ERRO "Missing 'bees' command"
+command -v @LIBEXEC_PREFIX@/bees &> /dev/null || ERRO "Missing 'bees' agent"

 ## Parse args
-UUID="$1"
+ARGUMENTS=()
+while [ $# -gt 0 ]; do
+    case "$1" in
+        -*)
+            ARGUMENTS+=($1)
+        ;;
+        *)
+            if [ -z "$UUID" ]; then
+                UUID="$1"
+            else
+                ERRO "Only one filesystem may be supplied"
+            fi
+        ;;
+    esac
+    shift
+done
+
 case "$UUID" in
    *-*-*-*-*)
        FILE_CONFIG=""
@@ -39,7 +54,7 @@ case "$UUID" in
        source "$FILE_CONFIG"
    ;;
    *)
-        echo "beesd <btrfs_uuid>"
+        echo "beesd [options] <btrfs_uuid>"
        exit 1
    ;;
 esac
@@ -49,18 +64,19 @@ MNT_DIR="${MNT_DIR:-$WORK_DIR/mnt/$UUID}"
 BEESHOME="${BEESHOME:-$MNT_DIR/.beeshome}"
 BEESSTATUS="${BEESSTATUS:-$WORK_DIR/$UUID.status}"
 DB_SIZE="${DB_SIZE:-$((64*AL16M))}"
-LOG_SHORT_PATH="${LOG_SHORT_PATH:-N}"
-
-INFO "Check: BTRFS UUID exists"
-if [ ! -d "/sys/fs/btrfs/$UUID" ]; then
-    ERRO "Can't find BTRFS UUID: $UUID"
-fi

 INFO "Check: Disk exists"
 if [ ! -b "/dev/disk/by-uuid/$UUID" ]; then
    ERRO "Missing disk: /dev/disk/by-uuid/$UUID"
 fi

+is_btrfs(){ [ "$(blkid -s TYPE -o value "$1")" == "btrfs" ]; }
+
+INFO "Check: Disk with btrfs"
+if ! is_btrfs "/dev/disk/by-uuid/$UUID"; then
+    ERRO "Disk not contain btrfs: /dev/disk/by-uuid/$UUID"
+fi
+
 INFO "WORK DIR: $WORK_DIR"
 mkdir -p "$WORK_DIR" || exit 1

@@ -97,10 +113,7 @@ fi
    chmod 700 "$DB_PATH"
 }

-if YN "$LOG_SHORT_PATH"; then
-    cd "$MNT_DIR" || exit 1
-    bees .
-else
-    bees "$MNT_DIR"
-fi
-exit 0
+MNT_DIR="${MNT_DIR//\/\//\/}"
+
+cd "$MNT_DIR"
+@LIBEXEC_PREFIX@/bees "${ARGUMENTS[@]}" $OPTIONS "$MNT_DIR"
--- a/scripts/beesd@.service
+++ b/scripts/beesd@.service
@@ -1,14 +0,0 @@
-[Unit]
-Description=Bees - Best-Effort Extent-Same, a btrfs deduplicator daemon: %i
-After=local-fs.target
-
-[Service]
-ExecStart=/usr/bin/beesd %i
-Nice=19
-IOSchedulingClass=idle
-CPUAccounting=true
-MemoryAccounting=true
-# CPUQuota=95%
-
-[Install]
-WantedBy=local-fs.target
--- a/scripts/beesd@.service.in
+++ b/scripts/beesd@.service.in
@@ -0,0 +1,25 @@
+[Unit]
+Description=Bees (%i)
+Documentation=https://github.com/Zygo/bees
+After=sysinit.target
+
+[Service]
+Type=simple
+ExecStart=/usr/sbin/beesd %i
+Nice=19
+KillMode=control-group
+KillSignal=SIGTERM
+CPUShares=128
+StartupCPUShares=256
+BlockIOWeight=100
+StartupBlockIOWeight=250
+IOSchedulingClass=idle
+IOSchedulingPriority=7
+CPUSchedulingPolicy=batch
+Nice=19
+Restart=on-abnormal
+CPUAccounting=true
+MemoryAccounting=true
+
+[Install]
+WantedBy=basic.target
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -1 +1 @@
-bees-version.h
+bees-version.[ch]
--- a/src/Makefile
+++ b/src/Makefile
@@ -8,14 +8,16 @@ all: $(PROGRAMS) depends.mk
 include ../makeflags

 LIBS = -lcrucible -lpthread
-LDFLAGS = -L../lib -Wl,-rpath=$(shell realpath ../lib)
+LDFLAGS = -L../lib

 depends.mk: Makefile *.cc
-	echo "#define BEES_VERSION \"$(shell git describe --always --dirty || echo UNKNOWN)\"" > bees-version.new.h
-	mv -f bees-version.new.h bees-version.h
 	for x in *.cc; do $(CXX) $(CXXFLAGS) -M "$$x"; done > depends.mk.new
 	mv -fv depends.mk.new depends.mk

+bees-version.c: Makefile *.cc *.h
+	echo "const char *BEES_VERSION = \"$(shell git describe --always --dirty || echo UNKNOWN)\";" > bees-version.new.c
+	mv -f bees-version.new.c bees-version.c
+
 -include depends.mk

 %.o: %.cc %.h
@@ -33,9 +35,11 @@ BEES_OBJS = \
 	bees-roots.o \
 	bees-thread.o \
 	bees-types.o \
+	bees-version.o \

 ../bin/bees: $(BEES_OBJS)
 	$(CXX) $(CXXFLAGS) -o "$@" $(BEES_OBJS) $(LDFLAGS) $(LIBS)

 clean:
-	-rm -fv *.o bees-version.h
+	-rm -fv bees-version.h
+	-rm -fv *.o bees-version.c
--- a/src/bees-context.cc
+++ b/src/bees-context.cc
@@ -1,6 +1,7 @@
 #include "bees.h"

 #include "crucible/limits.h"
+#include "crucible/process.h"
 #include "crucible/string.h"

 #include <fstream>
@@ -29,12 +30,14 @@ BeesFdCache::BeesFdCache()
 		BEESCOUNTADD(open_root_ms, open_timer.age() * 1000);
 		return rv;
 	});
+	m_root_cache.max_size(BEES_ROOT_FD_CACHE_SIZE);
 	m_file_cache.func([&](shared_ptr<BeesContext> ctx, uint64_t root, uint64_t ino) -> Fd {
 		Timer open_timer;
 		auto rv = ctx->roots()->open_root_ino_nocache(root, ino);
 		BEESCOUNTADD(open_ino_ms, open_timer.age() * 1000);
 		return rv;
 	});
+	m_file_cache.max_size(BEES_FILE_FD_CACHE_SIZE);
 }

 Fd
@@ -55,6 +58,12 @@ BeesFdCache::open_root(shared_ptr<BeesContext> ctx, uint64_t root)
 Fd
 BeesFdCache::open_root_ino(shared_ptr<BeesContext> ctx, uint64_t root, uint64_t ino)
 {
+	if (m_file_cache_timer.age() > BEES_COMMIT_INTERVAL) {
+		BEESINFO("Clearing open FD cache to enable file delete");
+		m_file_cache.clear();
+		m_file_cache_timer.reset();
+		BEESCOUNT(open_clear);
+	}
 	return m_file_cache(ctx, root, ino);
 }

@@ -65,97 +74,6 @@ BeesFdCache::insert_root_ino(shared_ptr<BeesContext> ctx, Fd fd)
 	return m_file_cache.insert(fd, ctx, fid.root(), fid.ino());
 }

-mutex BeesWorkQueueBase::s_mutex;
-set<BeesWorkQueueBase*> BeesWorkQueueBase::s_all_workers;
-
-BeesWorkQueueBase::BeesWorkQueueBase(const string &name) :
-	m_name(name)
-{
-}
-
-BeesWorkQueueBase::~BeesWorkQueueBase()
-{
-	unique_lock<mutex> lock(s_mutex);
-	s_all_workers.erase(this);
-}
-
-void
-BeesWorkQueueBase::for_each_work_queue(std::function<void (BeesWorkQueueBase*)> f)
-{
-	unique_lock<mutex> lock(s_mutex);
-	for (auto i : s_all_workers) {
-		f(i);
-	}
-}
-
-string
-BeesWorkQueueBase::name() const
-{
-	return m_name;
-}
-
-void
-BeesWorkQueueBase::name(const string &new_name)
-{
-	m_name = new_name;
-}
-
-template <class Task>
-BeesWorkQueue<Task>::~BeesWorkQueue()
-{
-}
-
-template <class Task>
-BeesWorkQueue<Task>::BeesWorkQueue(const string &name) :
-	BeesWorkQueueBase(name)
-{
-	unique_lock<mutex> lock(s_mutex);
-	s_all_workers.insert(this);
-}
-
-template <class Task>
-void
-BeesWorkQueue<Task>::push_active(const Task &t)
-{
-	BEESNOTE("pushing task " << t);
-	m_active_queue.push(t);
-}
-
-template <class Task>
-void
-BeesWorkQueue<Task>::push_active(const Task &t, size_t limit)
-{
-	// BEESNOTE("pushing limit " << limit << " task " << t);
-	m_active_queue.push_wait(t, limit);
-}
-
-template <class Task>
-size_t
-BeesWorkQueue<Task>::active_size() const
-{
-	return m_active_queue.size();
-}
-
-template <class Task>
-list<string>
-BeesWorkQueue<Task>::peek_active(size_t count) const
-{
-	list<string> rv;
-	for (auto i : m_active_queue.peek(count)) {
-		ostringstream oss;
-		oss << i;
-		rv.push_back(oss.str());
-	}
-	return rv;
-}
-
-template <class Task>
-Task
-BeesWorkQueue<Task>::pop()
-{
-	return m_active_queue.pop();
-}
-
 void
 BeesContext::dump_status()
 {
@@ -182,12 +100,6 @@ BeesContext::dump_status()
 			ofs << "\ttid " << t.first << ": " << t.second << "\n";
 		}

-		BeesWorkQueueBase::for_each_work_queue([&](BeesWorkQueueBase *worker) {
-			ofs << "QUEUE: " << worker->name() << " active: " << worker->active_size() << "\n";
-			for (auto t : worker->peek_active(10)) {
-				ofs << "\t" << t << "\n";
-			}
-		});
 		ofs.close();

 		BEESNOTE("renaming status file '" << status_file << "'");
@@ -223,10 +135,6 @@ BeesContext::show_progress()
 		};
 		lastProgressStats = thisStats;

-		BeesWorkQueueBase::for_each_work_queue([&](BeesWorkQueueBase *worker) {
-			BEESLOG("QUEUE: " << worker->name() << " active: " << worker->active_size());
-		});
-
 		BEESLOG("THREADS:");

 		for (auto t : BeesNote::get_status()) {	
@@ -252,6 +160,7 @@ BeesContext::home_fd()
 BeesContext::BeesContext(shared_ptr<BeesContext> parent) :
 	m_parent_ctx(parent)
 {
+	// m_extent_lock_set.max_size(bees_worker_thread_count());;
 	if (m_parent_ctx) {
 		m_fd_cache = m_parent_ctx->fd_cache();
 	}
@@ -260,44 +169,23 @@ BeesContext::BeesContext(shared_ptr<BeesContext> parent) :
 bool
 BeesContext::dedup(const BeesRangePair &brp)
 {
-	// TOOLONG and NOTE can retroactively fill in the filename details, but LOG can't
-	BEESNOTE("dedup " << brp);
-
+	// Open the files
 	brp.first.fd(shared_from_this());
 	brp.second.fd(shared_from_this());

-#if 0
-	// This avoids some sort of kernel race condition;
-	// however, it also doubles our dedup times.
-	// Is avoiding a crash every few weeks worth it?
-	bees_sync(brp.first.fd());
-#endif
-
+	BEESNOTE("dedup " << brp);
 	BEESTOOLONG("dedup " << brp);

-	thread_local BeesFileId tl_first_fid, tl_second_fid;
-	if (tl_first_fid != brp.first.fid()) {
-		BEESLOG("dedup: src " << name_fd(brp.first.fd()));
-		tl_first_fid = brp.first.fid();
-		tl_second_fid = BeesFileId();
-	}
-	ostringstream dst_line;
-	dst_line << "       dst " << pretty(brp.first.size()) << " [" << to_hex(brp.first.begin()) << ".." << to_hex(brp.first.end()) << "]";
-	if (brp.first.begin() != brp.second.begin()) {
-		dst_line << " [" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "]";
-	}
 	BeesAddress first_addr(brp.first.fd(), brp.first.begin());
 	BeesAddress second_addr(brp.second.fd(), brp.second.begin());
-	dst_line << " (" << first_addr << "->" << second_addr << ")";
+
+	BEESLOG("dedup: src " << pretty(brp.first.size())  << " [" << to_hex(brp.first.begin())  << ".." << to_hex(brp.first.end())  << "] {" << first_addr  << "} " << name_fd(brp.first.fd()));
+	BEESLOG("       dst " << pretty(brp.second.size()) << " [" << to_hex(brp.second.begin()) << ".." << to_hex(brp.second.end()) << "] {" << second_addr << "} " << name_fd(brp.second.fd()));
+
 	if (first_addr.get_physical_or_zero() == second_addr.get_physical_or_zero()) {
 		BEESLOGTRACE("equal physical addresses in dedup");
 		BEESCOUNT(bug_dedup_same_physical);
 	}
-	if (tl_second_fid != brp.second.fid()) {
-		dst_line << " " << name_fd(brp.second.fd());
-		tl_second_fid = brp.second.fid();
-	}
-	BEESLOG(dst_line.str());

 	THROW_CHECK1(invalid_argument, brp, !brp.first.overlaps(brp.second));
 	THROW_CHECK1(invalid_argument, brp, brp.first.size() == brp.second.size());
@@ -342,6 +230,7 @@ BeesContext::rewrite_file_range(const BeesFileRange &bfr)
 	// BEESLOG("\torig_bbd " << orig_bbd);
 	BeesBlockData dup_bbd(dup_brp.first.fd(), dup_brp.first.begin(), min(BLOCK_SIZE_SUMS, dup_brp.first.size()));
 	// BEESLOG("BeesResolver br(..., " << bfr << ")");
+	BEESTRACE("BeesContext::rewrite_file_range calling BeesResolver " << bfr);
 	BeesResolver br(m_ctx, BeesAddress(bfr.fd(), bfr.begin()));
 	// BEESLOG("\treplace_src " << dup_bbd);
 	br.replace_src(dup_bbd);
@@ -537,6 +426,7 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
 			if (found_addr.is_toxic()) {
 				BEESINFO("WORKAROUND: abandoned toxic match for hash " << hash << " addr " << found_addr);
 				// Don't push these back in because we'll never delete them.
+				// Extents may become non-toxic so give them a chance to expire.
 				// hash_table->push_front_hash_addr(hash, found_addr);
 				BEESCOUNT(scan_toxic_hash);
 				return bfr;
@@ -547,17 +437,16 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)
 			catch_all([&]() {
 				BEESNOTE("resolving " << found_addr << " matched " << bbd);
 				BEESTRACE("resolving " << found_addr << " matched " << bbd);
+				BEESTRACE("BeesContext::scan_one_extent calling BeesResolver " << found_addr);
 				BeesResolver resolved(m_ctx, found_addr);
 				// Toxic extents are really toxic
 				if (resolved.is_toxic()) {
 					BEESINFO("WORKAROUND: abandoned toxic match at found_addr " << found_addr << " matching bbd " << bbd);
 					BEESCOUNT(scan_toxic_match);
-#if 0
-					// Don't push these back in because we'll never delete them.
-					// Make sure we never see this hash again
+					// Make sure we never see this hash again.
+					// It has become toxic since it was inserted into the hash table.
 					found_addr.set_toxic();
 					hash_table->push_front_hash_addr(hash, found_addr);
-#endif
 					abandon_extent = true;
 				} else if (!resolved.count()) {
 					BEESCOUNT(scan_resolve_zero);
@@ -779,13 +668,7 @@ BeesContext::scan_one_extent(const BeesFileRange &bfr, const Extent &e)

 	// Visualize
 	if (bar != string(block_count, '.')) {
-		thread_local BeesFileId last_fid;
-		string file_name;
-		if (bfr.fid() != last_fid) {
-			last_fid = bfr.fid();
-			file_name = " " + name_fd(bfr.fd());
-		}
-		BEESLOG("scan: " << pretty(e.size()) << " " << to_hex(e.begin()) << " [" << bar << "] " << to_hex(e.end()) << file_name);
+		BEESLOG("scan: " << pretty(e.size()) << " " << to_hex(e.begin()) << " [" << bar << "] " << to_hex(e.end()) << ' ' << name_fd(bfr.fd()));
 	}

 	return bfr;
@@ -837,6 +720,9 @@ BeesContext::scan_forward(const BeesFileRange &bfr)
 			e = ew.current();

 			catch_all([&]() {
+				uint64_t extent_bytenr = e.bytenr();
+				BEESNOTE("waiting for extent bytenr " << to_hex(extent_bytenr));
+				auto extent_lock = m_extent_lock_set.make_lock(extent_bytenr);
 				Timer one_extent_timer;
 				return_bfr = scan_one_extent(bfr, e);
 				BEESCOUNTADD(scanf_extent_ms, one_extent_timer.age() * 1000);
@@ -868,12 +754,19 @@ BeesContext::resolve_addr_uncached(BeesAddress addr)
 {
 	THROW_CHECK1(invalid_argument, addr, !addr.is_magic());
 	THROW_CHECK0(invalid_argument, !!root_fd());
+
+	// To avoid hammering all the cores with long-running ioctls,
+	// only do one resolve at any given time.
+	BEESNOTE("waiting to resolve addr " << addr);
+	auto lock = bees_ioctl_lock_set.make_lock(gettid());
+
 	Timer resolve_timer;

 	// There is no performance benefit if we restrict the buffer size.
        BtrfsIoctlLogicalInoArgs log_ino(addr.get_physical_or_zero());

 	{
+		BEESNOTE("resolving addr " << addr);
 		BEESTOOLONG("Resolving addr " << addr << " in " << root_path() << " refs " << log_ino.m_iors.size());
 		if (log_ino.do_ioctl_nothrow(root_fd())) {
 			BEESCOUNT(resolve_ok);
@@ -928,8 +821,9 @@ BeesContext::set_root_fd(Fd fd)
 	m_root_uuid = fsinfo.uuid();
 	BEESLOG("Filesystem UUID is " << m_root_uuid);

-	// 65536 is big enough for two max-sized extents
-	m_resolve_cache.max_size(65536);
+	// 65536 is big enough for two max-sized extents.
+	// Need enough total space in the cache for the maximum number of active threads.
+	m_resolve_cache.max_size(65536 * bees_worker_thread_count());
 	m_resolve_cache.func([&](BeesAddress addr) -> BeesResolveAddrResult {
 		return resolve_addr_uncached(addr);
 	});
@@ -969,7 +863,8 @@ BeesContext::tmpfile()
 	if (!m_tmpfiles[this_thread::get_id()]) {
 		m_tmpfiles[this_thread::get_id()] = make_shared<BeesTempFile>(shared_from_this());
 	}
-	return m_tmpfiles[this_thread::get_id()];
+	auto rv = m_tmpfiles[this_thread::get_id()];
+	return rv;
 }

 shared_ptr<BeesFdCache>
@@ -980,7 +875,8 @@ BeesContext::fd_cache()
 	if (!m_fd_cache) {
 		m_fd_cache = make_shared<BeesFdCache>();
 	}
-	return m_fd_cache;
+	auto rv = m_fd_cache;
+	return rv;
 }

 shared_ptr<BeesRoots>
@@ -991,7 +887,8 @@ BeesContext::roots()
 	if (!m_roots) {
 		m_roots = make_shared<BeesRoots>(shared_from_this());
 	}
-	return m_roots;
+	auto rv = m_roots;
+	return rv;
 }

 shared_ptr<BeesHashTable>
@@ -1002,7 +899,8 @@ BeesContext::hash_table()
 	if (!m_hash_table) {
 		m_hash_table = make_shared<BeesHashTable>(shared_from_this(), "beeshash.dat");
 	}
-	return m_hash_table;
+	auto rv = m_hash_table;
+	return rv;
 }

 void
@@ -1018,8 +916,3 @@ BeesContext::insert_root_ino(Fd fd)
 {
 	fd_cache()->insert_root_ino(shared_from_this(), fd);
 }
-
-// instantiate templates for linkage ----------------------------------------
-
-template class BeesWorkQueue<BeesFileRange>;
-template class BeesWorkQueue<BeesRangePair>;
--- a/src/bees-hash.cc
+++ b/src/bees-hash.cc
@@ -1,4 +1,3 @@
-#include "bees-version.h"
 #include "bees.h"

 #include "crucible/crc64.h"
@@ -25,14 +24,16 @@ operator<<(ostream &os, const BeesHashTable::Cell &bhte)
 		  << BeesAddress(bhte.e_addr) << " }";
 }

+#if 0
+static
 void
-dump_bucket(BeesHashTable::Cell *p, BeesHashTable::Cell *q)
+dump_bucket_locked(BeesHashTable::Cell *p, BeesHashTable::Cell *q)
 {
-	// Must be called while holding m_bucket_mutex
 	for (auto i = p; i < q; ++i) {
 		BEESLOG("Entry " << i - p << " " << *i);
 	}
 }
+#endif

 const bool VERIFY_CLEARS_BUGS = false;

@@ -92,52 +93,74 @@ BeesHashTable::get_extent_range(HashType hash)
 	return make_pair(bp, ep);
 }

+bool
+BeesHashTable::flush_dirty_extent(uint64_t extent_index)
+{
+	BEESNOTE("flushing extent #" << extent_index << " of " << m_extents << " extents");
+
+	auto lock = lock_extent_by_index(extent_index);
+
+	// Not dirty, nothing to do
+	if (!m_extent_metadata.at(extent_index).m_dirty) {
+		return false;
+	}
+
+	bool wrote_extent = false;
+
+	catch_all([&]() {
+		uint8_t *dirty_extent     = m_extent_ptr[extent_index].p_byte;
+		uint8_t *dirty_extent_end = m_extent_ptr[extent_index + 1].p_byte;
+		THROW_CHECK1(out_of_range, dirty_extent,     dirty_extent     >= m_byte_ptr);
+		THROW_CHECK1(out_of_range, dirty_extent_end, dirty_extent_end <= m_byte_ptr_end);
+		THROW_CHECK2(out_of_range, dirty_extent_end, dirty_extent, dirty_extent_end - dirty_extent == BLOCK_SIZE_HASHTAB_EXTENT);
+		BEESTOOLONG("pwrite(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
+		// Copy the extent because we might be stuck writing for a while
+		vector<uint8_t> extent_copy(dirty_extent, dirty_extent_end);
+
+		// Mark extent non-dirty while we still hold the lock
+		m_extent_metadata.at(extent_index).m_dirty = false;
+
+		// Release the lock
+		lock.unlock();
+
+		// Write the extent (or not)
+		pwrite_or_die(m_fd, extent_copy, dirty_extent - m_byte_ptr);
+		BEESCOUNT(hash_extent_out);
+
+		wrote_extent = true;
+	});
+
+	BEESNOTE("flush rate limited after extent #" << extent_index << " of " << m_extents << " extents");
+	m_flush_rate_limit.sleep_for(BLOCK_SIZE_HASHTAB_EXTENT);
+	return wrote_extent;
+}
+
 void
 BeesHashTable::flush_dirty_extents()
 {
 	THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);

-	unique_lock<mutex> lock(m_extent_mutex);
-	auto dirty_extent_copy = m_buckets_dirty;
-	m_buckets_dirty.clear();
-	if (dirty_extent_copy.empty()) {
-		BEESNOTE("idle");
-		m_condvar.wait(lock);
-		return; // please call later, i.e. immediately
+	uint64_t wrote_extents = 0;
+	for (size_t extent_index = 0; extent_index < m_extents; ++extent_index) {
+		if (flush_dirty_extent(extent_index)) {
+			++wrote_extents;
+		}
 	}
-	lock.unlock();

-	size_t extent_counter = 0;
-	for (auto extent_number : dirty_extent_copy) {
-		++extent_counter;
-		BEESNOTE("flush extent #" << extent_number << " (" << extent_counter << " of " << dirty_extent_copy.size() << ")");
-		catch_all([&]() {
-			uint8_t *dirty_extent     = m_extent_ptr[extent_number].p_byte;
-			uint8_t *dirty_extent_end = m_extent_ptr[extent_number + 1].p_byte;
-			THROW_CHECK1(out_of_range, dirty_extent,     dirty_extent     >= m_byte_ptr);
-			THROW_CHECK1(out_of_range, dirty_extent_end, dirty_extent_end <= m_byte_ptr_end);
-			THROW_CHECK2(out_of_range, dirty_extent_end, dirty_extent, dirty_extent_end - dirty_extent == BLOCK_SIZE_HASHTAB_EXTENT);
-			BEESTOOLONG("pwrite(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
-			// Page locks slow us down more than copying the data does
-			vector<uint8_t> extent_copy(dirty_extent, dirty_extent_end);
-			pwrite_or_die(m_fd, extent_copy, dirty_extent - m_byte_ptr);
-			BEESCOUNT(hash_extent_out);
-		});
-		BEESNOTE("flush rate limited at extent #" << extent_number << " (" << extent_counter << " of " << dirty_extent_copy.size() << ")");
-		m_flush_rate_limit.sleep_for(BLOCK_SIZE_HASHTAB_EXTENT);
-	}
+	BEESNOTE("idle after writing " << wrote_extents << " of " << m_extents << " extents");
+	unique_lock<mutex> lock(m_dirty_mutex);
+	m_dirty_condvar.wait(lock);
 }

 void
-BeesHashTable::set_extent_dirty(HashType hash)
+BeesHashTable::set_extent_dirty_locked(uint64_t extent_index)
 {
-	THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
-	auto pr = get_extent_range(hash);
-	uint64_t extent_number = reinterpret_cast<Extent *>(pr.first) - m_extent_ptr;
-	THROW_CHECK1(runtime_error, extent_number, extent_number < m_extents);
-	unique_lock<mutex> lock(m_extent_mutex);
-	m_buckets_dirty.insert(extent_number);
-	m_condvar.notify_one();
+	// Must already be locked
+	m_extent_metadata.at(extent_index).m_dirty = true;
+
+	// Signal writeback thread
+	unique_lock<mutex> dirty_lock(m_dirty_mutex);
+	m_dirty_condvar.notify_one();
 }

 void
@@ -180,13 +203,13 @@ BeesHashTable::prefetch_loop()
 		size_t unaligned_eof_count = 0;

 		for (uint64_t ext = 0; ext < m_extents; ++ext) {
-			BEESNOTE("prefetching hash table extent " << ext << " of " << m_extent_ptr_end - m_extent_ptr);
+			BEESNOTE("prefetching hash table extent " << ext << " of " << m_extents);
 			catch_all([&]() {
-				fetch_missing_extent(ext * c_buckets_per_extent);
+				fetch_missing_extent_by_index(ext);

-				BEESNOTE("analyzing hash table extent " << ext << " of " << m_extent_ptr_end - m_extent_ptr);
+				BEESNOTE("analyzing hash table extent " << ext << " of " << m_extents);
 				bool duplicate_bugs_found = false;
-				unique_lock<mutex> lock(m_bucket_mutex);
+				auto lock = lock_extent_by_index(ext);
 				for (Bucket *bucket = m_extent_ptr[ext].p_buckets; bucket < m_extent_ptr[ext + 1].p_buckets; ++bucket) {
 					if (verify_cell_range(bucket[0].p_cells, bucket[1].p_cells)) {
 						duplicate_bugs_found = true;
@@ -215,9 +238,8 @@ BeesHashTable::prefetch_loop()
 					// Count these instead of calculating the number so we get better stats in case of exceptions
 					occupied_count += this_bucket_occupied_count;
 				}
-				lock.unlock();
 				if (duplicate_bugs_found) {
-					set_extent_dirty(ext);
+					set_extent_dirty_locked(ext);
 				}
 			});
 		}
@@ -262,8 +284,8 @@ BeesHashTable::prefetch_loop()
 		graph_blob << "Uptime:  " << m_ctx->total_timer().age() << " seconds\n";
 		graph_blob << "Version: " << BEES_VERSION << "\n";

-		graph_blob 
-			<< "\nHash table page occupancy histogram (" << occupied_count << "/" << total_count << " cells occupied, " << (occupied_count * 100 / total_count) << "%)\n" 
+		graph_blob
+			<< "\nHash table page occupancy histogram (" << occupied_count << "/" << total_count << " cells occupied, " << (occupied_count * 100 / total_count) << "%)\n"
 			<< out.str() << "0%      |      25%      |      50%      |      75%      |   100% page fill\n"
 			<< "compressed " << compressed_count << " (" << percent(compressed_count, occupied_count) << ")"
 			<< " new-style " << compressed_offset_count << " (" << percent(compressed_offset_count, occupied_count) << ")"
@@ -292,55 +314,70 @@ BeesHashTable::prefetch_loop()
 	}
 }

-void
-BeesHashTable::fetch_missing_extent(HashType hash)
+size_t
+BeesHashTable::hash_to_extent_index(HashType hash)
+{
+	auto pr = get_extent_range(hash);
+	uint64_t extent_index = reinterpret_cast<const Extent *>(pr.first) - m_extent_ptr;
+	THROW_CHECK2(runtime_error, extent_index, m_extents, extent_index < m_extents);
+	return extent_index;
+}
+
+BeesHashTable::ExtentMetaData::ExtentMetaData() :
+	m_mutex_ptr(make_shared<mutex>())
+{
+}
+
+unique_lock<mutex>
+BeesHashTable::lock_extent_by_index(uint64_t extent_index)
+{
+	THROW_CHECK2(out_of_range, extent_index, m_extents, extent_index < m_extents);
+	return unique_lock<mutex>(*m_extent_metadata.at(extent_index).m_mutex_ptr);
+}
+
+unique_lock<mutex>
+BeesHashTable::lock_extent_by_hash(HashType hash)
 {
 	BEESTOOLONG("fetch_missing_extent for hash " << to_hex(hash));
-	THROW_CHECK1(runtime_error, m_buckets, m_buckets > 0);
-	auto pr = get_extent_range(hash);
-	uint64_t extent_number = reinterpret_cast<Extent *>(pr.first) - m_extent_ptr;
-	THROW_CHECK1(runtime_error, extent_number, extent_number < m_extents);
+	return lock_extent_by_index(hash_to_extent_index(hash));
+}

-	unique_lock<mutex> lock(m_extent_mutex);
-	if (!m_buckets_missing.count(extent_number)) {
+void
+BeesHashTable::fetch_missing_extent_by_index(uint64_t extent_index)
+{
+	BEESNOTE("checking hash extent #" << extent_index << " of " << m_extents << " extents");
+	auto lock = lock_extent_by_index(extent_index);
+	if (!m_extent_metadata.at(extent_index).m_missing) {
 		return;
 	}

-	size_t missing_buckets = m_buckets_missing.size();
-	lock.unlock();
-
-	BEESNOTE("fetch waiting for hash extent #" << extent_number << ", " << missing_buckets << " left to fetch");
-
-	// Acquire blocking lock on this extent only
-	LockSet<uint64_t>::Lock extent_lock(m_extent_lock_set, extent_number);
-
-	// Check missing again because someone else might have fetched this
-	// extent for us while we didn't hold any locks
-	lock.lock();
-	if (!m_buckets_missing.count(extent_number)) {
-		BEESCOUNT(hash_extent_in_twice);
-		return;
-	}
-	lock.unlock();
-
 	// OK we have to read this extent
-	BEESNOTE("fetching hash extent #" << extent_number << ", " << missing_buckets << " left to fetch");
+	BEESNOTE("fetching hash extent #" << extent_index << " of " << m_extents << " extents");
+	BEESTRACE("Fetching hash extent #" << extent_index << " of " << m_extents << " extents");
+	BEESTOOLONG("Fetching hash extent #" << extent_index << " of " << m_extents << " extents");

-	BEESTRACE("Fetching missing hash extent " << extent_number);
-	uint8_t *dirty_extent     = m_extent_ptr[extent_number].p_byte;
-	uint8_t *dirty_extent_end = m_extent_ptr[extent_number + 1].p_byte;
+	uint8_t *dirty_extent     = m_extent_ptr[extent_index].p_byte;
+	uint8_t *dirty_extent_end = m_extent_ptr[extent_index + 1].p_byte;

-	{
+	// If the read fails don't retry, just go with whatever data we have
+	m_extent_metadata.at(extent_index).m_missing = false;
+
+	catch_all([&]() {
 		BEESTOOLONG("pread(fd " << m_fd << " '" << name_fd(m_fd)<< "', length " << to_hex(dirty_extent_end - dirty_extent) << ", offset " << to_hex(dirty_extent - m_byte_ptr) << ")");
 		pread_or_die(m_fd, dirty_extent, dirty_extent_end - dirty_extent, dirty_extent - m_byte_ptr);
-	}
+	});

+	// Only count extents successfully read
 	BEESCOUNT(hash_extent_in);
-	// We don't block when fetching an extent but we do slow down the
-	// prefetch thread.
-	m_prefetch_rate_limit.borrow(BLOCK_SIZE_HASHTAB_EXTENT);
-	lock.lock();
-	m_buckets_missing.erase(extent_number);
+}
+
+void
+BeesHashTable::fetch_missing_extent_by_hash(HashType hash)
+{
+	uint64_t extent_index = hash_to_extent_index(hash);
+	BEESNOTE("waiting to fetch hash extent #" << extent_index << " of " << m_extents << " extents");
+
+	fetch_missing_extent_by_index(extent_index);
 }

 bool
@@ -362,10 +399,10 @@ BeesHashTable::find_cell(HashType hash)
 		rv.push_back(toxic_cell);
 		return rv;
 	}
-	fetch_missing_extent(hash);
+	fetch_missing_extent_by_hash(hash);
 	BEESTOOLONG("find_cell hash " << BeesHash(hash));
 	vector<Cell> rv;
-	unique_lock<mutex> lock(m_bucket_mutex);
+	auto lock = lock_extent_by_hash(hash);
 	auto er = get_cell_range(hash);
 	// FIXME:  Weed out zero addresses in the table due to earlier bugs
 	copy_if(er.first, er.second, back_inserter(rv), [=](const Cell &ip) { return ip.e_hash == hash && ip.e_addr >= 0x1000; });
@@ -381,9 +418,9 @@ BeesHashTable::find_cell(HashType hash)
 void
 BeesHashTable::erase_hash_addr(HashType hash, AddrType addr)
 {
-	fetch_missing_extent(hash);
+	fetch_missing_extent_by_hash(hash);
 	BEESTOOLONG("erase hash " << to_hex(hash) << " addr " << addr);
-	unique_lock<mutex> lock(m_bucket_mutex);
+	auto lock = lock_extent_by_hash(hash);
 	auto er = get_cell_range(hash);
 	Cell mv(hash, addr);
 	Cell *ip = find(er.first, er.second, mv);
@@ -391,7 +428,7 @@ BeesHashTable::erase_hash_addr(HashType hash, AddrType addr)
 	if (found) {
 		// Lookups on invalid addresses really hurt us.  Kill it with fire!
 		*ip = Cell(0, 0);
-		set_extent_dirty(hash);
+		set_extent_dirty_locked(hash_to_extent_index(hash));
 		BEESCOUNT(hash_erase);
 #if 0
 		if (verify_cell_range(er.first, er.second)) {
@@ -409,9 +446,9 @@ BeesHashTable::erase_hash_addr(HashType hash, AddrType addr)
 bool
 BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
 {
-	fetch_missing_extent(hash);
+	fetch_missing_extent_by_hash(hash);
 	BEESTOOLONG("push_front_hash_addr hash " << BeesHash(hash) <<" addr " << BeesAddress(addr));
-	unique_lock<mutex> lock(m_bucket_mutex);
+	auto lock = lock_extent_by_hash(hash);
 	auto er = get_cell_range(hash);
 	Cell mv(hash, addr);
 	Cell *ip = find(er.first, er.second, mv);
@@ -441,7 +478,7 @@ BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
 	// There is now a space at the front, insert there if different
 	if (er.first[0] != mv) {
 		er.first[0] = mv;
-		set_extent_dirty(hash);
+		set_extent_dirty_locked(hash_to_extent_index(hash));
 		BEESCOUNT(hash_front);
 	}
 #if 0
@@ -460,9 +497,9 @@ BeesHashTable::push_front_hash_addr(HashType hash, AddrType addr)
 bool
 BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr)
 {
-	fetch_missing_extent(hash);
+	fetch_missing_extent_by_hash(hash);
 	BEESTOOLONG("push_random_hash_addr hash " << BeesHash(hash) << " addr " << BeesAddress(addr));
-	unique_lock<mutex> lock(m_bucket_mutex);
+	auto lock = lock_extent_by_hash(hash);
 	auto er = get_cell_range(hash);
 	Cell mv(hash, addr);
 	Cell *ip = find(er.first, er.second, mv);
@@ -525,14 +562,14 @@ BeesHashTable::push_random_hash_addr(HashType hash, AddrType addr)
 	case_cond = 5;
 ret_dirty:
 	BEESCOUNT(hash_insert);
-	set_extent_dirty(hash);
+	set_extent_dirty_locked(hash_to_extent_index(hash));
 ret:
 #if 0
 	if (verify_cell_range(er.first, er.second, false)) {
 		BEESLOG("while push_randoming (case " << case_cond << ") pos " << pos
 			<< " ip " << (ip - er.first) << " " << mv);
-		// dump_bucket(saved.data(), saved.data() + saved.size());
-		// dump_bucket(er.first, er.second);
+		// dump_bucket_locked(saved.data(), saved.data() + saved.size());
+		// dump_bucket_locked(er.first, er.second);
 	}
 #else
 	(void)case_cond;
@@ -597,7 +634,6 @@ BeesHashTable::BeesHashTable(shared_ptr<BeesContext> ctx, string filename, off_t
 	m_writeback_thread("hash_writeback"),
 	m_prefetch_thread("hash_prefetch"),
 	m_flush_rate_limit(BEES_FLUSH_RATE),
-	m_prefetch_rate_limit(BEES_FLUSH_RATE),
 	m_stats_file(m_ctx->home_fd(), "beesstats.txt")
 {
 	// Sanity checks to protect the implementation from its weaknesses
@@ -640,18 +676,24 @@ BeesHashTable::BeesHashTable(shared_ptr<BeesContext> ctx, string filename, off_t
 	THROW_CHECK2(runtime_error, m_void_ptr, m_bucket_ptr, m_void_ptr == m_bucket_ptr);
 	THROW_CHECK2(runtime_error, m_void_ptr, m_extent_ptr, m_void_ptr == m_extent_ptr);

-	{
-		// It's OK if this fails (e.g. kernel not built with CONFIG_TRANSPARENT_HUGEPAGE)
-		// We don't fork any more so DONTFORK isn't really needed
-		BEESTOOLONG("madvise(MADV_HUGEPAGE | MADV_DONTFORK)");
-		if (madvise(m_byte_ptr, m_size, MADV_HUGEPAGE | MADV_DONTFORK)) {
-			BEESLOG("mostly harmless: madvise(MADV_HUGEPAGE | MADV_DONTFORK) failed: " << strerror(errno));
+	// Give all the madvise hints that the kernel understands
+	const struct madv_flag {
+		const char *name;
+		int value;
+	} madv_flags[] = {
+		{ .name = "MADV_HUGEPAGE", .value = MADV_HUGEPAGE },
+		{ .name = "MADV_DONTFORK", .value = MADV_DONTFORK },
+		{ .name = "MADV_DONTDUMP", .value = MADV_DONTDUMP },
+		{ .name = "", .value = 0 },
+	};
+	for (auto fp = madv_flags; fp->value; ++fp) {
+		BEESTOOLONG("madvise(" << fp->name << ")");
+		if (madvise(m_byte_ptr, m_size, fp->value)) {
+			BEESLOG("madvise(..., " << fp->name << "): " << strerror(errno) << " (ignored)");
 		}
 	}

-	for (uint64_t i = 0; i < m_size / sizeof(Extent); ++i) {
-		m_buckets_missing.insert(i);
-	}
+	m_extent_metadata.resize(m_extents);

 	m_writeback_thread.exec([&]() {
 		writeback_loop();
--- a/src/bees-resolve.cc
+++ b/src/bees-resolve.cc
@@ -105,7 +105,7 @@ BeesResolver::adjust_offset(const BeesFileRange &haystack, const BeesBlockData &
 	bool is_legacy = false;
 	if (m_addr.is_compressed()) {
 		BtrfsExtentWalker ew(haystack.fd(), haystack.begin(), m_ctx->root_fd());
-		BEESTRACE("haystack extent data " << ew); 
+		BEESTRACE("haystack extent data " << ew);
 		Extent e = ew.current();
 		if (m_addr.has_compressed_offset()) {
 			off_t coff = m_addr.get_compressed_offset();
--- a/src/bees-roots.cc
+++ b/src/bees-roots.cc
@@ -1,11 +1,14 @@
 #include "bees.h"

 #include "crucible/cache.h"
+#include "crucible/process.h"
 #include "crucible/string.h"

 #include <fstream>
 #include <tuple>

+#include <inttypes.h>
+
 using namespace crucible;
 using namespace std;

@@ -150,9 +153,12 @@ BeesRoots::crawl_state_erase(const BeesCrawlState &bcs)
 		return;
 	}

-	if (m_root_crawl_map.count(bcs.m_root)) {
-		m_root_crawl_map.erase(bcs.m_root);
+	auto found = m_root_crawl_map.find(bcs.m_root);
+	if (found != m_root_crawl_map.end()) {
+		auto hold_this_until_unlocked = found->second;
+		m_root_crawl_map.erase(found);
 		m_crawl_dirty = true;
+		lock.unlock();
 	}
 }

@@ -174,9 +180,9 @@ BeesRoots::transid_min()
 uint64_t
 BeesRoots::transid_max()
 {
-	BEESNOTE("Calculating transid_max");
 	uint64_t rv = 0;
 	uint64_t root = 0;
+	BEESNOTE("Calculating transid_max (" << rv << " as of root " << root << ")");
 	BEESTRACE("Calculating transid_max...");
 	do {
 		root = next_root(root);
@@ -191,97 +197,12 @@ BeesRoots::transid_max()
 	return rv;
 }

-void
-BeesRoots::crawl_roots()
-{
-	BEESNOTE("Crawling roots");
-
-	unique_lock<mutex> lock(m_mutex);
-	if (m_root_crawl_map.empty()) {
-		BEESNOTE("idle, crawl map is empty");
-		m_condvar.wait(lock);
-		// Don't count the time we were waiting as part of the crawl time
-		m_crawl_timer.reset();
-	}
-
-	// Work from a copy because BeesCrawl might change the world under us
-	auto crawl_map_copy = m_root_crawl_map;
-	lock.unlock();
-
-#if 0
-	// Scan the same inode/offset tuple in each subvol (good for snapshots)
-	BeesFileRange first_range;
-	shared_ptr<BeesCrawl> first_crawl;
-	for (auto i : crawl_map_copy) {
-		auto this_crawl = i.second;
-		auto this_range = this_crawl->peek_front();
-		if (this_range) {
-			if (!first_range || this_range < first_range) {
-				first_crawl = this_crawl;
-				first_range = this_range;
-			}
-		}
-	}
-
-	if (first_range) {
-		catch_all([&]() {
-			// BEESINFO("scan_forward " << first_range);
-			m_ctx->scan_forward(first_range);
-		});
-		BEESCOUNT(crawl_scan);
-		m_crawl_current = first_crawl->get_state();
-		auto first_range_popped = first_crawl->pop_front();
-		THROW_CHECK2(runtime_error, first_range, first_range_popped, first_range == first_range_popped);
-		return;
-	}
-#else
-	// Scan each subvol one extent at a time (good for continuous forward progress)
-	bool crawled = false;
-	for (auto i : crawl_map_copy) {
-		auto this_crawl = i.second;
-		auto this_range = this_crawl->peek_front();
-		if (this_range) {
-			catch_all([&]() {
-				// BEESINFO("scan_forward " << this_range);
-				m_ctx->scan_forward(this_range);
-			});
-			crawled = true;
-			BEESCOUNT(crawl_scan);
-			m_crawl_current = this_crawl->get_state();
-			auto this_range_popped = this_crawl->pop_front();
-			THROW_CHECK2(runtime_error, this_range, this_range_popped, this_range == this_range_popped);
-		}
-	}
-
-	if (crawled) return;
-#endif
-
-	BEESLOG("Crawl ran out of data after " << m_crawl_timer.lap() << "s, waiting for more...");
-	BEESCOUNT(crawl_done);
-	BEESNOTE("idle, waiting for more data");
-	lock.lock();
-	m_condvar.wait(lock);
-
-	// Don't count the time we were waiting as part of the crawl time
-	m_crawl_timer.reset();
-}
-
-void
-BeesRoots::crawl_thread()
-{
-	BEESNOTE("crawling");
-	while (1) {
-		catch_all([&]() {
-			crawl_roots();
-		});
-	}
-}
-
 void
 BeesRoots::writeback_thread()
 {
-	while (1) {
-		BEESNOTE(m_crawl_current << (m_crawl_dirty ? " (dirty)" : ""));
+	while (true) {
+		// BEESNOTE(m_crawl_current << (m_crawl_dirty ? " (dirty)" : ""));
+		BEESNOTE((m_crawl_dirty ? "dirty" : "clean") << ", interval " << BEES_WRITEBACK_INTERVAL << "s");

 		catch_all([&]() {
 			BEESNOTE("saving crawler state");
@@ -379,17 +300,16 @@ BeesRoots::state_load()
 BeesRoots::BeesRoots(shared_ptr<BeesContext> ctx) :
 	m_ctx(ctx),
 	m_crawl_state_file(ctx->home_fd(), crawl_state_filename()),
-	m_crawl_thread("crawl"),
 	m_writeback_thread("crawl_writeback")
 {
-	m_crawl_thread.exec([&]() {
-		catch_all([&]() {
-			state_load();
-		});
-		m_writeback_thread.exec([&]() {
-			writeback_thread();
-		});
-		crawl_thread();
+	// This is a sanity check to prevent us from running out of FDs
+	m_lock_set.max_size(BEES_WORKER_THREAD_LIMIT);
+
+	catch_all([&]() {
+		state_load();
+	});
+	m_writeback_thread.exec([&]() {
+		writeback_thread();
 	});
 }

@@ -470,7 +390,7 @@ BeesRoots::open_root_nocache(uint64_t rootid)
 				THROW_CHECK2(runtime_error, new_root_id, rootid, new_root_id == rootid);
 				Stat st(rv);
 				THROW_CHECK1(runtime_error, st.st_ino, st.st_ino == BTRFS_FIRST_FREE_OBJECTID);
-				BEESINFO("open_root_nocache " << rootid << ": " << name_fd(rv));
+				// BEESINFO("open_root_nocache " << rootid << ": " << name_fd(rv));
 				return rv;
 			}
 		}
@@ -594,6 +514,27 @@ BeesRoots::open_root_ino_nocache(uint64_t root, uint64_t ino)
 			break;
 		}

+		// As of 4.12 the kernel rejects dedup requests with
+		// src and dst that have different datasum flags.
+		//
+		// We can't detect those from userspace reliably, but
+		// we can detect the common case where one file is
+		// marked with the nodatasum (which implies nodatacow)
+		// on a filesystem that is mounted with datacow.
+		// These are arguably out of scope for dedup.
+		//
+		// To fix this properly, we have to keep track of which
+		// pairs of inodes failed to dedup, guess that the reason
+		// for failure was a mismatch of datasum flags, and
+		// create temporary files with the right flags somehow.
+		int attr = ioctl_iflags_get(rv);
+		if (attr & FS_NOCOW_FL) {
+			BEESLOG("Opening " << name_fd(rv) << " found FS_NOCOW_FL flag in " << to_hex(attr));
+			rv = Fd();
+			BEESCOUNT(open_wrong_flags);
+			break;
+		}
+
 		BEESTRACE("mapped " << BeesFileId(root, ino));
 		BEESTRACE("\tto " << name_fd(rv));
 		BEESCOUNT(open_hit);
@@ -612,8 +553,54 @@ BeesRoots::open_root_ino(uint64_t root, uint64_t ino)

 BeesCrawl::BeesCrawl(shared_ptr<BeesContext> ctx, BeesCrawlState initial_state) :
 	m_ctx(ctx),
-	m_state(initial_state)
+	m_state(initial_state),
+	m_thread(astringprintf("crawl_%" PRIu64, m_state.m_root))
 {
+	m_thread.exec([&]() {
+		crawl_thread();
+	});
+}
+
+BeesCrawl::~BeesCrawl()
+{
+	BEESLOGNOTE("Stopping crawl thread " << m_state);
+	unique_lock<mutex> lock(m_mutex);
+	m_stopped = true;
+	m_cond_stopped.notify_all();
+	lock.unlock();
+	BEESLOGNOTE("Joining crawl thread " << m_state);
+	m_thread.join();
+	BEESLOG("Stopped crawl thread " << m_state);
+}
+
+void
+BeesCrawl::crawl_thread()
+{
+	Timer crawl_timer;
+	while (!m_stopped) {
+		BEESNOTE("pop_front " << m_state);
+		auto this_range = pop_front();
+		if (this_range) {
+			catch_all([&]() {
+				BEESNOTE("waiting for scan thread limit " << m_state);
+				auto crawl_lock = m_ctx->roots()->lock_set().make_lock(m_state.m_root);
+
+				BEESNOTE("scan_forward " << this_range);
+				m_ctx->scan_forward(this_range);
+			});
+			BEESCOUNT(crawl_scan);
+		} else {
+			auto crawl_time = crawl_timer.age();
+			BEESLOGNOTE("Crawl ran out of data after " << crawl_time << "s, waiting for more...");
+			unique_lock<mutex> lock(m_mutex);
+			if (m_stopped) {
+				break;
+			}
+			m_cond_stopped.wait_for(lock, chrono::duration<double>(BEES_COMMIT_INTERVAL));
+			crawl_timer.reset();
+		}
+	}
+	BEESLOG("Crawl thread stopped");
 }

 bool
@@ -661,7 +648,7 @@ BeesCrawl::fetch_extents()
 	}

 	BEESNOTE("crawling " << get_state());
-	BEESLOG("Crawling " << get_state());
+	// BEESLOG("Crawling " << get_state());

 	Timer crawl_timer;

@@ -680,6 +667,11 @@ BeesCrawl::fetch_extents()
 	BEESTRACE("Searching crawl sk " << static_cast<btrfs_ioctl_search_key&>(sk));
 	bool ioctl_ok = false;
 	{
+#if 0
+		BEESNOTE("waiting to search crawl sk " << static_cast<btrfs_ioctl_search_key&>(sk));
+		auto lock = bees_ioctl_lock_set.make_lock(gettid());
+#endif
+
 		BEESNOTE("searching crawl sk " << static_cast<btrfs_ioctl_search_key&>(sk));
 		BEESTOOLONG("Searching crawl sk " << static_cast<btrfs_ioctl_search_key&>(sk));
 		Timer crawl_timer;
@@ -700,7 +692,7 @@ BeesCrawl::fetch_extents()
 		return next_transid();
 	}

-	BEESLOG("Crawling " << sk.m_result.size() << " results from " << get_state());
+	// BEESLOG("Crawling " << sk.m_result.size() << " results from " << get_state());
 	auto results_left = sk.m_result.size();
 	BEESNOTE("crawling " << results_left << " results from " << get_state());
 	size_t count_other = 0;
@@ -717,7 +709,6 @@ BeesCrawl::fetch_extents()

 		BEESTRACE("i = " << i);

-#if 1
 		// We need the "+ 1" and objectid rollover that next_min does.
 		auto new_state = get_state();
 		new_state.m_objectid = sk.min_objectid;
@@ -729,7 +720,6 @@ BeesCrawl::fetch_extents()
 		// is a lot of metadata we can't process.  Favor forward
 		// progress over losing search results.
 		set_state(new_state);
-#endif

 		// Ignore things that aren't EXTENT_DATA_KEY
 		if (i.type != BTRFS_EXTENT_DATA_KEY) {
@@ -742,13 +732,24 @@ BeesCrawl::fetch_extents()
 		if (gen < get_state().m_min_transid) {
 			BEESCOUNT(crawl_gen_low);
 			++count_low;
-			// We probably want (need?) to scan these anyway.
-			// continue;
+			// We want (need?) to scan these anyway?
+			// The header generation refers to the transid
+			// of the metadata page holding the current ref.
+			// This includes anything else in that page that
+			// happened to be modified, regardless of how
+			// old it is.
+			// The file_extent_generation refers to the
+			// transid of the extent item's page, which is
+			// a different approximation of what we want.
+			// Combine both of these filters to minimize
+			// the number of times we unnecessarily re-read
+			// an extent.
+			continue;
 		}
 		if (gen > get_state().m_max_transid) {
 			BEESCOUNT(crawl_gen_high);
 			++count_high;
-			// This shouldn't ever happen
+			// This shouldn't ever happen...and so far, doesn't.
 			// continue;
 		}

@@ -768,6 +769,7 @@ BeesCrawl::fetch_extents()
 				break;
 			case BTRFS_FILE_EXTENT_PREALLOC:
 				BEESCOUNT(crawl_prealloc);
+				// fallthrough
 			case BTRFS_FILE_EXTENT_REG: {
 				auto physical = call_btrfs_get(btrfs_stack_file_extent_disk_bytenr, i.m_data);
 				auto ram = call_btrfs_get(btrfs_stack_file_extent_ram_bytes, i.m_data);
@@ -797,7 +799,7 @@ BeesCrawl::fetch_extents()
 			}
 		}
 	}
-	BEESLOG("Crawled inline " << count_inline << " data " << count_data << " other " << count_other << " unknown " << count_unknown << " gen_low " << count_low << " gen_high " << count_high << " " << get_state() << " in " << crawl_timer << "s");
+	// BEESLOG("Crawled inline " << count_inline << " data " << count_data << " other " << count_other << " unknown " << count_unknown << " gen_low " << count_low << " gen_high " << count_high << " " << get_state() << " in " << crawl_timer << "s");

 	return true;
 }
@@ -822,7 +824,8 @@ BeesCrawl::peek_front()
 	if (m_extents.empty()) {
 		return BeesFileRange();
 	}
-	return *m_extents.begin();
+	auto rv = *m_extents.begin();
+	return rv;
 }

 BeesFileRange
@@ -835,12 +838,6 @@ BeesCrawl::pop_front()
 	}
 	auto rv = *m_extents.begin();
 	m_extents.erase(m_extents.begin());
-#if 0
-	auto state = get_state();
-	state.m_objectid = rv.fid().ino();
-	state.m_offset = rv.begin();
-	set_state(state);
-#endif
 	return rv;
 }

@@ -848,7 +845,8 @@ BeesCrawlState
 BeesCrawl::get_state()
 {
 	unique_lock<mutex> lock(m_state_mutex);
-	return m_state;
+	auto rv = m_state;
+	return rv;
 }

 void
--- a/src/bees-types.cc
+++ b/src/bees-types.cc
@@ -71,7 +71,18 @@ operator<<(ostream &os, const BeesFileRange &bfr)
 	if (bfr.end() == numeric_limits<off_t>::max()) {
 		os << "- [" << to_hex(bfr.begin()) << "..eof]";
 	} else {
-		os << pretty(bfr.size()) << " [" << to_hex(bfr.begin()) << ".." << to_hex(bfr.end()) << "]";
+		os << pretty(bfr.size()) << " ";
+		if (bfr.begin() != 0) {
+			os << "[" << to_hex(bfr.begin());
+		} else {
+			os << "(";
+		}
+		os << ".." << to_hex(bfr.end());
+		if (!!bfr.m_fd && bfr.end() >= bfr.file_size()) {
+			os << ")";
+		} else {
+			os << "]";
+		}
 	}
 	if (bfr.m_fid) {
 		os << " fid = " << bfr.m_fid;
@@ -92,8 +103,6 @@ operator<<(ostream &os, const BeesRangePair &brp)
 		<< "\ndst = " << brp.second.fd() << " " << name_fd(brp.second.fd());
 }

-mutex BeesFileRange::s_mutex;
-
 bool
 BeesFileRange::operator<(const BeesFileRange &that) const
 {
@@ -145,7 +154,6 @@ off_t
 BeesFileRange::file_size() const
 {
 	if (m_file_size <= 0) {
-		// Use method fd() not member m_fd() so we hold lock
 		Stat st(fd());
 		m_file_size = st.st_size;
 		// These checks could trigger on valid input, but that would mean we have
@@ -178,31 +186,21 @@ BeesFileRange::grow_begin(off_t delta)
 BeesFileRange::BeesFileRange(const BeesBlockData &bbd) :
 	m_fd(bbd.fd()),
 	m_begin(bbd.begin()),
-	m_end(bbd.end()),
-	m_file_size(-1)
+	m_end(bbd.end())
 {
 }

 BeesFileRange::BeesFileRange(Fd fd, off_t begin, off_t end) :
 	m_fd(fd),
 	m_begin(begin),
-	m_end(end),
-	m_file_size(-1)
+	m_end(end)
 {
 }

 BeesFileRange::BeesFileRange(const BeesFileId &fid, off_t begin, off_t end) :
 	m_fid(fid),
 	m_begin(begin),
-	m_end(end),
-	m_file_size(-1)
-{
-}
-
-BeesFileRange::BeesFileRange() :
-	m_begin(0),
-	m_end(0),
-	m_file_size(-1)
+	m_end(end)
 {
 }

@@ -285,22 +283,18 @@ BeesFileRange::operator BeesBlockData() const
 Fd
 BeesFileRange::fd() const
 {
-	unique_lock<mutex> lock(s_mutex);
 	return m_fd;
 }

 Fd
 BeesFileRange::fd(const shared_ptr<BeesContext> &ctx) const
 {
-	unique_lock<mutex> lock(s_mutex);
 	// If we don't have a fid we can't do much here
 	if (m_fid) {
 		if (!m_fd) {
 			// If we don't have a fd, open by fid
 			if (m_fid && ctx) {
-				lock.unlock();
 				Fd new_fd = ctx->roots()->open_root_ino(m_fid);
-				lock.lock();
 				m_fd = new_fd;
 			}
 		} else {
@@ -936,6 +930,7 @@ BeesBlockData::data() const
 {
 	if (m_data.empty()) {
 		THROW_CHECK1(invalid_argument, size(), size() > 0);
+		BEESNOTE("Reading BeesBlockData " << *this);
 		BEESTOOLONG("Reading BeesBlockData " << *this);
 		Timer read_timer;

--- a/src/bees.cc
+++ b/src/bees.cc
@@ -1,7 +1,5 @@
-#include "bees-version.h"
 #include "bees.h"

-#include "crucible/interp.h"
 #include "crucible/limits.h"
 #include "crucible/process.h"
 #include "crucible/string.h"
@@ -21,18 +19,31 @@
 #include <linux/fs.h>
 #include <sys/ioctl.h>

+// setrlimit
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include <getopt.h>
+
 using namespace crucible;
 using namespace std;

 int
-do_cmd_help(const ArgList &argv)
+do_cmd_help(char *argv[])
 {
-	cerr << "Usage: " << argv[0] << " fs-root-path [fs-root-path-2...]\n"
+	cerr << "Usage: " << argv[0] << " [options] fs-root-path [fs-root-path-2...]\n"
 		"Performs best-effort extent-same deduplication on btrfs.\n"
 		"\n"
 		"fs-root-path MUST be the root of a btrfs filesystem tree (id 5).\n"
 		"Other directories will be rejected.\n"
 		"\n"
+		"Options:\n"
+		"\t-h, --help\t\tShow this help\n"
+		"\t-t, --timestamps\tShow timestamps in log output (default)\n"
+		"\t-T, --notimestamps\tOmit timestamps in log output\n"
+		"\t-p, --absolute-paths\tShow absolute paths (default)\n"
+		"\t-P, --relative-paths\tShow paths relative to $CWD\n"
+		"\n"
 		"Optional environment variables:\n"
 		"\tBEESHOME\tPath to hash table and configuration files\n"
 		"\t\t\t(default is .beeshome/ in the root of each filesystem).\n"
@@ -48,30 +59,36 @@ do_cmd_help(const ArgList &argv)

 RateLimiter bees_info_rate_limit(BEES_INFO_RATE, BEES_INFO_BURST);

-thread_local BeesTracer *BeesTracer::s_next_tracer = nullptr;
+thread_local BeesTracer *BeesTracer::tl_next_tracer = nullptr;

 BeesTracer::~BeesTracer()
 {
 	if (uncaught_exception()) {
-		m_func();
+		try {
+			m_func();
+		} catch (exception &e) {
+			BEESLOG("Nested exception: " << e.what());
+		} catch (...) {
+			BEESLOG("Nested exception ...");
+		}
 		if (!m_next_tracer) {
 			BEESLOG("---  END  TRACE --- exception ---");
 		}
 	}
-	s_next_tracer = m_next_tracer;
+	tl_next_tracer = m_next_tracer;
 }

 BeesTracer::BeesTracer(function<void()> f) :
 	m_func(f)
 {
-	m_next_tracer = s_next_tracer;
-	s_next_tracer = this;
+	m_next_tracer = tl_next_tracer;
+	tl_next_tracer = this;
 }

 void
 BeesTracer::trace_now()
 {
-	BeesTracer *tp = s_next_tracer;
+	BeesTracer *tp = tl_next_tracer;
 	BEESLOG("--- BEGIN TRACE ---");
 	while (tp) {
 		tp->m_func();
@@ -80,17 +97,17 @@ BeesTracer::trace_now()
 	BEESLOG("---  END  TRACE ---");
 }

-thread_local BeesNote *BeesNote::s_next = nullptr;
+thread_local BeesNote *BeesNote::tl_next = nullptr;
 mutex BeesNote::s_mutex;
 map<pid_t, BeesNote*> BeesNote::s_status;
-thread_local string BeesNote::s_name;
+thread_local string BeesNote::tl_name;

 BeesNote::~BeesNote()
 {
+	tl_next = m_prev;
 	unique_lock<mutex> lock(s_mutex);
-	s_next = m_prev;
-	if (s_next) {
-		s_status[gettid()] = s_next;
+	if (tl_next) {
+		s_status[gettid()] = tl_next;
 	} else {
 		s_status.erase(gettid());
 	}
@@ -99,28 +116,26 @@ BeesNote::~BeesNote()
 BeesNote::BeesNote(function<void(ostream &os)> f) :
 	m_func(f)
 {
+	m_name = tl_name;
+	m_prev = tl_next;
+	tl_next = this;
 	unique_lock<mutex> lock(s_mutex);
-	m_name = s_name;
-	m_prev = s_next;
-	s_next = this;
-	s_status[gettid()] = s_next;
+	s_status[gettid()] = tl_next;
 }

 void
 BeesNote::set_name(const string &name)
 {
-	unique_lock<mutex> lock(s_mutex);
-	s_name = name;
+	tl_name = name;
 }

 string
 BeesNote::get_name()
 {
-	unique_lock<mutex> lock(s_mutex);
-	if (s_name.empty()) {
+	if (tl_name.empty()) {
 		return "bees";
 	} else {
-		return s_name;
+		return tl_name;
 	}
 }

@@ -202,15 +217,21 @@ operator<<(ostream &os, const BeesStatTmpl<T> &bs)

 // other ----------------------------------------

+/**
+ * Don't allow two threads to use some btrfs ioctls at the same time.
+ * Some of them consume egregious amounts of kernel CPU time and are
+ * not interruptible, so if we have more threads than cores we will
+ * effectively crash the kernel. */
+LockSet<pid_t> bees_ioctl_lock_set;
+
 template <class T>
 T&
 BeesStatTmpl<T>::at(string idx)
 {
-	unique_lock<mutex> lock(m_mutex);
-    if (!m_stats_map.count(idx)) {
+	if (!m_stats_map.count(idx)) {
 		m_stats_map[idx] = 0;
 	}
-    return m_stats_map[idx];
+	return m_stats_map[idx];
 }

 template <class T>
@@ -218,7 +239,8 @@ T
 BeesStatTmpl<T>::at(string idx) const
 {
 	unique_lock<mutex> lock(m_mutex);
-    return m_stats_map.at(idx);
+	auto rv = m_stats_map.at(idx);
+	return rv;
 }

 template <class T>
@@ -226,7 +248,7 @@ void
 BeesStatTmpl<T>::add_count(string idx, size_t amount)
 {
 	unique_lock<mutex> lock(m_mutex);
-    if (!m_stats_map.count(idx)) {
+	if (!m_stats_map.count(idx)) {
 		m_stats_map[idx] = 0;
 	}
 	m_stats_map.at(idx) += amount;
@@ -258,14 +280,17 @@ BeesStats
 BeesStats::operator-(const BeesStats &that) const
 {
 	if (&that == this) return BeesStats();
+
 	unique_lock<mutex> this_lock(m_mutex);
 	BeesStats this_copy;
 	this_copy.m_stats_map = m_stats_map;
+	this_lock.unlock();
+
 	unique_lock<mutex> that_lock(that.m_mutex);
 	BeesStats that_copy;
 	that_copy.m_stats_map = that.m_stats_map;
-	this_lock.unlock();
 	that_lock.unlock();
+
 	for (auto i : that.m_stats_map) {
 		if (i.second != 0) {
 			this_copy.at(i.first) -= i.second;
@@ -414,6 +439,7 @@ BeesTempFile::create()
 	BEESNOTE("creating temporary file in " << m_ctx->root_path());
 	BEESTOOLONG("creating temporary file in " << m_ctx->root_path());

+	Timer create_timer;
 	DIE_IF_MINUS_ONE(m_fd = openat(m_ctx->root_fd(), ".", FLAGS_OPEN_TMPFILE, S_IRUSR | S_IWUSR));
 	BEESCOUNT(tmp_create);

@@ -421,18 +447,22 @@ BeesTempFile::create()
 	// Resolves won't work there anyway.  There are lots of tempfiles
 	// and they're short-lived, so this ends up being just a memory leak
 	// m_ctx->blacklist_add(BeesFileId(m_fd));
+
+	// Put this inode in the cache so we can resolve it later
 	m_ctx->insert_root_ino(m_fd);

 	// Set compression attribute
-	int flags = 0;
-	BEESTRACE("Getting FS_COMPR_FL on m_fd " << name_fd(m_fd) << " flags " << to_hex(flags));
-	DIE_IF_MINUS_ONE(ioctl(m_fd, FS_IOC_GETFLAGS, &flags));
+	BEESTRACE("Getting FS_COMPR_FL on m_fd " << name_fd(m_fd));
+	int flags = ioctl_iflags_get(m_fd);
 	flags |= FS_COMPR_FL;
 	BEESTRACE("Setting FS_COMPR_FL on m_fd " << name_fd(m_fd) << " flags " << to_hex(flags));
-	DIE_IF_MINUS_ONE(ioctl(m_fd, FS_IOC_SETFLAGS, &flags));
+	ioctl_iflags_set(m_fd, flags);

 	// Always leave first block empty to avoid creating a file with an inline extent
 	m_end_offset = BLOCK_SIZE_CLONE;
+
+	// Count time spent here
+	BEESCOUNTADD(tmp_create_ms, create_timer.age() * 1000);
 }

 void
@@ -446,11 +476,15 @@ BeesTempFile::resize(off_t offset)
 	THROW_CHECK2(invalid_argument, m_end_offset, offset, m_end_offset < offset);

 	// Truncate
+	Timer resize_timer;
 	DIE_IF_NON_ZERO(ftruncate(m_fd, offset));
 	BEESCOUNT(tmp_resize);

 	// Success
 	m_end_offset = offset;
+
+	// Count time spent here
+	BEESCOUNTADD(tmp_resize_ms, resize_timer.age() * 1000);
 }

 BeesTempFile::BeesTempFile(shared_ptr<BeesContext> ctx) :
@@ -462,7 +496,7 @@ BeesTempFile::BeesTempFile(shared_ptr<BeesContext> ctx) :

 void
 BeesTempFile::realign()
-{ 
+{
 	if (m_end_offset > BLOCK_SIZE_MAX_TEMP_FILE) {
 		BEESLOG("temporary file size " << to_hex(m_end_offset) << " > max " << BLOCK_SIZE_MAX_TEMP_FILE);
 		BEESCOUNT(tmp_trunc);
@@ -519,6 +553,7 @@ BeesTempFile::make_copy(const BeesFileRange &src)
 	auto end = m_end_offset + src.size();
 	resize(end);

+	Timer copy_timer;
 	BeesFileRange rv(m_fd, begin, end);
 	BEESTRACE("copying to: " << rv);
 	BEESNOTE("copying " << src << " to " << rv);
@@ -544,18 +579,30 @@ BeesTempFile::make_copy(const BeesFileRange &src)
 		src_p += len;
 		dst_p += len;
 	}
+	BEESCOUNTADD(tmp_copy_ms, copy_timer.age() * 1000);

 	// We seem to get lockups without this!
 	if (did_block_write) {
+#if 1
+		// Is this fixed by "Btrfs: fix deadlock between dedup on same file and starting writeback"?
+		// No.
 		bees_sync(m_fd);
+#endif
 	}

 	BEESCOUNT(tmp_copy);
 	return rv;
 }

+unsigned
+bees_worker_thread_count()
+{
+	// Maybe # of cores * (scalar from 0.25..4)?
+	return max(1U, thread::hardware_concurrency() * 4);
+}
+
 int
-bees_main(ArgList args)
+bees_main(int argc, char *argv[])
 {
 	set_catch_explainer([&](string s) {
 		BEESLOG("\n\n*** EXCEPTION ***\n\t" << s << "\n***\n");
@@ -568,12 +615,77 @@ bees_main(ArgList args)
 	list<shared_ptr<BeesContext>> all_contexts;
 	shared_ptr<BeesContext> bc;

+	THROW_CHECK1(invalid_argument, argc, argc >= 0);
+
+	string cwd(readlink_or_die("/proc/self/cwd"));
+
+	// Defaults
+	bool chatter_prefix_timestamp = true;
+
+	// Parse options
+	int c;
+	while (1) {
+		int option_index = 0;
+		static struct option long_options[] = {
+			{ "timestamps",     no_argument, NULL, 't' },
+			{ "notimestamps",   no_argument, NULL, 'T' },
+			{ "absolute-paths", no_argument, NULL, 'p' },
+			{ "relative-paths", no_argument, NULL, 'P' },
+			{ "help",           no_argument, NULL, 'h' }
+		};
+
+		c = getopt_long(argc, argv, "TtPph", long_options, &option_index);
+		if (-1 == c) {
+			break;
+		}
+
+		switch (c) {
+			case 'T':
+				chatter_prefix_timestamp = false;
+				break;
+			case 't':
+				chatter_prefix_timestamp = true;
+				break;
+			case 'P':
+				crucible::set_relative_path(cwd);
+				break;
+			case 'p':
+				crucible::set_relative_path("");
+				break;
+			case 'h':
+				do_cmd_help(argv); // fallthrough
+			default:
+				return 2;
+		}
+	}
+
+	Chatter::enable_timestamp(chatter_prefix_timestamp);
+
+	if (!relative_path().empty()) {
+		BEESLOG("using relative path " << relative_path() << "\n");
+	}
+
+	// There can be only one because we measure running time with it
+	// EXPERIMENT:  don't try this on kernels before v4.14
+	// bees_ioctl_lock_set.max_size(1);
+
+	BEESLOG("setting rlimit NOFILE to " << BEES_OPEN_FILE_LIMIT);
+
+	struct rlimit lim = {
+		.rlim_cur = BEES_OPEN_FILE_LIMIT,
+		.rlim_max = BEES_OPEN_FILE_LIMIT,
+	};
+	int rv = setrlimit(RLIMIT_NOFILE, &lim);
+	if (rv) {
+		BEESLOG("setrlimit(RLIMIT_NOFILE, { " << lim.rlim_cur << " }): " << strerror(errno));
+	};
+
 	// Create a context and start crawlers
 	bool did_subscription = false;
-	for (string arg : args) {
+	while (optind < argc) {
 		catch_all([&]() {
 			bc = make_shared<BeesContext>(bc);
-			bc->set_root_path(arg);
+			bc->set_root_path(argv[optind++]);
 			did_subscription = true;
 		});
 	}
@@ -594,7 +706,7 @@ bees_main(ArgList args)
 }

 int
-main(int argc, const char **argv)
+main(int argc, char *argv[])
 {
 	cerr << "bees version " << BEES_VERSION << endl;

@@ -603,11 +715,9 @@ main(int argc, const char **argv)
 		return 2;
 	}

-	ArgList args(argv + 1);
-
 	int rv = 1;
 	catch_and_explain([&]() {
-		rv = bees_main(args);
+		rv = bees_main(argc, argv);
 	});
 	return rv;
 }
--- a/src/bees.h
+++ b/src/bees.h
@@ -1,7 +1,6 @@
 #ifndef BEES_H
 #define BEES_H

-#include "crucible/bool.h"
 #include "crucible/cache.h"
 #include "crucible/chatter.h"
 #include "crucible/error.h"
@@ -40,13 +39,6 @@ const off_t BLOCK_SIZE_MAX_EXTENT_SAME = 4096 * 4096;
 // Maximum length of a compressed extent in bytes
 const off_t BLOCK_SIZE_MAX_COMPRESSED_EXTENT = 128 * 1024;

-// Try to combine smaller extents into larger ones
-const off_t BLOCK_SIZE_MIN_EXTENT_DEFRAG = BLOCK_SIZE_MAX_COMPRESSED_EXTENT;
-
-// Avoid splitting extents that are already too small
-const off_t BLOCK_SIZE_MIN_EXTENT_SPLIT = BLOCK_SIZE_MAX_COMPRESSED_EXTENT;
-// const off_t BLOCK_SIZE_MIN_EXTENT_SPLIT = 1024LL * 1024 * 1024 * 1024;
-
 // Maximum length of any extent in bytes
 // except we've seen 1.03G extents...
 // ...FIEMAP is slow and full of lies
@@ -55,8 +47,6 @@ const off_t BLOCK_SIZE_MAX_EXTENT = 128 * 1024 * 1024;
 // Masks, so we don't have to write "(BLOCK_SIZE_CLONE - 1)" everywhere
 const off_t BLOCK_MASK_CLONE = BLOCK_SIZE_CLONE - 1;
 const off_t BLOCK_MASK_SUMS = BLOCK_SIZE_SUMS - 1;
-const off_t BLOCK_MASK_MMAP = BLOCK_SIZE_MMAP - 1;
-const off_t BLOCK_MASK_MAX_COMPRESSED_EXTENT = BLOCK_SIZE_MAX_COMPRESSED_EXTENT * 2 - 1;

 // Maximum temporary file size
 const off_t BLOCK_SIZE_MAX_TEMP_FILE = 1024 * 1024 * 1024;
@@ -70,29 +60,43 @@ const off_t BLOCK_SIZE_HASHTAB_EXTENT = 16 * 1024 * 1024;
 // Bytes per second we want to flush (8GB every two hours)
 const double BEES_FLUSH_RATE = 8.0 * 1024 * 1024 * 1024 / 7200.0;

-// Interval between writing non-hash-table things to disk (15 minutes)
-const int BEES_WRITEBACK_INTERVAL = 900;
+// How long we should wait for new btrfs transactions
+const double BEES_COMMIT_INTERVAL = 900;
+
+// Interval between writing non-hash-table things to disk, and starting new subvol crawlers
+const int BEES_WRITEBACK_INTERVAL = BEES_COMMIT_INTERVAL;

 // Statistics reports while scanning
 const int BEES_STATS_INTERVAL = 3600;

 // Progress shows instantaneous rates and thread status
-const int BEES_PROGRESS_INTERVAL = 3600;
+const int BEES_PROGRESS_INTERVAL = BEES_STATS_INTERVAL;

 // Status is output every freakin second.  Use a ramdisk.
 const int BEES_STATUS_INTERVAL = 1;

+// Number of file FDs to cache when not in active use
+const size_t BEES_FILE_FD_CACHE_SIZE = 4096;
+
+// Number of root FDs to cache when not in active use
+const size_t BEES_ROOT_FD_CACHE_SIZE = 1024;
+
+// Number of FDs to open (rlimit)
+const size_t BEES_OPEN_FILE_LIMIT = (BEES_FILE_FD_CACHE_SIZE + BEES_ROOT_FD_CACHE_SIZE) * 2 + 100;
+
+// Worker thread limit (more threads may be created, but only this number will be active concurrently)
+const size_t BEES_WORKER_THREAD_LIMIT = 128;
+
 // Log warnings when an operation takes too long
 const double BEES_TOO_LONG = 2.5;

 // Avoid any extent where LOGICAL_INO takes this long
-const double BEES_TOXIC_DURATION = 9.9;
-
-// How long we should wait for new btrfs transactions
-const double BEES_COMMIT_INTERVAL = 900;
+// const double BEES_TOXIC_DURATION = 9.9;
+// EXPERIMENT:  Kernel v4.14+ may let us ignore toxicity
+const double BEES_TOXIC_DURATION = BEES_COMMIT_INTERVAL;

 // How long between hash table histograms
-const double BEES_HASH_TABLE_ANALYZE_INTERVAL = 3600;
+const double BEES_HASH_TABLE_ANALYZE_INTERVAL = BEES_STATS_INTERVAL;

 // Rate limiting of informational messages
 const double BEES_INFO_RATE = 10.0;
@@ -102,7 +106,7 @@ const double BEES_INFO_BURST = 1.0;
 const size_t BEES_MAX_QUEUE_SIZE = 1024;

 // Read this many items at a time in SEARCHv2
-const size_t BEES_MAX_CRAWL_SIZE = 4096;
+const size_t BEES_MAX_CRAWL_SIZE = 1024;

 // If an extent has this many refs, pretend it does not exist
 // to avoid a crippling btrfs performance bug
@@ -156,12 +160,12 @@ class BeesStatTmpl {
 	map<string, T>	m_stats_map;
 	mutable mutex	m_mutex;

+	T& at(string idx);
 public:
 	BeesStatTmpl() = default;
 	BeesStatTmpl(const BeesStatTmpl &that);
 	BeesStatTmpl &operator=(const BeesStatTmpl &that);
 	void add_count(string idx, size_t amount = 1);
-	T& at(string idx);
 	T at(string idx) const;

 friend ostream& operator<< <>(ostream &os, const BeesStatTmpl<T> &bs);
@@ -185,7 +189,7 @@ class BeesTracer {
 	function<void()> m_func;
 	BeesTracer *m_next_tracer = 0;
 	
-	thread_local static BeesTracer *s_next_tracer;
+	thread_local static BeesTracer *tl_next_tracer;
 public:
 	BeesTracer(function<void()> f);
 	~BeesTracer();
@@ -201,8 +205,8 @@ class BeesNote {
 	static mutex			s_mutex;
 	static map<pid_t, BeesNote*>	s_status;

-	thread_local static BeesNote	*s_next;
-	thread_local static string	s_name;
+	thread_local static BeesNote	*tl_next;
+	thread_local static string	tl_name;

 public:
 	BeesNote(function<void(ostream &)> f);
@@ -252,15 +256,14 @@ ostream& operator<<(ostream &os, const BeesFileId &bfi);

 class BeesFileRange {
 protected:
-	static mutex		s_mutex;
 	mutable Fd		m_fd;
 	mutable BeesFileId	m_fid;
-	off_t			m_begin, m_end;
-	mutable off_t		m_file_size;
+	off_t			m_begin = 0, m_end = 0;
+	mutable off_t		m_file_size = -1;

 public:

-	BeesFileRange();
+	BeesFileRange() = default;
 	BeesFileRange(Fd fd, off_t begin, off_t end);
 	BeesFileRange(const BeesFileId &fid, off_t begin, off_t end);
 	BeesFileRange(const BeesBlockData &bbd);
@@ -440,19 +443,24 @@ private:
 	uint64_t		m_buckets;
 	uint64_t		m_extents;
 	uint64_t		m_cells;
-	set<uint64_t>		m_buckets_dirty;
-	set<uint64_t>		m_buckets_missing;
 	BeesThread  		m_writeback_thread;
 	BeesThread	        m_prefetch_thread;
 	RateLimiter		m_flush_rate_limit;
-	RateLimiter		m_prefetch_rate_limit;
-	mutex			m_extent_mutex;
-	mutex			m_bucket_mutex;
-	condition_variable	m_condvar;
 	set<HashType>		m_toxic_hashes;
 	BeesStringFile		m_stats_file;

-	LockSet<uint64_t> 	m_extent_lock_set;
+	// Mutex/condvar for the writeback thread
+	mutex			m_dirty_mutex;
+	condition_variable	m_dirty_condvar;
+
+	// Per-extent structures
+	struct ExtentMetaData {
+		shared_ptr<mutex> m_mutex_ptr;		// Access serializer
+		bool	m_dirty = false;	// Needs to be written back to disk
+		bool	m_missing = true;	// Needs to be read from disk
+		ExtentMetaData();
+	};
+	vector<ExtentMetaData>	m_extent_metadata;

 	void open_file();
 	void writeback_loop();
@@ -460,11 +468,17 @@ private:
 	void try_mmap_flags(int flags);
 	pair<Cell *, Cell *> get_cell_range(HashType hash);
 	pair<uint8_t *, uint8_t *> get_extent_range(HashType hash);
-	void fetch_missing_extent(HashType hash);
-	void set_extent_dirty(HashType hash);
+	void fetch_missing_extent_by_hash(HashType hash);
+	void fetch_missing_extent_by_index(uint64_t extent_index);
+	void set_extent_dirty_locked(uint64_t extent_index);
 	void flush_dirty_extents();
+	bool flush_dirty_extent(uint64_t extent_index);
 	bool is_toxic_hash(HashType h) const;

+	size_t			hash_to_extent_index(HashType ht);
+	unique_lock<mutex>	lock_extent_by_hash(HashType ht);
+	unique_lock<mutex>	lock_extent_by_index(uint64_t extent_index);
+
 	BeesHashTable(const BeesHashTable &) = delete;
 	BeesHashTable &operator=(const BeesHashTable &) = delete;
 };
@@ -487,35 +501,40 @@ class BeesCrawl {

 	mutex					m_mutex;
 	set<BeesFileRange>			m_extents;
-	DefaultBool				m_deferred;
+	bool					m_deferred = false;

 	mutex					m_state_mutex;
 	BeesCrawlState				m_state;

+	BeesThread				m_thread;
+	bool					m_stopped = false;
+	condition_variable			m_cond_stopped;
+
 	bool fetch_extents();
 	void fetch_extents_harder();
 	bool next_transid();

 public:
+	~BeesCrawl();
 	BeesCrawl(shared_ptr<BeesContext> ctx, BeesCrawlState initial_state);
 	BeesFileRange peek_front();
 	BeesFileRange pop_front();
 	BeesCrawlState get_state();
 	void set_state(const BeesCrawlState &bcs);
+	void crawl_thread();
 };

 class BeesRoots {
 	shared_ptr<BeesContext>			m_ctx;

 	BeesStringFile				m_crawl_state_file;
-	BeesCrawlState				m_crawl_current;
 	map<uint64_t, shared_ptr<BeesCrawl>>	m_root_crawl_map;
 	mutex					m_mutex;
 	condition_variable			m_condvar;
-	DefaultBool				m_crawl_dirty;
+	bool					m_crawl_dirty = false;
 	Timer					m_crawl_timer;
-	BeesThread				m_crawl_thread;
 	BeesThread				m_writeback_thread;
+	LockSet<uint64_t>			m_lock_set;

 	void insert_new_crawl();
 	void insert_root(const BeesCrawlState &bcs);
@@ -530,7 +549,6 @@ class BeesRoots {
 	BeesCrawlState crawl_state_get(uint64_t root);
 	void crawl_state_set_dirty();
 	void crawl_state_erase(const BeesCrawlState &bcs);
-	void crawl_thread();
 	void writeback_thread();
 	uint64_t next_root(uint64_t root = 0);
 	void current_state_set(const BeesCrawlState &bcs);
@@ -543,6 +561,7 @@ public:
 	Fd open_root(uint64_t root);
 	Fd open_root_ino(uint64_t root, uint64_t ino);
 	Fd open_root_ino(const BeesFileId &bfi) { return open_root_ino(bfi.root(), bfi.ino()); }
+	LockSet<uint64_t> &lock_set() { return m_lock_set; }
 };

 struct BeesHash {
@@ -568,7 +587,7 @@ class BeesBlockData {
 	mutable BeesAddress	m_addr;
 	mutable Blob		m_data;
 	mutable BeesHash	m_hash;
-	mutable DefaultBool	m_hash_done;
+	mutable bool		m_hash_done = false;

 public:
 	// Constructor with the immutable fields
@@ -606,42 +625,6 @@ public:
 friend ostream & operator<<(ostream &os, const BeesRangePair &brp);
 };

-class BeesWorkQueueBase {
-	string 				m_name; 
-
-protected:
-	static mutex			s_mutex;
-	static set<BeesWorkQueueBase *>	s_all_workers;
-
-public:
-	virtual ~BeesWorkQueueBase();
-	BeesWorkQueueBase(const string &name);
-
-	string name() const;
-	void name(const string &new_name);
-
-	virtual size_t active_size() const = 0;
-	virtual list<string> peek_active(size_t count) const = 0;
- 
-	static void for_each_work_queue(function<void(BeesWorkQueueBase *)> f);
-};
-
-template <class Task>
-class BeesWorkQueue : public BeesWorkQueueBase {
-	WorkQueue<Task>				m_active_queue;
-
-public:
-	BeesWorkQueue(const string &name);
-	~BeesWorkQueue();
-	void push_active(const Task &task, size_t limit);
-	void push_active(const Task &task);
-
-	size_t active_size() const override;
-	list<string> peek_active(size_t count) const override;
-
-	Task pop();
-};
-
 class BeesTempFile {
 	shared_ptr<BeesContext> m_ctx;
 	Fd			m_fd;
@@ -661,6 +644,7 @@ class BeesFdCache {
 	LRUCache<Fd, shared_ptr<BeesContext>, uint64_t>			m_root_cache;
 	LRUCache<Fd, shared_ptr<BeesContext>, uint64_t, uint64_t>	m_file_cache;
 	Timer								m_root_cache_timer;
+	Timer								m_file_cache_timer;

 public:
 	BeesFdCache();
@@ -672,7 +656,7 @@ public:
 struct BeesResolveAddrResult {
 	BeesResolveAddrResult();
 	vector<BtrfsInodeOffsetRoot> m_biors;
-	DefaultBool m_is_toxic;
+	bool m_is_toxic = false;
 	bool is_toxic() const { return m_is_toxic; }
 };

@@ -700,6 +684,8 @@ class BeesContext : public enable_shared_from_this<BeesContext> {

 	Timer						m_total_timer;

+	LockSet<uint64_t>			m_extent_lock_set;
+
 	void set_root_fd(Fd fd);

 	BeesResolveAddrResult resolve_addr_uncached(BeesAddress addr);
@@ -737,6 +723,7 @@ public:
 	shared_ptr<BeesTempFile> tmpfile();

 	const Timer &total_timer() const { return m_total_timer; }
+	LockSet<uint64_t> &extent_lock_set() { return m_extent_lock_set; }

 	// TODO: move the rest of the FD cache methods here
 	void insert_root_ino(Fd fd);
@@ -750,22 +737,22 @@ class BeesResolver {
 	unsigned				m_bior_count;

 	// We found matching data, so we can dedup
-	DefaultBool				m_found_data;
+	bool					m_found_data = false;

 	// We found matching data, so we *did* dedup
-	DefaultBool				m_found_dup;
+	bool					m_found_dup = false;

 	// We found matching hash, so the hash table is still correct
-	DefaultBool				m_found_hash;
+	bool					m_found_hash = false;

 	// We found matching physical address, so the hash table isn't totally wrong
-	DefaultBool				m_found_addr;
+	bool					m_found_addr = false;

 	// We found matching physical address, but data did not match
-	DefaultBool				m_wrong_data;
+	bool					m_wrong_data = false;

 	// The whole thing is a placebo to avoid crippling btrfs performance bugs
-	DefaultBool				m_is_toxic;
+	bool					m_is_toxic = false;

 	BeesFileRange chase_extent_ref(const BtrfsInodeOffsetRoot &bior, BeesBlockData &needle_bbd);
 	BeesBlockData adjust_offset(const BeesFileRange &haystack, const BeesBlockData &needle);
@@ -819,9 +806,12 @@ public:
 };

 // And now, a giant pile of extern declarations
+extern const char *BEES_VERSION;
 string pretty(double d);
 extern RateLimiter bees_info_rate_limit;
 void bees_sync(int fd);
 string format_time(time_t t);
+extern LockSet<pid_t> bees_ioctl_lock_set;
+extern unsigned bees_worker_thread_count();

 #endif
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,9 +1,7 @@
 PROGRAMS = \
 	chatter \
 	crc64 \
-	execpipe \
 	fd \
-	interp \
 	limits \
 	path \
 	process \
@@ -21,7 +19,7 @@ LDFLAGS = -L../lib -Wl,-rpath=$(shell realpath ../lib)
 depends.mk: *.cc
 	for x in *.cc; do $(CXX) $(CXXFLAGS) -M "$$x"; done >> depends.mk.new
 	mv -fv depends.mk.new depends.mk
-                                       
+
 -include depends.mk

 %.o: %.cc %.h ../makeflags
--- a/test/execpipe.cc
+++ b/test/execpipe.cc
@@ -1,64 +0,0 @@
-#include "tests.h"
-
-#include "crucible/execpipe.h"
-
-#include <ios>
-#include <cassert>
-#include <cstring>
-#include <cstdlib>
-#include <stdexcept>
-
-#include <unistd.h>
-
-using namespace crucible;
-using namespace std;
-
-#if 1 // Needs rework
-static inline
-void
-test_hello_world()
-{
-	// alarm(9);
-	Fd fd = popen([]() { return system("echo Hello, World!"); });
-	char buf[1024];
-	size_t rv = -1;
-	read_partial_or_die(fd, buf, rv);
-	assert(rv > 0);
-	string b(buf, buf + rv - 1);
-	// cerr << "hello_world says: '" << b << "'" << endl;
-	assert(b == "Hello, World!");
-}
-
-static inline
-void
-test_read_limit(size_t limit = 4096)
-{
-	alarm(9);
-	Fd fd = popen([]() { return system("yes Hello!"); });
-	try {
-		string b = read_all(fd, limit);
-	} catch (out_of_range &re) {
-		return;
-	}
-	assert(!"no exception thrown by read_all");
-}
-#endif
-
-namespace crucible {
-	extern bool assert_no_leaked_fds();
-};
-
-int
-main(int, char**)
-{
-#if 1
-	RUN_A_TEST(test_hello_world());
-	assert(assert_no_leaked_fds());
-	RUN_A_TEST(test_read_limit(4095));
-	RUN_A_TEST(test_read_limit(4096));
-	RUN_A_TEST(test_read_limit(4097));
-	assert(assert_no_leaked_fds());
-#endif
-
-	exit(EXIT_SUCCESS);
-}
--- a/test/interp.cc
+++ b/test/interp.cc
@@ -1,88 +0,0 @@
-#include "tests.h"
-
-#include "crucible/interp.h"
-
-using namespace crucible;
-using namespace std;
-
-/***********************************************************************
-
-How this should work:
-
-Interpreter reads an arg list:
-
-	argv[0] --method0args --method1arg arg1 --method1arg=arg1 -- args...
-
-argv[0] should look up a shared_ptr<Command> which creates an object of
-type shared_ptr<Process>.  This object is used to receive args by
-method calls or one at a time.
-
-<Command> and <Process> can be the same object, or not.
-
-Process p methods:
-
-	p->spawn(Interp*) -> Process
-	p->exec(ArgList) -> Process / Result
-	p->method (from ArgParser<>)
-		p->finish() -> void (destroys object without early destruction warnings...?)
-		p->~Process() -> complains loudly if finish() not called first...?
-
-Result might be a pair of Process, string.  Or just string.
-
-ArgParser should be more like GetOpt:
-
-	build a dictionary and an arg list from arguments
-	Process methods should interrogate ArgParser
-	ArgParser might have a table of boolean and string option names so it can reject invalid options
-		but if it had that, we could also pass in Process and have it call methods on it
-		...but that is a _lot_ of pointer-hiding when we could KISS
-		...but if we had that solved, argparser tables look like lists of method names
-	ArgParser<T> has a table of names and methods on object of type T
-		ArgParser hides everything behind void* and hands off to a compiled implementation to do callbacks
-
-Extreme simplification:  arguments are themselves executable
-
-	so '--method_foo arg' really means construct MethodFoo(arg) and cast to shared_ptr<ProcArg>
-	then Process->invokeSomething(ProcArg)
-	too extreme, use argparser instead
-
-***********************************************************************/
-
-void
-test_arg_parser()
-{
-	ArgParser ap;
-	ArgList al( { "abc", "--def", "ghi" } );
-	ap.parse(NULL, al);
-}
-
-struct Thing {
-	int m_i;
-	double m_d;
-	string m_s;
-
-	void set_i(int i) { cerr << "i = " << i << endl; m_i = i; }
-	void set_d(double d) { cerr << "d = " << d << endl; m_d = d; }
-	void set_s(string s) { cerr << "s = " << s << endl; m_s = s; }
-};
-
-template <typename F, typename T, typename A>
-void
-assign(T& t, F f, A a)
-{
-	cerr << __PRETTY_FUNCTION__ << " - a = " << a << endl;
-	(t.*f)(a);
-}
-
-int
-main(int, char**)
-{
-	RUN_A_TEST(test_arg_parser());
-
-	Thing p;
-	assign(p, &Thing::set_i, 5);
-
-	cerr << "p.m_i = " << p.m_i << endl;
-
-	exit(EXIT_SUCCESS);
-}