From e8298570ed09341b2531c7952b0f341604d39472 Mon Sep 17 00:00:00 2001
From: Zygo Blaxell <bees@furryterror.org>
Date: Sun, 30 Sep 2018 01:08:29 -0400
Subject: [PATCH] README: split into sections, reformat for github.io

Split the rather large README into smaller sections with a pitch and
a ToC at the top.

Move the sections into docs/ so that Github Pages can read them.

'make doc' produces a local HTML tree.

Update the kernel bugs and gotchas list.

Add some information that has been accumulating in Github comments.

Remove information about bugs in kernels earlier than 4.14.

Signed-off-by: Zygo Blaxell <bees@furryterror.org>
---
 Makefile             |  14 +-
 README.md            | 617 +++----------------------------------------
 docs/.gitignore      |   1 +
 docs/Makefile        |   8 +
 docs/_config.yml     |   1 +
 docs/btrfs-kernel.md |  56 ++++
 docs/btrfs-other.md  |  53 ++++
 docs/config.md       | 151 +++++++++++
 docs/gotchas.md      | 113 ++++++++
 docs/how-it-works.md | 100 +++++++
 docs/index.md        |  73 +++++
 docs/install.md      |  91 +++++++
 docs/missing.md      |  50 ++++
 docs/options.md      |  52 ++++
 docs/running.md      |  92 +++++++
 15 files changed, 888 insertions(+), 584 deletions(-)
 create mode 100644 docs/.gitignore
 create mode 100644 docs/Makefile
 create mode 100644 docs/_config.yml
 create mode 100644 docs/btrfs-kernel.md
 create mode 100644 docs/btrfs-other.md
 create mode 100644 docs/config.md
 create mode 100644 docs/gotchas.md
 create mode 100644 docs/how-it-works.md
 create mode 100644 docs/index.md
 create mode 100644 docs/install.md
 create mode 100644 docs/missing.md
 create mode 100644 docs/options.md
 create mode 100644 docs/running.md

diff --git a/Makefile b/Makefile
index a065a97..49f397b 100644
--- a/Makefile
+++ b/Makefile
@@ -7,8 +7,6 @@ LIBEXEC_PREFIX ?= $(LIB_PREFIX)/bees
 
 SYSTEMD_SYSTEM_UNIT_DIR ?= $(shell pkg-config systemd --variable=systemdsystemunitdir)
 
-MARKDOWN := $(firstword $(shell type -P markdown markdown2 markdown_py 2>/dev/null || echo markdown))
-
 BEES_VERSION ?= $(shell git describe --always --dirty || echo UNKNOWN)
 
 # allow local configuration to override above variables
@@ -25,13 +23,12 @@ include Defines.mk
 default: $(DEFAULT_MAKE_TARGET)
 
 all: lib src scripts
-docs: README.html
-reallyall: all docs test
+reallyall: all doc test
 
 clean: ## Cleanup
 	git clean -dfx -e localconf
 
-.PHONY: lib src test
+.PHONY: lib src test doc
 
 lib: ## Build libs
 	$(MAKE) -C lib
@@ -44,15 +41,14 @@ test: ## Run tests
 test: lib src
 	$(MAKE) -C test
 
+doc: ## Build docs
+	$(MAKE) -C docs
+
 scripts/%: scripts/%.in
 	$(TEMPLATE_COMPILER)
 
 scripts: scripts/beesd scripts/beesd@.service
 
-README.html: README.md
-	$(MARKDOWN) README.md > README.html.new
-	mv -f README.html.new README.html
-
 install_libs: lib
 	install -Dm644 lib/libcrucible.so $(DESTDIR)$(LIB_PREFIX)/libcrucible.so
 
diff --git a/README.md b/README.md
index 4a20d39..7243b94 100644
--- a/README.md
+++ b/README.md
@@ -1,591 +1,60 @@
 BEES
 ====
 
-Best-Effort Extent-Same, a btrfs dedup agent.
+Best-Effort Extent-Same, a btrfs deduplication agent.
 
-About Bees
+About bees
 ----------
 
-Bees is a block-oriented userspace dedup agent designed to avoid
-scalability problems on large filesystems.
+bees is a block-oriented userspace deduplication agent designed for large
+btrfs filesystems.  It is an offline dedupe combined with an incremental
+data scan capability to minimize time data spends on disk from write
+to dedupe.
 
-Bees is designed to degrade gracefully when underprovisioned with RAM.
-Bees does not use more RAM or storage as filesystem data size increases.
-The dedup hash table size is fixed at creation time and does not change.
-The effective dedup block size is dynamic and adjusts automatically to
-fit the hash table into the configured RAM limit.  Hash table overflow
-is not implemented to eliminate the IO overhead of hash table overflow.
-Hash table entries are only 16 bytes per dedup block to keep the average
-dedup block size small.
-
-Bees does not require alignment between dedup blocks or extent boundaries
-(i.e. it can handle any multiple-of-4K offset between dup block pairs).
-Bees rearranges blocks into shared and unique extents if required to
-work within current btrfs kernel dedup limitations.
-
-Bees can dedup any combination of compressed and uncompressed extents.
-
-Bees operates in a single pass which removes duplicate extents immediately
-during scan.  There are no separate scanning and dedup phases.
-
-Bees uses only data-safe btrfs kernel operations, so it can dedup live
-data (e.g. build servers, sqlite databases, VM disk images).  It does
-not modify file attributes or timestamps.
-
-Bees does not store any information about filesystem structure, so it is
-not affected by the number or size of files (except to the extent that
-these cause performance problems for btrfs in general).  It retrieves such
-information on demand through btrfs SEARCH_V2 and LOGICAL_INO ioctls.
-This eliminates the storage required to maintain the equivalents of
-these functions in userspace.  It's also why bees has no XFS support.
-
-Bees is a daemon designed to run continuously and maintain its state
-across crahes and reboots.  Bees uses checkpoints for persistence to
-eliminate the IO overhead of a transactional data store.  On restart,
-bees will dedup any data that was added to the filesystem since the
-last checkpoint.
-
-Bees is used to dedup filesystems ranging in size from 16GB to 35TB, with
-hash tables ranging in size from 128MB to 11GB.
-
-How Bees Works
---------------
-
-Bees uses a fixed-size persistent dedup hash table with a variable dedup
-block size.  Any size of hash table can be dedicated to dedup.  Bees will
-scale the dedup block size to fit the filesystem's unique data size
-using a weighted sampling algorithm.  This allows Bees to adapt itself
-to its filesystem size without forcing admins to do math at install time.
-At the same time, the duplicate block alignment constraint can be as low
-as 4K, allowing efficient deduplication of files with narrowly-aligned
-duplicate block offsets (e.g. compiled binaries and VM/disk images)
-even if the effective block size is much larger.
-
-The Bees hash table is loaded into RAM at startup (using hugepages if
-available), mlocked, and synced to persistent storage by trickle-writing
-over a period of several hours.  This avoids issues related to seeking
-or fragmentation, and enables the hash table to be efficiently stored
-on Btrfs with compression (or an ext4 filesystem, or a raw disk, or
-on CIFS...).
-
-Once a duplicate block is identified, Bees examines the nearby blocks
-in the files where block appears.  This allows Bees to find long runs
-of adjacent duplicate block pairs if it has an entry for any one of
-the blocks in its hash table.  The stored hash entry plus the block
-recently scanned from disk form a duplicate pair.  On typical data sets,
-this means most of the blocks in the hash table are redundant and can
-be discarded without significant performance impact.
-
-Hash table entries are grouped together into LRU lists.  As each block
-is scanned, its hash table entry is inserted into the LRU list at a
-random position.  If the LRU list is full, the entry at the end of the
-list is deleted.  If a hash table entry is used to discover duplicate
-blocks, the entry is moved to the beginning of the list.  This makes Bees
-unable to detect a small number of duplicates (less than 1% on typical
-filesystems), but it dramatically improves efficiency on filesystems
-with many small files.  Bees has found a net 13% more duplicate bytes
-than a naive fixed-block-size algorithm with a 64K block size using the
-same size of hash table, even after discarding 1% of the duplicate bytes.
-
-Hash Table Sizing
------------------
-
-Hash table entries are 16 bytes each (64-bit hash, 52-bit block number,
-and some metadata bits).  Each entry represents a minimum of 4K on disk.
-
-    unique data size    hash table size    average dedup block size
-        1TB                 4GB                  4K
-        1TB                 1GB                 16K
-        1TB               256MB                 64K
-        1TB                16MB               1024K
-       64TB                 1GB               1024K
-
-To change the size of the hash table, use 'truncate' to change the hash
-table size, delete `beescrawl.dat` so that bees will start over with a
-fresh full-filesystem rescan, and restart `bees`.
-
-Things You Might Expect That Bees Doesn't Have
-----------------------------------------------
-
-* There's no configuration file (patches welcome!).  There are some tunables
-hardcoded in the source that could eventually become configuration options.
-There's also an incomplete option parser (patches welcome!).
-
-* There's no way to *stop* the Bees daemon.  Use SIGKILL, SIGTERM, or
-Ctrl-C for now.  Some of the destructors are unreachable and have never
-been tested.  Bees will repeat some work when restarted.
-
-* The Bees process doesn't fork and writes its log to stdout/stderr.
-A shell wrapper is required to make it behave more like a daemon.
-
-* There's no facility to exclude any part of a filesystem (patches
-welcome).
-
-* PREALLOC extents and extents containing blocks filled with zeros will
-be replaced by holes unconditionally.
-
-* Duplicate block groups that are less than 12K in length can take 30%
-of the run time while saving only 3% of the disk space.  There should
-be an option to just not bother with those.
-
-* There is a lot of duplicate reading of blocks in snapshots.  Bees will
-scan all snapshots at close to the same time to try to get better
-performance by caching, but really fixing this requires rewriting the
-crawler to scan the btrfs extent tree directly instead of the subvol
-FS trees.
-
-* Block reads are currently more allocation- and CPU-intensive than they
-should be, especially for filesystems on SSD where the IO overhead is
-much smaller.  This is a problem for power-constrained environments
-(e.g. laptops with slow CPU).
-
-* Bees can currently fragment extents when required to remove duplicate
-blocks, but has no defragmentation capability yet.  When possible, Bees
-will attempt to work with existing extent boundaries, but it will not
-aggregate blocks together from multiple extents to create larger ones.
-
-* It is possible to resize the hash table without starting over with
-a new full-filesystem scan; however, this has not been implemented yet.
-
-Good Btrfs Feature Interactions
--------------------------------
-
-Bees has been tested in combination with the following:
-
-* btrfs compression (zlib, lzo, zstd), mixtures of compressed and uncompressed extents
-* PREALLOC extents (unconditionally replaced with holes)
-* HOLE extents and btrfs no-holes feature
-* Other deduplicators, reflink copies (though Bees may decide to redo their work)
-* btrfs snapshots and non-snapshot subvols (RW and RO)
-* Concurrent file modification (e.g. PostgreSQL and sqlite databases, build daemons)
-* all btrfs RAID profiles (people ask about this, but it's irrelevant to bees)
-* IO errors during dedup (read errors will throw exceptions, Bees will catch them and skip over the affected extent)
-* Filesystems mounted *with* the flushoncommit option
-* 4K filesystem data block size / clone alignment
-* 64-bit and 32-bit host CPUs (amd64, x86, arm)
-* Large (>16M) extents
-* Huge files (>1TB--although Btrfs performance on such files isn't great in general)
-* filesystems up to 25T bytes, 100M+ files
-* btrfs receive
-* btrfs nodatacow/nodatasum inode attribute or mount option (bees skips all nodatasum files)
-* open(O_DIRECT) (seems to work as well--or as poorly--with bees as with any other btrfs feature)
-
-Bad Btrfs Feature Interactions
-------------------------------
-
-Bees has been tested in combination with the following, and various problems are known:
-
-* bcache, lvmcache:  *severe (filesystem-destroying) metadata corruption
-  issues* observed in testing and reported by users, apparently only when
-  used with bees.  Plain SSD and HDD seem to be OK.
-* btrfs send:  sometimes aborts with an I/O error when bees changes the
-  data layout during a send.  The send can be restarted and will work
-  if bees has finished processing the snapshot being sent.  No data
-  corruption observed other than the truncated send.
-* btrfs qgroups:  very slow, sometimes hangs
-* btrfs autodefrag mount option:  hangs and high CPU usage problems
-  reported by users.  bees cannot distinguish autodefrag activity from
-  normal filesystem activity and will likely try to undo the autodefrag,
-  so it should probably be turned off for bees in any case.
-
-Untested Btrfs Feature Interactions
------------------------------------
-
-Bees has not been tested with the following, and undesirable interactions may occur:
-
-* Non-4K filesystem data block size (should work if recompiled)
-* Non-equal hash (SUM) and filesystem data block (CLONE) sizes (probably never will work)
-* btrfs seed filesystems (does anyone even use those?)
-* btrfs out-of-tree kernel patches (e.g. in-band dedup or encryption)
-* btrfs-convert from ext2/3/4 (never tested, might run out of space or ignore significant portions of the filesystem due to sanity checks)
-* btrfs mixed block groups (don't know a reason why it would *not* work, but never tested)
-* Filesystems mounted *without* the flushoncommit option (don't know the impact of crashes during dedup writes vs. ordinary writes)
-
-Other Caveats
--------------
-
-* btrfs balance will invalidate parts of the dedup hash table.  Bees will
-  happily rebuild the table, but it will have to scan all the blocks
-  again.
-
-* btrfs defrag will cause Bees to rescan the defragmented file.  If it
-  contained duplicate blocks and other references to the original
-  fragmented duplicates still exist, Bees will replace the defragmented
-  extents with the original fragmented ones.
-
-* Bees creates temporary files (with O_TMPFILE) and uses them to split
-  and combine extents elsewhere in btrfs.  These will take up to 2GB
-  of disk space per thread during normal operation.
-
-* Like all deduplicators, Bees will replace data blocks with metadata
-  references.  It is a good idea to ensure there is sufficient unallocated
-  space (see `btrfs fi usage`) on the filesystem to allow the metadata
-  to multiply in size by the number of snapshots before running Bees
-  for the first time.  Use
-
-        btrfs balance start -dusage=100,limit=N /your/filesystem
-
-  where the `limit` parameter 'N' should be calculated as follows:
-
-	* start with the current size of metadata usage (from `btrfs fi
-	  df`) in GB, plus 1
-
-	* multiply by the proportion of disk space in subvols with
-	  snapshots (i.e. if there are no snapshots, multiply by 0;
-	  if all of the data is shared between at least one origin
-	  and one snapshot subvol, multiply by 1)
-
-	* multiply by the number of snapshots (i.e. if there is only
-	  one subvol, multiply by 0; if there are 3 snapshots and one
-	  origin subvol, multiply by 3)
-
-  `limit = GB_metadata * (disk_space_in_snapshots / total_disk_space) * number_of_snapshots`
-
-  Monitor unallocated space to ensure that the filesystem never runs out
-  of metadata space (whether Bees is running or not--this is a general
-  btrfs requirement).
-
-
-A Brief List Of Btrfs Kernel Bugs
----------------------------------
-
-Missing features (usually not available in older LTS kernels):
-
-* 3.13: `FILE_EXTENT_SAME` ioctl added.  No way to reliably dedup with
-  concurrent modifications before this.
-* 3.16: `SEARCH_V2` ioctl added.  Bees could use `SEARCH` instead.
-* 4.2: `FILE_EXTENT_SAME` no longer updates mtime, can be used at EOF.
-
-Future features (kernel features Bees does not yet use, but may rely on
-in the future):
-
-* 4.14: `LOGICAL_INO_V2` allows userspace to create forward and backward
-  reference maps to entire physical extents with a single ioctl call,
-  and raises the limit of 2730 references per extent.  Bees has not yet
-  been rewritten to take full advantage of these features.
-
-Bug fixes (sometimes included in older LTS kernels):
-
-* Bugs fixed prior to 4.4.107 are not listed here.
-* 4.5: hang in the `INO_PATHS` ioctl used by Bees.
-* 4.5: use-after-free in the `FILE_EXTENT_SAME` ioctl used by Bees.
-* 4.6: lost inodes after a rename, crash, and log tree replay
-  (triggered by the fsync() while writing `beescrawl.dat`).
-* 4.7: *slow backref* bug no longer triggers a softlockup panic.  It still
-  takes too long to resolve a block address to a root/inode/offset triple.
-* 4.10: reduced CPU time cost of the LOGICAL_INO ioctl and dedup
-  backref processing in general.
-* 4.11: yet another dedup deadlock case is fixed.  Alas, it is not the
-  last one.
-* 4.14: backref performance improvements make LOGICAL_INO even faster
-  in the worst cases (but possibly slower in the best cases?).
-* 4.14.29: WARN_ON(ref->count < 0) in fs/btrfs/backref.c triggers
-  almost once per second.  The WARN_ON is incorrect and can be removed.
-
-Unfixed kernel bugs (as of 4.14.34) with workarounds in Bees:
-
-* *Deadlocks* in the kernel dedup ioctl when files are modified
-  immediately before dedup.  `BeesTempFile::make_copy` calls `fsync()`
-  immediately before dedup to work around this.  If the `fsync()` is
-  removed, the filesystem hangs within a few hours, requiring a reboot
-  to recover.  Even with the `fsync()`, it is possible to lose the
-  kernel race condition and encounter a deadlock within a machine-year.
-  VM image workloads may trigger this faster.  Over the past years
-  several specific deadlock cases have been fixed, but at least one
-  remains.
-
-* *Bad interactions* with other Linux block layers:  bcache and lvmcache
-  can fail spectacularly, and apparently only while running bees.
-  This is definitely a kernel bug, either in btrfs or the lower block
-  layers.  Avoid using bees with these tools, or test very carefully
-  before deployment.
-
-* *slow backrefs* (aka toxic extents): If the number of references to a
-  single shared extent within a single file grows above a few thousand,
-  the kernel consumes CPU for minutes at a time while holding various
-  locks that block access to the filesystem.  Bees avoids this bug by
-  measuring the time the kernel spends performing certain operations
-  and permanently blacklisting any extent or hash where the kernel
-  starts to get slow.  Inside Bees, such blocks are marked as 'toxic'
-  hash/block addresses.  Linux kernel v4.14 is better but can still
-  have problems.
-
-* `LOGICAL_INO` output is arbitrarily limited to 2730 references
-  even if more buffer space is provided for results.  Once this number
-  has been reached, Bees can no longer replace the extent since it can't
-  find and remove all existing references.  Bees refrains from adding
-  any more references after the first 2560.  Offending blocks are
-  marked 'toxic' even if there is no corresponding performance problem.
-  This places an obvious limit on dedup efficiency for extremely common
-  blocks or filesystems with many snapshots (although this limit is
-  far greater than the effective limit imposed by the *slow backref* bug).
-  *Fixed in v4.14.*
-
-* `LOGICAL_INO` on compressed extents returns a list of root/inode/offset
-  tuples matching the extent bytenr of its argument.  On uncompressed
-  extents, any r/i/o tuple whose extent offset does not match the
-  argument's extent offset is discarded, i.e. only the single 4K block
-  matching the argument is returned, so a complete map of the extent
-  references requires calling `LOGICAL_INO` for every single block of
-  the extent.  This is undesirable behavior for Bees, which wants a
-  list of all extent refs referencing a data extent (i.e. Bees wants
-  the compressed-extent behavior in all cases).  *Fixed in v4.14.*
-
-* `FILE_EXTENT_SAME` is arbitrarily limited to 16MB.  This is less than
-  128MB which is the maximum extent size that can be created by defrag
-  or prealloc.  Bees avoids feedback loops this can generate while
-  attempting to replace extents over 16MB in length.
-
-Not really bugs, but gotchas nonetheless:
-
-* If a process holds a directory FD open, the subvol containing the
-  directory cannot be deleted (`btrfs sub del` will start the deletion
-  process, but it will not proceed past the first open directory FD).
-  `btrfs-cleaner` will simply skip over the directory *and all of its
-  children* until the FD is closed.  Bees avoids this gotcha by closing
-  all of the FDs in its directory FD cache every 10 btrfs transactions.
-
-* If a file is deleted while Bees is caching an open FD to the file,
-  Bees continues to scan the file.  For very large files (e.g. VM
-  images), the deletion of the file can be delayed indefinitely.
-  To limit this delay, Bees closes all FDs in its file FD cache every
-  10 btrfs transactions.
-
-* If a snapshot is deleted, bees will generate a burst of exceptions
-  for references to files in the snapshot that no longer exist.  This
-  lasts until the FD caches are cleared.
-
-Installation
-============
-
-Bees can be installed by following one these instructions:
-
-Arch package
-------------
-
-Bees is available in Arch Linux AUR. Install with:
-
-`$ pacaur -S bees-git`
-
-Gentoo package
---------------
-
-Bees is officially available in Gentoo Portage. Just emerge a stable
-version:
-
-`$ emerge --ask bees`
-
-or build a live version from git master:
-
-`$ emerge --ask =bees-9999`
-
-You can opt-out of building the support tools with
-
-`USE="-tools" emerge ...`
-
-If you want to start hacking on bees and contribute changes, just emerge
-the live version which automatically pulls in all required development
-packages.
-
-Build from source
------------------
-
-Build with `make`. The build produces `bin/bees` and `lib/libcrucible.so`,
-which must be copied to somewhere in `$PATH` and `$LD_LIBRARY_PATH`
-on the target system respectively.
-
-It will also generate `scripts/beesd@.service` for systemd users. This
-service makes use of a helper script `scripts/beesd` to boot the service.
-Both of the latter use the filesystem UUID to mount the root subvolume
-within a temporary runtime directory.
-
-### Ubuntu 16.04 - 17.04:
-`$ apt -y install build-essential btrfs-tools uuid-dev markdown && make`
-
-### Ubuntu 14.04:
-You can try to carry on the work done here: https://gist.github.com/dagelf/99ee07f5638b346adb8c058ab3d57492
-
-Packaging
+Strengths
 ---------
 
-See 'Dependencies' below. Package maintainers can pick ideas for building and
-configuring the source package from the Gentoo ebuild:
+ * Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
+ * Incremental realtime dedupe of new data using btrfs tree search
+ * Works with btrfs compression - dedupe any combination of compressed and uncompressed files
+ * Works around btrfs filesystem structure to free more disk space
+ * Persistent hash table for rapid restart after shutdown
+ * Whole-filesystem dedupe - including snapshots
+ * Constant hash table size - no increased RAM usage if data set becomes larger
+ * Works on live data - no scheduled downtime required
+ * Automatic self-throttling based on system load
 
-https://github.com/gentoo/gentoo/tree/master/sys-fs/bees
+Weaknesses
+----------
 
-You can configure some build options by creating a file `localconf` and
-adjust settings for your distribution environment there.
+ * Whole-filesystem dedupe - has no include/exclude filters, does not accept file lists
+ * Runs continuously as a daemon - no quick start/stop
+ * Requires root privilege (or `CAP_SYS_ADMIN`)
+ * First run may require temporary disk space for extent reorganization
+ * [First run may increase metadata space usage if many snapshots exist](docs/gotchas.md)
+ * Constant hash table size - no decreased RAM usage if data set becomes smaller
+ * btrfs only
 
-Please also review the Makefile for additional hints.
+Installation and Usage
+----------------------
 
-Dependencies
-------------
+ * [Installation](docs/install.md)
+ * [Configuration](docs/config.md)
+ * [Running](docs/running.md)
+ * [Command Line Options](docs/options.md)
 
-* C++11 compiler (tested with GCC 4.9, 6.2.0, 8.1.0)
+Recommended Reading
+-------------------
 
-  Sorry.  I really like closures and shared_ptr, so support
-  for earlier compiler versions is unlikely.
+ * [bees Gotchas](docs/gotchas.md)
+ * [btrfs kernel bugs](docs/btrfs-kernel.md)
+ * [bees vs. other btrfs features](docs/btrfs-other.md)
 
-* btrfs-progs (tested with 4.1..4.15.1) or libbtrfs-dev
-  (tested with version 4.16.1)
-
-  Needed for btrfs.h and ctree.h during compile.
-  Also needed by the service wrapper script.
-
-* libuuid-dev
-
-  This library is only required for a feature that was removed after v0.1.
-  The lingering support code can be removed.
-
-* Linux kernel version: *minimum* 4.4.107, *4.14.29 or later recommended*
-
-  Don't bother trying to make Bees work with kernel versions older than
-  4.4.107.  It may appear to work, but it won't end well:  there are
-  too many missing features and bugs (including data corruption bugs)
-  to work around in older kernels.
-
-  Kernel versions between 4.4.107 and 4.14.29 are usable with bees,
-  but bees can trigger known performance bugs and hangs in dedup-related
-  functions.
-
-* markdown
-
-* util-linux version that provides `blkid` command for the helper
-  script `scripts/beesd` to work
-
-Setup
------
-
-If you don't want to use the helper script `scripts/beesd` to setup and
-configure bees, here's how you manually setup bees.
-
-Create a directory for bees state files:
-
-        export BEESHOME=/some/path
-        mkdir -p "$BEESHOME"
-
-Create an empty hash table (your choice of size, but it must be a multiple
-of 16M).  This example creates a 1GB hash table:
-
-        truncate -s 1g "$BEESHOME/beeshash.dat"
-        chmod 700 "$BEESHOME/beeshash.dat"
-
-bees can only process the root subvol of a btrfs (seriously--if the
-argument is not the root subvol directory, Bees will just throw an
-exception and stop).
-
-Use a bind mount, and let only bees access it:
-
-	UUID=3399e413-695a-4b0b-9384-1b0ef8f6c4cd
-	mkdir -p /var/lib/bees/$UUID
-	mount /dev/disk/by-uuid/$UUID /var/lib/bees/$UUID -osubvol=/
-
-If you don't set BEESHOME, the path ".beeshome" will be used relative
-to the root subvol of the filesystem.  For example:
-
-	btrfs sub create /var/lib/bees/$UUID/.beeshome
-	truncate -s 1g /var/lib/bees/$UUID/.beeshome/beeshash.dat
-	chmod 700 /var/lib/bees/$UUID/.beeshome/beeshash.dat
-
-You can use any relative path in BEESHOME.  The path will be taken
-relative to the root of the deduped filesystem (in other words it can
-be the name of a subvol):
-
-	export BEESHOME=@my-beeshome
-	btrfs sub create /var/lib/bees/$UUID/$BEESHOME
-	truncate -s 1g /var/lib/bees/$UUID/$BEESHOME/beeshash.dat
-	chmod 700 /var/lib/bees/$UUID/$BEESHOME/beeshash.dat
-
-Configuration
--------------
-
-There are some runtime configurable options using environment variables:
-
-* BEESHOME: Directory containing Bees state files:
- * beeshash.dat  | persistent hash table.  Must be a multiple of 16M.
-                   This contains 16-byte records:  8 bytes for CRC64,
-                   8 bytes for physical address and some metadata bits.
- * beescrawl.dat | state of SEARCH_V2 crawlers.  ASCII text.
- * beesstats.txt | statistics and performance counters.  ASCII text.
-* BEESSTATUS: File containing a snapshot of current Bees state:  performance
-  counters and current status of each thread.  The file is meant to be
-  human readable, but understanding it probably requires reading the source.
-  You can watch bees run in realtime with a command like:
-
-	watch -n1 cat $BEESSTATUS
-
-Other options (e.g. interval between filesystem crawls) can be configured
-in src/bees.h or on the cmdline (see 'Command Line Options' below).
-
-Running
--------
-
-Reduce CPU and IO priority to be kinder to other applications sharing
-this host (or raise them for more aggressive disk space recovery).  If you
-use cgroups, put `bees` in its own cgroup, then reduce the `blkio.weight`
-and `cpu.shares` parameters.  You can also use `schedtool` and `ionice`
-in the shell script that launches `bees`:
-
-        schedtool -D -n20 $$
-        ionice -c3 -p $$
-
-Let the bees fly:
-
-	for fs in /var/lib/bees/*-*-*-*-*/; do
-		bees "$fs" >> "$fs/.beeshome/bees.log" 2>&1 &
-	done
-
-You'll probably want to arrange for /var/log/bees.log to be rotated
-periodically.  You may also want to set umask to 077 to prevent disclosure
-of information about the contents of the filesystem through the log file.
-
-There are also some shell wrappers in the `scripts/` directory.
-
-
-
-Command Line Options
---------------------
-
-* --thread-count (-c) COUNT
-  * Specify maximum number of worker threads for scanning.  Overrides
-    --thread-factor (-C) and default/autodetected values.
-* --thread-factor (-C) FACTOR
-  * Specify ratio of worker threads to CPU cores.  Overridden by --thread-count (-c).
-    Default is 1.0, i.e. 1 worker thread per detected CPU.  Use values
-    below 1.0 to leave some cores idle, or above 1.0 if there are more
-    disks than CPUs in the filesystem.
-* --loadavg-target (-g) LOADAVG
-  * Specify load average target for dynamic worker threads.
-    Threads will be started or stopped subject to the upper limit imposed
-    by thread-factor, thread-min and thread-count until the load average
-    is within +/- 0.5 of LOADAVG.
-* --thread-min (-G) COUNT
-  * Specify minimum number of worker threads for scanning.
-    Ignored unless -g option is used to specify a target load.
-
-* --scan-mode (-m) MODE
-  * Specify extent scanning algorithm.  Default mode is 0.
-    _EXPERIMENTAL_ feature that may go away.
-    * Mode 0: scan extents in ascending order of (inode, subvol, offset).
-      Keeps shared extents between snapshots together.  Reads files sequentially.
-      Minimizes temporary space usage.
-    * Mode 1: scan extents from all subvols in parallel.  Good performance
-      on non-spinning media when subvols are unrelated.
-    * Mode 2: scan all extents from one subvol at a time.  Good sequential
-      read performance for spinning media.  Maximizes temporary space usage.
-
-* --timestamps (-t)
-  * Enable timestamps in log output.
-* --no-timestamps (-T)
-  * Disable timestamps in log output.
-* --absolute-paths (-p)
-  * Paths in log output will be absolute.
-* --strip-paths (-P)
-  * Paths in log output will have the working directory at Bees startup
-    stripped.
-* --verbose (-v)
-  * Set log verbosity (0 = no output, 8 = all output, default 8).
+More Information
+----------------
 
+ * [How bees works](docs/how-it-works.md)
+ * [Missing bees features](docs/missing.md)
 
 Bug Reports and Contributions
 -----------------------------
@@ -596,11 +65,9 @@ You can also use Github:
 
         https://github.com/Zygo/bees
 
-
-
 Copyright & License
-===================
+-------------------
 
-Copyright 2015-2017 Zygo Blaxell <bees@furryterror.org>.
+Copyright 2015-2018 Zygo Blaxell <bees@furryterror.org>.
 
 GPL (version 3 or later).
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 0000000..2d19fc7
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..2a3cdf7
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,8 @@
+MARKDOWN := $(firstword $(shell type -P markdown markdown2 markdown_py 2>/dev/null || echo markdown))
+.PHONY: docs
+
+docs: $(subst .md,.html,$(wildcard *.md)) ../README.html
+
+%.html: %.md
+	$(MARKDOWN) $< | sed -e 's/\.md/\.html/g' > $@.new
+	mv -f $@.new $@
diff --git a/docs/_config.yml b/docs/_config.yml
new file mode 100644
index 0000000..277f1f2
--- /dev/null
+++ b/docs/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-cayman
diff --git a/docs/btrfs-kernel.md b/docs/btrfs-kernel.md
new file mode 100644
index 0000000..f111a85
--- /dev/null
+++ b/docs/btrfs-kernel.md
@@ -0,0 +1,56 @@
+Recommended kernel version
+==========================
+
+Linux **4.14.34** or later.
+
+A Brief List Of Btrfs Kernel Bugs
+---------------------------------
+
+Recent kernel bug fixes:
+
+* 4.14.29: `WARN_ON(ref->count < 0)` in fs/btrfs/backref.c triggers
+  almost once per second.  The `WARN_ON` is incorrect, and is now removed.
+
+Unfixed kernel bugs (as of 4.14.71):
+
+* **Bad _filesystem destroying_ interactions** with other Linux block
+  layers:  `bcache` and `lvmcache` can fail spectacularly, and apparently
+  only do so while running bees.  This is definitely a kernel bug,
+  either in btrfs or the lower block layers.  **Avoid using bees with
+  these tools unless your filesystem is disposable and you intend to
+  debug the kernel.**
+
+* **Compressed data corruption** is possible when using the `fallocate`
+  system call to punch holes into compressed extents that contain long
+  runs of zeros.  The [bug results in intermittent corruption during
+  reads](https://www.spinics.net/lists/linux-btrfs/msg81293.html), but
+  due to the bug, the kernel might sometimes mistakenly determine data
+  is duplicate, and deduplication will corrupt the data permanently.
+  This bug also affects compressed `kvm` raw images with the `discard`
+  feature on btrfs or any compressed file where `fallocate -d` or
+  `fallocate -p` has been used.
+
+* **Deadlock** when [simultaneously using the same files in dedupe and
+  `rename`](https://www.spinics.net/lists/linux-btrfs/msg81109.html).
+  There is no way for bees to reliably know when another process is
+  about to rename a file while bees is deduping it.  In the `rsync` case,
+  bees will dedupe the new file `rsync` is creating using the old file
+  `rsync` is copying from, while `rsync` will rename the new file over
+  the old file to replace it.
+
+Minor kernel problems with workarounds:
+
+* **Slow backrefs** (aka toxic extents): If the number of references to a
+  single shared extent within a single file grows above a few thousand,
+  the kernel consumes CPU for minutes at a time while holding various
+  locks that block access to the filesystem.  bees avoids this bug
+  by measuring the time the kernel spends performing `LOGICAL_INO`
+  operations and permanently blacklisting any extent or hash involved
+  where the kernel starts to get slow.  Inside bees, such blocks are
+  known as 'toxic' hash/block addresses.
+
+* **`FILE_EXTENT_SAME` is arbitrarily limited to 16MB**.  This is
+  less than 128MB which is the maximum extent size that can be created
+  by defrag, prealloc, or filesystems without the `compress-force`
+  mount option.  bees avoids feedback loops this can generate while
+  attempting to replace extents over 16MB in length.
diff --git a/docs/btrfs-other.md b/docs/btrfs-other.md
new file mode 100644
index 0000000..a89cb09
--- /dev/null
+++ b/docs/btrfs-other.md
@@ -0,0 +1,53 @@
+Good Btrfs Feature Interactions
+-------------------------------
+
+bees has been tested in combination with the following:
+
+* btrfs compression (zlib, lzo, zstd), mixtures of compressed and uncompressed extents
+* PREALLOC extents (unconditionally replaced with holes)
+* HOLE extents and btrfs no-holes feature
+* Other deduplicators, reflink copies (though bees may decide to redo their work)
+* btrfs snapshots and non-snapshot subvols (RW and RO)
+* Concurrent file modification (e.g. PostgreSQL and sqlite databases, build daemons)
+* all btrfs RAID profiles
+* IO errors during dedupe (read errors will throw exceptions, bees will catch them and skip over the affected extent)
+* Filesystems mounted *with* the flushoncommit option (system crashes, power failures OK)
+* 4K filesystem data block size / clone alignment
+* 64-bit and 32-bit host CPUs (amd64, x86, arm)
+* Huge files (>1TB--although Btrfs performance on such files isn't great in general)
+* filesystems up to 30T+ bytes, 100M+ files
+* btrfs receive
+* btrfs nodatacow/nodatasum inode attribute or mount option (bees skips all nodatasum files)
+* open(O_DIRECT) (seems to work as well--or as poorly--with bees as with any other btrfs feature)
+
+Bad Btrfs Feature Interactions
+------------------------------
+
+bees has been tested in combination with the following, and various problems are known:
+
+* bcache, lvmcache:  **severe (filesystem-destroying) metadata corruption
+  issues** observed in testing and reported by users, apparently only when
+  used with bees.  Plain SSD and HDD seem to be OK.
+* btrfs send:  some kernel versions have bugs in btrfs send that can be
+  triggered by bees.  The send can be restarted and will work if bees
+  has finished processing the snapshot being sent.  No data corruption
+  observed other than the truncated send.
+* btrfs qgroups:  very slow, sometimes hangs...and it's even worse when
+  bees is running.
+* btrfs autodefrag mount option:  hangs and high CPU usage problems
+  reported by users.  bees cannot distinguish autodefrag activity from
+  normal filesystem activity and will likely try to undo the autodefrag
+  if duplicate copies of the defragmented data exist.
+
+Untested Btrfs Feature Interactions
+-----------------------------------
+
+bees has not been tested with the following, and undesirable interactions may occur:
+
+* Non-4K filesystem data block size (should work if recompiled)
+* Non-equal hash (SUM) and filesystem data block (CLONE) sizes (need to fix that eventually)
+* btrfs seed filesystems (does anyone even use those?)
+* btrfs out-of-tree kernel patches (e.g. in-kernel dedupe or encryption)
+* btrfs-convert from ext2/3/4 (never tested, might run out of space or ignore significant portions of the filesystem due to sanity checks)
+* btrfs mixed block groups (don't know a reason why it would *not* work, but never tested)
+* Filesystems mounted *without* the flushoncommit option (don't know the data integrity impact of crashes during dedupe writes vs. ordinary writes)
diff --git a/docs/config.md b/docs/config.md
new file mode 100644
index 0000000..11b4497
--- /dev/null
+++ b/docs/config.md
@@ -0,0 +1,151 @@
+bees Configuration
+==================
+
+The only configuration parameter that *must* be provided is the hash
+table size.  Other parameters are optional or hardcoded, and the defaults
+are reasonable in most cases.
+
+Hash Table Sizing
+-----------------
+
+Hash table entries are 16 bytes per data block.  The hash table stores
+the most recently read unique hashes.  Once the hash table is full,
+each new entry in the table evicts an old entry.
+
+Here are some numbers to estimate appropriate hash table sizes:
+
+    unique data size |  hash table size |average dedupe extent size
+        1TB          |      4GB         |        4K
+        1TB          |      1GB         |       16K
+        1TB          |    256MB         |       64K
+        1TB          |    128MB         |      128K <- recommended
+        1TB          |     16MB         |     1024K
+       64TB          |      1GB         |     1024K
+
+Notes:
+
+ * If the hash table is too large, no extra dedupe efficiency is
+obtained, and the extra space just wastes RAM.  Extra space can also slow
+bees down by preventing old data from being evicted, so bees wastes time
+looking for matching data that is no longer present on the filesystem.
+
+ * If the hash table is too small, bees extrapolates from matching
+blocks to find matching adjacent blocks in the filesystem that have been
+evicted from the hash table.  In other words, bees only needs to find
+one block in common between two extents in order to be able to dedupe
+the entire extents.  This provides significantly more dedupe hit rate
+per hash table byte than other dedupe tools.
+
+ * When counting unique data in compressed data blocks to estimate
+optimum hash table size, count the *uncompressed* size of the data.
+
+ * Another way to approach the hash table size is to simply decide how much
+RAM can be spared without too much discomfort, give bees that amount of
+RAM, and accept whatever dedupe hit rate occurs as a result.  bees will
+do the best job it can with the RAM it is given.
+
+Factors affecting optimal hash table size
+-----------------------------------------
+
+It is difficult to predict the net effect of data layout and access
+patterns on dedupe effectiveness without performing deep inspection of
+both the filesystem data and its structure--a task that is as expensive
+as performing the deduplication.
+
+* **Compression** on the filesystem reduces the average extent length
+compared to uncompressed filesystems.  The maximum compressed extent
+length on btrfs is 128KB, while the maximum uncompressed extent length
+is 128MB.  Longer extents decrease the optimum hash table size while
+shorter extents increase the optimum hash table size because the
+probability of a hash table entry being present (i.e. unevicted) in
+each extent is proportional to the extent length.
+
+   As a rule of thumb, the optimal hash table size for a compressed
+filesystem is 2-4x larger than the optimal hash table size for the same
+data on an uncompressed filesystem.  Dedupe efficiency falls dramatically
+with hash tables smaller than 128MB/TB as the average dedupe extent size
+is larger than the largest possible compressed extent size (128KB).
+
+* **Short writes** also shorten the average extent length and increase
+optimum hash table size.  If a database writes to files randomly using
+4K page writes, all of these extents will be 4K in length, and the hash
+table size must be increased to retain each one (or the user must accept
+a lower dedupe hit rate).
+
+   Defragmenting files that have had many short writes increases the
+extent length and therefore reduces the optimum hash table size.
+
+* **Time between duplicate writes** also affects the optimum hash table
+size.  bees reads data blocks in logical order during its first pass,
+and after that new data blocks are read incrementally a few seconds or
+minutes after they are written.  bees finds more matching blocks if there
+is a smaller amount of data between the matching reads, i.e. there are
+fewer blocks evicted from the hash table.  If most identical writes to
+the filesystem occur near the same time, the optimum hash table size is
+smaller.  If most identical writes occur over longer intervals of time,
+the optimum hash table size must be larger to avoid evicting hashes from
+the table before matches are found.
+
+   For example, a build server normally writes out very similar source
+code files over and over, so it will need a smaller hash table than a
+backup server which has to refer to the oldest data on the filesystem
+every time a new client machine's data is added to the server.
+
+Scanning modes for multiple subvols
+-----------------------------------
+
+The `--scan-mode` option affects how bees divides resources between
+subvolumes.  This is particularly relevant when there are snapshots,
+as there are tradeoffs to be made depending on how snapshots are used
+on the filesystem.
+
+Note that if a filesystem has only one subvolume (i.e. the root,
+subvol ID 5) then the `--scan-mode` option has no effect, as there is
+only one subvolume to scan.
+
+The default mode is mode 0, "lockstep".  In this mode, each inode of each
+subvol is scanned at the same time, before moving to the next inode in
+each subvol.  This maximizes the likelihood that all of the references to
+a snapshot of a file are scanned at the same time, which takes advantage
+of VFS caching in the Linux kernel.  If snapshots are created very often,
+bees will not make very good progress as it constantly restarts the
+filesystem scan from the beginning each time a new snapshot is created.
+
+Scan mode 1, "independent", simply scans every subvol independently
+in parallel.  Each subvol's scanner shares time equally with all other
+subvol scanners.  Whenever a new subvol appears, a new scanner is
+created and the new subvol scanner doesn't affect the behavior of any
+existing subvol scanner.
+
+Scan mode 2, "sequential", processes each subvol completely before
+proceeding to the next subvol.  This is a good mode when using bees for
+the first time on a filesystem that already has many existing snapshots
+and a high rate of new snapshot creation.  Short-lived snapshots
+(e.g. those used for `btrfs send`) are effectively ignored, and bees
+directs its efforts toward older subvols that are more likely to be
+origin subvols for snapshots.  By deduping origin subvols first, bees
+ensures that future snapshots will already be deduplicated and do not
+need to be deduplicated again.
+
+If you are using bees for the first time on a filesystem with many
+existing snapshots, you should read about [snapshot gotchas](gotchas.md).
+
+Threads and load management
+---------------------------
+
+By default, bees creates one worker thread for each CPU detected.
+These threads then perform scanning and dedupe operations.  The number of
+worker threads can be set with the [`--thread-count` and `--thread-factor`
+options](options.md).
+
+If desired, bees can automatically increase or decrease the number
+of worker threads in response to system load.  This reduces impact on
+the rest of the system by pausing bees when other CPU and IO intensive
+loads are active on the system, and resumes bees when the other loads
+are inactive.  This is configured with the [`--loadavg-target` and
+`--thread-min` options](options.md).
+
+Log verbosity
+-------------
+
+bees can be made less chatty with the [`--verbose` option](options.md).
diff --git a/docs/gotchas.md b/docs/gotchas.md
new file mode 100644
index 0000000..2712ac7
--- /dev/null
+++ b/docs/gotchas.md
@@ -0,0 +1,113 @@
+bees Gotchas
+============
+
+Snapshots
+---------
+
+bees can dedupe filesystems with many snapshots, but bees only does
+well in this situation if bees was running on the filesystem from
+the beginning.
+
+Each time bees dedupes an extent that is referenced by a snapshot,
+the entire metadata page in the snapshot subvol (16KB by default) must
+be CoWed in btrfs.  This can result in a substantial increase in btrfs
+metadata size if there are many snapshots on a filesystem.
+
+Normally, metadata is small (less than 1% of the filesystem) and dedupe
+hit rates are large (10-40% of the filesystem), so the increase in
+metadata size is offset by much larger reductions in data size and the
+total space used by the entire filesystem is reduced.
+
+If a subvol is deduped _before_ a snapshot is created, the snapshot will
+have the same deduplication as the subvol.  This does _not_ result in
+unusually large metadata sizes.  If a snapshot is made after bees has
+fully scanned the origin subvol, bees can avoid scanning most of the
+data in the snapshot subvol, as it will be provably identical to the
+origin subvol that was already scanned.
+
+If a subvol is deduped _after_ a snapshot is created, the origin and
+snapshot subvols must be deduplicated separately.  In the worst case, this
+will double the amount of reading the bees scanner must perform, and will
+also double the amount of btrfs metadata used for the snapshot; however,
+the "worst case" is a dedupe hit rate of 1% or more, so a doubling of
+metadata size is certain for all but the most unique data sets.  Also,
+bees will not be able to free any space until the last snapshot has been
+scanned and deduped, so payoff in data space savings is deferred until
+the metadata has almost finished expanding.
+
+If a subvol is deduped after _many_ snapshots have been created, all
+subvols must be deduplicated individually.  In the worst case, this will
+multiply the scanning work and metadata size by the number of snapshots.
+For 100 snapshots this can mean a 100x growth in metadata size and
+bees scanning time, which typically exceeds the possible savings from
+reducing the data size by dedupe.  In such cases using bees will result
+in a net increase in disk space usage that persists until the snapshots
+are deleted.
+
+Snapshot case studies
+---------------------
+
+ * bees running on an empty filesystem
+   * filesystem is mkfsed
+   * bees is installed and starts running
+   * data is written to the filesystem
+   * bees dedupes the data as it appears
+   * a snapshot is made of the data
+      * The snapshot will already be 99% deduped, so the metadata will
+      not expand very much because only 1% of the data in the snapshot
+      must be deduped.
+   * more snapshots are made of the data
+      * as long as dedupe has been completed on the origin subvol,
+      bees will quickly scan each new snapshot because it can skip
+      all the previously scanned data.  Metadata usage remains low
+      (it may even shrink because there are fewer csums).
+
+ * bees installed on a non-empty filesystem with snapshots
+   * filesystem is mkfsed
+   * data is written to the filesystem
+   * multiple snapshots are made of the data
+   * bees is installed and starts running
+   * bees dedupes each snapshot individually
+      * The snapshot metadata will no longer be shared, resulting in
+      substantial growth of metadata usage.
+      * Disk space savings do not occur until bees processes the
+      last snapshot reference to data.
+
+
+Other Gotchas
+-------------
+
+* bees avoids the [slow backrefs kernel bug](btrfs-kernel.md) by
+  measuring the time required to perform `LOGICAL_INO` operations.  If an
+  extent requires over 10 seconds to perform a `LOGICAL_INO` then bees
+  blacklists the extent and avoids referencing it in future operations.
+  In most cases, fewer than 0.1% of extents in a filesystem must be
+  avoided this way.  This results in short write latency spikes of up
+  to and a little over 10 seconds as btrfs will not allow writes to the
+  filesystem while `LOGICAL_INO` is running.  Generally the CPU spends
+  most of the runtime of the `LOGICAL_INO` ioctl running the kernel,
+  so on a single-core CPU the entire system can freeze up for a few
+  seconds at a time.
+
+* Load managers that send a `SIGSTOP` to the bees process to throttle
+  CPU usage may affect the `LOGICAL_INO` timing mechanism, causing extents
+  to be incorrectly labelled 'toxic'.  This will cause a small reduction
+  of dedupe hit rate.  Slow and heavily loaded disks can trigger the same
+  effect if `LOGICAL_INO` takes too long due to IO latency.
+
+* If a process holds a directory FD open, the subvol containing the
+  directory cannot be deleted (`btrfs sub del` will start the deletion
+  process, but it will not proceed past the first open directory FD).
+  `btrfs-cleaner` will simply skip over the directory *and all of its
+  children* until the FD is closed.  bees avoids this gotcha by closing
+  all of the FDs in its directory FD cache every 10 btrfs transactions.
+
+* If a file is deleted while bees is caching an open FD to the file,
+  bees continues to scan the file.  For very large files (e.g. VM
+  images), the deletion of the file can be delayed indefinitely.
+  To limit this delay, bees closes all FDs in its file FD cache every
+  10 btrfs transactions.
+
+* If a snapshot is deleted, bees will generate a burst of exceptions
+  for references to files in the snapshot that no longer exist.  This
+  lasts until the FD caches are cleared.
diff --git a/docs/how-it-works.md b/docs/how-it-works.md
new file mode 100644
index 0000000..b3fa62c
--- /dev/null
+++ b/docs/how-it-works.md
@@ -0,0 +1,100 @@
+How bees Works
+--------------
+
+bees is a daemon designed to run continuously and maintain its state
+across crashes and reboots.
+
+bees uses checkpoints for persistence to eliminate the IO overhead of a
+transactional data store.  On restart, bees will dedupe any data that
+was added to the filesystem since the last checkpoint.  Checkpoints
+occur every 15 minutes for scan progress, stored in `beescrawl.dat`.
+The hash table trickle-writes to disk at 4GB/hour to `beeshash.dat`.
+An hourly performance report is written to `beesstats.txt`.  There are
+no special requirements for bees hash table storage--`.beeshome` could
+be stored on a different btrfs filesystem, ext4, or even CIFS.
+
+bees uses a persistent dedupe hash table with a fixed size configured
+by the user.  Any size of hash table can be dedicated to dedupe.  If a
+fast dedupe with low hit rate is desired, bees can use a hash table as
+small as 16MB.
+
+The bees hash table is loaded into RAM at startup and `mlock`ed so it
+will not be swapped out by the kernel (if swap is permitted, performance
+degrades to nearly zero).
+
+bees scans the filesystem in a single pass which removes duplicate
+extents immediately after they are detected.  There are no distinct
+scanning and dedupe phases, so bees can start recovering free space
+immediately after startup.
+
+Once a filesystem scan has been completed, bees uses the `min_transid`
+parameter of the `TREE_SEARCH_V2` ioctl to avoid rescanning old data
+on future scans and quickly scan new data.  An incremental data scan
+can complete in less than a millisecond on an idle filesystem.
+
+Once a duplicate data block is identified, bees examines the nearby
+blocks in the files where the matched block appears.  This allows bees
+to find long runs of adjacent duplicate block pairs if it has an entry
+for any one of the blocks in its hash table.  On typical data sets,
+this means most of the blocks in the hash table are redundant and can
+be discarded without significant impact on dedupe hit rate.
+
+Hash table entries are grouped together into LRU lists.  As each block
+is scanned, its hash table entry is inserted into the LRU list at a
+random position.  If the LRU list is full, the entry at the end of the
+list is deleted.  If a hash table entry is used to discover duplicate
+blocks, the entry is moved to the beginning of the list.  This makes bees
+unable to detect a small number of duplicates, but it dramatically
+improves efficiency on filesystems with many small files.
+
+Once the hash table fills up, old entries are evicted by new entries.
+This means that the optimum hash table size is determined by the
+distance between duplicate blocks on the filesystem rather than the
+filesystem unique data size.  Even if the hash table is too small
+to find all duplicates, it may still find _most_ of them, especially
+during incremental scans where the data in many workloads tends to be
+more similar.
+
+When a duplicate block pair is found in two btrfs extents, bees will
+attempt to match all other blocks in the newer extent with blocks in
+the older extent (i.e. the goal is to keep the extent referenced in the
+hash table and remove the most recently scanned extent).  If this is
+possible, then the new extent will be replaced with a reference to the
+old extent.  If this is not possible, then bees will create a temporary
+copy of the unmatched data in the new extent so that the entire new
+extent can be removed by deduplication.  This must be done because btrfs
+cannot partially overwrite extents--the _entire_ extent must be replaced.
+The temporary copy is then scanned during the next pass bees makes over
+the filesystem for potential duplication of other extents.
+
+When a block containing all-zero bytes is found, bees dedupes the extent
+against a temporary file containing a hole, possibly creating temporary
+copies of any non-zero data in the extent for later deduplication as
+described above.  If the extent is compressed, bees avoids splitting
+the extent in the middle as this generally has a negative impact on
+compression ratio (and also triggers a [kernel bug](btrfs-kernel.md)).
+
+bees does not store any information about filesystem structure, so
+its performance is linear in the number or size of files.  The hash
+table stores physical block numbers which are converted into paths
+and FDs on demand through btrfs `SEARCH_V2` and `LOGICAL_INO` ioctls.
+This eliminates the storage required to maintain the equivalents
+of these functions in userspace, at the expense of encountering [some
+kernel bugs in `LOGICAL_INO` performance](btrfs-kernel.md).
+
+bees uses only the data-safe `FILE_EXTENT_SAME` (aka `FIDEDUPERANGE`)
+kernel operations to manipulate user data, so it can dedupe live data
+(e.g. build servers, sqlite databases, VM disk images).  It does not
+modify file attributes or timestamps.
+
+When bees has scanned all of the data, bees will pause until 10
+transactions have been completed in the btrfs filesystem.  bees tracks
+the current btrfs transaction ID over time so that it polls less often
+on quiescent filesystems and more often on busy filesystems.
+
+Scanning and deduplication work is performed by worker threads.  If the
+[`--loadavg-target` option](options.md) is used, bees adjusts the number
+of worker threads up or down as required to have a user-specified load
+impact on the system.  The maximum and minimum number of threads is
+configurable.  If the system load is too high then bees will stop until
+the load falls to acceptable levels.
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..13c5626
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,73 @@
+BEES
+====
+
+Best-Effort Extent-Same, a btrfs deduplication agent.
+
+About bees
+----------
+
+bees is a block-oriented userspace deduplication agent designed for large
+btrfs filesystems.  It is an offline dedupe combined with an incremental
+data scan capability to minimize time data spends on disk from write
+to dedupe.
+
+Strengths
+---------
+
+ * Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB)
+ * Incremental realtime dedupe of new data using btrfs tree search
+ * Works with btrfs compression - dedupe any combination of compressed and uncompressed files
+ * Works around btrfs filesystem structure to free more disk space
+ * Persistent hash table for rapid restart after shutdown
+ * Whole-filesystem dedupe - including snapshots
+ * Constant hash table size - no increased RAM usage if data set becomes larger
+ * Works on live data - no scheduled downtime required
+ * Automatic self-throttling based on system load
+
+Weaknesses
+----------
+
+ * Whole-filesystem dedupe - has no include/exclude filters, does not accept file lists
+ * Runs continuously as a daemon - no quick start/stop
+ * Requires root privilege (or `CAP_SYS_ADMIN`)
+ * First run may require temporary disk space for extent reorganization
+ * [First run may increase metadata space usage if many snapshots exist](gotchas.md)
+ * Constant hash table size - no decreased RAM usage if data set becomes smaller
+ * btrfs only
+
+Installation and Usage
+----------------------
+
+ * [Installation](install.md)
+ * [Configuration](config.md)
+ * [Running](running.md)
+ * [Command Line Options](options.md)
+
+Recommended Reading
+-------------------
+
+ * [bees Gotchas](gotchas.md)
+ * [btrfs kernel bugs](btrfs-kernel.md)
+ * [bees vs. other btrfs features](btrfs-other.md)
+
+More Information
+----------------
+
+ * [How bees works](how-it-works.md)
+ * [Missing bees features](missing.md)
+
+Bug Reports and Contributions
+-----------------------------
+
+Email bug reports and patches to Zygo Blaxell <bees@furryterror.org>.
+
+You can also use Github:
+
+        https://github.com/Zygo/bees
+
+Copyright & License
+-------------------
+
+Copyright 2015-2018 Zygo Blaxell <bees@furryterror.org>.
+
+GPL (version 3 or later).
diff --git a/docs/install.md b/docs/install.md
new file mode 100644
index 0000000..3f75057
--- /dev/null
+++ b/docs/install.md
@@ -0,0 +1,91 @@
+Building bees
+=============
+
+Dependencies
+------------
+
+* C++11 compiler (tested with GCC 4.9, 6.2.0, 8.1.0)
+
+  Sorry.  I really like closures and shared_ptr, so support
+  for earlier compiler versions is unlikely.
+
+* btrfs-progs (tested with 4.1..4.15.1) or libbtrfs-dev
+  (tested with version 4.16.1)
+
+  Needed for btrfs.h and ctree.h during compile.
+  Also needed by the service wrapper script.
+
+* libuuid-dev
+
+  This library is only required for a feature that was removed after v0.1.
+  The lingering support code can be removed.
+
+* [Linux kernel version](btrfs-kernel.md) gets its own page.
+
+* markdown for documentation
+
+* util-linux version that provides `blkid` command for the helper
+  script `scripts/beesd` to work
+
+Installation
+============
+
+bees can be installed by following one these instructions:
+
+Arch package
+------------
+
+bees is available in Arch Linux AUR. Install with:
+
+`$ pacaur -S bees-git`
+
+Gentoo package
+--------------
+
+bees is officially available in Gentoo Portage. Just emerge a stable
+version:
+
+`$ emerge --ask bees`
+
+or build a live version from git master:
+
+`$ emerge --ask =bees-9999`
+
+You can opt-out of building the support tools with
+
+`USE="-tools" emerge ...`
+
+If you want to start hacking on bees and contribute changes, just emerge
+the live version which automatically pulls in all required development
+packages.
+
+Build from source
+-----------------
+
+Build with `make`. The build produces `bin/bees` and `lib/libcrucible.so`,
+which must be copied to somewhere in `$PATH` and `$LD_LIBRARY_PATH`
+on the target system respectively.
+
+It will also generate `scripts/beesd@.service` for systemd users. This
+service makes use of a helper script `scripts/beesd` to boot the service.
+Both of the latter use the filesystem UUID to mount the root subvolume
+within a temporary runtime directory.
+
+### Ubuntu 16.04 - 17.04:
+`$ apt -y install build-essential btrfs-tools uuid-dev markdown && make`
+
+### Ubuntu 14.04:
+You can try to carry on the work done here: <https://gist.github.com/dagelf/99ee07f5638b346adb8c058ab3d57492>
+
+Packaging
+---------
+
+See 'Dependencies' below. Package maintainers can pick ideas for building and
+configuring the source package from the Gentoo ebuild:
+
+<https://github.com/gentoo/gentoo/tree/master/sys-fs/bees>
+
+You can configure some build options by creating a file `localconf` and
+adjust settings for your distribution environment there.
+
+Please also review the Makefile for additional hints.
diff --git a/docs/missing.md b/docs/missing.md
new file mode 100644
index 0000000..e17dacc
--- /dev/null
+++ b/docs/missing.md
@@ -0,0 +1,50 @@
+Features You Might Expect That bees Doesn't Have
+------------------------------------------------
+
+* There's no configuration file (patches welcome!).  There are
+some tunables hardcoded in the source that could eventually become
+configuration options.  There's also an incomplete option parser
+(patches welcome!).
+
+* There's no way to *stop* the bees daemon.  Use SIGKILL, SIGTERM, or
+Ctrl-C for now.  Some of the destructors are unreachable and have never
+been tested.  bees checkpoints its progress every 15 minutes (also not
+configurable, patches welcome) and will repeat some work when restarted.
+
+* The bees process doesn't fork and writes its log to stdout/stderr.
+A shell wrapper is required to make it behave more like a daemon.
+
+* There's no facility to exclude any part of a filesystem or focus on
+specific files (patches welcome).
+
+* PREALLOC extents and extents containing blocks filled with zeros will
+be replaced by holes.  There is no way to turn this off.
+
+* Consecutive runs of duplicate blocks that are less than 12K in length
+can take 30% of the processing time while saving only 3% of the disk
+space.  There should be an option to just not bother with those, but it's
+complicated by the btrfs requirement to always dedupe complete extents.
+
+* There is a lot of duplicate reading of blocks in snapshots.  bees will
+scan all snapshots at close to the same time to try to get better
+performance by caching, but really fixing this requires rewriting the
+crawler to scan the btrfs extent tree directly instead of the subvol
+FS trees.
+
+* Block reads are currently more allocation- and CPU-intensive than they
+should be, especially for filesystems on SSD where the IO overhead is
+much smaller.  This is a problem for CPU-power-constrained environments
+(e.g. laptops running from battery, or ARM devices with slow CPU).
+
+* bees can currently fragment extents when required to remove duplicate
+blocks, but has no defragmentation capability yet.  When possible, bees
+will attempt to work with existing extent boundaries, but it will not
+aggregate blocks together from multiple extents to create larger ones.
+
+* When bees fragments an extent, the copied data is compressed.  There
+is currently no way (other than by modifying the source) to select a
+compression method or not compress the data (patches welcome!).
+
+* It is theoretically possible to resize the hash table without starting
+over with a new full-filesystem scan; however, this feature has not been
+implemented yet.
diff --git a/docs/options.md b/docs/options.md
new file mode 100644
index 0000000..741f680
--- /dev/null
+++ b/docs/options.md
@@ -0,0 +1,52 @@
+# bees Command Line Options
+
+<table border>
+<tr><th width="20%">--thread-count COUNT</th><th width="5%">-c</th>
+<td>Specify maximum number of worker threads for scanning.  Overrides
+--thread-factor (-C) and default/autodetected values.
+</td></tr>
+<tr><th>--thread-factor FACTOR</th><th>-C</th>
+<td>Specify ratio of worker threads to CPU cores.  Overridden by --thread-count (-c).
+    Default is 1.0, i.e. 1 worker thread per detected CPU.  Use values
+    below 1.0 to leave some cores idle, or above 1.0 if there are more
+    disks than CPUs in the filesystem.
+</td></tr>
+
+<tr><th>--loadavg-target LOADAVG</th><th>-g</th>
+  <td>Specify load average target for dynamic worker threads.
+    Threads will be started or stopped subject to the upper limit imposed
+    by thread-factor, thread-min and thread-count until the load average
+    is within +/- 0.5 of LOADAVG.
+</td></tr>
+<tr><th>--thread-min COUNT</th><th>-G</th>
+<td>Specify minimum number of worker threads for scanning.
+    Ignored unless -g option is used to specify a target load.</td></tr>
+
+<tr><th>--scan-mode MODE</th><th>-m</th>
+  <td>
+Specify extent scanning algorithm.  Default mode is 0.
+<em>EXPERIMENTAL</em> feature that may go away.
+<ul>
+<li> Mode 0: scan extents in ascending order of (inode, subvol, offset).
+  Keeps shared extents between snapshots together.  Reads files sequentially.
+Minimizes temporary space usage.</li>
+<li> Mode 1: scan extents from all subvols in parallel.  Good performance
+  on non-spinning media when subvols are unrelated.</li>
+<li> Mode 2: scan all extents from one subvol at a time.  Good sequential
+  read performance for spinning media.  Maximizes temporary space usage.</li>
+</ul>
+</td></tr>
+
+<tr><th>--timestamps</th><th>-t</th>
+  <td>Enable timestamps in log output.</td></tr>
+<tr><th>--no-timestamps</th><th>-T</th>
+  <td>Disable timestamps in log output.</td></tr>
+<tr><th>--absolute-paths</th><th>-p</th>
+  <td>Paths in log output will be absolute.</td></tr>
+<tr><th>--strip-paths</th><th>-P</th>
+  <td>Paths in log output will have the working directory at bees startup
+    stripped.</td></tr>
+<tr><th>--verbose</th><th>-v</th>
+  <td>Set log verbosity (0 = no output, 8 = all output, default 8).</td></tr>
+
+</table>
diff --git a/docs/running.md b/docs/running.md
new file mode 100644
index 0000000..d604891
--- /dev/null
+++ b/docs/running.md
@@ -0,0 +1,92 @@
+Running bees
+============
+
+Setup
+-----
+
+If you don't want to use the helper script `scripts/beesd` to setup and
+configure bees, here's how you manually setup bees.
+
+Create a directory for bees state files:
+
+        export BEESHOME=/some/path
+        mkdir -p "$BEESHOME"
+
+Create an empty hash table ([your choice of size](config.md), but it
+must be a multiple of 16MB).  This example creates a 1GB hash table:
+
+        truncate -s 1g "$BEESHOME/beeshash.dat"
+        chmod 700 "$BEESHOME/beeshash.dat"
+
+bees can _only_ process the root subvol of a btrfs with nothing mounted
+over top.  If the bees argument is not the root subvol directory, bees
+will just throw an exception and stop.
+
+Use a separate mount point, and let only bees access it:
+
+        UUID=3399e413-695a-4b0b-9384-1b0ef8f6c4cd
+        mkdir -p /var/lib/bees/$UUID
+        mount /dev/disk/by-uuid/$UUID /var/lib/bees/$UUID -osubvol=/
+
+If you don't set BEESHOME, the path "`.beeshome`" will be used relative
+to the root subvol of the filesystem.  For example:
+
+        btrfs sub create /var/lib/bees/$UUID/.beeshome
+        truncate -s 1g /var/lib/bees/$UUID/.beeshome/beeshash.dat
+        chmod 700 /var/lib/bees/$UUID/.beeshome/beeshash.dat
+
+You can use any relative path in `BEESHOME`.  The path will be taken
+relative to the root of the deduped filesystem (in other words it can
+be the name of a subvol):
+
+        export BEESHOME=@my-beeshome
+        btrfs sub create /var/lib/bees/$UUID/$BEESHOME
+        truncate -s 1g /var/lib/bees/$UUID/$BEESHOME/beeshash.dat
+        chmod 700 /var/lib/bees/$UUID/$BEESHOME/beeshash.dat
+
+Configuration
+-------------
+
+There are some runtime configurable options using environment variables:
+
+* BEESHOME: Directory containing bees state files:
+	* beeshash.dat  | persistent hash table.  Must be a multiple of 16MB, and must be created before bees starts.
+	* beescrawl.dat | state of SEARCH_V2 crawlers.  ASCII text.  bees will create this.
+	* beesstats.txt | statistics and performance counters.  ASCII text.  bees will create this.
+* BEESSTATUS: File containing a snapshot of current bees state:  performance
+  counters and current status of each thread.  The file is meant to be
+  human readable, but understanding it probably requires reading the source.
+  You can watch bees run in realtime with a command like:
+
+        watch -n1 cat $BEESSTATUS
+
+Other options (e.g. interval between filesystem crawls) can be configured
+in `src/bees.h` or [on the command line](options.md).
+
+Running
+-------
+
+Reduce CPU and IO priority to be kinder to other applications sharing
+this host (or raise them for more aggressive disk space recovery).  If you
+use cgroups, put `bees` in its own cgroup, then reduce the `blkio.weight`
+and `cpu.shares` parameters.  You can also use `schedtool` and `ionice`
+in the shell script that launches `bees`:
+
+        schedtool -D -n20 $$
+        ionice -c3 -p $$
+
+You can also use the [`--loadavg-target` and `--thread-min`
+options](options.md) to further control the impact of bees on the rest
+of the system.
+
+Let the bees fly:
+
+        for fs in /var/lib/bees/*-*-*-*-*/; do
+                bees "$fs" >> "$fs/.beeshome/bees.log" 2>&1 &
+        done
+
+You'll probably want to arrange for `/var/log/bees.log` to be rotated
+periodically.  You may also want to set umask to 077 to prevent disclosure
+of information about the contents of the filesystem through the log file.
+
+There are also some shell wrappers in the `scripts/` directory.