From e8298570ed09341b2531c7952b0f341604d39472 Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Sun, 30 Sep 2018 01:08:29 -0400 Subject: [PATCH] README: split into sections, reformat for github.io Split the rather large README into smaller sections with a pitch and a ToC at the top. Move the sections into docs/ so that Github Pages can read them. 'make doc' produces a local HTML tree. Update the kernel bugs and gotchas list. Add some information that has been accumulating in Github comments. Remove information about bugs in kernels earlier than 4.14. Signed-off-by: Zygo Blaxell --- Makefile | 14 +- README.md | 617 +++---------------------------------------- docs/.gitignore | 1 + docs/Makefile | 8 + docs/_config.yml | 1 + docs/btrfs-kernel.md | 56 ++++ docs/btrfs-other.md | 53 ++++ docs/config.md | 151 +++++++++++ docs/gotchas.md | 113 ++++++++ docs/how-it-works.md | 100 +++++++ docs/index.md | 73 +++++ docs/install.md | 91 +++++++ docs/missing.md | 50 ++++ docs/options.md | 52 ++++ docs/running.md | 92 +++++++ 15 files changed, 888 insertions(+), 584 deletions(-) create mode 100644 docs/.gitignore create mode 100644 docs/Makefile create mode 100644 docs/_config.yml create mode 100644 docs/btrfs-kernel.md create mode 100644 docs/btrfs-other.md create mode 100644 docs/config.md create mode 100644 docs/gotchas.md create mode 100644 docs/how-it-works.md create mode 100644 docs/index.md create mode 100644 docs/install.md create mode 100644 docs/missing.md create mode 100644 docs/options.md create mode 100644 docs/running.md diff --git a/Makefile b/Makefile index a065a97..49f397b 100644 --- a/Makefile +++ b/Makefile @@ -7,8 +7,6 @@ LIBEXEC_PREFIX ?= $(LIB_PREFIX)/bees SYSTEMD_SYSTEM_UNIT_DIR ?= $(shell pkg-config systemd --variable=systemdsystemunitdir) -MARKDOWN := $(firstword $(shell type -P markdown markdown2 markdown_py 2>/dev/null || echo markdown)) - BEES_VERSION ?= $(shell git describe --always --dirty || echo UNKNOWN) # allow local configuration to override above variables @@ -25,13 +23,12 @@ include Defines.mk default: $(DEFAULT_MAKE_TARGET) all: lib src scripts -docs: README.html -reallyall: all docs test +reallyall: all doc test clean: ## Cleanup git clean -dfx -e localconf -.PHONY: lib src test +.PHONY: lib src test doc lib: ## Build libs $(MAKE) -C lib @@ -44,15 +41,14 @@ test: ## Run tests test: lib src $(MAKE) -C test +doc: ## Build docs + $(MAKE) -C docs + scripts/%: scripts/%.in $(TEMPLATE_COMPILER) scripts: scripts/beesd scripts/beesd@.service -README.html: README.md - $(MARKDOWN) README.md > README.html.new - mv -f README.html.new README.html - install_libs: lib install -Dm644 lib/libcrucible.so $(DESTDIR)$(LIB_PREFIX)/libcrucible.so diff --git a/README.md b/README.md index 4a20d39..7243b94 100644 --- a/README.md +++ b/README.md @@ -1,591 +1,60 @@ BEES ==== -Best-Effort Extent-Same, a btrfs dedup agent. +Best-Effort Extent-Same, a btrfs deduplication agent. -About Bees +About bees ---------- -Bees is a block-oriented userspace dedup agent designed to avoid -scalability problems on large filesystems. +bees is a block-oriented userspace deduplication agent designed for large +btrfs filesystems. It is an offline dedupe combined with an incremental +data scan capability to minimize time data spends on disk from write +to dedupe. -Bees is designed to degrade gracefully when underprovisioned with RAM. -Bees does not use more RAM or storage as filesystem data size increases. -The dedup hash table size is fixed at creation time and does not change. -The effective dedup block size is dynamic and adjusts automatically to -fit the hash table into the configured RAM limit. Hash table overflow -is not implemented to eliminate the IO overhead of hash table overflow. -Hash table entries are only 16 bytes per dedup block to keep the average -dedup block size small. - -Bees does not require alignment between dedup blocks or extent boundaries -(i.e. it can handle any multiple-of-4K offset between dup block pairs). -Bees rearranges blocks into shared and unique extents if required to -work within current btrfs kernel dedup limitations. - -Bees can dedup any combination of compressed and uncompressed extents. - -Bees operates in a single pass which removes duplicate extents immediately -during scan. There are no separate scanning and dedup phases. - -Bees uses only data-safe btrfs kernel operations, so it can dedup live -data (e.g. build servers, sqlite databases, VM disk images). It does -not modify file attributes or timestamps. - -Bees does not store any information about filesystem structure, so it is -not affected by the number or size of files (except to the extent that -these cause performance problems for btrfs in general). It retrieves such -information on demand through btrfs SEARCH_V2 and LOGICAL_INO ioctls. -This eliminates the storage required to maintain the equivalents of -these functions in userspace. It's also why bees has no XFS support. - -Bees is a daemon designed to run continuously and maintain its state -across crahes and reboots. Bees uses checkpoints for persistence to -eliminate the IO overhead of a transactional data store. On restart, -bees will dedup any data that was added to the filesystem since the -last checkpoint. - -Bees is used to dedup filesystems ranging in size from 16GB to 35TB, with -hash tables ranging in size from 128MB to 11GB. - -How Bees Works --------------- - -Bees uses a fixed-size persistent dedup hash table with a variable dedup -block size. Any size of hash table can be dedicated to dedup. Bees will -scale the dedup block size to fit the filesystem's unique data size -using a weighted sampling algorithm. This allows Bees to adapt itself -to its filesystem size without forcing admins to do math at install time. -At the same time, the duplicate block alignment constraint can be as low -as 4K, allowing efficient deduplication of files with narrowly-aligned -duplicate block offsets (e.g. compiled binaries and VM/disk images) -even if the effective block size is much larger. - -The Bees hash table is loaded into RAM at startup (using hugepages if -available), mlocked, and synced to persistent storage by trickle-writing -over a period of several hours. This avoids issues related to seeking -or fragmentation, and enables the hash table to be efficiently stored -on Btrfs with compression (or an ext4 filesystem, or a raw disk, or -on CIFS...). - -Once a duplicate block is identified, Bees examines the nearby blocks -in the files where block appears. This allows Bees to find long runs -of adjacent duplicate block pairs if it has an entry for any one of -the blocks in its hash table. The stored hash entry plus the block -recently scanned from disk form a duplicate pair. On typical data sets, -this means most of the blocks in the hash table are redundant and can -be discarded without significant performance impact. - -Hash table entries are grouped together into LRU lists. As each block -is scanned, its hash table entry is inserted into the LRU list at a -random position. If the LRU list is full, the entry at the end of the -list is deleted. If a hash table entry is used to discover duplicate -blocks, the entry is moved to the beginning of the list. This makes Bees -unable to detect a small number of duplicates (less than 1% on typical -filesystems), but it dramatically improves efficiency on filesystems -with many small files. Bees has found a net 13% more duplicate bytes -than a naive fixed-block-size algorithm with a 64K block size using the -same size of hash table, even after discarding 1% of the duplicate bytes. - -Hash Table Sizing ------------------ - -Hash table entries are 16 bytes each (64-bit hash, 52-bit block number, -and some metadata bits). Each entry represents a minimum of 4K on disk. - - unique data size hash table size average dedup block size - 1TB 4GB 4K - 1TB 1GB 16K - 1TB 256MB 64K - 1TB 16MB 1024K - 64TB 1GB 1024K - -To change the size of the hash table, use 'truncate' to change the hash -table size, delete `beescrawl.dat` so that bees will start over with a -fresh full-filesystem rescan, and restart `bees`. - -Things You Might Expect That Bees Doesn't Have ----------------------------------------------- - -* There's no configuration file (patches welcome!). There are some tunables -hardcoded in the source that could eventually become configuration options. -There's also an incomplete option parser (patches welcome!). - -* There's no way to *stop* the Bees daemon. Use SIGKILL, SIGTERM, or -Ctrl-C for now. Some of the destructors are unreachable and have never -been tested. Bees will repeat some work when restarted. - -* The Bees process doesn't fork and writes its log to stdout/stderr. -A shell wrapper is required to make it behave more like a daemon. - -* There's no facility to exclude any part of a filesystem (patches -welcome). - -* PREALLOC extents and extents containing blocks filled with zeros will -be replaced by holes unconditionally. - -* Duplicate block groups that are less than 12K in length can take 30% -of the run time while saving only 3% of the disk space. There should -be an option to just not bother with those. - -* There is a lot of duplicate reading of blocks in snapshots. Bees will -scan all snapshots at close to the same time to try to get better -performance by caching, but really fixing this requires rewriting the -crawler to scan the btrfs extent tree directly instead of the subvol -FS trees. - -* Block reads are currently more allocation- and CPU-intensive than they -should be, especially for filesystems on SSD where the IO overhead is -much smaller. This is a problem for power-constrained environments -(e.g. laptops with slow CPU). - -* Bees can currently fragment extents when required to remove duplicate -blocks, but has no defragmentation capability yet. When possible, Bees -will attempt to work with existing extent boundaries, but it will not -aggregate blocks together from multiple extents to create larger ones. - -* It is possible to resize the hash table without starting over with -a new full-filesystem scan; however, this has not been implemented yet. - -Good Btrfs Feature Interactions -------------------------------- - -Bees has been tested in combination with the following: - -* btrfs compression (zlib, lzo, zstd), mixtures of compressed and uncompressed extents -* PREALLOC extents (unconditionally replaced with holes) -* HOLE extents and btrfs no-holes feature -* Other deduplicators, reflink copies (though Bees may decide to redo their work) -* btrfs snapshots and non-snapshot subvols (RW and RO) -* Concurrent file modification (e.g. PostgreSQL and sqlite databases, build daemons) -* all btrfs RAID profiles (people ask about this, but it's irrelevant to bees) -* IO errors during dedup (read errors will throw exceptions, Bees will catch them and skip over the affected extent) -* Filesystems mounted *with* the flushoncommit option -* 4K filesystem data block size / clone alignment -* 64-bit and 32-bit host CPUs (amd64, x86, arm) -* Large (>16M) extents -* Huge files (>1TB--although Btrfs performance on such files isn't great in general) -* filesystems up to 25T bytes, 100M+ files -* btrfs receive -* btrfs nodatacow/nodatasum inode attribute or mount option (bees skips all nodatasum files) -* open(O_DIRECT) (seems to work as well--or as poorly--with bees as with any other btrfs feature) - -Bad Btrfs Feature Interactions ------------------------------- - -Bees has been tested in combination with the following, and various problems are known: - -* bcache, lvmcache: *severe (filesystem-destroying) metadata corruption - issues* observed in testing and reported by users, apparently only when - used with bees. Plain SSD and HDD seem to be OK. -* btrfs send: sometimes aborts with an I/O error when bees changes the - data layout during a send. The send can be restarted and will work - if bees has finished processing the snapshot being sent. No data - corruption observed other than the truncated send. -* btrfs qgroups: very slow, sometimes hangs -* btrfs autodefrag mount option: hangs and high CPU usage problems - reported by users. bees cannot distinguish autodefrag activity from - normal filesystem activity and will likely try to undo the autodefrag, - so it should probably be turned off for bees in any case. - -Untested Btrfs Feature Interactions ------------------------------------ - -Bees has not been tested with the following, and undesirable interactions may occur: - -* Non-4K filesystem data block size (should work if recompiled) -* Non-equal hash (SUM) and filesystem data block (CLONE) sizes (probably never will work) -* btrfs seed filesystems (does anyone even use those?) -* btrfs out-of-tree kernel patches (e.g. in-band dedup or encryption) -* btrfs-convert from ext2/3/4 (never tested, might run out of space or ignore significant portions of the filesystem due to sanity checks) -* btrfs mixed block groups (don't know a reason why it would *not* work, but never tested) -* Filesystems mounted *without* the flushoncommit option (don't know the impact of crashes during dedup writes vs. ordinary writes) - -Other Caveats -------------- - -* btrfs balance will invalidate parts of the dedup hash table. Bees will - happily rebuild the table, but it will have to scan all the blocks - again. - -* btrfs defrag will cause Bees to rescan the defragmented file. If it - contained duplicate blocks and other references to the original - fragmented duplicates still exist, Bees will replace the defragmented - extents with the original fragmented ones. - -* Bees creates temporary files (with O_TMPFILE) and uses them to split - and combine extents elsewhere in btrfs. These will take up to 2GB - of disk space per thread during normal operation. - -* Like all deduplicators, Bees will replace data blocks with metadata - references. It is a good idea to ensure there is sufficient unallocated - space (see `btrfs fi usage`) on the filesystem to allow the metadata - to multiply in size by the number of snapshots before running Bees - for the first time. Use - - btrfs balance start -dusage=100,limit=N /your/filesystem - - where the `limit` parameter 'N' should be calculated as follows: - - * start with the current size of metadata usage (from `btrfs fi - df`) in GB, plus 1 - - * multiply by the proportion of disk space in subvols with - snapshots (i.e. if there are no snapshots, multiply by 0; - if all of the data is shared between at least one origin - and one snapshot subvol, multiply by 1) - - * multiply by the number of snapshots (i.e. if there is only - one subvol, multiply by 0; if there are 3 snapshots and one - origin subvol, multiply by 3) - - `limit = GB_metadata * (disk_space_in_snapshots / total_disk_space) * number_of_snapshots` - - Monitor unallocated space to ensure that the filesystem never runs out - of metadata space (whether Bees is running or not--this is a general - btrfs requirement). - - -A Brief List Of Btrfs Kernel Bugs ---------------------------------- - -Missing features (usually not available in older LTS kernels): - -* 3.13: `FILE_EXTENT_SAME` ioctl added. No way to reliably dedup with - concurrent modifications before this. -* 3.16: `SEARCH_V2` ioctl added. Bees could use `SEARCH` instead. -* 4.2: `FILE_EXTENT_SAME` no longer updates mtime, can be used at EOF. - -Future features (kernel features Bees does not yet use, but may rely on -in the future): - -* 4.14: `LOGICAL_INO_V2` allows userspace to create forward and backward - reference maps to entire physical extents with a single ioctl call, - and raises the limit of 2730 references per extent. Bees has not yet - been rewritten to take full advantage of these features. - -Bug fixes (sometimes included in older LTS kernels): - -* Bugs fixed prior to 4.4.107 are not listed here. -* 4.5: hang in the `INO_PATHS` ioctl used by Bees. -* 4.5: use-after-free in the `FILE_EXTENT_SAME` ioctl used by Bees. -* 4.6: lost inodes after a rename, crash, and log tree replay - (triggered by the fsync() while writing `beescrawl.dat`). -* 4.7: *slow backref* bug no longer triggers a softlockup panic. It still - takes too long to resolve a block address to a root/inode/offset triple. -* 4.10: reduced CPU time cost of the LOGICAL_INO ioctl and dedup - backref processing in general. -* 4.11: yet another dedup deadlock case is fixed. Alas, it is not the - last one. -* 4.14: backref performance improvements make LOGICAL_INO even faster - in the worst cases (but possibly slower in the best cases?). -* 4.14.29: WARN_ON(ref->count < 0) in fs/btrfs/backref.c triggers - almost once per second. The WARN_ON is incorrect and can be removed. - -Unfixed kernel bugs (as of 4.14.34) with workarounds in Bees: - -* *Deadlocks* in the kernel dedup ioctl when files are modified - immediately before dedup. `BeesTempFile::make_copy` calls `fsync()` - immediately before dedup to work around this. If the `fsync()` is - removed, the filesystem hangs within a few hours, requiring a reboot - to recover. Even with the `fsync()`, it is possible to lose the - kernel race condition and encounter a deadlock within a machine-year. - VM image workloads may trigger this faster. Over the past years - several specific deadlock cases have been fixed, but at least one - remains. - -* *Bad interactions* with other Linux block layers: bcache and lvmcache - can fail spectacularly, and apparently only while running bees. - This is definitely a kernel bug, either in btrfs or the lower block - layers. Avoid using bees with these tools, or test very carefully - before deployment. - -* *slow backrefs* (aka toxic extents): If the number of references to a - single shared extent within a single file grows above a few thousand, - the kernel consumes CPU for minutes at a time while holding various - locks that block access to the filesystem. Bees avoids this bug by - measuring the time the kernel spends performing certain operations - and permanently blacklisting any extent or hash where the kernel - starts to get slow. Inside Bees, such blocks are marked as 'toxic' - hash/block addresses. Linux kernel v4.14 is better but can still - have problems. - -* `LOGICAL_INO` output is arbitrarily limited to 2730 references - even if more buffer space is provided for results. Once this number - has been reached, Bees can no longer replace the extent since it can't - find and remove all existing references. Bees refrains from adding - any more references after the first 2560. Offending blocks are - marked 'toxic' even if there is no corresponding performance problem. - This places an obvious limit on dedup efficiency for extremely common - blocks or filesystems with many snapshots (although this limit is - far greater than the effective limit imposed by the *slow backref* bug). - *Fixed in v4.14.* - -* `LOGICAL_INO` on compressed extents returns a list of root/inode/offset - tuples matching the extent bytenr of its argument. On uncompressed - extents, any r/i/o tuple whose extent offset does not match the - argument's extent offset is discarded, i.e. only the single 4K block - matching the argument is returned, so a complete map of the extent - references requires calling `LOGICAL_INO` for every single block of - the extent. This is undesirable behavior for Bees, which wants a - list of all extent refs referencing a data extent (i.e. Bees wants - the compressed-extent behavior in all cases). *Fixed in v4.14.* - -* `FILE_EXTENT_SAME` is arbitrarily limited to 16MB. This is less than - 128MB which is the maximum extent size that can be created by defrag - or prealloc. Bees avoids feedback loops this can generate while - attempting to replace extents over 16MB in length. - -Not really bugs, but gotchas nonetheless: - -* If a process holds a directory FD open, the subvol containing the - directory cannot be deleted (`btrfs sub del` will start the deletion - process, but it will not proceed past the first open directory FD). - `btrfs-cleaner` will simply skip over the directory *and all of its - children* until the FD is closed. Bees avoids this gotcha by closing - all of the FDs in its directory FD cache every 10 btrfs transactions. - -* If a file is deleted while Bees is caching an open FD to the file, - Bees continues to scan the file. For very large files (e.g. VM - images), the deletion of the file can be delayed indefinitely. - To limit this delay, Bees closes all FDs in its file FD cache every - 10 btrfs transactions. - -* If a snapshot is deleted, bees will generate a burst of exceptions - for references to files in the snapshot that no longer exist. This - lasts until the FD caches are cleared. - -Installation -============ - -Bees can be installed by following one these instructions: - -Arch package ------------- - -Bees is available in Arch Linux AUR. Install with: - -`$ pacaur -S bees-git` - -Gentoo package --------------- - -Bees is officially available in Gentoo Portage. Just emerge a stable -version: - -`$ emerge --ask bees` - -or build a live version from git master: - -`$ emerge --ask =bees-9999` - -You can opt-out of building the support tools with - -`USE="-tools" emerge ...` - -If you want to start hacking on bees and contribute changes, just emerge -the live version which automatically pulls in all required development -packages. - -Build from source ------------------ - -Build with `make`. The build produces `bin/bees` and `lib/libcrucible.so`, -which must be copied to somewhere in `$PATH` and `$LD_LIBRARY_PATH` -on the target system respectively. - -It will also generate `scripts/beesd@.service` for systemd users. This -service makes use of a helper script `scripts/beesd` to boot the service. -Both of the latter use the filesystem UUID to mount the root subvolume -within a temporary runtime directory. - -### Ubuntu 16.04 - 17.04: -`$ apt -y install build-essential btrfs-tools uuid-dev markdown && make` - -### Ubuntu 14.04: -You can try to carry on the work done here: https://gist.github.com/dagelf/99ee07f5638b346adb8c058ab3d57492 - -Packaging +Strengths --------- -See 'Dependencies' below. Package maintainers can pick ideas for building and -configuring the source package from the Gentoo ebuild: + * Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB) + * Incremental realtime dedupe of new data using btrfs tree search + * Works with btrfs compression - dedupe any combination of compressed and uncompressed files + * Works around btrfs filesystem structure to free more disk space + * Persistent hash table for rapid restart after shutdown + * Whole-filesystem dedupe - including snapshots + * Constant hash table size - no increased RAM usage if data set becomes larger + * Works on live data - no scheduled downtime required + * Automatic self-throttling based on system load -https://github.com/gentoo/gentoo/tree/master/sys-fs/bees +Weaknesses +---------- -You can configure some build options by creating a file `localconf` and -adjust settings for your distribution environment there. + * Whole-filesystem dedupe - has no include/exclude filters, does not accept file lists + * Runs continuously as a daemon - no quick start/stop + * Requires root privilege (or `CAP_SYS_ADMIN`) + * First run may require temporary disk space for extent reorganization + * [First run may increase metadata space usage if many snapshots exist](docs/gotchas.md) + * Constant hash table size - no decreased RAM usage if data set becomes smaller + * btrfs only -Please also review the Makefile for additional hints. +Installation and Usage +---------------------- -Dependencies ------------- + * [Installation](docs/install.md) + * [Configuration](docs/config.md) + * [Running](docs/running.md) + * [Command Line Options](docs/options.md) -* C++11 compiler (tested with GCC 4.9, 6.2.0, 8.1.0) +Recommended Reading +------------------- - Sorry. I really like closures and shared_ptr, so support - for earlier compiler versions is unlikely. + * [bees Gotchas](docs/gotchas.md) + * [btrfs kernel bugs](docs/btrfs-kernel.md) + * [bees vs. other btrfs features](docs/btrfs-other.md) -* btrfs-progs (tested with 4.1..4.15.1) or libbtrfs-dev - (tested with version 4.16.1) - - Needed for btrfs.h and ctree.h during compile. - Also needed by the service wrapper script. - -* libuuid-dev - - This library is only required for a feature that was removed after v0.1. - The lingering support code can be removed. - -* Linux kernel version: *minimum* 4.4.107, *4.14.29 or later recommended* - - Don't bother trying to make Bees work with kernel versions older than - 4.4.107. It may appear to work, but it won't end well: there are - too many missing features and bugs (including data corruption bugs) - to work around in older kernels. - - Kernel versions between 4.4.107 and 4.14.29 are usable with bees, - but bees can trigger known performance bugs and hangs in dedup-related - functions. - -* markdown - -* util-linux version that provides `blkid` command for the helper - script `scripts/beesd` to work - -Setup ------ - -If you don't want to use the helper script `scripts/beesd` to setup and -configure bees, here's how you manually setup bees. - -Create a directory for bees state files: - - export BEESHOME=/some/path - mkdir -p "$BEESHOME" - -Create an empty hash table (your choice of size, but it must be a multiple -of 16M). This example creates a 1GB hash table: - - truncate -s 1g "$BEESHOME/beeshash.dat" - chmod 700 "$BEESHOME/beeshash.dat" - -bees can only process the root subvol of a btrfs (seriously--if the -argument is not the root subvol directory, Bees will just throw an -exception and stop). - -Use a bind mount, and let only bees access it: - - UUID=3399e413-695a-4b0b-9384-1b0ef8f6c4cd - mkdir -p /var/lib/bees/$UUID - mount /dev/disk/by-uuid/$UUID /var/lib/bees/$UUID -osubvol=/ - -If you don't set BEESHOME, the path ".beeshome" will be used relative -to the root subvol of the filesystem. For example: - - btrfs sub create /var/lib/bees/$UUID/.beeshome - truncate -s 1g /var/lib/bees/$UUID/.beeshome/beeshash.dat - chmod 700 /var/lib/bees/$UUID/.beeshome/beeshash.dat - -You can use any relative path in BEESHOME. The path will be taken -relative to the root of the deduped filesystem (in other words it can -be the name of a subvol): - - export BEESHOME=@my-beeshome - btrfs sub create /var/lib/bees/$UUID/$BEESHOME - truncate -s 1g /var/lib/bees/$UUID/$BEESHOME/beeshash.dat - chmod 700 /var/lib/bees/$UUID/$BEESHOME/beeshash.dat - -Configuration -------------- - -There are some runtime configurable options using environment variables: - -* BEESHOME: Directory containing Bees state files: - * beeshash.dat | persistent hash table. Must be a multiple of 16M. - This contains 16-byte records: 8 bytes for CRC64, - 8 bytes for physical address and some metadata bits. - * beescrawl.dat | state of SEARCH_V2 crawlers. ASCII text. - * beesstats.txt | statistics and performance counters. ASCII text. -* BEESSTATUS: File containing a snapshot of current Bees state: performance - counters and current status of each thread. The file is meant to be - human readable, but understanding it probably requires reading the source. - You can watch bees run in realtime with a command like: - - watch -n1 cat $BEESSTATUS - -Other options (e.g. interval between filesystem crawls) can be configured -in src/bees.h or on the cmdline (see 'Command Line Options' below). - -Running -------- - -Reduce CPU and IO priority to be kinder to other applications sharing -this host (or raise them for more aggressive disk space recovery). If you -use cgroups, put `bees` in its own cgroup, then reduce the `blkio.weight` -and `cpu.shares` parameters. You can also use `schedtool` and `ionice` -in the shell script that launches `bees`: - - schedtool -D -n20 $$ - ionice -c3 -p $$ - -Let the bees fly: - - for fs in /var/lib/bees/*-*-*-*-*/; do - bees "$fs" >> "$fs/.beeshome/bees.log" 2>&1 & - done - -You'll probably want to arrange for /var/log/bees.log to be rotated -periodically. You may also want to set umask to 077 to prevent disclosure -of information about the contents of the filesystem through the log file. - -There are also some shell wrappers in the `scripts/` directory. - - - -Command Line Options --------------------- - -* --thread-count (-c) COUNT - * Specify maximum number of worker threads for scanning. Overrides - --thread-factor (-C) and default/autodetected values. -* --thread-factor (-C) FACTOR - * Specify ratio of worker threads to CPU cores. Overridden by --thread-count (-c). - Default is 1.0, i.e. 1 worker thread per detected CPU. Use values - below 1.0 to leave some cores idle, or above 1.0 if there are more - disks than CPUs in the filesystem. -* --loadavg-target (-g) LOADAVG - * Specify load average target for dynamic worker threads. - Threads will be started or stopped subject to the upper limit imposed - by thread-factor, thread-min and thread-count until the load average - is within +/- 0.5 of LOADAVG. -* --thread-min (-G) COUNT - * Specify minimum number of worker threads for scanning. - Ignored unless -g option is used to specify a target load. - -* --scan-mode (-m) MODE - * Specify extent scanning algorithm. Default mode is 0. - _EXPERIMENTAL_ feature that may go away. - * Mode 0: scan extents in ascending order of (inode, subvol, offset). - Keeps shared extents between snapshots together. Reads files sequentially. - Minimizes temporary space usage. - * Mode 1: scan extents from all subvols in parallel. Good performance - on non-spinning media when subvols are unrelated. - * Mode 2: scan all extents from one subvol at a time. Good sequential - read performance for spinning media. Maximizes temporary space usage. - -* --timestamps (-t) - * Enable timestamps in log output. -* --no-timestamps (-T) - * Disable timestamps in log output. -* --absolute-paths (-p) - * Paths in log output will be absolute. -* --strip-paths (-P) - * Paths in log output will have the working directory at Bees startup - stripped. -* --verbose (-v) - * Set log verbosity (0 = no output, 8 = all output, default 8). +More Information +---------------- + * [How bees works](docs/how-it-works.md) + * [Missing bees features](docs/missing.md) Bug Reports and Contributions ----------------------------- @@ -596,11 +65,9 @@ You can also use Github: https://github.com/Zygo/bees - - Copyright & License -=================== +------------------- -Copyright 2015-2017 Zygo Blaxell . +Copyright 2015-2018 Zygo Blaxell . GPL (version 3 or later). diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..2d19fc7 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..2a3cdf7 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,8 @@ +MARKDOWN := $(firstword $(shell type -P markdown markdown2 markdown_py 2>/dev/null || echo markdown)) +.PHONY: docs + +docs: $(subst .md,.html,$(wildcard *.md)) ../README.html + +%.html: %.md + $(MARKDOWN) $< | sed -e 's/\.md/\.html/g' > $@.new + mv -f $@.new $@ diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 0000000..277f1f2 --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1 @@ +theme: jekyll-theme-cayman diff --git a/docs/btrfs-kernel.md b/docs/btrfs-kernel.md new file mode 100644 index 0000000..f111a85 --- /dev/null +++ b/docs/btrfs-kernel.md @@ -0,0 +1,56 @@ +Recommended kernel version +========================== + +Linux **4.14.34** or later. + +A Brief List Of Btrfs Kernel Bugs +--------------------------------- + +Recent kernel bug fixes: + +* 4.14.29: `WARN_ON(ref->count < 0)` in fs/btrfs/backref.c triggers + almost once per second. The `WARN_ON` is incorrect, and is now removed. + +Unfixed kernel bugs (as of 4.14.71): + +* **Bad _filesystem destroying_ interactions** with other Linux block + layers: `bcache` and `lvmcache` can fail spectacularly, and apparently + only do so while running bees. This is definitely a kernel bug, + either in btrfs or the lower block layers. **Avoid using bees with + these tools unless your filesystem is disposable and you intend to + debug the kernel.** + +* **Compressed data corruption** is possible when using the `fallocate` + system call to punch holes into compressed extents that contain long + runs of zeros. The [bug results in intermittent corruption during + reads](https://www.spinics.net/lists/linux-btrfs/msg81293.html), but + due to the bug, the kernel might sometimes mistakenly determine data + is duplicate, and deduplication will corrupt the data permanently. + This bug also affects compressed `kvm` raw images with the `discard` + feature on btrfs or any compressed file where `fallocate -d` or + `fallocate -p` has been used. + +* **Deadlock** when [simultaneously using the same files in dedupe and + `rename`](https://www.spinics.net/lists/linux-btrfs/msg81109.html). + There is no way for bees to reliably know when another process is + about to rename a file while bees is deduping it. In the `rsync` case, + bees will dedupe the new file `rsync` is creating using the old file + `rsync` is copying from, while `rsync` will rename the new file over + the old file to replace it. + +Minor kernel problems with workarounds: + +* **Slow backrefs** (aka toxic extents): If the number of references to a + single shared extent within a single file grows above a few thousand, + the kernel consumes CPU for minutes at a time while holding various + locks that block access to the filesystem. bees avoids this bug + by measuring the time the kernel spends performing `LOGICAL_INO` + operations and permanently blacklisting any extent or hash involved + where the kernel starts to get slow. Inside bees, such blocks are + known as 'toxic' hash/block addresses. + +* **`FILE_EXTENT_SAME` is arbitrarily limited to 16MB**. This is + less than 128MB which is the maximum extent size that can be created + by defrag, prealloc, or filesystems without the `compress-force` + mount option. bees avoids feedback loops this can generate while + attempting to replace extents over 16MB in length. diff --git a/docs/btrfs-other.md b/docs/btrfs-other.md new file mode 100644 index 0000000..a89cb09 --- /dev/null +++ b/docs/btrfs-other.md @@ -0,0 +1,53 @@ +Good Btrfs Feature Interactions +------------------------------- + +bees has been tested in combination with the following: + +* btrfs compression (zlib, lzo, zstd), mixtures of compressed and uncompressed extents +* PREALLOC extents (unconditionally replaced with holes) +* HOLE extents and btrfs no-holes feature +* Other deduplicators, reflink copies (though bees may decide to redo their work) +* btrfs snapshots and non-snapshot subvols (RW and RO) +* Concurrent file modification (e.g. PostgreSQL and sqlite databases, build daemons) +* all btrfs RAID profiles +* IO errors during dedupe (read errors will throw exceptions, bees will catch them and skip over the affected extent) +* Filesystems mounted *with* the flushoncommit option (system crashes, power failures OK) +* 4K filesystem data block size / clone alignment +* 64-bit and 32-bit host CPUs (amd64, x86, arm) +* Huge files (>1TB--although Btrfs performance on such files isn't great in general) +* filesystems up to 30T+ bytes, 100M+ files +* btrfs receive +* btrfs nodatacow/nodatasum inode attribute or mount option (bees skips all nodatasum files) +* open(O_DIRECT) (seems to work as well--or as poorly--with bees as with any other btrfs feature) + +Bad Btrfs Feature Interactions +------------------------------ + +bees has been tested in combination with the following, and various problems are known: + +* bcache, lvmcache: **severe (filesystem-destroying) metadata corruption + issues** observed in testing and reported by users, apparently only when + used with bees. Plain SSD and HDD seem to be OK. +* btrfs send: some kernel versions have bugs in btrfs send that can be + triggered by bees. The send can be restarted and will work if bees + has finished processing the snapshot being sent. No data corruption + observed other than the truncated send. +* btrfs qgroups: very slow, sometimes hangs...and it's even worse when + bees is running. +* btrfs autodefrag mount option: hangs and high CPU usage problems + reported by users. bees cannot distinguish autodefrag activity from + normal filesystem activity and will likely try to undo the autodefrag + if duplicate copies of the defragmented data exist. + +Untested Btrfs Feature Interactions +----------------------------------- + +bees has not been tested with the following, and undesirable interactions may occur: + +* Non-4K filesystem data block size (should work if recompiled) +* Non-equal hash (SUM) and filesystem data block (CLONE) sizes (need to fix that eventually) +* btrfs seed filesystems (does anyone even use those?) +* btrfs out-of-tree kernel patches (e.g. in-kernel dedupe or encryption) +* btrfs-convert from ext2/3/4 (never tested, might run out of space or ignore significant portions of the filesystem due to sanity checks) +* btrfs mixed block groups (don't know a reason why it would *not* work, but never tested) +* Filesystems mounted *without* the flushoncommit option (don't know the data integrity impact of crashes during dedupe writes vs. ordinary writes) diff --git a/docs/config.md b/docs/config.md new file mode 100644 index 0000000..11b4497 --- /dev/null +++ b/docs/config.md @@ -0,0 +1,151 @@ +bees Configuration +================== + +The only configuration parameter that *must* be provided is the hash +table size. Other parameters are optional or hardcoded, and the defaults +are reasonable in most cases. + +Hash Table Sizing +----------------- + +Hash table entries are 16 bytes per data block. The hash table stores +the most recently read unique hashes. Once the hash table is full, +each new entry in the table evicts an old entry. + +Here are some numbers to estimate appropriate hash table sizes: + + unique data size | hash table size |average dedupe extent size + 1TB | 4GB | 4K + 1TB | 1GB | 16K + 1TB | 256MB | 64K + 1TB | 128MB | 128K <- recommended + 1TB | 16MB | 1024K + 64TB | 1GB | 1024K + +Notes: + + * If the hash table is too large, no extra dedupe efficiency is +obtained, and the extra space just wastes RAM. Extra space can also slow +bees down by preventing old data from being evicted, so bees wastes time +looking for matching data that is no longer present on the filesystem. + + * If the hash table is too small, bees extrapolates from matching +blocks to find matching adjacent blocks in the filesystem that have been +evicted from the hash table. In other words, bees only needs to find +one block in common between two extents in order to be able to dedupe +the entire extents. This provides significantly more dedupe hit rate +per hash table byte than other dedupe tools. + + * When counting unique data in compressed data blocks to estimate +optimum hash table size, count the *uncompressed* size of the data. + + * Another way to approach the hash table size is to simply decide how much +RAM can be spared without too much discomfort, give bees that amount of +RAM, and accept whatever dedupe hit rate occurs as a result. bees will +do the best job it can with the RAM it is given. + +Factors affecting optimal hash table size +----------------------------------------- + +It is difficult to predict the net effect of data layout and access +patterns on dedupe effectiveness without performing deep inspection of +both the filesystem data and its structure--a task that is as expensive +as performing the deduplication. + +* **Compression** on the filesystem reduces the average extent length +compared to uncompressed filesystems. The maximum compressed extent +length on btrfs is 128KB, while the maximum uncompressed extent length +is 128MB. Longer extents decrease the optimum hash table size while +shorter extents increase the optimum hash table size because the +probability of a hash table entry being present (i.e. unevicted) in +each extent is proportional to the extent length. + + As a rule of thumb, the optimal hash table size for a compressed +filesystem is 2-4x larger than the optimal hash table size for the same +data on an uncompressed filesystem. Dedupe efficiency falls dramatically +with hash tables smaller than 128MB/TB as the average dedupe extent size +is larger than the largest possible compressed extent size (128KB). + +* **Short writes** also shorten the average extent length and increase +optimum hash table size. If a database writes to files randomly using +4K page writes, all of these extents will be 4K in length, and the hash +table size must be increased to retain each one (or the user must accept +a lower dedupe hit rate). + + Defragmenting files that have had many short writes increases the +extent length and therefore reduces the optimum hash table size. + +* **Time between duplicate writes** also affects the optimum hash table +size. bees reads data blocks in logical order during its first pass, +and after that new data blocks are read incrementally a few seconds or +minutes after they are written. bees finds more matching blocks if there +is a smaller amount of data between the matching reads, i.e. there are +fewer blocks evicted from the hash table. If most identical writes to +the filesystem occur near the same time, the optimum hash table size is +smaller. If most identical writes occur over longer intervals of time, +the optimum hash table size must be larger to avoid evicting hashes from +the table before matches are found. + + For example, a build server normally writes out very similar source +code files over and over, so it will need a smaller hash table than a +backup server which has to refer to the oldest data on the filesystem +every time a new client machine's data is added to the server. + +Scanning modes for multiple subvols +----------------------------------- + +The `--scan-mode` option affects how bees divides resources between +subvolumes. This is particularly relevant when there are snapshots, +as there are tradeoffs to be made depending on how snapshots are used +on the filesystem. + +Note that if a filesystem has only one subvolume (i.e. the root, +subvol ID 5) then the `--scan-mode` option has no effect, as there is +only one subvolume to scan. + +The default mode is mode 0, "lockstep". In this mode, each inode of each +subvol is scanned at the same time, before moving to the next inode in +each subvol. This maximizes the likelihood that all of the references to +a snapshot of a file are scanned at the same time, which takes advantage +of VFS caching in the Linux kernel. If snapshots are created very often, +bees will not make very good progress as it constantly restarts the +filesystem scan from the beginning each time a new snapshot is created. + +Scan mode 1, "independent", simply scans every subvol independently +in parallel. Each subvol's scanner shares time equally with all other +subvol scanners. Whenever a new subvol appears, a new scanner is +created and the new subvol scanner doesn't affect the behavior of any +existing subvol scanner. + +Scan mode 2, "sequential", processes each subvol completely before +proceeding to the next subvol. This is a good mode when using bees for +the first time on a filesystem that already has many existing snapshots +and a high rate of new snapshot creation. Short-lived snapshots +(e.g. those used for `btrfs send`) are effectively ignored, and bees +directs its efforts toward older subvols that are more likely to be +origin subvols for snapshots. By deduping origin subvols first, bees +ensures that future snapshots will already be deduplicated and do not +need to be deduplicated again. + +If you are using bees for the first time on a filesystem with many +existing snapshots, you should read about [snapshot gotchas](gotchas.md). + +Threads and load management +--------------------------- + +By default, bees creates one worker thread for each CPU detected. +These threads then perform scanning and dedupe operations. The number of +worker threads can be set with the [`--thread-count` and `--thread-factor` +options](options.md). + +If desired, bees can automatically increase or decrease the number +of worker threads in response to system load. This reduces impact on +the rest of the system by pausing bees when other CPU and IO intensive +loads are active on the system, and resumes bees when the other loads +are inactive. This is configured with the [`--loadavg-target` and +`--thread-min` options](options.md). + +Log verbosity +------------- + +bees can be made less chatty with the [`--verbose` option](options.md). diff --git a/docs/gotchas.md b/docs/gotchas.md new file mode 100644 index 0000000..2712ac7 --- /dev/null +++ b/docs/gotchas.md @@ -0,0 +1,113 @@ +bees Gotchas +============ + +Snapshots +--------- + +bees can dedupe filesystems with many snapshots, but bees only does +well in this situation if bees was running on the filesystem from +the beginning. + +Each time bees dedupes an extent that is referenced by a snapshot, +the entire metadata page in the snapshot subvol (16KB by default) must +be CoWed in btrfs. This can result in a substantial increase in btrfs +metadata size if there are many snapshots on a filesystem. + +Normally, metadata is small (less than 1% of the filesystem) and dedupe +hit rates are large (10-40% of the filesystem), so the increase in +metadata size is offset by much larger reductions in data size and the +total space used by the entire filesystem is reduced. + +If a subvol is deduped _before_ a snapshot is created, the snapshot will +have the same deduplication as the subvol. This does _not_ result in +unusually large metadata sizes. If a snapshot is made after bees has +fully scanned the origin subvol, bees can avoid scanning most of the +data in the snapshot subvol, as it will be provably identical to the +origin subvol that was already scanned. + +If a subvol is deduped _after_ a snapshot is created, the origin and +snapshot subvols must be deduplicated separately. In the worst case, this +will double the amount of reading the bees scanner must perform, and will +also double the amount of btrfs metadata used for the snapshot; however, +the "worst case" is a dedupe hit rate of 1% or more, so a doubling of +metadata size is certain for all but the most unique data sets. Also, +bees will not be able to free any space until the last snapshot has been +scanned and deduped, so payoff in data space savings is deferred until +the metadata has almost finished expanding. + +If a subvol is deduped after _many_ snapshots have been created, all +subvols must be deduplicated individually. In the worst case, this will +multiply the scanning work and metadata size by the number of snapshots. +For 100 snapshots this can mean a 100x growth in metadata size and +bees scanning time, which typically exceeds the possible savings from +reducing the data size by dedupe. In such cases using bees will result +in a net increase in disk space usage that persists until the snapshots +are deleted. + +Snapshot case studies +--------------------- + + * bees running on an empty filesystem + * filesystem is mkfsed + * bees is installed and starts running + * data is written to the filesystem + * bees dedupes the data as it appears + * a snapshot is made of the data + * The snapshot will already be 99% deduped, so the metadata will + not expand very much because only 1% of the data in the snapshot + must be deduped. + * more snapshots are made of the data + * as long as dedupe has been completed on the origin subvol, + bees will quickly scan each new snapshot because it can skip + all the previously scanned data. Metadata usage remains low + (it may even shrink because there are fewer csums). + + * bees installed on a non-empty filesystem with snapshots + * filesystem is mkfsed + * data is written to the filesystem + * multiple snapshots are made of the data + * bees is installed and starts running + * bees dedupes each snapshot individually + * The snapshot metadata will no longer be shared, resulting in + substantial growth of metadata usage. + * Disk space savings do not occur until bees processes the + last snapshot reference to data. + + +Other Gotchas +------------- + +* bees avoids the [slow backrefs kernel bug](btrfs-kernel.md) by + measuring the time required to perform `LOGICAL_INO` operations. If an + extent requires over 10 seconds to perform a `LOGICAL_INO` then bees + blacklists the extent and avoids referencing it in future operations. + In most cases, fewer than 0.1% of extents in a filesystem must be + avoided this way. This results in short write latency spikes of up + to and a little over 10 seconds as btrfs will not allow writes to the + filesystem while `LOGICAL_INO` is running. Generally the CPU spends + most of the runtime of the `LOGICAL_INO` ioctl running the kernel, + so on a single-core CPU the entire system can freeze up for a few + seconds at a time. + +* Load managers that send a `SIGSTOP` to the bees process to throttle + CPU usage may affect the `LOGICAL_INO` timing mechanism, causing extents + to be incorrectly labelled 'toxic'. This will cause a small reduction + of dedupe hit rate. Slow and heavily loaded disks can trigger the same + effect if `LOGICAL_INO` takes too long due to IO latency. + +* If a process holds a directory FD open, the subvol containing the + directory cannot be deleted (`btrfs sub del` will start the deletion + process, but it will not proceed past the first open directory FD). + `btrfs-cleaner` will simply skip over the directory *and all of its + children* until the FD is closed. bees avoids this gotcha by closing + all of the FDs in its directory FD cache every 10 btrfs transactions. + +* If a file is deleted while bees is caching an open FD to the file, + bees continues to scan the file. For very large files (e.g. VM + images), the deletion of the file can be delayed indefinitely. + To limit this delay, bees closes all FDs in its file FD cache every + 10 btrfs transactions. + +* If a snapshot is deleted, bees will generate a burst of exceptions + for references to files in the snapshot that no longer exist. This + lasts until the FD caches are cleared. diff --git a/docs/how-it-works.md b/docs/how-it-works.md new file mode 100644 index 0000000..b3fa62c --- /dev/null +++ b/docs/how-it-works.md @@ -0,0 +1,100 @@ +How bees Works +-------------- + +bees is a daemon designed to run continuously and maintain its state +across crashes and reboots. + +bees uses checkpoints for persistence to eliminate the IO overhead of a +transactional data store. On restart, bees will dedupe any data that +was added to the filesystem since the last checkpoint. Checkpoints +occur every 15 minutes for scan progress, stored in `beescrawl.dat`. +The hash table trickle-writes to disk at 4GB/hour to `beeshash.dat`. +An hourly performance report is written to `beesstats.txt`. There are +no special requirements for bees hash table storage--`.beeshome` could +be stored on a different btrfs filesystem, ext4, or even CIFS. + +bees uses a persistent dedupe hash table with a fixed size configured +by the user. Any size of hash table can be dedicated to dedupe. If a +fast dedupe with low hit rate is desired, bees can use a hash table as +small as 16MB. + +The bees hash table is loaded into RAM at startup and `mlock`ed so it +will not be swapped out by the kernel (if swap is permitted, performance +degrades to nearly zero). + +bees scans the filesystem in a single pass which removes duplicate +extents immediately after they are detected. There are no distinct +scanning and dedupe phases, so bees can start recovering free space +immediately after startup. + +Once a filesystem scan has been completed, bees uses the `min_transid` +parameter of the `TREE_SEARCH_V2` ioctl to avoid rescanning old data +on future scans and quickly scan new data. An incremental data scan +can complete in less than a millisecond on an idle filesystem. + +Once a duplicate data block is identified, bees examines the nearby +blocks in the files where the matched block appears. This allows bees +to find long runs of adjacent duplicate block pairs if it has an entry +for any one of the blocks in its hash table. On typical data sets, +this means most of the blocks in the hash table are redundant and can +be discarded without significant impact on dedupe hit rate. + +Hash table entries are grouped together into LRU lists. As each block +is scanned, its hash table entry is inserted into the LRU list at a +random position. If the LRU list is full, the entry at the end of the +list is deleted. If a hash table entry is used to discover duplicate +blocks, the entry is moved to the beginning of the list. This makes bees +unable to detect a small number of duplicates, but it dramatically +improves efficiency on filesystems with many small files. + +Once the hash table fills up, old entries are evicted by new entries. +This means that the optimum hash table size is determined by the +distance between duplicate blocks on the filesystem rather than the +filesystem unique data size. Even if the hash table is too small +to find all duplicates, it may still find _most_ of them, especially +during incremental scans where the data in many workloads tends to be +more similar. + +When a duplicate block pair is found in two btrfs extents, bees will +attempt to match all other blocks in the newer extent with blocks in +the older extent (i.e. the goal is to keep the extent referenced in the +hash table and remove the most recently scanned extent). If this is +possible, then the new extent will be replaced with a reference to the +old extent. If this is not possible, then bees will create a temporary +copy of the unmatched data in the new extent so that the entire new +extent can be removed by deduplication. This must be done because btrfs +cannot partially overwrite extents--the _entire_ extent must be replaced. +The temporary copy is then scanned during the next pass bees makes over +the filesystem for potential duplication of other extents. + +When a block containing all-zero bytes is found, bees dedupes the extent +against a temporary file containing a hole, possibly creating temporary +copies of any non-zero data in the extent for later deduplication as +described above. If the extent is compressed, bees avoids splitting +the extent in the middle as this generally has a negative impact on +compression ratio (and also triggers a [kernel bug](btrfs-kernel.md)). + +bees does not store any information about filesystem structure, so +its performance is linear in the number or size of files. The hash +table stores physical block numbers which are converted into paths +and FDs on demand through btrfs `SEARCH_V2` and `LOGICAL_INO` ioctls. +This eliminates the storage required to maintain the equivalents +of these functions in userspace, at the expense of encountering [some +kernel bugs in `LOGICAL_INO` performance](btrfs-kernel.md). + +bees uses only the data-safe `FILE_EXTENT_SAME` (aka `FIDEDUPERANGE`) +kernel operations to manipulate user data, so it can dedupe live data +(e.g. build servers, sqlite databases, VM disk images). It does not +modify file attributes or timestamps. + +When bees has scanned all of the data, bees will pause until 10 +transactions have been completed in the btrfs filesystem. bees tracks +the current btrfs transaction ID over time so that it polls less often +on quiescent filesystems and more often on busy filesystems. + +Scanning and deduplication work is performed by worker threads. If the +[`--loadavg-target` option](options.md) is used, bees adjusts the number +of worker threads up or down as required to have a user-specified load +impact on the system. The maximum and minimum number of threads is +configurable. If the system load is too high then bees will stop until +the load falls to acceptable levels. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..13c5626 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,73 @@ +BEES +==== + +Best-Effort Extent-Same, a btrfs deduplication agent. + +About bees +---------- + +bees is a block-oriented userspace deduplication agent designed for large +btrfs filesystems. It is an offline dedupe combined with an incremental +data scan capability to minimize time data spends on disk from write +to dedupe. + +Strengths +--------- + + * Space-efficient hash table and matching algorithms - can use as little as 1 GB hash table per 10 TB unique data (0.1GB/TB) + * Incremental realtime dedupe of new data using btrfs tree search + * Works with btrfs compression - dedupe any combination of compressed and uncompressed files + * Works around btrfs filesystem structure to free more disk space + * Persistent hash table for rapid restart after shutdown + * Whole-filesystem dedupe - including snapshots + * Constant hash table size - no increased RAM usage if data set becomes larger + * Works on live data - no scheduled downtime required + * Automatic self-throttling based on system load + +Weaknesses +---------- + + * Whole-filesystem dedupe - has no include/exclude filters, does not accept file lists + * Runs continuously as a daemon - no quick start/stop + * Requires root privilege (or `CAP_SYS_ADMIN`) + * First run may require temporary disk space for extent reorganization + * [First run may increase metadata space usage if many snapshots exist](gotchas.md) + * Constant hash table size - no decreased RAM usage if data set becomes smaller + * btrfs only + +Installation and Usage +---------------------- + + * [Installation](install.md) + * [Configuration](config.md) + * [Running](running.md) + * [Command Line Options](options.md) + +Recommended Reading +------------------- + + * [bees Gotchas](gotchas.md) + * [btrfs kernel bugs](btrfs-kernel.md) + * [bees vs. other btrfs features](btrfs-other.md) + +More Information +---------------- + + * [How bees works](how-it-works.md) + * [Missing bees features](missing.md) + +Bug Reports and Contributions +----------------------------- + +Email bug reports and patches to Zygo Blaxell . + +You can also use Github: + + https://github.com/Zygo/bees + +Copyright & License +------------------- + +Copyright 2015-2018 Zygo Blaxell . + +GPL (version 3 or later). diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 0000000..3f75057 --- /dev/null +++ b/docs/install.md @@ -0,0 +1,91 @@ +Building bees +============= + +Dependencies +------------ + +* C++11 compiler (tested with GCC 4.9, 6.2.0, 8.1.0) + + Sorry. I really like closures and shared_ptr, so support + for earlier compiler versions is unlikely. + +* btrfs-progs (tested with 4.1..4.15.1) or libbtrfs-dev + (tested with version 4.16.1) + + Needed for btrfs.h and ctree.h during compile. + Also needed by the service wrapper script. + +* libuuid-dev + + This library is only required for a feature that was removed after v0.1. + The lingering support code can be removed. + +* [Linux kernel version](btrfs-kernel.md) gets its own page. + +* markdown for documentation + +* util-linux version that provides `blkid` command for the helper + script `scripts/beesd` to work + +Installation +============ + +bees can be installed by following one these instructions: + +Arch package +------------ + +bees is available in Arch Linux AUR. Install with: + +`$ pacaur -S bees-git` + +Gentoo package +-------------- + +bees is officially available in Gentoo Portage. Just emerge a stable +version: + +`$ emerge --ask bees` + +or build a live version from git master: + +`$ emerge --ask =bees-9999` + +You can opt-out of building the support tools with + +`USE="-tools" emerge ...` + +If you want to start hacking on bees and contribute changes, just emerge +the live version which automatically pulls in all required development +packages. + +Build from source +----------------- + +Build with `make`. The build produces `bin/bees` and `lib/libcrucible.so`, +which must be copied to somewhere in `$PATH` and `$LD_LIBRARY_PATH` +on the target system respectively. + +It will also generate `scripts/beesd@.service` for systemd users. This +service makes use of a helper script `scripts/beesd` to boot the service. +Both of the latter use the filesystem UUID to mount the root subvolume +within a temporary runtime directory. + +### Ubuntu 16.04 - 17.04: +`$ apt -y install build-essential btrfs-tools uuid-dev markdown && make` + +### Ubuntu 14.04: +You can try to carry on the work done here: + +Packaging +--------- + +See 'Dependencies' below. Package maintainers can pick ideas for building and +configuring the source package from the Gentoo ebuild: + + + +You can configure some build options by creating a file `localconf` and +adjust settings for your distribution environment there. + +Please also review the Makefile for additional hints. diff --git a/docs/missing.md b/docs/missing.md new file mode 100644 index 0000000..e17dacc --- /dev/null +++ b/docs/missing.md @@ -0,0 +1,50 @@ +Features You Might Expect That bees Doesn't Have +------------------------------------------------ + +* There's no configuration file (patches welcome!). There are +some tunables hardcoded in the source that could eventually become +configuration options. There's also an incomplete option parser +(patches welcome!). + +* There's no way to *stop* the bees daemon. Use SIGKILL, SIGTERM, or +Ctrl-C for now. Some of the destructors are unreachable and have never +been tested. bees checkpoints its progress every 15 minutes (also not +configurable, patches welcome) and will repeat some work when restarted. + +* The bees process doesn't fork and writes its log to stdout/stderr. +A shell wrapper is required to make it behave more like a daemon. + +* There's no facility to exclude any part of a filesystem or focus on +specific files (patches welcome). + +* PREALLOC extents and extents containing blocks filled with zeros will +be replaced by holes. There is no way to turn this off. + +* Consecutive runs of duplicate blocks that are less than 12K in length +can take 30% of the processing time while saving only 3% of the disk +space. There should be an option to just not bother with those, but it's +complicated by the btrfs requirement to always dedupe complete extents. + +* There is a lot of duplicate reading of blocks in snapshots. bees will +scan all snapshots at close to the same time to try to get better +performance by caching, but really fixing this requires rewriting the +crawler to scan the btrfs extent tree directly instead of the subvol +FS trees. + +* Block reads are currently more allocation- and CPU-intensive than they +should be, especially for filesystems on SSD where the IO overhead is +much smaller. This is a problem for CPU-power-constrained environments +(e.g. laptops running from battery, or ARM devices with slow CPU). + +* bees can currently fragment extents when required to remove duplicate +blocks, but has no defragmentation capability yet. When possible, bees +will attempt to work with existing extent boundaries, but it will not +aggregate blocks together from multiple extents to create larger ones. + +* When bees fragments an extent, the copied data is compressed. There +is currently no way (other than by modifying the source) to select a +compression method or not compress the data (patches welcome!). + +* It is theoretically possible to resize the hash table without starting +over with a new full-filesystem scan; however, this feature has not been +implemented yet. diff --git a/docs/options.md b/docs/options.md new file mode 100644 index 0000000..741f680 --- /dev/null +++ b/docs/options.md @@ -0,0 +1,52 @@ +# bees Command Line Options + + + + + + + + + + + + + + + + + + + + + + + + + + +
--thread-count COUNT-cSpecify maximum number of worker threads for scanning. Overrides +--thread-factor (-C) and default/autodetected values. +
--thread-factor FACTOR-CSpecify ratio of worker threads to CPU cores. Overridden by --thread-count (-c). + Default is 1.0, i.e. 1 worker thread per detected CPU. Use values + below 1.0 to leave some cores idle, or above 1.0 if there are more + disks than CPUs in the filesystem. +
--loadavg-target LOADAVG-gSpecify load average target for dynamic worker threads. + Threads will be started or stopped subject to the upper limit imposed + by thread-factor, thread-min and thread-count until the load average + is within +/- 0.5 of LOADAVG. +
--thread-min COUNT-GSpecify minimum number of worker threads for scanning. + Ignored unless -g option is used to specify a target load.
--scan-mode MODE-m +Specify extent scanning algorithm. Default mode is 0. +EXPERIMENTAL feature that may go away. +
    +
  • Mode 0: scan extents in ascending order of (inode, subvol, offset). + Keeps shared extents between snapshots together. Reads files sequentially. +Minimizes temporary space usage.
  • +
  • Mode 1: scan extents from all subvols in parallel. Good performance + on non-spinning media when subvols are unrelated.
  • +
  • Mode 2: scan all extents from one subvol at a time. Good sequential + read performance for spinning media. Maximizes temporary space usage.
  • +
+
--timestamps-tEnable timestamps in log output.
--no-timestamps-TDisable timestamps in log output.
--absolute-paths-pPaths in log output will be absolute.
--strip-paths-PPaths in log output will have the working directory at bees startup + stripped.
--verbose-vSet log verbosity (0 = no output, 8 = all output, default 8).
diff --git a/docs/running.md b/docs/running.md new file mode 100644 index 0000000..d604891 --- /dev/null +++ b/docs/running.md @@ -0,0 +1,92 @@ +Running bees +============ + +Setup +----- + +If you don't want to use the helper script `scripts/beesd` to setup and +configure bees, here's how you manually setup bees. + +Create a directory for bees state files: + + export BEESHOME=/some/path + mkdir -p "$BEESHOME" + +Create an empty hash table ([your choice of size](config.md), but it +must be a multiple of 16MB). This example creates a 1GB hash table: + + truncate -s 1g "$BEESHOME/beeshash.dat" + chmod 700 "$BEESHOME/beeshash.dat" + +bees can _only_ process the root subvol of a btrfs with nothing mounted +over top. If the bees argument is not the root subvol directory, bees +will just throw an exception and stop. + +Use a separate mount point, and let only bees access it: + + UUID=3399e413-695a-4b0b-9384-1b0ef8f6c4cd + mkdir -p /var/lib/bees/$UUID + mount /dev/disk/by-uuid/$UUID /var/lib/bees/$UUID -osubvol=/ + +If you don't set BEESHOME, the path "`.beeshome`" will be used relative +to the root subvol of the filesystem. For example: + + btrfs sub create /var/lib/bees/$UUID/.beeshome + truncate -s 1g /var/lib/bees/$UUID/.beeshome/beeshash.dat + chmod 700 /var/lib/bees/$UUID/.beeshome/beeshash.dat + +You can use any relative path in `BEESHOME`. The path will be taken +relative to the root of the deduped filesystem (in other words it can +be the name of a subvol): + + export BEESHOME=@my-beeshome + btrfs sub create /var/lib/bees/$UUID/$BEESHOME + truncate -s 1g /var/lib/bees/$UUID/$BEESHOME/beeshash.dat + chmod 700 /var/lib/bees/$UUID/$BEESHOME/beeshash.dat + +Configuration +------------- + +There are some runtime configurable options using environment variables: + +* BEESHOME: Directory containing bees state files: + * beeshash.dat | persistent hash table. Must be a multiple of 16MB, and must be created before bees starts. + * beescrawl.dat | state of SEARCH_V2 crawlers. ASCII text. bees will create this. + * beesstats.txt | statistics and performance counters. ASCII text. bees will create this. +* BEESSTATUS: File containing a snapshot of current bees state: performance + counters and current status of each thread. The file is meant to be + human readable, but understanding it probably requires reading the source. + You can watch bees run in realtime with a command like: + + watch -n1 cat $BEESSTATUS + +Other options (e.g. interval between filesystem crawls) can be configured +in `src/bees.h` or [on the command line](options.md). + +Running +------- + +Reduce CPU and IO priority to be kinder to other applications sharing +this host (or raise them for more aggressive disk space recovery). If you +use cgroups, put `bees` in its own cgroup, then reduce the `blkio.weight` +and `cpu.shares` parameters. You can also use `schedtool` and `ionice` +in the shell script that launches `bees`: + + schedtool -D -n20 $$ + ionice -c3 -p $$ + +You can also use the [`--loadavg-target` and `--thread-min` +options](options.md) to further control the impact of bees on the rest +of the system. + +Let the bees fly: + + for fs in /var/lib/bees/*-*-*-*-*/; do + bees "$fs" >> "$fs/.beeshome/bees.log" 2>&1 & + done + +You'll probably want to arrange for `/var/log/bees.log` to be rotated +periodically. You may also want to set umask to 077 to prevent disclosure +of information about the contents of the filesystem through the log file. + +There are also some shell wrappers in the `scripts/` directory.